From: Mike Snitzer Date: Thu, 8 Feb 2024 21:55:29 +0000 (-0600) Subject: dm vdo: move indexer files into sub-directory X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=17b1a73feaf31bea77010e226c8e434cba1f95ea;p=linux.git dm vdo: move indexer files into sub-directory The goal is to assist high-level understanding of which code is conceptually specific to VDO's indexer. Signed-off-by: Mike Snitzer Signed-off-by: Matthew Sakai --- diff --git a/drivers/md/dm-vdo/Makefile b/drivers/md/dm-vdo/Makefile index 32266ab04cc19..502a7a0acbdbb 100644 --- a/drivers/md/dm-vdo/Makefile +++ b/drivers/md/dm-vdo/Makefile @@ -1,50 +1,39 @@ # SPDX-License-Identifier: GPL-2.0-only +ccflags-y := -I$(srctree)/$(src) -I$(srctree)/$(src)/indexer + obj-$(CONFIG_DM_VDO) += dm-vdo.o dm-vdo-objs := \ action-manager.o \ admin-state.o \ block-map.o \ - chapter-index.o \ completion.o \ - config.o \ data-vio.o \ dedupe.o \ - delta-index.o \ dm-vdo-target.o \ dump.o \ encodings.o \ errors.o \ flush.o \ funnel-queue.o \ - funnel-requestqueue.o \ funnel-workqueue.o \ - geometry.o \ - index-layout.o \ - index.o \ - index-page-map.o \ - index-session.o \ int-map.o \ - io-factory.o \ io-submitter.o \ logger.o \ logical-zone.o \ memory-alloc.o \ message-stats.o \ murmurhash3.o \ - open-chapter.o \ packer.o \ permassert.o \ physical-zone.o \ pool-sysfs.o \ pool-sysfs-stats.o \ priority-table.o \ - radix-sort.o \ recovery-journal.o \ repair.o \ slab-depot.o \ - sparse-cache.o \ status-codes.o \ string-utils.o \ sysfs.o \ @@ -54,6 +43,19 @@ dm-vdo-objs := \ uds-sysfs.o \ vdo.o \ vio.o \ - volume-index.o \ - volume.o \ - wait-queue.o + wait-queue.o \ + indexer/chapter-index.o \ + indexer/config.o \ + indexer/delta-index.o \ + indexer/funnel-requestqueue.o \ + indexer/geometry.o \ + indexer/index.o \ + indexer/index-layout.o \ + indexer/index-page-map.o \ + indexer/index-session.o \ + indexer/io-factory.o \ + indexer/open-chapter.o \ + indexer/radix-sort.o \ + indexer/sparse-cache.o \ + indexer/volume.o \ + indexer/volume-index.o diff --git a/drivers/md/dm-vdo/chapter-index.c b/drivers/md/dm-vdo/chapter-index.c deleted file mode 100644 index 9b9185c2c2374..0000000000000 --- a/drivers/md/dm-vdo/chapter-index.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "chapter-index.h" - -#include "errors.h" -#include "hash-utils.h" -#include "indexer.h" -#include "logger.h" -#include "memory-alloc.h" -#include "permassert.h" - -int uds_make_open_chapter_index(struct open_chapter_index **chapter_index, - const struct index_geometry *geometry, u64 volume_nonce) -{ - int result; - size_t memory_size; - struct open_chapter_index *index; - - result = uds_allocate(1, struct open_chapter_index, "open chapter index", &index); - if (result != UDS_SUCCESS) - return result; - - /* - * The delta index will rebalance delta lists when memory gets tight, - * so give the chapter index one extra page. - */ - memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page); - index->geometry = geometry; - index->volume_nonce = volume_nonce; - result = uds_initialize_delta_index(&index->delta_index, 1, - geometry->delta_lists_per_chapter, - geometry->chapter_mean_delta, - geometry->chapter_payload_bits, - memory_size, 'm'); - if (result != UDS_SUCCESS) { - uds_free(index); - return result; - } - - index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index); - *chapter_index = index; - return UDS_SUCCESS; -} - -void uds_free_open_chapter_index(struct open_chapter_index *chapter_index) -{ - if (chapter_index == NULL) - return; - - uds_uninitialize_delta_index(&chapter_index->delta_index); - uds_free(chapter_index); -} - -/* Re-initialize an open chapter index for a new chapter. */ -void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, - u64 virtual_chapter_number) -{ - uds_reset_delta_index(&chapter_index->delta_index); - chapter_index->virtual_chapter_number = virtual_chapter_number; -} - -static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address) -{ - return (!entry->at_end) && (entry->key == address); -} - -/* Associate a record name with the record page containing its metadata. */ -int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, - const struct uds_record_name *name, - u32 page_number) -{ - int result; - struct delta_index_entry entry; - u32 address; - u32 list_number; - const u8 *found_name; - bool found; - const struct index_geometry *geometry = chapter_index->geometry; - u64 chapter_number = chapter_index->virtual_chapter_number; - u32 record_pages = geometry->record_pages_per_chapter; - - result = ASSERT(page_number < record_pages, - "Page number within chapter (%u) exceeds the maximum value %u", - page_number, record_pages); - if (result != UDS_SUCCESS) - return UDS_INVALID_ARGUMENT; - - address = uds_hash_to_chapter_delta_address(name, geometry); - list_number = uds_hash_to_chapter_delta_list(name, geometry); - result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number, - address, name->name, &entry); - if (result != UDS_SUCCESS) - return result; - - found = was_entry_found(&entry, address); - result = ASSERT(!(found && entry.is_collision), - "Chunk appears more than once in chapter %llu", - (unsigned long long) chapter_number); - if (result != UDS_SUCCESS) - return UDS_BAD_STATE; - - found_name = (found ? name->name : NULL); - return uds_put_delta_index_entry(&entry, address, page_number, found_name); -} - -/* - * Pack a section of an open chapter index into a chapter index page. A range of delta lists - * (starting with a specified list index) is copied from the open chapter index into a memory page. - * The number of lists copied onto the page is returned to the caller on success. - * - * @chapter_index: The open chapter index - * @memory: The memory page to use - * @first_list: The first delta list number to be copied - * @last_page: If true, this is the last page of the chapter index and all the remaining lists must - * be packed onto this page - * @lists_packed: The number of delta lists that were packed onto this page - */ -int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, - u8 *memory, u32 first_list, bool last_page, - u32 *lists_packed) -{ - int result; - struct delta_index *delta_index = &chapter_index->delta_index; - struct delta_index_stats stats; - u64 nonce = chapter_index->volume_nonce; - u64 chapter_number = chapter_index->virtual_chapter_number; - const struct index_geometry *geometry = chapter_index->geometry; - u32 list_count = geometry->delta_lists_per_chapter; - unsigned int removals = 0; - struct delta_index_entry entry; - u32 next_list; - s32 list_number; - - for (;;) { - result = uds_pack_delta_index_page(delta_index, nonce, memory, - geometry->bytes_per_page, - chapter_number, first_list, - lists_packed); - if (result != UDS_SUCCESS) - return result; - - if ((first_list + *lists_packed) == list_count) { - /* All lists are packed. */ - break; - } else if (*lists_packed == 0) { - /* - * The next delta list does not fit on a page. This delta list will be - * removed. - */ - } else if (last_page) { - /* - * This is the last page and there are lists left unpacked, but all of the - * remaining lists must fit on the page. Find a list that contains entries - * and remove the entire list. Try the first list that does not fit. If it - * is empty, we will select the last list that already fits and has any - * entries. - */ - } else { - /* This page is done. */ - break; - } - - if (removals == 0) { - uds_get_delta_index_stats(delta_index, &stats); - uds_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions", - (unsigned long long) chapter_number, - (unsigned long long) stats.record_count, - (unsigned long long) stats.collision_count); - } - - list_number = *lists_packed; - do { - if (list_number < 0) - return UDS_OVERFLOW; - - next_list = first_list + list_number--, - result = uds_start_delta_index_search(delta_index, next_list, 0, - &entry); - if (result != UDS_SUCCESS) - return result; - - result = uds_next_delta_index_entry(&entry); - if (result != UDS_SUCCESS) - return result; - } while (entry.at_end); - - do { - result = uds_remove_delta_index_entry(&entry); - if (result != UDS_SUCCESS) - return result; - - removals++; - } while (!entry.at_end); - } - - if (removals > 0) { - uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index", - (unsigned long long) chapter_number, removals); - } - - return UDS_SUCCESS; -} - -/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */ -int uds_initialize_chapter_index_page(struct delta_index_page *index_page, - const struct index_geometry *geometry, - u8 *page_buffer, u64 volume_nonce) -{ - return uds_initialize_delta_index_page(index_page, volume_nonce, - geometry->chapter_mean_delta, - geometry->chapter_payload_bits, - page_buffer, geometry->bytes_per_page); -} - -/* Validate a chapter index page read during rebuild. */ -int uds_validate_chapter_index_page(const struct delta_index_page *index_page, - const struct index_geometry *geometry) -{ - int result; - const struct delta_index *delta_index = &index_page->delta_index; - u32 first = index_page->lowest_list_number; - u32 last = index_page->highest_list_number; - u32 list_number; - - /* We walk every delta list from start to finish. */ - for (list_number = first; list_number <= last; list_number++) { - struct delta_index_entry entry; - - result = uds_start_delta_index_search(delta_index, list_number - first, - 0, &entry); - if (result != UDS_SUCCESS) - return result; - - for (;;) { - result = uds_next_delta_index_entry(&entry); - if (result != UDS_SUCCESS) { - /* - * A random bit stream is highly likely to arrive here when we go - * past the end of the delta list. - */ - return result; - } - - if (entry.at_end) - break; - - /* Also make sure that the record page field contains a plausible value. */ - if (uds_get_delta_entry_value(&entry) >= - geometry->record_pages_per_chapter) { - /* - * Do not log this as an error. It happens in normal operation when - * we are doing a rebuild but haven't written the entire volume - * once. - */ - return UDS_CORRUPT_DATA; - } - } - } - return UDS_SUCCESS; -} - -/* - * Search a chapter index page for a record name, returning the record page number that may contain - * the name. - */ -int uds_search_chapter_index_page(struct delta_index_page *index_page, - const struct index_geometry *geometry, - const struct uds_record_name *name, - u16 *record_page_ptr) -{ - int result; - struct delta_index *delta_index = &index_page->delta_index; - u32 address = uds_hash_to_chapter_delta_address(name, geometry); - u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry); - u32 sub_list_number = delta_list_number - index_page->lowest_list_number; - struct delta_index_entry entry; - - result = uds_get_delta_index_entry(delta_index, sub_list_number, address, - name->name, &entry); - if (result != UDS_SUCCESS) - return result; - - if (was_entry_found(&entry, address)) - *record_page_ptr = uds_get_delta_entry_value(&entry); - else - *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; - - return UDS_SUCCESS; -} diff --git a/drivers/md/dm-vdo/chapter-index.h b/drivers/md/dm-vdo/chapter-index.h deleted file mode 100644 index be8bf2b675b1c..0000000000000 --- a/drivers/md/dm-vdo/chapter-index.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_CHAPTER_INDEX_H -#define UDS_CHAPTER_INDEX_H - -#include - -#include "delta-index.h" -#include "geometry.h" - -/* - * A chapter index for an open chapter is a mutable structure that tracks all the records that have - * been added to the chapter. A chapter index for a closed chapter is similar except that it is - * immutable because the contents of a closed chapter can never change, and the immutable structure - * is more efficient. Both types of chapter index are implemented with a delta index. - */ - -/* The value returned when no entry is found in the chapter index. */ -#define NO_CHAPTER_INDEX_ENTRY U16_MAX - -struct open_chapter_index { - const struct index_geometry *geometry; - struct delta_index delta_index; - u64 virtual_chapter_number; - u64 volume_nonce; - size_t memory_size; -}; - -int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index, - const struct index_geometry *geometry, - u64 volume_nonce); - -void uds_free_open_chapter_index(struct open_chapter_index *chapter_index); - -void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, - u64 virtual_chapter_number); - -int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, - const struct uds_record_name *name, - u32 page_number); - -int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, - u8 *memory, u32 first_list, - bool last_page, u32 *lists_packed); - -int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page, - const struct index_geometry *geometry, - u8 *page_buffer, u64 volume_nonce); - -int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page, - const struct index_geometry *geometry); - -int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page, - const struct index_geometry *geometry, - const struct uds_record_name *name, - u16 *record_page_ptr); - -#endif /* UDS_CHAPTER_INDEX_H */ diff --git a/drivers/md/dm-vdo/config.c b/drivers/md/dm-vdo/config.c deleted file mode 100644 index 0bf315e7b5d13..0000000000000 --- a/drivers/md/dm-vdo/config.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "config.h" - -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" -#include "string-utils.h" -#include "thread-utils.h" - -static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC"; -static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02"; -static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02"; - -enum { - DEFAULT_VOLUME_READ_THREADS = 2, - MAX_VOLUME_READ_THREADS = 16, - INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, - INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1, -}; - -static bool is_version(const u8 *version, u8 *buffer) -{ - return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0; -} - -static bool are_matching_configurations(struct uds_configuration *saved_config, - struct index_geometry *saved_geometry, - struct uds_configuration *user) -{ - struct index_geometry *geometry = user->geometry; - bool result = true; - - if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) { - uds_log_error("Record pages per chapter (%u) does not match (%u)", - saved_geometry->record_pages_per_chapter, - geometry->record_pages_per_chapter); - result = false; - } - - if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) { - uds_log_error("Chapter count (%u) does not match (%u)", - saved_geometry->chapters_per_volume, - geometry->chapters_per_volume); - result = false; - } - - if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) { - uds_log_error("Sparse chapter count (%u) does not match (%u)", - saved_geometry->sparse_chapters_per_volume, - geometry->sparse_chapters_per_volume); - result = false; - } - - if (saved_config->cache_chapters != user->cache_chapters) { - uds_log_error("Cache size (%u) does not match (%u)", - saved_config->cache_chapters, user->cache_chapters); - result = false; - } - - if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) { - uds_log_error("Volume index mean delta (%u) does not match (%u)", - saved_config->volume_index_mean_delta, - user->volume_index_mean_delta); - result = false; - } - - if (saved_geometry->bytes_per_page != geometry->bytes_per_page) { - uds_log_error("Bytes per page value (%zu) does not match (%zu)", - saved_geometry->bytes_per_page, geometry->bytes_per_page); - result = false; - } - - if (saved_config->sparse_sample_rate != user->sparse_sample_rate) { - uds_log_error("Sparse sample rate (%u) does not match (%u)", - saved_config->sparse_sample_rate, - user->sparse_sample_rate); - result = false; - } - - if (saved_config->nonce != user->nonce) { - uds_log_error("Nonce (%llu) does not match (%llu)", - (unsigned long long) saved_config->nonce, - (unsigned long long) user->nonce); - result = false; - } - - return result; -} - -/* Read the configuration and validate it against the provided one. */ -int uds_validate_config_contents(struct buffered_reader *reader, - struct uds_configuration *user_config) -{ - int result; - struct uds_configuration config; - struct index_geometry geometry; - u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH]; - u32 bytes_per_page; - u8 buffer[sizeof(struct uds_configuration_6_02)]; - size_t offset = 0; - - result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) - return result; - - result = uds_read_from_buffered_reader(reader, version_buffer, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot read index config version"); - - if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) && - !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "unsupported configuration version: '%.*s'", - INDEX_CONFIG_VERSION_LENGTH, - version_buffer); - } - - result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot read config data"); - - decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter); - decode_u32_le(buffer, &offset, &geometry.chapters_per_volume); - decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume); - decode_u32_le(buffer, &offset, &config.cache_chapters); - offset += sizeof(u32); - decode_u32_le(buffer, &offset, &config.volume_index_mean_delta); - decode_u32_le(buffer, &offset, &bytes_per_page); - geometry.bytes_per_page = bytes_per_page; - decode_u32_le(buffer, &offset, &config.sparse_sample_rate); - decode_u64_le(buffer, &offset, &config.nonce); - - result = ASSERT(offset == sizeof(struct uds_configuration_6_02), - "%zu bytes read but not decoded", - sizeof(struct uds_configuration_6_02) - offset); - if (result != UDS_SUCCESS) - return UDS_CORRUPT_DATA; - - if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) { - user_config->geometry->remapped_virtual = 0; - user_config->geometry->remapped_physical = 0; - } else { - u8 remapping[sizeof(u64) + sizeof(u64)]; - - result = uds_read_from_buffered_reader(reader, remapping, - sizeof(remapping)); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot read converted config"); - - offset = 0; - decode_u64_le(remapping, &offset, - &user_config->geometry->remapped_virtual); - decode_u64_le(remapping, &offset, - &user_config->geometry->remapped_physical); - } - - if (!are_matching_configurations(&config, &geometry, user_config)) { - uds_log_warning("Supplied configuration does not match save"); - return UDS_NO_INDEX; - } - - return UDS_SUCCESS; -} - -/* - * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02 - * version; otherwise write the 8.02 version, indicating the configuration is for an index that has - * been reduced by one chapter. - */ -int uds_write_config_contents(struct buffered_writer *writer, - struct uds_configuration *config, u32 version) -{ - int result; - struct index_geometry *geometry = config->geometry; - u8 buffer[sizeof(struct uds_configuration_8_02)]; - size_t offset = 0; - - result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) - return result; - - /* - * If version is < 4, the index has not been reduced by a chapter so it must be written out - * as version 6.02 so that it is still compatible with older versions of UDS. - */ - if (version >= 4) { - result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) - return result; - } else { - result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) - return result; - } - - encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter); - encode_u32_le(buffer, &offset, geometry->chapters_per_volume); - encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume); - encode_u32_le(buffer, &offset, config->cache_chapters); - encode_u32_le(buffer, &offset, 0); - encode_u32_le(buffer, &offset, config->volume_index_mean_delta); - encode_u32_le(buffer, &offset, geometry->bytes_per_page); - encode_u32_le(buffer, &offset, config->sparse_sample_rate); - encode_u64_le(buffer, &offset, config->nonce); - - result = ASSERT(offset == sizeof(struct uds_configuration_6_02), - "%zu bytes encoded, of %zu expected", offset, - sizeof(struct uds_configuration_6_02)); - if (result != UDS_SUCCESS) - return result; - - if (version >= 4) { - encode_u64_le(buffer, &offset, geometry->remapped_virtual); - encode_u64_le(buffer, &offset, geometry->remapped_physical); - } - - return uds_write_to_buffered_writer(writer, buffer, offset); -} - -/* Compute configuration parameters that depend on memory size. */ -static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse, - u32 *chapters_per_volume, u32 *record_pages_per_chapter, - u32 *sparse_chapters_per_volume) -{ - u32 reduced_chapters = 0; - u32 base_chapters; - - if (mem_gb == UDS_MEMORY_CONFIG_256MB) { - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) { - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) { - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) { - base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) { - reduced_chapters = 1; - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) { - reduced_chapters = 1; - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) { - reduced_chapters = 1; - base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; - *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) && - (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) { - reduced_chapters = 1; - base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) * - DEFAULT_CHAPTERS_PER_VOLUME); - *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else { - uds_log_error("received invalid memory size"); - return -EINVAL; - } - - if (sparse) { - /* Make 95% of chapters sparse, allowing 10x more records. */ - *sparse_chapters_per_volume = (19 * base_chapters) / 2; - base_chapters *= 10; - } else { - *sparse_chapters_per_volume = 0; - } - - *chapters_per_volume = base_chapters - reduced_chapters; - return UDS_SUCCESS; -} - -static unsigned int __must_check normalize_zone_count(unsigned int requested) -{ - unsigned int zone_count = requested; - - if (zone_count == 0) - zone_count = num_online_cpus() / 2; - - if (zone_count < 1) - zone_count = 1; - - if (zone_count > MAX_ZONES) - zone_count = MAX_ZONES; - - uds_log_info("Using %u indexing zone%s for concurrency.", - zone_count, zone_count == 1 ? "" : "s"); - return zone_count; -} - -static unsigned int __must_check normalize_read_threads(unsigned int requested) -{ - unsigned int read_threads = requested; - - if (read_threads < 1) - read_threads = DEFAULT_VOLUME_READ_THREADS; - - if (read_threads > MAX_VOLUME_READ_THREADS) - read_threads = MAX_VOLUME_READ_THREADS; - - return read_threads; -} - -int uds_make_configuration(const struct uds_parameters *params, - struct uds_configuration **config_ptr) -{ - struct uds_configuration *config; - u32 chapters_per_volume = 0; - u32 record_pages_per_chapter = 0; - u32 sparse_chapters_per_volume = 0; - int result; - - result = compute_memory_sizes(params->memory_size, params->sparse, - &chapters_per_volume, &record_pages_per_chapter, - &sparse_chapters_per_volume); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(1, struct uds_configuration, __func__, &config); - if (result != UDS_SUCCESS) - return result; - - result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter, - chapters_per_volume, sparse_chapters_per_volume, - 0, 0, &config->geometry); - if (result != UDS_SUCCESS) { - uds_free_configuration(config); - return result; - } - - config->zone_count = normalize_zone_count(params->zone_count); - config->read_threads = normalize_read_threads(params->read_threads); - - config->cache_chapters = DEFAULT_CACHE_CHAPTERS; - config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA; - config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0); - config->nonce = params->nonce; - config->bdev = params->bdev; - config->offset = params->offset; - config->size = params->size; - - *config_ptr = config; - return UDS_SUCCESS; -} - -void uds_free_configuration(struct uds_configuration *config) -{ - if (config != NULL) { - uds_free_index_geometry(config->geometry); - uds_free(config); - } -} - -void uds_log_configuration(struct uds_configuration *config) -{ - struct index_geometry *geometry = config->geometry; - - uds_log_debug("Configuration:"); - uds_log_debug(" Record pages per chapter: %10u", geometry->record_pages_per_chapter); - uds_log_debug(" Chapters per volume: %10u", geometry->chapters_per_volume); - uds_log_debug(" Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume); - uds_log_debug(" Cache size (chapters): %10u", config->cache_chapters); - uds_log_debug(" Volume index mean delta: %10u", config->volume_index_mean_delta); - uds_log_debug(" Bytes per page: %10zu", geometry->bytes_per_page); - uds_log_debug(" Sparse sample rate: %10u", config->sparse_sample_rate); - uds_log_debug(" Nonce: %llu", (unsigned long long) config->nonce); -} diff --git a/drivers/md/dm-vdo/config.h b/drivers/md/dm-vdo/config.h deleted file mode 100644 index 08507dc2f7a14..0000000000000 --- a/drivers/md/dm-vdo/config.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_CONFIG_H -#define UDS_CONFIG_H - -#include "geometry.h" -#include "indexer.h" -#include "io-factory.h" - -/* - * The uds_configuration records a variety of parameters used to configure a new UDS index. Some - * parameters are provided by the client, while others are fixed or derived from user-supplied - * values. It is created when an index is created, and it is recorded in the index metadata. - */ - -enum { - DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096, - DEFAULT_CACHE_CHAPTERS = 7, - DEFAULT_SPARSE_SAMPLE_RATE = 32, - MAX_ZONES = 16, -}; - -/* A set of configuration parameters for the indexer. */ -struct uds_configuration { - /* Storage device for the index */ - struct block_device *bdev; - - /* The maximum allowable size of the index */ - size_t size; - - /* The offset where the index should start */ - off_t offset; - - /* Parameters for the volume */ - - /* The volume layout */ - struct index_geometry *geometry; - - /* Index owner's nonce */ - u64 nonce; - - /* The number of threads used to process index requests */ - unsigned int zone_count; - - /* The number of threads used to read volume pages */ - unsigned int read_threads; - - /* Size of the page cache and sparse chapter index cache in chapters */ - u32 cache_chapters; - - /* Parameters for the volume index */ - - /* The mean delta for the volume index */ - u32 volume_index_mean_delta; - - /* Sampling rate for sparse indexing */ - u32 sparse_sample_rate; -}; - -/* On-disk structure of data for a version 8.02 index. */ -struct uds_configuration_8_02 { - /* Smaller (16), Small (64) or large (256) indices */ - u32 record_pages_per_chapter; - /* Total number of chapters per volume */ - u32 chapters_per_volume; - /* Number of sparse chapters per volume */ - u32 sparse_chapters_per_volume; - /* Size of the page cache, in chapters */ - u32 cache_chapters; - /* Unused field */ - u32 unused; - /* The volume index mean delta to use */ - u32 volume_index_mean_delta; - /* Size of a page, used for both record pages and index pages */ - u32 bytes_per_page; - /* Sampling rate for sparse indexing */ - u32 sparse_sample_rate; - /* Index owner's nonce */ - u64 nonce; - /* Virtual chapter remapped from physical chapter 0 */ - u64 remapped_virtual; - /* New physical chapter which remapped chapter was moved to */ - u64 remapped_physical; -} __packed; - -/* On-disk structure of data for a version 6.02 index. */ -struct uds_configuration_6_02 { - /* Smaller (16), Small (64) or large (256) indices */ - u32 record_pages_per_chapter; - /* Total number of chapters per volume */ - u32 chapters_per_volume; - /* Number of sparse chapters per volume */ - u32 sparse_chapters_per_volume; - /* Size of the page cache, in chapters */ - u32 cache_chapters; - /* Unused field */ - u32 unused; - /* The volume index mean delta to use */ - u32 volume_index_mean_delta; - /* Size of a page, used for both record pages and index pages */ - u32 bytes_per_page; - /* Sampling rate for sparse indexing */ - u32 sparse_sample_rate; - /* Index owner's nonce */ - u64 nonce; -} __packed; - -int __must_check uds_make_configuration(const struct uds_parameters *params, - struct uds_configuration **config_ptr); - -void uds_free_configuration(struct uds_configuration *config); - -int __must_check uds_validate_config_contents(struct buffered_reader *reader, - struct uds_configuration *config); - -int __must_check uds_write_config_contents(struct buffered_writer *writer, - struct uds_configuration *config, u32 version); - -void uds_log_configuration(struct uds_configuration *config); - -#endif /* UDS_CONFIG_H */ diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h index e7729623a6bb5..44fd0d8ccb769 100644 --- a/drivers/md/dm-vdo/data-vio.h +++ b/drivers/md/dm-vdo/data-vio.h @@ -10,9 +10,10 @@ #include #include -#include "indexer.h" #include "permassert.h" +#include "indexer.h" + #include "block-map.h" #include "completion.h" #include "constants.h" diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 942a50ef8b0d8..9468d7fad4435 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -126,13 +126,14 @@ #include #include -#include "indexer.h" #include "logger.h" #include "memory-alloc.h" #include "numeric.h" #include "permassert.h" #include "string-utils.h" +#include "indexer.h" + #include "action-manager.h" #include "admin-state.h" #include "completion.h" diff --git a/drivers/md/dm-vdo/delta-index.c b/drivers/md/dm-vdo/delta-index.c deleted file mode 100644 index 66f51b5f8fd21..0000000000000 --- a/drivers/md/dm-vdo/delta-index.c +++ /dev/null @@ -1,1987 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ -#include "delta-index.h" - -#include -#include -#include -#include -#include - -#include "config.h" -#include "cpu.h" -#include "errors.h" -#include "indexer.h" -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" -#include "permassert.h" -#include "string-utils.h" -#include "time-utils.h" - -/* - * The entries in a delta index could be stored in a single delta list, but to reduce search times - * and update costs it uses multiple delta lists. These lists are stored in a single chunk of - * memory managed by the delta_zone structure. The delta_zone can move the data around within its - * memory, so the location of each delta list is recorded as a bit offset into the memory. Because - * the volume index can contain over a million delta lists, we want to be efficient with the size - * of the delta list header information. This information is encoded into 16 bytes per list. The - * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to - * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are - * sufficient to store the size of a delta list. - * - * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are - * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and - * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most signficant bit - * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's - * number corresponds to its index in the array. - * - * A standard delta list entry is stored as a fixed length payload (the value) followed by a - * variable length key (the delta). A collision entry is used when two block names have the same - * delta list address. A collision entry always follows a standard entry for the hash with which it - * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end, - * containing the full block name. An entry with a delta of 0 at the beginning of a delta list - * indicates a normal entry. - * - * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory - * used by small deltas. The Huffman code is specified by three parameters, which can be computed - * from the desired mean delta when the index is full. (See compute_coding_constants() for - * details.) - * - * The bit field utilities used to read and write delta entries assume that it is possible to read - * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two - * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are - * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit - * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted - * delta list could cause this step to run off the end of the delta_zone memory, so as extra - * protection against this happening, the tail guard list is set to all ones. - * - * The delta_index supports two different forms. The mutable form is created by - * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The - * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and - * cached) chapter index pages. The immutable form does not allocate delta list headers or - * temporary offsets, and thus is somewhat more memory efficient. - */ - -/* - * This is the largest field size supported by get_field() and set_field(). Any field that is - * larger is not guaranteed to fit in a single byte-aligned u32. - */ -enum { - MAX_FIELD_BITS = (sizeof(u32) - 1) * BITS_PER_BYTE + 1, -}; - -/* - * This is the largest field size supported by get_big_field() and set_big_field(). Any field that - * is larger is not guaranteed to fit in a single byte-aligned u64. - */ -enum { - MAX_BIG_FIELD_BITS = (sizeof(u64) - 1) * BITS_PER_BYTE + 1, -}; - -/* - * This is the number of guard bytes needed at the end of the memory byte array when using the bit - * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7 - * bytes beyond the end of the desired field. The definition is written to make it clear how this - * value is derived. - */ -enum { - POST_FIELD_GUARD_BYTES = sizeof(u64) - 1, -}; - -/* The number of guard bits that are needed in the tail guard list */ -enum { - GUARD_BITS = POST_FIELD_GUARD_BYTES * BITS_PER_BYTE -}; - -/* - * The maximum size of a single delta list in bytes. We count guard bytes in this value because a - * buffer of this size can be used with move_bits(). - */ -enum { - DELTA_LIST_MAX_BYTE_COUNT = - ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES) -}; - -/* The number of extra bytes and bits needed to store a collision entry */ -enum { - COLLISION_BYTES = UDS_RECORD_NAME_SIZE, - COLLISION_BITS = COLLISION_BYTES * BITS_PER_BYTE -}; - -/* - * Immutable delta lists are packed into pages containing a header that encodes the delta list - * information into 19 bits per list (64KB bit offset). - */ - -enum { IMMUTABLE_HEADER_SIZE = 19 }; - -/* - * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a - * number to increment when the format of the data changes. - */ - -enum { - MAGIC_SIZE = 8, -}; - -static const char DELTA_INDEX_MAGIC[] = "DI-00002"; - -struct delta_index_header { - char magic[MAGIC_SIZE]; - u32 zone_number; - u32 zone_count; - u32 first_list; - u32 list_count; - u64 record_count; - u64 collision_count; -}; - -/* - * Header data used for immutable delta index pages. This data is followed by the delta list offset - * table. - */ -struct delta_page_header { - /* Externally-defined nonce */ - u64 nonce; - /* The virtual chapter number */ - u64 virtual_chapter_number; - /* Index of the first delta list on the page */ - u16 first_list; - /* Number of delta lists on the page */ - u16 list_count; -} __packed; - -static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list) -{ - return delta_list->start / BITS_PER_BYTE; -} - -static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list) -{ - unsigned int bit_offset = delta_list->start % BITS_PER_BYTE; - - return BITS_TO_BYTES(bit_offset + delta_list->size); -} - -static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first, - u32 last) -{ - struct delta_list *delta_list; - u64 new_start; - - if (first == last) { - /* Only one list is moving, and we know there is space. */ - delta_list = &delta_zone->delta_lists[first]; - new_start = delta_zone->new_offsets[first]; - if (delta_list->start != new_start) { - u64 source; - u64 destination; - - source = get_delta_list_byte_start(delta_list); - delta_list->start = new_start; - destination = get_delta_list_byte_start(delta_list); - memmove(delta_zone->memory + destination, - delta_zone->memory + source, - get_delta_list_byte_size(delta_list)); - } - } else { - /* - * There is more than one list. Divide the problem in half, and use recursive calls - * to process each half. Note that after this computation, first <= middle, and - * middle < last. - */ - u32 middle = (first + last) / 2; - - delta_list = &delta_zone->delta_lists[middle]; - new_start = delta_zone->new_offsets[middle]; - - /* - * The direction that our middle list is moving determines which half of the - * problem must be processed first. - */ - if (new_start > delta_list->start) { - rebalance_delta_zone(delta_zone, middle + 1, last); - rebalance_delta_zone(delta_zone, first, middle); - } else { - rebalance_delta_zone(delta_zone, first, middle); - rebalance_delta_zone(delta_zone, middle + 1, last); - } - } -} - -static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size) -{ - /* Round up so that each zone is a multiple of 64K in size. */ - enum { - ALLOC_BOUNDARY = 64 * 1024, - }; - - return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; -} - -void uds_reset_delta_index(const struct delta_index *delta_index) -{ - unsigned int z; - - /* - * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one - * before the first real entry and one after so that we don't need to bounds check the - * array access when calculating preceding and following gap sizes. - */ - for (z = 0; z < delta_index->zone_count; z++) { - u64 list_bits; - u64 spacing; - u64 offset; - unsigned int i; - struct delta_zone *zone = &delta_index->delta_zones[z]; - struct delta_list *delta_lists = zone->delta_lists; - - /* Zeroing the delta list headers initializes the head guard list correctly. */ - memset(delta_lists, 0, - (zone->list_count + 2) * sizeof(struct delta_list)); - - /* Set all the bits in the end guard list. */ - list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS; - delta_lists[zone->list_count + 1].start = list_bits; - delta_lists[zone->list_count + 1].size = GUARD_BITS; - memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0, - POST_FIELD_GUARD_BYTES); - - /* Evenly space out the real delta lists by setting regular offsets. */ - spacing = list_bits / zone->list_count; - offset = spacing / 2; - for (i = 1; i <= zone->list_count; i++) { - delta_lists[i].start = offset; - offset += spacing; - } - - /* Update the statistics. */ - zone->discard_count += zone->record_count; - zone->record_count = 0; - zone->collision_count = 0; - } -} - -/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by - * three parameters: - * - * MINBITS The number of bits in the smallest code - * BASE The number of values coded using a code of length MINBITS - * INCR The number of values coded by using one additional bit - * - * These parameters are related by this equation: - * - * BASE + INCR == 1 << MINBITS - * - * The math for the Huffman code of an exponential distribution says that - * - * INCR = log(2) * MEAN_DELTA - * - * Then use the smallest MINBITS value so that - * - * (1 << MINBITS) > INCR - * - * And then - * - * BASE = (1 << MINBITS) - INCR - * - * Now the index can generate a code such that - * - The first BASE values code using MINBITS bits. - * - The next INCR values code using MINBITS+1 bits. - * - The next INCR values code using MINBITS+2 bits. - * - (and so on). - */ -static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys) -{ - /* - * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use - * floating point, use a really good integer approximation. - */ - *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL; - *min_bits = bits_per(*incr_keys + 1); - *min_keys = (1 << *min_bits) - *incr_keys; -} - -void uds_uninitialize_delta_index(struct delta_index *delta_index) -{ - unsigned int z; - - if (delta_index->delta_zones == NULL) - return; - - for (z = 0; z < delta_index->zone_count; z++) { - uds_free(uds_forget(delta_index->delta_zones[z].new_offsets)); - uds_free(uds_forget(delta_index->delta_zones[z].delta_lists)); - uds_free(uds_forget(delta_index->delta_zones[z].memory)); - } - - uds_free(delta_index->delta_zones); - memset(delta_index, 0, sizeof(struct delta_index)); -} - -static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size, - u32 first_list, u32 list_count, u32 mean_delta, - u32 payload_bits, u8 tag) -{ - int result; - - result = uds_allocate(size, u8, "delta list", &delta_zone->memory); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(list_count + 2, u64, "delta list temp", - &delta_zone->new_offsets); - if (result != UDS_SUCCESS) - return result; - - /* Allocate the delta lists. */ - result = uds_allocate(list_count + 2, struct delta_list, "delta lists", - &delta_zone->delta_lists); - if (result != UDS_SUCCESS) - return result; - - compute_coding_constants(mean_delta, &delta_zone->min_bits, - &delta_zone->min_keys, &delta_zone->incr_keys); - delta_zone->value_bits = payload_bits; - delta_zone->buffered_writer = NULL; - delta_zone->size = size; - delta_zone->rebalance_time = 0; - delta_zone->rebalance_count = 0; - delta_zone->record_count = 0; - delta_zone->collision_count = 0; - delta_zone->discard_count = 0; - delta_zone->overflow_count = 0; - delta_zone->first_list = first_list; - delta_zone->list_count = list_count; - delta_zone->tag = tag; - - return UDS_SUCCESS; -} - -int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count, - u32 list_count, u32 mean_delta, u32 payload_bits, - size_t memory_size, u8 tag) -{ - int result; - unsigned int z; - size_t zone_memory; - - result = uds_allocate(zone_count, struct delta_zone, "Delta Index Zones", - &delta_index->delta_zones); - if (result != UDS_SUCCESS) - return result; - - delta_index->zone_count = zone_count; - delta_index->list_count = list_count; - delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count); - delta_index->memory_size = 0; - delta_index->mutable = true; - delta_index->tag = tag; - - for (z = 0; z < zone_count; z++) { - u32 lists_in_zone = delta_index->lists_per_zone; - u32 first_list_in_zone = z * lists_in_zone; - - if (z == zone_count - 1) { - /* - * The last zone gets fewer lists if zone_count doesn't evenly divide - * list_count. We'll have an underflow if the assertion below doesn't hold. - */ - if (delta_index->list_count <= first_list_in_zone) { - uds_uninitialize_delta_index(delta_index); - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "%u delta lists not enough for %u zones", - list_count, zone_count); - } - lists_in_zone = delta_index->list_count - first_list_in_zone; - } - - zone_memory = get_zone_memory_size(zone_count, memory_size); - result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory, - first_list_in_zone, lists_in_zone, - mean_delta, payload_bits, tag); - if (result != UDS_SUCCESS) { - uds_uninitialize_delta_index(delta_index); - return result; - } - - delta_index->memory_size += - (sizeof(struct delta_zone) + zone_memory + - (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64))); - } - - uds_reset_delta_index(delta_index); - return UDS_SUCCESS; -} - -/* Read a bit field from an arbitrary bit boundary. */ -static inline u32 get_field(const u8 *memory, u64 offset, u8 size) -{ - const void *addr = memory + offset / BITS_PER_BYTE; - - return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1); -} - -/* Write a bit field to an arbitrary bit boundary. */ -static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size) -{ - void *addr = memory + offset / BITS_PER_BYTE; - int shift = offset % BITS_PER_BYTE; - u32 data = get_unaligned_le32(addr); - - data &= ~(((1 << size) - 1) << shift); - data |= value << shift; - put_unaligned_le32(data, addr); -} - -/* Get the bit offset to the immutable delta list header. */ -static inline u32 get_immutable_header_offset(u32 list_number) -{ - return sizeof(struct delta_page_header) * BITS_PER_BYTE + - list_number * IMMUTABLE_HEADER_SIZE; -} - -/* Get the bit offset to the start of the immutable delta list bit stream. */ -static inline u32 get_immutable_start(const u8 *memory, u32 list_number) -{ - return get_field(memory, get_immutable_header_offset(list_number), - IMMUTABLE_HEADER_SIZE); -} - -/* Set the bit offset to the start of the immutable delta list bit stream. */ -static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start) -{ - set_field(start, memory, get_immutable_header_offset(list_number), - IMMUTABLE_HEADER_SIZE); -} - -static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce, - u8 *memory, size_t memory_size) -{ - unsigned int i; - - /* - * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the - * entire volume at least once. - */ - if (nonce != expected_nonce) - return false; - - /* Verify that the number of delta lists can fit in the page. */ - if (list_count > ((memory_size - sizeof(struct delta_page_header)) * - BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE)) - return false; - - /* - * Verify that the first delta list is immediately after the last delta - * list header. - */ - if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1)) - return false; - - /* Verify that the lists are in the correct order. */ - for (i = 0; i < list_count; i++) { - if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1)) - return false; - } - - /* - * Verify that the last list ends on the page, and that there is room - * for the post-field guard bits. - */ - if (get_immutable_start(memory, list_count) > - (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE) - return false; - - /* Verify that the guard bytes are correctly set to all ones. */ - for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { - if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0) - return false; - } - - /* All verifications passed. */ - return true; -} - -/* Initialize a delta index page to refer to a supplied page. */ -int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, - u64 expected_nonce, u32 mean_delta, u32 payload_bits, - u8 *memory, size_t memory_size) -{ - u64 nonce; - u64 vcn; - u64 first_list; - u64 list_count; - struct delta_page_header *header = (struct delta_page_header *) memory; - struct delta_zone *delta_zone = &delta_index_page->delta_zone; - const u8 *nonce_addr = (const u8 *) &header->nonce; - const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number; - const u8 *first_list_addr = (const u8 *) &header->first_list; - const u8 *list_count_addr = (const u8 *) &header->list_count; - - /* First assume that the header is little endian. */ - nonce = get_unaligned_le64(nonce_addr); - vcn = get_unaligned_le64(vcn_addr); - first_list = get_unaligned_le16(first_list_addr); - list_count = get_unaligned_le16(list_count_addr); - if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, - memory_size)) { - /* If that fails, try big endian. */ - nonce = get_unaligned_be64(nonce_addr); - vcn = get_unaligned_be64(vcn_addr); - first_list = get_unaligned_be16(first_list_addr); - list_count = get_unaligned_be16(list_count_addr); - if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, - memory_size)) { - /* - * Both attempts failed. Do not log this as an error, because it can happen - * during a rebuild if we haven't written the entire volume at least once. - */ - return UDS_CORRUPT_DATA; - } - } - - delta_index_page->delta_index.delta_zones = delta_zone; - delta_index_page->delta_index.zone_count = 1; - delta_index_page->delta_index.list_count = list_count; - delta_index_page->delta_index.lists_per_zone = list_count; - delta_index_page->delta_index.mutable = false; - delta_index_page->delta_index.tag = 'p'; - delta_index_page->virtual_chapter_number = vcn; - delta_index_page->lowest_list_number = first_list; - delta_index_page->highest_list_number = first_list + list_count - 1; - - compute_coding_constants(mean_delta, &delta_zone->min_bits, - &delta_zone->min_keys, &delta_zone->incr_keys); - delta_zone->value_bits = payload_bits; - delta_zone->memory = memory; - delta_zone->delta_lists = NULL; - delta_zone->new_offsets = NULL; - delta_zone->buffered_writer = NULL; - delta_zone->size = memory_size; - delta_zone->rebalance_time = 0; - delta_zone->rebalance_count = 0; - delta_zone->record_count = 0; - delta_zone->collision_count = 0; - delta_zone->discard_count = 0; - delta_zone->overflow_count = 0; - delta_zone->first_list = 0; - delta_zone->list_count = list_count; - delta_zone->tag = 'p'; - - return UDS_SUCCESS; -} - -/* Read a large bit field from an arbitrary bit boundary. */ -static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size) -{ - const void *addr = memory + offset / BITS_PER_BYTE; - - return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1); -} - -/* Write a large bit field to an arbitrary bit boundary. */ -static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size) -{ - void *addr = memory + offset / BITS_PER_BYTE; - u8 shift = offset % BITS_PER_BYTE; - u64 data = get_unaligned_le64(addr); - - data &= ~(((1UL << size) - 1) << shift); - data |= value << shift; - put_unaligned_le64(data, addr); -} - -/* Set a sequence of bits to all zeros. */ -static inline void set_zero(u8 *memory, u64 offset, u32 size) -{ - if (size > 0) { - u8 *addr = memory + offset / BITS_PER_BYTE; - u8 shift = offset % BITS_PER_BYTE; - u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size; - - *addr++ &= ~(((1 << count) - 1) << shift); - for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE) - *addr++ = 0; - - if (size > 0) - *addr &= 0xFF << size; - } -} - -/* - * Move several bits from a higher to a lower address, moving the lower addressed bits first. The - * size and memory offsets are measured in bits. - */ -static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) -{ - const u8 *source; - u8 *destination; - u8 offset; - u8 count; - u64 field; - - /* Start by moving one field that ends on a to int boundary. */ - count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32))); - field = get_big_field(from, from_offset, count); - set_big_field(field, to, to_offset, count); - from_offset += count; - to_offset += count; - size -= count; - - /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ - offset = from_offset % BITS_PER_TYPE(u32); - source = from + (from_offset - offset) / BITS_PER_BYTE; - destination = to + to_offset / BITS_PER_BYTE; - while (size > MAX_BIG_FIELD_BITS) { - put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); - source += sizeof(u32); - destination += sizeof(u32); - from_offset += BITS_PER_TYPE(u32); - to_offset += BITS_PER_TYPE(u32); - size -= BITS_PER_TYPE(u32); - } - - /* Finish up by moving any remaining bits. */ - if (size > 0) { - field = get_big_field(from, from_offset, size); - set_big_field(field, to, to_offset, size); - } -} - -/* - * Move several bits from a lower to a higher address, moving the higher addressed bits first. The - * size and memory offsets are measured in bits. - */ -static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) -{ - const u8 *source; - u8 *destination; - u8 offset; - u8 count; - u64 field; - - /* Start by moving one field that begins on a destination int boundary. */ - count = (to_offset + size) % BITS_PER_TYPE(u32); - if (count > 0) { - size -= count; - field = get_big_field(from, from_offset + size, count); - set_big_field(field, to, to_offset + size, count); - } - - /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ - offset = (from_offset + size) % BITS_PER_TYPE(u32); - source = from + (from_offset + size - offset) / BITS_PER_BYTE; - destination = to + (to_offset + size) / BITS_PER_BYTE; - while (size > MAX_BIG_FIELD_BITS) { - source -= sizeof(u32); - destination -= sizeof(u32); - size -= BITS_PER_TYPE(u32); - put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); - } - - /* Finish up by moving any remaining bits. */ - if (size > 0) { - field = get_big_field(from, from_offset, size); - set_big_field(field, to, to_offset, size); - } -} - -/* - * Move bits from one field to another. When the fields overlap, behave as if we first move all the - * bits from the source to a temporary value, and then move all the bits from the temporary value - * to the destination. The size and memory offsets are measured in bits. - */ -static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) -{ - u64 field; - - /* A small move doesn't require special handling. */ - if (size <= MAX_BIG_FIELD_BITS) { - if (size > 0) { - field = get_big_field(from, from_offset, size); - set_big_field(field, to, to_offset, size); - } - - return; - } - - if (from_offset > to_offset) - move_bits_down(from, from_offset, to, to_offset, size); - else - move_bits_up(from, from_offset, to, to_offset, size); -} - -/* - * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta - * lists (starting with a specified list index) is copied from the mutable delta index into a - * memory page used in the immutable index. The number of lists copied onto the page is returned in - * list_count. - */ -int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce, - u8 *memory, size_t memory_size, u64 virtual_chapter_number, - u32 first_list, u32 *list_count) -{ - const struct delta_zone *delta_zone; - struct delta_list *delta_lists; - u32 max_lists; - u32 n_lists = 0; - u32 offset; - u32 i; - int free_bits; - int bits; - struct delta_page_header *header; - - delta_zone = &delta_index->delta_zones[0]; - delta_lists = &delta_zone->delta_lists[first_list + 1]; - max_lists = delta_index->list_count - first_list; - - /* - * Compute how many lists will fit on the page. Subtract the size of the fixed header, one - * delta list offset, and the guard bytes from the page size to determine how much space is - * available for delta lists. - */ - free_bits = memory_size * BITS_PER_BYTE; - free_bits -= get_immutable_header_offset(1); - free_bits -= GUARD_BITS; - if (free_bits < IMMUTABLE_HEADER_SIZE) { - /* This page is too small to store any delta lists. */ - return uds_log_error_strerror(UDS_OVERFLOW, - "Chapter Index Page of %zu bytes is too small", - memory_size); - } - - while (n_lists < max_lists) { - /* Each list requires a delta list offset and the list data. */ - bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size; - if (bits > free_bits) - break; - - n_lists++; - free_bits -= bits; - } - - *list_count = n_lists; - - header = (struct delta_page_header *) memory; - put_unaligned_le64(header_nonce, (u8 *) &header->nonce); - put_unaligned_le64(virtual_chapter_number, - (u8 *) &header->virtual_chapter_number); - put_unaligned_le16(first_list, (u8 *) &header->first_list); - put_unaligned_le16(n_lists, (u8 *) &header->list_count); - - /* Construct the delta list offset table. */ - offset = get_immutable_header_offset(n_lists + 1); - set_immutable_start(memory, 0, offset); - for (i = 0; i < n_lists; i++) { - offset += delta_lists[i].size; - set_immutable_start(memory, i + 1, offset); - } - - /* Copy the delta list data onto the memory page. */ - for (i = 0; i < n_lists; i++) { - move_bits(delta_zone->memory, delta_lists[i].start, memory, - get_immutable_start(memory, i), delta_lists[i].size); - } - - /* Set all the bits in the guard bytes. */ - memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0, - POST_FIELD_GUARD_BYTES); - return UDS_SUCCESS; -} - -/* Compute the new offsets of the delta lists. */ -static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index, - size_t growing_size, size_t used_space) -{ - size_t spacing; - u32 i; - struct delta_list *delta_lists = delta_zone->delta_lists; - u32 tail_guard_index = delta_zone->list_count + 1; - - spacing = (delta_zone->size - used_space) / delta_zone->list_count; - delta_zone->new_offsets[0] = 0; - for (i = 0; i <= delta_zone->list_count; i++) { - delta_zone->new_offsets[i + 1] = - (delta_zone->new_offsets[i] + - get_delta_list_byte_size(&delta_lists[i]) + spacing); - delta_zone->new_offsets[i] *= BITS_PER_BYTE; - delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE; - if (i == 0) - delta_zone->new_offsets[i + 1] -= spacing / 2; - if (i + 1 == growing_index) - delta_zone->new_offsets[i + 1] += growing_size; - } - - delta_zone->new_offsets[tail_guard_index] = - (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size); -} - -static void rebalance_lists(struct delta_zone *delta_zone) -{ - struct delta_list *delta_lists; - u32 i; - size_t used_space = 0; - - /* Extend and balance memory to receive the delta lists */ - delta_lists = delta_zone->delta_lists; - for (i = 0; i <= delta_zone->list_count + 1; i++) - used_space += get_delta_list_byte_size(&delta_lists[i]); - - compute_new_list_offsets(delta_zone, 0, 0, used_space); - for (i = 1; i <= delta_zone->list_count + 1; i++) - delta_lists[i].start = delta_zone->new_offsets[i]; -} - -/* Start restoring a delta index from multiple input streams. */ -int uds_start_restoring_delta_index(struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - int result; - unsigned int zone_count = reader_count; - u64 record_count = 0; - u64 collision_count = 0; - u32 first_list[MAX_ZONES]; - u32 list_count[MAX_ZONES]; - unsigned int z; - u32 list_next = 0; - const struct delta_zone *delta_zone; - - /* Read and validate each header. */ - for (z = 0; z < zone_count; z++) { - struct delta_index_header header; - u8 buffer[sizeof(struct delta_index_header)]; - size_t offset = 0; - - result = uds_read_from_buffered_reader(buffered_readers[z], buffer, - sizeof(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta index header"); - } - - memcpy(&header.magic, buffer, MAGIC_SIZE); - offset += MAGIC_SIZE; - decode_u32_le(buffer, &offset, &header.zone_number); - decode_u32_le(buffer, &offset, &header.zone_count); - decode_u32_le(buffer, &offset, &header.first_list); - decode_u32_le(buffer, &offset, &header.list_count); - decode_u64_le(buffer, &offset, &header.record_count); - decode_u64_le(buffer, &offset, &header.collision_count); - - result = ASSERT(offset == sizeof(struct delta_index_header), - "%zu bytes decoded of %zu expected", offset, - sizeof(struct delta_index_header)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta index header"); - } - - if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index file has bad magic number"); - } - - if (zone_count != header.zone_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index files contain mismatched zone counts (%u,%u)", - zone_count, header.zone_count); - } - - if (header.zone_number != z) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index zone %u found in slot %u", - header.zone_number, z); - } - - first_list[z] = header.first_list; - list_count[z] = header.list_count; - record_count += header.record_count; - collision_count += header.collision_count; - - if (first_list[z] != list_next) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index file for zone %u starts with list %u instead of list %u", - z, first_list[z], list_next); - } - - list_next += list_count[z]; - } - - if (list_next != delta_index->list_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index files contain %u delta lists instead of %u delta lists", - list_next, delta_index->list_count); - } - - if (collision_count > record_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "delta index files contain %llu collisions and %llu records", - (unsigned long long) collision_count, - (unsigned long long) record_count); - } - - uds_reset_delta_index(delta_index); - delta_index->delta_zones[0].record_count = record_count; - delta_index->delta_zones[0].collision_count = collision_count; - - /* Read the delta lists and distribute them to the proper zones. */ - for (z = 0; z < zone_count; z++) { - u32 i; - - delta_index->load_lists[z] = 0; - for (i = 0; i < list_count[z]; i++) { - u16 delta_list_size; - u32 list_number; - unsigned int zone_number; - u8 size_data[sizeof(u16)]; - - result = uds_read_from_buffered_reader(buffered_readers[z], - size_data, - sizeof(size_data)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta index size"); - } - - delta_list_size = get_unaligned_le16(size_data); - if (delta_list_size > 0) - delta_index->load_lists[z] += 1; - - list_number = first_list[z] + i; - zone_number = list_number / delta_index->lists_per_zone; - delta_zone = &delta_index->delta_zones[zone_number]; - list_number -= delta_zone->first_list; - delta_zone->delta_lists[list_number + 1].size = delta_list_size; - } - } - - /* Prepare each zone to start receiving the delta list data. */ - for (z = 0; z < delta_index->zone_count; z++) - rebalance_lists(&delta_index->delta_zones[z]); - - return UDS_SUCCESS; -} - -static int restore_delta_list_to_zone(struct delta_zone *delta_zone, - const struct delta_list_save_info *save_info, - const u8 *data) -{ - struct delta_list *delta_list; - u16 bit_count; - u16 byte_count; - u32 list_number = save_info->index - delta_zone->first_list; - - if (list_number >= delta_zone->list_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "invalid delta list number %u not in range [%u,%u)", - save_info->index, delta_zone->first_list, - delta_zone->first_list + delta_zone->list_count); - } - - delta_list = &delta_zone->delta_lists[list_number + 1]; - if (delta_list->size == 0) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "unexpected delta list number %u", - save_info->index); - } - - bit_count = delta_list->size + save_info->bit_offset; - byte_count = BITS_TO_BYTES(bit_count); - if (save_info->byte_count != byte_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "unexpected delta list size %u != %u", - save_info->byte_count, byte_count); - } - - move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start, - delta_list->size); - return UDS_SUCCESS; -} - -static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone, - struct buffered_reader *buffered_reader, u8 *data) -{ - int result; - struct delta_list_save_info save_info; - u8 buffer[sizeof(struct delta_list_save_info)]; - unsigned int new_zone; - - result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta list data"); - } - - save_info = (struct delta_list_save_info) { - .tag = buffer[0], - .bit_offset = buffer[1], - .byte_count = get_unaligned_le16(&buffer[2]), - .index = get_unaligned_le32(&buffer[4]), - }; - - if ((save_info.bit_offset >= BITS_PER_BYTE) || - (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "corrupt delta list data"); - } - - /* Make sure the data is intended for this delta index. */ - if (save_info.tag != delta_index->tag) - return UDS_CORRUPT_DATA; - - if (save_info.index >= delta_index->list_count) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "invalid delta list number %u of %u", - save_info.index, - delta_index->list_count); - } - - result = uds_read_from_buffered_reader(buffered_reader, data, - save_info.byte_count); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta list data"); - } - - delta_index->load_lists[load_zone] -= 1; - new_zone = save_info.index / delta_index->lists_per_zone; - return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone], - &save_info, data); -} - -/* Restore delta lists from saved data. */ -int uds_finish_restoring_delta_index(struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - int result; - int saved_result = UDS_SUCCESS; - unsigned int z; - u8 *data; - - result = uds_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data); - if (result != UDS_SUCCESS) - return result; - - for (z = 0; z < reader_count; z++) { - while (delta_index->load_lists[z] > 0) { - result = restore_delta_list_data(delta_index, z, - buffered_readers[z], data); - if (result != UDS_SUCCESS) { - saved_result = result; - break; - } - } - } - - uds_free(data); - return saved_result; -} - -int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - int result; - unsigned int z; - u8 buffer[sizeof(struct delta_list_save_info)]; - - for (z = 0; z < reader_count; z++) { - result = uds_read_from_buffered_reader(buffered_readers[z], buffer, - sizeof(buffer)); - if (result != UDS_SUCCESS) - return result; - - if (buffer[0] != 'z') - return UDS_CORRUPT_DATA; - } - - return UDS_SUCCESS; -} - -static int flush_delta_list(struct delta_zone *zone, u32 flush_index) -{ - struct delta_list *delta_list; - u8 buffer[sizeof(struct delta_list_save_info)]; - int result; - - delta_list = &zone->delta_lists[flush_index + 1]; - - buffer[0] = zone->tag; - buffer[1] = delta_list->start % BITS_PER_BYTE; - put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]); - put_unaligned_le32(zone->first_list + flush_index, &buffer[4]); - - result = uds_write_to_buffered_writer(zone->buffered_writer, buffer, - sizeof(buffer)); - if (result != UDS_SUCCESS) { - uds_log_warning_strerror(result, "failed to write delta list memory"); - return result; - } - - result = uds_write_to_buffered_writer(zone->buffered_writer, - zone->memory + get_delta_list_byte_start(delta_list), - get_delta_list_byte_size(delta_list)); - if (result != UDS_SUCCESS) - uds_log_warning_strerror(result, "failed to write delta list memory"); - - return result; -} - -/* Start saving a delta index zone to a buffered output stream. */ -int uds_start_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number, - struct buffered_writer *buffered_writer) -{ - int result; - u32 i; - struct delta_zone *delta_zone; - u8 buffer[sizeof(struct delta_index_header)]; - size_t offset = 0; - - delta_zone = &delta_index->delta_zones[zone_number]; - memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE); - offset += MAGIC_SIZE; - encode_u32_le(buffer, &offset, zone_number); - encode_u32_le(buffer, &offset, delta_index->zone_count); - encode_u32_le(buffer, &offset, delta_zone->first_list); - encode_u32_le(buffer, &offset, delta_zone->list_count); - encode_u64_le(buffer, &offset, delta_zone->record_count); - encode_u64_le(buffer, &offset, delta_zone->collision_count); - - result = ASSERT(offset == sizeof(struct delta_index_header), - "%zu bytes encoded of %zu expected", offset, - sizeof(struct delta_index_header)); - if (result != UDS_SUCCESS) - return result; - - result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); - if (result != UDS_SUCCESS) - return uds_log_warning_strerror(result, - "failed to write delta index header"); - - for (i = 0; i < delta_zone->list_count; i++) { - u8 data[sizeof(u16)]; - struct delta_list *delta_list; - - delta_list = &delta_zone->delta_lists[i + 1]; - put_unaligned_le16(delta_list->size, data); - result = uds_write_to_buffered_writer(buffered_writer, data, - sizeof(data)); - if (result != UDS_SUCCESS) - return uds_log_warning_strerror(result, - "failed to write delta list size"); - } - - delta_zone->buffered_writer = buffered_writer; - return UDS_SUCCESS; -} - -int uds_finish_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number) -{ - int result; - int first_error = UDS_SUCCESS; - u32 i; - struct delta_zone *delta_zone; - struct delta_list *delta_list; - - delta_zone = &delta_index->delta_zones[zone_number]; - for (i = 0; i < delta_zone->list_count; i++) { - delta_list = &delta_zone->delta_lists[i + 1]; - if (delta_list->size > 0) { - result = flush_delta_list(delta_zone, i); - if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS)) - first_error = result; - } - } - - delta_zone->buffered_writer = NULL; - return first_error; -} - -int uds_write_guard_delta_list(struct buffered_writer *buffered_writer) -{ - int result; - u8 buffer[sizeof(struct delta_list_save_info)]; - - memset(buffer, 0, sizeof(struct delta_list_save_info)); - buffer[0] = 'z'; - - result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) - uds_log_warning_strerror(result, "failed to write guard delta list"); - - return UDS_SUCCESS; -} - -size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size) -{ - /* One zone will use at least as much memory as other zone counts. */ - return (sizeof(struct delta_index_header) + - list_count * (sizeof(struct delta_list_save_info) + 1) + - get_zone_memory_size(1, memory_size)); -} - -static int assert_not_at_end(const struct delta_index_entry *delta_entry) -{ - int result = ASSERT(!delta_entry->at_end, - "operation is invalid because the list entry is at the end of the delta list"); - if (result != UDS_SUCCESS) - result = UDS_BAD_STATE; - - return result; -} - -/* - * Prepare to search for an entry in the specified delta list. - * - * This is always the first function to be called when dealing with delta index entries. It is - * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The - * fields of the delta_index_entry argument will be set up for iteration, but will not contain an - * entry from the list. - */ -int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number, - u32 key, struct delta_index_entry *delta_entry) -{ - int result; - unsigned int zone_number; - struct delta_zone *delta_zone; - struct delta_list *delta_list; - - result = ASSERT((list_number < delta_index->list_count), - "Delta list number (%u) is out of range (%u)", list_number, - delta_index->list_count); - if (result != UDS_SUCCESS) - return UDS_CORRUPT_DATA; - - zone_number = list_number / delta_index->lists_per_zone; - delta_zone = &delta_index->delta_zones[zone_number]; - list_number -= delta_zone->first_list; - result = ASSERT((list_number < delta_zone->list_count), - "Delta list number (%u) is out of range (%u) for zone (%u)", - list_number, delta_zone->list_count, zone_number); - if (result != UDS_SUCCESS) - return UDS_CORRUPT_DATA; - - if (delta_index->mutable) { - delta_list = &delta_zone->delta_lists[list_number + 1]; - } else { - u32 end_offset; - - /* - * Translate the immutable delta list header into a temporary - * full delta list header. - */ - delta_list = &delta_entry->temp_delta_list; - delta_list->start = get_immutable_start(delta_zone->memory, list_number); - end_offset = get_immutable_start(delta_zone->memory, list_number + 1); - delta_list->size = end_offset - delta_list->start; - delta_list->save_key = 0; - delta_list->save_offset = 0; - } - - if (key > delta_list->save_key) { - delta_entry->key = delta_list->save_key; - delta_entry->offset = delta_list->save_offset; - } else { - delta_entry->key = 0; - delta_entry->offset = 0; - if (key == 0) { - /* - * This usually means we're about to walk the entire delta list, so get all - * of it into the CPU cache. - */ - uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE], - delta_list->size / BITS_PER_BYTE, false); - } - } - - delta_entry->at_end = false; - delta_entry->delta_zone = delta_zone; - delta_entry->delta_list = delta_list; - delta_entry->entry_bits = 0; - delta_entry->is_collision = false; - delta_entry->list_number = list_number; - delta_entry->list_overflow = false; - delta_entry->value_bits = delta_zone->value_bits; - return UDS_SUCCESS; -} - -static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry) -{ - return delta_entry->delta_list->start + delta_entry->offset; -} - -/* - * Decode a delta index entry delta value. The delta_index_entry basically describes the previous - * list entry, and has had its offset field changed to point to the subsequent entry. We decode the - * bit stream and update the delta_list_entry to describe the entry. - */ -static inline void decode_delta(struct delta_index_entry *delta_entry) -{ - int key_bits; - u32 delta; - const struct delta_zone *delta_zone = delta_entry->delta_zone; - const u8 *memory = delta_zone->memory; - u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; - const u8 *addr = memory + delta_offset / BITS_PER_BYTE; - int offset = delta_offset % BITS_PER_BYTE; - u32 data = get_unaligned_le32(addr) >> offset; - - addr += sizeof(u32); - key_bits = delta_zone->min_bits; - delta = data & ((1 << key_bits) - 1); - if (delta >= delta_zone->min_keys) { - data >>= key_bits; - if (data == 0) { - key_bits = sizeof(u32) * BITS_PER_BYTE - offset; - while ((data = get_unaligned_le32(addr)) == 0) { - addr += sizeof(u32); - key_bits += sizeof(u32) * BITS_PER_BYTE; - } - } - key_bits += ffs(data); - delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys); - } - delta_entry->delta = delta; - delta_entry->key += delta; - - /* Check for a collision, a delta of zero after the start. */ - if (unlikely((delta == 0) && (delta_entry->offset > 0))) { - delta_entry->is_collision = true; - delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS; - } else { - delta_entry->is_collision = false; - delta_entry->entry_bits = delta_entry->value_bits + key_bits; - } -} - -noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry) -{ - int result; - const struct delta_list *delta_list; - u32 next_offset; - u16 size; - - result = assert_not_at_end(delta_entry); - if (result != UDS_SUCCESS) - return result; - - delta_list = delta_entry->delta_list; - delta_entry->offset += delta_entry->entry_bits; - size = delta_list->size; - if (unlikely(delta_entry->offset >= size)) { - delta_entry->at_end = true; - delta_entry->delta = 0; - delta_entry->is_collision = false; - result = ASSERT((delta_entry->offset == size), - "next offset past end of delta list"); - if (result != UDS_SUCCESS) - result = UDS_CORRUPT_DATA; - - return result; - } - - decode_delta(delta_entry); - - next_offset = delta_entry->offset + delta_entry->entry_bits; - if (next_offset > size) { - /* - * This is not an assertion because uds_validate_chapter_index_page() wants to - * handle this error. - */ - uds_log_warning("Decoded past the end of the delta list"); - return UDS_CORRUPT_DATA; - } - - return UDS_SUCCESS; -} - -int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry) -{ - int result; - struct delta_list *delta_list = delta_entry->delta_list; - - result = ASSERT(!delta_entry->is_collision, "entry is not a collision"); - if (result != UDS_SUCCESS) - return result; - - delta_list->save_key = delta_entry->key - delta_entry->delta; - delta_list->save_offset = delta_entry->offset; - return UDS_SUCCESS; -} - -static void set_delta(struct delta_index_entry *delta_entry, u32 delta) -{ - const struct delta_zone *delta_zone = delta_entry->delta_zone; - u32 key_bits = (delta_zone->min_bits + - ((delta_zone->incr_keys - delta_zone->min_keys + delta) / - delta_zone->incr_keys)); - - delta_entry->delta = delta; - delta_entry->entry_bits = delta_entry->value_bits + key_bits; -} - -static void get_collision_name(const struct delta_index_entry *entry, u8 *name) -{ - u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; - const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; - int size = COLLISION_BYTES; - int shift = offset % BITS_PER_BYTE; - - while (--size >= 0) - *name++ = get_unaligned_le16(addr++) >> shift; -} - -static void set_collision_name(const struct delta_index_entry *entry, const u8 *name) -{ - u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; - u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; - int size = COLLISION_BYTES; - int shift = offset % BITS_PER_BYTE; - u16 mask = ~((u16) 0xFF << shift); - u16 data; - - while (--size >= 0) { - data = (get_unaligned_le16(addr) & mask) | (*name++ << shift); - put_unaligned_le16(data, addr++); - } -} - -int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number, - u32 key, const u8 *name, - struct delta_index_entry *delta_entry) -{ - int result; - - result = uds_start_delta_index_search(delta_index, list_number, key, - delta_entry); - if (result != UDS_SUCCESS) - return result; - - do { - result = uds_next_delta_index_entry(delta_entry); - if (result != UDS_SUCCESS) - return result; - } while (!delta_entry->at_end && (key > delta_entry->key)); - - result = uds_remember_delta_index_offset(delta_entry); - if (result != UDS_SUCCESS) - return result; - - if (!delta_entry->at_end && (key == delta_entry->key)) { - struct delta_index_entry collision_entry = *delta_entry; - - for (;;) { - u8 full_name[COLLISION_BYTES]; - - result = uds_next_delta_index_entry(&collision_entry); - if (result != UDS_SUCCESS) - return result; - - if (collision_entry.at_end || !collision_entry.is_collision) - break; - - get_collision_name(&collision_entry, full_name); - if (memcmp(full_name, name, COLLISION_BYTES) == 0) { - *delta_entry = collision_entry; - break; - } - } - } - - return UDS_SUCCESS; -} - -int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name) -{ - int result; - - result = assert_not_at_end(delta_entry); - if (result != UDS_SUCCESS) - return result; - - result = ASSERT(delta_entry->is_collision, - "Cannot get full block name from a non-collision delta index entry"); - if (result != UDS_SUCCESS) - return UDS_BAD_STATE; - - get_collision_name(delta_entry, name); - return UDS_SUCCESS; -} - -u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry) -{ - return get_field(delta_entry->delta_zone->memory, - get_delta_entry_offset(delta_entry), delta_entry->value_bits); -} - -static int assert_mutable_entry(const struct delta_index_entry *delta_entry) -{ - int result = ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list), - "delta index is mutable"); - if (result != UDS_SUCCESS) - result = UDS_BAD_STATE; - - return result; -} - -int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value) -{ - int result; - u32 value_mask = (1 << delta_entry->value_bits) - 1; - - result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) - return result; - - result = assert_not_at_end(delta_entry); - if (result != UDS_SUCCESS) - return result; - - result = ASSERT((value & value_mask) == value, - "Value (%u) being set in a delta index is too large (must fit in %u bits)", - value, delta_entry->value_bits); - if (result != UDS_SUCCESS) - return UDS_INVALID_ARGUMENT; - - set_field(value, delta_entry->delta_zone->memory, - get_delta_entry_offset(delta_entry), delta_entry->value_bits); - return UDS_SUCCESS; -} - -/* - * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated - * by growing_index, then rebalancing the lists in the new chunk. - */ -static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index, - size_t growing_size) -{ - ktime_t start_time; - ktime_t end_time; - struct delta_list *delta_lists; - u32 i; - size_t used_space; - - - /* Calculate the amount of space that is or will be in use. */ - start_time = current_time_ns(CLOCK_MONOTONIC); - delta_lists = delta_zone->delta_lists; - used_space = growing_size; - for (i = 0; i <= delta_zone->list_count + 1; i++) - used_space += get_delta_list_byte_size(&delta_lists[i]); - - if (delta_zone->size < used_space) - return UDS_OVERFLOW; - - /* Compute the new offsets of the delta lists. */ - compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space); - - /* - * When we rebalance the delta list, we will include the end guard list in the rebalancing. - * It contains the end guard data, which must be copied. - */ - rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1); - end_time = current_time_ns(CLOCK_MONOTONIC); - delta_zone->rebalance_count++; - delta_zone->rebalance_time += ktime_sub(end_time, start_time); - return UDS_SUCCESS; -} - -static int insert_bits(struct delta_index_entry *delta_entry, u16 size) -{ - u64 free_before; - u64 free_after; - u64 source; - u64 destination; - u32 count; - bool before_flag; - u8 *memory; - struct delta_zone *delta_zone = delta_entry->delta_zone; - struct delta_list *delta_list = delta_entry->delta_list; - /* Compute bits in use before and after the inserted bits. */ - u32 total_size = delta_list->size; - u32 before_size = delta_entry->offset; - u32 after_size = total_size - delta_entry->offset; - - if (total_size + size > U16_MAX) { - delta_entry->list_overflow = true; - delta_zone->overflow_count++; - return UDS_OVERFLOW; - } - - /* Compute bits available before and after the delta list. */ - free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); - free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); - - if ((size <= free_before) && (size <= free_after)) { - /* - * We have enough space to use either before or after the list. Select the smaller - * amount of data. If it is exactly the same, try to take from the larger amount of - * free space. - */ - if (before_size < after_size) - before_flag = true; - else if (after_size < before_size) - before_flag = false; - else - before_flag = free_before > free_after; - } else if (size <= free_before) { - /* There is space before but not after. */ - before_flag = true; - } else if (size <= free_after) { - /* There is space after but not before. */ - before_flag = false; - } else { - /* - * Neither of the surrounding spaces is large enough for this request. Extend - * and/or rebalance the delta list memory choosing to move the least amount of - * data. - */ - int result; - u32 growing_index = delta_entry->list_number + 1; - - before_flag = before_size < after_size; - if (!before_flag) - growing_index++; - result = extend_delta_zone(delta_zone, growing_index, - BITS_TO_BYTES(size)); - if (result != UDS_SUCCESS) - return result; - } - - delta_list->size += size; - if (before_flag) { - source = delta_list->start; - destination = source - size; - delta_list->start -= size; - count = before_size; - } else { - source = delta_list->start + delta_entry->offset; - destination = source + size; - count = after_size; - } - - memory = delta_zone->memory; - move_bits(memory, source, memory, destination, count); - return UDS_SUCCESS; -} - -static void encode_delta(const struct delta_index_entry *delta_entry) -{ - u32 temp; - u32 t1; - u32 t2; - u64 offset; - const struct delta_zone *delta_zone = delta_entry->delta_zone; - u8 *memory = delta_zone->memory; - - offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; - if (delta_entry->delta < delta_zone->min_keys) { - set_field(delta_entry->delta, memory, offset, delta_zone->min_bits); - return; - } - - temp = delta_entry->delta - delta_zone->min_keys; - t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys; - t2 = temp / delta_zone->incr_keys; - set_field(t1, memory, offset, delta_zone->min_bits); - set_zero(memory, offset + delta_zone->min_bits, t2); - set_field(1, memory, offset + delta_zone->min_bits + t2, 1); -} - -static void encode_entry(const struct delta_index_entry *delta_entry, u32 value, - const u8 *name) -{ - u8 *memory = delta_entry->delta_zone->memory; - u64 offset = get_delta_entry_offset(delta_entry); - - set_field(value, memory, offset, delta_entry->value_bits); - encode_delta(delta_entry); - if (name != NULL) - set_collision_name(delta_entry, name); -} - -/* - * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must - * be provided. - */ -int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value, - const u8 *name) -{ - int result; - struct delta_zone *delta_zone; - - result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) - return result; - - if (delta_entry->is_collision) { - /* - * The caller wants us to insert a collision entry onto a collision entry. This - * happens when we find a collision and attempt to add the name again to the index. - * This is normally a fatal error unless we are replaying a closed chapter while we - * are rebuilding a volume index. - */ - return UDS_DUPLICATE_NAME; - } - - if (delta_entry->offset < delta_entry->delta_list->save_offset) { - /* - * The saved entry offset is after the new entry and will no longer be valid, so - * replace it with the insertion point. - */ - result = uds_remember_delta_index_offset(delta_entry); - if (result != UDS_SUCCESS) - return result; - } - - if (name != NULL) { - /* Insert a collision entry which is placed after this entry. */ - result = assert_not_at_end(delta_entry); - if (result != UDS_SUCCESS) - return result; - - result = ASSERT((key == delta_entry->key), - "incorrect key for collision entry"); - if (result != UDS_SUCCESS) - return result; - - delta_entry->offset += delta_entry->entry_bits; - set_delta(delta_entry, 0); - delta_entry->is_collision = true; - delta_entry->entry_bits += COLLISION_BITS; - result = insert_bits(delta_entry, delta_entry->entry_bits); - } else if (delta_entry->at_end) { - /* Insert a new entry at the end of the delta list. */ - result = ASSERT((key >= delta_entry->key), "key past end of list"); - if (result != UDS_SUCCESS) - return result; - - set_delta(delta_entry, key - delta_entry->key); - delta_entry->key = key; - delta_entry->at_end = false; - result = insert_bits(delta_entry, delta_entry->entry_bits); - } else { - u16 old_entry_size; - u16 additional_size; - struct delta_index_entry next_entry; - u32 next_value; - - /* - * Insert a new entry which requires the delta in the following entry to be - * updated. - */ - result = ASSERT((key < delta_entry->key), - "key precedes following entry"); - if (result != UDS_SUCCESS) - return result; - - result = ASSERT((key >= delta_entry->key - delta_entry->delta), - "key effects following entry's delta"); - if (result != UDS_SUCCESS) - return result; - - old_entry_size = delta_entry->entry_bits; - next_entry = *delta_entry; - next_value = uds_get_delta_entry_value(&next_entry); - set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta)); - delta_entry->key = key; - set_delta(&next_entry, next_entry.key - key); - next_entry.offset += delta_entry->entry_bits; - /* The two new entries are always bigger than the single entry being replaced. */ - additional_size = (delta_entry->entry_bits + - next_entry.entry_bits - old_entry_size); - result = insert_bits(delta_entry, additional_size); - if (result != UDS_SUCCESS) - return result; - - encode_entry(&next_entry, next_value, NULL); - } - - if (result != UDS_SUCCESS) - return result; - - encode_entry(delta_entry, value, name); - delta_zone = delta_entry->delta_zone; - delta_zone->record_count++; - delta_zone->collision_count += delta_entry->is_collision ? 1 : 0; - return UDS_SUCCESS; -} - -static void delete_bits(const struct delta_index_entry *delta_entry, int size) -{ - u64 source; - u64 destination; - u32 count; - bool before_flag; - struct delta_list *delta_list = delta_entry->delta_list; - u8 *memory = delta_entry->delta_zone->memory; - /* Compute bits retained before and after the deleted bits. */ - u32 total_size = delta_list->size; - u32 before_size = delta_entry->offset; - u32 after_size = total_size - delta_entry->offset - size; - - /* - * Determine whether to add to the available space either before or after the delta list. - * We prefer to move the least amount of data. If it is exactly the same, try to add to the - * smaller amount of free space. - */ - if (before_size < after_size) { - before_flag = true; - } else if (after_size < before_size) { - before_flag = false; - } else { - u64 free_before = - (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); - u64 free_after = - (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); - - before_flag = (free_before < free_after); - } - - delta_list->size -= size; - if (before_flag) { - source = delta_list->start; - destination = source + size; - delta_list->start += size; - count = before_size; - } else { - destination = delta_list->start + delta_entry->offset; - source = destination + size; - count = after_size; - } - - move_bits(memory, source, memory, destination, count); -} - -int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry) -{ - int result; - struct delta_index_entry next_entry; - struct delta_zone *delta_zone; - struct delta_list *delta_list; - - result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) - return result; - - next_entry = *delta_entry; - result = uds_next_delta_index_entry(&next_entry); - if (result != UDS_SUCCESS) - return result; - - delta_zone = delta_entry->delta_zone; - - if (delta_entry->is_collision) { - /* This is a collision entry, so just remove it. */ - delete_bits(delta_entry, delta_entry->entry_bits); - next_entry.offset = delta_entry->offset; - delta_zone->collision_count -= 1; - } else if (next_entry.at_end) { - /* This entry is at the end of the list, so just remove it. */ - delete_bits(delta_entry, delta_entry->entry_bits); - next_entry.key -= delta_entry->delta; - next_entry.offset = delta_entry->offset; - } else { - /* The delta in the next entry needs to be updated. */ - u32 next_value = uds_get_delta_entry_value(&next_entry); - u16 old_size = delta_entry->entry_bits + next_entry.entry_bits; - - if (next_entry.is_collision) { - next_entry.is_collision = false; - delta_zone->collision_count -= 1; - } - - set_delta(&next_entry, delta_entry->delta + next_entry.delta); - next_entry.offset = delta_entry->offset; - /* The one new entry is always smaller than the two entries being replaced. */ - delete_bits(delta_entry, old_size - next_entry.entry_bits); - encode_entry(&next_entry, next_value, NULL); - } - - delta_zone->record_count--; - delta_zone->discard_count++; - *delta_entry = next_entry; - - delta_list = delta_entry->delta_list; - if (delta_entry->offset < delta_list->save_offset) { - /* The saved entry offset is no longer valid. */ - delta_list->save_key = 0; - delta_list->save_offset = 0; - } - - return UDS_SUCCESS; -} - -void uds_get_delta_index_stats(const struct delta_index *delta_index, - struct delta_index_stats *stats) -{ - unsigned int z; - const struct delta_zone *delta_zone; - - memset(stats, 0, sizeof(struct delta_index_stats)); - for (z = 0; z < delta_index->zone_count; z++) { - delta_zone = &delta_index->delta_zones[z]; - stats->rebalance_time += delta_zone->rebalance_time; - stats->rebalance_count += delta_zone->rebalance_count; - stats->record_count += delta_zone->record_count; - stats->collision_count += delta_zone->collision_count; - stats->discard_count += delta_zone->discard_count; - stats->overflow_count += delta_zone->overflow_count; - stats->list_count += delta_zone->list_count; - } -} - -size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits) -{ - u16 min_bits; - u32 incr_keys; - u32 min_keys; - - compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys); - /* On average, each delta is encoded into about min_bits + 1.5 bits. */ - return entry_count * (payload_bits + min_bits + 1) + entry_count / 2; -} - -u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, - u32 payload_bits, size_t bytes_per_page) -{ - unsigned int bits_per_delta_list; - unsigned int bits_per_page; - size_t bits_per_index; - - /* Compute the expected number of bits needed for all the entries. */ - bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta, - payload_bits); - bits_per_delta_list = bits_per_index / list_count; - - /* Add in the immutable delta list headers. */ - bits_per_index += list_count * IMMUTABLE_HEADER_SIZE; - /* Compute the number of usable bits on an immutable index page. */ - bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE); - /* - * Reduce the bits per page by one immutable delta list header and one delta list to - * account for internal fragmentation. - */ - bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list; - /* Now compute the number of pages needed. */ - return DIV_ROUND_UP(bits_per_index, bits_per_page); -} - -void uds_log_delta_index_entry(struct delta_index_entry *delta_entry) -{ - uds_log_ratelimit(uds_log_info, - "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s", - delta_entry->list_number, delta_entry->key, - delta_entry->offset, delta_entry->at_end ? " end" : "", - delta_entry->is_collision ? " collision" : "", - delta_entry->delta_list->size, - delta_entry->list_overflow ? " overflow" : ""); - delta_entry->list_overflow = false; -} diff --git a/drivers/md/dm-vdo/delta-index.h b/drivers/md/dm-vdo/delta-index.h deleted file mode 100644 index b3b38fb440bfd..0000000000000 --- a/drivers/md/dm-vdo/delta-index.h +++ /dev/null @@ -1,278 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_DELTA_INDEX_H -#define UDS_DELTA_INDEX_H - -#include - -#include "config.h" -#include "io-factory.h" -#include "numeric.h" -#include "time-utils.h" - -/* - * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the - * value). The entries are sorted by address, and only the delta between successive addresses is - * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are - * therefore exponentially distributed. - * - * A delta_index can either be mutable or immutable depending on its expected use. The immutable - * form of a delta index is used for the indexes of closed chapters committed to the volume. The - * mutable form of a delta index is used by the volume index, and also by the chapter index in an - * open chapter. Like the index as a whole, each mutable delta index is divided into a number of - * independent zones. - */ - -struct delta_list { - /* The offset of the delta list start, in bits */ - u64 start; - /* The number of bits in the delta list */ - u16 size; - /* Where the last search "found" the key, in bits */ - u16 save_offset; - /* The key for the record just before save_offset */ - u32 save_key; -}; - -struct delta_zone { - /* The delta list memory */ - u8 *memory; - /* The delta list headers */ - struct delta_list *delta_lists; - /* Temporary starts of delta lists */ - u64 *new_offsets; - /* Buffered writer for saving an index */ - struct buffered_writer *buffered_writer; - /* The size of delta list memory */ - size_t size; - /* Nanoseconds spent rebalancing */ - ktime_t rebalance_time; - /* Number of memory rebalances */ - u32 rebalance_count; - /* The number of bits in a stored value */ - u8 value_bits; - /* The number of bits in the minimal key code */ - u16 min_bits; - /* The number of keys used in a minimal code */ - u32 min_keys; - /* The number of keys used for another code bit */ - u32 incr_keys; - /* The number of records in the index */ - u64 record_count; - /* The number of collision records */ - u64 collision_count; - /* The number of records removed */ - u64 discard_count; - /* The number of UDS_OVERFLOW errors detected */ - u64 overflow_count; - /* The index of the first delta list */ - u32 first_list; - /* The number of delta lists */ - u32 list_count; - /* Tag belonging to this delta index */ - u8 tag; -} __aligned(L1_CACHE_BYTES); - -struct delta_list_save_info { - /* Tag identifying which delta index this list is in */ - u8 tag; - /* Bit offset of the start of the list data */ - u8 bit_offset; - /* Number of bytes of list data */ - u16 byte_count; - /* The delta list number within the delta index */ - u32 index; -} __packed; - -struct delta_index { - /* The zones */ - struct delta_zone *delta_zones; - /* The number of zones */ - unsigned int zone_count; - /* The number of delta lists */ - u32 list_count; - /* Maximum lists per zone */ - u32 lists_per_zone; - /* Total memory allocated to this index */ - size_t memory_size; - /* The number of non-empty lists at load time per zone */ - u32 load_lists[MAX_ZONES]; - /* True if this index is mutable */ - bool mutable; - /* Tag belonging to this delta index */ - u8 tag; -}; - -/* - * A delta_index_page describes a single page of a chapter index. The delta_index field allows the - * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter - * index page as a single zone index, and without the need to do an additional memory allocation. - */ -struct delta_index_page { - struct delta_index delta_index; - /* These values are loaded from the delta_page_header */ - u32 lowest_list_number; - u32 highest_list_number; - u64 virtual_chapter_number; - /* This structure describes the single zone of a delta index page. */ - struct delta_zone delta_zone; -}; - -/* - * Notes on the delta_index_entries: - * - * The fields documented as "public" can be read by any code that uses a delta_index. The fields - * documented as "private" carry information between delta_index method calls and should not be - * used outside the delta_index module. - * - * (1) The delta_index_entry is used like an iterator when searching a delta list. - * - * (2) It is also the result of a successful search and can be used to refer to the element found - * by the search. - * - * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion - * point for a new record. - * - * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new - * record at the end of the list. - * - * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a - * collision entry in the list, and the delta_list entry can be used a a reference to this - * entry. - * - * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a - * non-collision entry in the list. Such delta_list entries can be used as a reference to a - * found entry, or an insertion point for a non-collision entry before this entry, or an - * insertion point for a collision entry that collides with this entry. - */ -struct delta_index_entry { - /* Public fields */ - /* The key for this entry */ - u32 key; - /* We are after the last list entry */ - bool at_end; - /* This record is a collision */ - bool is_collision; - - /* Private fields */ - /* This delta list overflowed */ - bool list_overflow; - /* The number of bits used for the value */ - u8 value_bits; - /* The number of bits used for the entire entry */ - u16 entry_bits; - /* The delta index zone */ - struct delta_zone *delta_zone; - /* The delta list containing the entry */ - struct delta_list *delta_list; - /* The delta list number */ - u32 list_number; - /* Bit offset of this entry within the list */ - u16 offset; - /* The delta between this and previous entry */ - u32 delta; - /* Temporary delta list for immutable indices */ - struct delta_list temp_delta_list; -}; - -struct delta_index_stats { - /* Number of bytes allocated */ - size_t memory_allocated; - /* Nanoseconds spent rebalancing */ - ktime_t rebalance_time; - /* Number of memory rebalances */ - u32 rebalance_count; - /* The number of records in the index */ - u64 record_count; - /* The number of collision records */ - u64 collision_count; - /* The number of records removed */ - u64 discard_count; - /* The number of UDS_OVERFLOW errors detected */ - u64 overflow_count; - /* The number of delta lists */ - u32 list_count; -}; - -int __must_check uds_initialize_delta_index(struct delta_index *delta_index, - unsigned int zone_count, u32 list_count, - u32 mean_delta, u32 payload_bits, - size_t memory_size, u8 tag); - -int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, - u64 expected_nonce, u32 mean_delta, - u32 payload_bits, u8 *memory, - size_t memory_size); - -void uds_uninitialize_delta_index(struct delta_index *delta_index); - -void uds_reset_delta_index(const struct delta_index *delta_index); - -int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index, - u64 header_nonce, u8 *memory, - size_t memory_size, - u64 virtual_chapter_number, u32 first_list, - u32 *list_count); - -int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count); - -int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count); - -int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, - unsigned int reader_count); - -int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number, - struct buffered_writer *buffered_writer); - -int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number); - -int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer); - -size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count, - size_t memory_size); - -int __must_check uds_start_delta_index_search(const struct delta_index *delta_index, - u32 list_number, u32 key, - struct delta_index_entry *iterator); - -int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry); - -int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry); - -int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index, - u32 list_number, u32 key, const u8 *name, - struct delta_index_entry *delta_entry); - -int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, - u8 *name); - -u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry); - -int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value); - -int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, - u32 value, const u8 *name); - -int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry); - -void uds_get_delta_index_stats(const struct delta_index *delta_index, - struct delta_index_stats *stats); - -size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, - u32 payload_bits); - -u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, - u32 payload_bits, size_t bytes_per_page); - -void uds_log_delta_index_entry(struct delta_index_entry *delta_entry); - -#endif /* UDS_DELTA_INDEX_H */ diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c deleted file mode 100644 index d2b49e39550c9..0000000000000 --- a/drivers/md/dm-vdo/funnel-requestqueue.c +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "funnel-requestqueue.h" - -#include -#include -#include - -#include "funnel-queue.h" -#include "logger.h" -#include "memory-alloc.h" -#include "thread-utils.h" - -/* - * This queue will attempt to handle requests in reasonably sized batches instead of reacting - * immediately to each new request. The wait time between batches is dynamically adjusted up or - * down to try to balance responsiveness against wasted thread run time. - * - * If the wait time becomes long enough, the queue will become dormant and must be explicitly - * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel - * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a - * wakeup of the worker thread. - * - * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to - * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before - * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or - * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel - * queue's "next" field update isn't visible yet to make the entry accessible, its existence will - * kick the worker thread out of dormant mode and back into timer-based mode. - * - * Unbatched requests are used to communicate between different zone threads and will also cause - * the queue to awaken immediately. - */ - -enum { - NANOSECOND = 1, - MICROSECOND = 1000 * NANOSECOND, - MILLISECOND = 1000 * MICROSECOND, - DEFAULT_WAIT_TIME = 20 * MICROSECOND, - MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, - MAXIMUM_WAIT_TIME = MILLISECOND, - MINIMUM_BATCH = 32, - MAXIMUM_BATCH = 64, -}; - -struct uds_request_queue { - /* Wait queue for synchronizing producers and consumer */ - struct wait_queue_head wait_head; - /* Function to process a request */ - uds_request_queue_processor_fn processor; - /* Queue of new incoming requests */ - struct funnel_queue *main_queue; - /* Queue of old requests to retry */ - struct funnel_queue *retry_queue; - /* The thread id of the worker thread */ - struct thread *thread; - /* True if the worker was started */ - bool started; - /* When true, requests can be enqueued */ - bool running; - /* A flag set when the worker is waiting without a timeout */ - atomic_t dormant; -}; - -static inline struct uds_request *poll_queues(struct uds_request_queue *queue) -{ - struct funnel_queue_entry *entry; - - entry = uds_funnel_queue_poll(queue->retry_queue); - if (entry != NULL) - return container_of(entry, struct uds_request, queue_link); - - entry = uds_funnel_queue_poll(queue->main_queue); - if (entry != NULL) - return container_of(entry, struct uds_request, queue_link); - - return NULL; -} - -static inline bool are_queues_idle(struct uds_request_queue *queue) -{ - return uds_is_funnel_queue_idle(queue->retry_queue) && - uds_is_funnel_queue_idle(queue->main_queue); -} - -/* - * Determine if there is a next request to process, and return it if there is. Also return flags - * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether - * the thread did sleep before returning a new request. - */ -static inline bool dequeue_request(struct uds_request_queue *queue, - struct uds_request **request_ptr, bool *waited_ptr) -{ - struct uds_request *request = poll_queues(queue); - - if (request != NULL) { - *request_ptr = request; - return true; - } - - if (!READ_ONCE(queue->running)) { - /* Wake the worker thread so it can exit. */ - *request_ptr = NULL; - return true; - } - - *request_ptr = NULL; - *waited_ptr = true; - return false; -} - -static void wait_for_request(struct uds_request_queue *queue, bool dormant, - unsigned long timeout, struct uds_request **request, - bool *waited) -{ - if (dormant) { - wait_event_interruptible(queue->wait_head, - (dequeue_request(queue, request, waited) || - !are_queues_idle(queue))); - return; - } - - wait_event_interruptible_hrtimeout(queue->wait_head, - dequeue_request(queue, request, waited), - ns_to_ktime(timeout)); -} - -static void request_queue_worker(void *arg) -{ - struct uds_request_queue *queue = arg; - struct uds_request *request = NULL; - unsigned long time_batch = DEFAULT_WAIT_TIME; - bool dormant = atomic_read(&queue->dormant); - bool waited = false; - long current_batch = 0; - - for (;;) { - wait_for_request(queue, dormant, time_batch, &request, &waited); - if (likely(request != NULL)) { - current_batch++; - queue->processor(request); - } else if (!READ_ONCE(queue->running)) { - break; - } - - if (dormant) { - /* - * The queue has been roused from dormancy. Clear the flag so enqueuers can - * stop broadcasting. No fence is needed for this transition. - */ - atomic_set(&queue->dormant, false); - dormant = false; - time_batch = DEFAULT_WAIT_TIME; - } else if (waited) { - /* - * We waited for this request to show up. Adjust the wait time to smooth - * out the batch size. - */ - if (current_batch < MINIMUM_BATCH) { - /* - * If the last batch of requests was too small, increase the wait - * time. - */ - time_batch += time_batch / 4; - if (time_batch >= MAXIMUM_WAIT_TIME) { - atomic_set(&queue->dormant, true); - dormant = true; - } - } else if (current_batch > MAXIMUM_BATCH) { - /* - * If the last batch of requests was too large, decrease the wait - * time. - */ - time_batch -= time_batch / 4; - if (time_batch < MINIMUM_WAIT_TIME) - time_batch = MINIMUM_WAIT_TIME; - } - current_batch = 0; - } - } - - /* - * Ensure that we process any remaining requests that were enqueued before trying to shut - * down. The corresponding write barrier is in uds_request_queue_finish(). - */ - smp_rmb(); - while ((request = poll_queues(queue)) != NULL) - queue->processor(request); -} - -int uds_make_request_queue(const char *queue_name, - uds_request_queue_processor_fn processor, - struct uds_request_queue **queue_ptr) -{ - int result; - struct uds_request_queue *queue; - - result = uds_allocate(1, struct uds_request_queue, __func__, &queue); - if (result != UDS_SUCCESS) - return result; - - queue->processor = processor; - queue->running = true; - atomic_set(&queue->dormant, false); - init_waitqueue_head(&queue->wait_head); - - result = uds_make_funnel_queue(&queue->main_queue); - if (result != UDS_SUCCESS) { - uds_request_queue_finish(queue); - return result; - } - - result = uds_make_funnel_queue(&queue->retry_queue); - if (result != UDS_SUCCESS) { - uds_request_queue_finish(queue); - return result; - } - - result = vdo_create_thread(request_queue_worker, queue, queue_name, - &queue->thread); - if (result != UDS_SUCCESS) { - uds_request_queue_finish(queue); - return result; - } - - queue->started = true; - *queue_ptr = queue; - return UDS_SUCCESS; -} - -static inline void wake_up_worker(struct uds_request_queue *queue) -{ - if (wq_has_sleeper(&queue->wait_head)) - wake_up(&queue->wait_head); -} - -void uds_request_queue_enqueue(struct uds_request_queue *queue, - struct uds_request *request) -{ - struct funnel_queue *sub_queue; - bool unbatched = request->unbatched; - - sub_queue = request->requeued ? queue->retry_queue : queue->main_queue; - uds_funnel_queue_put(sub_queue, &request->queue_link); - - /* - * We must wake the worker thread when it is dormant. A read fence isn't needed here since - * we know the queue operation acts as one. - */ - if (atomic_read(&queue->dormant) || unbatched) - wake_up_worker(queue); -} - -void uds_request_queue_finish(struct uds_request_queue *queue) -{ - if (queue == NULL) - return; - - /* - * This memory barrier ensures that any requests we queued will be seen. The point is that - * when dequeue_request() sees the following update to the running flag, it will also be - * able to see any change we made to a next field in the funnel queue entry. The - * corresponding read barrier is in request_queue_worker(). - */ - smp_wmb(); - WRITE_ONCE(queue->running, false); - - if (queue->started) { - wake_up_worker(queue); - vdo_join_threads(queue->thread); - } - - uds_free_funnel_queue(queue->main_queue); - uds_free_funnel_queue(queue->retry_queue); - uds_free(queue); -} diff --git a/drivers/md/dm-vdo/funnel-requestqueue.h b/drivers/md/dm-vdo/funnel-requestqueue.h deleted file mode 100644 index 9b0f53939b4dd..0000000000000 --- a/drivers/md/dm-vdo/funnel-requestqueue.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_REQUEST_QUEUE_H -#define UDS_REQUEST_QUEUE_H - -#include "indexer.h" - -/* - * A simple request queue which will handle new requests in the order in which they are received, - * and will attempt to handle requeued requests before new ones. However, the nature of the - * implementation means that it cannot guarantee this ordering; the prioritization is merely a - * hint. - */ - -struct uds_request_queue; - -typedef void (*uds_request_queue_processor_fn)(struct uds_request *); - -int __must_check uds_make_request_queue(const char *queue_name, - uds_request_queue_processor_fn processor, - struct uds_request_queue **queue_ptr); - -void uds_request_queue_enqueue(struct uds_request_queue *queue, - struct uds_request *request); - -void uds_request_queue_finish(struct uds_request_queue *queue); - -#endif /* UDS_REQUEST_QUEUE_H */ diff --git a/drivers/md/dm-vdo/geometry.c b/drivers/md/dm-vdo/geometry.c deleted file mode 100644 index 04c07195a01c6..0000000000000 --- a/drivers/md/dm-vdo/geometry.c +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "geometry.h" - -#include -#include - -#include "delta-index.h" -#include "errors.h" -#include "indexer.h" -#include "logger.h" -#include "memory-alloc.h" -#include "permassert.h" - -/* - * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a - * fixed number of fixed-size pages. The volume layout is defined by two constants and four - * parameters. The constants are that index records are 32 bytes long (16-byte block name plus - * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters - * are the number of bytes in a page, the number of record pages in a chapter, the number of - * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can - * derive the rest of the layout and other index properties. - * - * The index volume is sized by its maximum memory footprint. For a dense index, the persistent - * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent - * storage is about 100 times the size of the memory footprint. - * - * For a small index with a memory footprint less than 1GB, there are three possible memory - * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records - * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter - * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For - * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5 - * GB for the persistent storage and 256 MB of RAM. - * - * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024 - * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024 - * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window - * of 1 TB using about 9GB of persistent storage and 1 GB of RAM. - * - * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times - * as many chapters as the corresponding non-sparse volume, which provides 10 times the - * deduplication window while using 10 times as much persistent storage as the equivalent - * non-sparse volume with the same memory footprint. - * - * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters - * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual - * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter. - * This remapping is expressed by storing which virtual chapter was remapped, and which physical - * chapter it was moved to. - */ - -int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, - u32 chapters_per_volume, u32 sparse_chapters_per_volume, - u64 remapped_virtual, u64 remapped_physical, - struct index_geometry **geometry_ptr) -{ - int result; - struct index_geometry *geometry; - - result = uds_allocate(1, struct index_geometry, "geometry", &geometry); - if (result != UDS_SUCCESS) - return result; - - geometry->bytes_per_page = bytes_per_page; - geometry->record_pages_per_chapter = record_pages_per_chapter; - geometry->chapters_per_volume = chapters_per_volume; - geometry->sparse_chapters_per_volume = sparse_chapters_per_volume; - geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume; - geometry->remapped_virtual = remapped_virtual; - geometry->remapped_physical = remapped_physical; - - geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD; - geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter; - geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume; - - geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; - geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1); - /* - * We want 1 delta list for every 64 records in the chapter. - * The "| 077" ensures that the chapter_delta_list_bits computation - * does not underflow. - */ - geometry->chapter_delta_list_bits = - bits_per((geometry->records_per_chapter - 1) | 077) - 6; - geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits; - /* We need enough address bits to achieve the desired mean delta. */ - geometry->chapter_address_bits = - (DEFAULT_CHAPTER_MEAN_DELTA_BITS - - geometry->chapter_delta_list_bits + - bits_per(geometry->records_per_chapter - 1)); - geometry->index_pages_per_chapter = - uds_get_delta_index_page_count(geometry->records_per_chapter, - geometry->delta_lists_per_chapter, - geometry->chapter_mean_delta, - geometry->chapter_payload_bits, - bytes_per_page); - - geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter; - geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume; - geometry->bytes_per_volume = - bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME); - - *geometry_ptr = geometry; - return UDS_SUCCESS; -} - -int uds_copy_index_geometry(struct index_geometry *source, - struct index_geometry **geometry_ptr) -{ - return uds_make_index_geometry(source->bytes_per_page, - source->record_pages_per_chapter, - source->chapters_per_volume, - source->sparse_chapters_per_volume, - source->remapped_virtual, source->remapped_physical, - geometry_ptr); -} - -void uds_free_index_geometry(struct index_geometry *geometry) -{ - uds_free(geometry); -} - -u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, - u64 virtual_chapter) -{ - u64 delta; - - if (!uds_is_reduced_index_geometry(geometry)) - return virtual_chapter % geometry->chapters_per_volume; - - if (likely(virtual_chapter > geometry->remapped_virtual)) { - delta = virtual_chapter - geometry->remapped_virtual; - if (likely(delta > geometry->remapped_physical)) - return delta % geometry->chapters_per_volume; - else - return delta - 1; - } - - if (virtual_chapter == geometry->remapped_virtual) - return geometry->remapped_physical; - - delta = geometry->remapped_virtual - virtual_chapter; - if (delta < geometry->chapters_per_volume) - return geometry->chapters_per_volume - delta; - - /* This chapter is so old the answer doesn't matter. */ - return 0; -} - -/* Check whether any sparse chapters are in use. */ -bool uds_has_sparse_chapters(const struct index_geometry *geometry, - u64 oldest_virtual_chapter, u64 newest_virtual_chapter) -{ - return uds_is_sparse_index_geometry(geometry) && - ((newest_virtual_chapter - oldest_virtual_chapter + 1) > - geometry->dense_chapters_per_volume); -} - -bool uds_is_chapter_sparse(const struct index_geometry *geometry, - u64 oldest_virtual_chapter, u64 newest_virtual_chapter, - u64 virtual_chapter_number) -{ - return uds_has_sparse_chapters(geometry, oldest_virtual_chapter, - newest_virtual_chapter) && - ((virtual_chapter_number + geometry->dense_chapters_per_volume) <= - newest_virtual_chapter); -} - -/* Calculate how many chapters to expire after opening the newest chapter. */ -u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter) -{ - /* If the index isn't full yet, don't expire anything. */ - if (newest_chapter < geometry->chapters_per_volume) - return 0; - - /* If a chapter is out of order... */ - if (geometry->remapped_physical > 0) { - u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume; - - /* - * ... expire an extra chapter when expiring the moved chapter to free physical - * space for the new chapter ... - */ - if (oldest_chapter == geometry->remapped_virtual) - return 2; - - /* - * ... but don't expire anything when the new chapter will use the physical chapter - * freed by expiring the moved chapter. - */ - if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical)) - return 0; - } - - /* Normally, just expire one. */ - return 1; -} diff --git a/drivers/md/dm-vdo/geometry.h b/drivers/md/dm-vdo/geometry.h deleted file mode 100644 index a2ecdb238cf2d..0000000000000 --- a/drivers/md/dm-vdo/geometry.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_INDEX_GEOMETRY_H -#define UDS_INDEX_GEOMETRY_H - -#include "indexer.h" - -/* - * The index_geometry records parameters that define the layout of a UDS index volume, and the size and - * shape of various index structures. It is created when the index is created, and is referenced by - * many index sub-components. - */ - -struct index_geometry { - /* Size of a chapter page, in bytes */ - size_t bytes_per_page; - /* Number of record pages in a chapter */ - u32 record_pages_per_chapter; - /* Total number of chapters in a volume */ - u32 chapters_per_volume; - /* Number of sparsely-indexed chapters in a volume */ - u32 sparse_chapters_per_volume; - /* Number of bits used to determine delta list numbers */ - u8 chapter_delta_list_bits; - /* Virtual chapter remapped from physical chapter 0 */ - u64 remapped_virtual; - /* New physical chapter where the remapped chapter can be found */ - u64 remapped_physical; - - /* - * The following properties are derived from the ones above, but they are computed and - * recorded as fields for convenience. - */ - /* Total number of pages in a volume, excluding the header */ - u32 pages_per_volume; - /* Total number of bytes in a volume, including the header */ - size_t bytes_per_volume; - /* Number of pages in a chapter */ - u32 pages_per_chapter; - /* Number of index pages in a chapter index */ - u32 index_pages_per_chapter; - /* Number of records that fit on a page */ - u32 records_per_page; - /* Number of records that fit in a chapter */ - u32 records_per_chapter; - /* Number of records that fit in a volume */ - u64 records_per_volume; - /* Number of delta lists per chapter index */ - u32 delta_lists_per_chapter; - /* Mean delta for chapter indexes */ - u32 chapter_mean_delta; - /* Number of bits needed for record page numbers */ - u8 chapter_payload_bits; - /* Number of bits used to compute addresses for chapter delta lists */ - u8 chapter_address_bits; - /* Number of densely-indexed chapters in a volume */ - u32 dense_chapters_per_volume; -}; - -enum { - /* The number of bytes in a record (name + metadata) */ - BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE), - - /* The default length of a page in a chapter, in bytes */ - DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, - - /* The default maximum number of records per page */ - DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, - - /* The default number of record pages in a chapter */ - DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, - - /* The default number of record pages in a chapter for a small index */ - SMALL_RECORD_PAGES_PER_CHAPTER = 64, - - /* The default number of chapters in a volume */ - DEFAULT_CHAPTERS_PER_VOLUME = 1024, - - /* The default number of sparsely-indexed chapters in a volume */ - DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, - - /* The log2 of the default mean delta */ - DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, - - /* The log2 of the number of delta lists in a large chapter */ - DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, - - /* The log2 of the number of delta lists in a small chapter */ - SMALL_CHAPTER_DELTA_LIST_BITS = 10, - - /* The number of header pages per volume */ - HEADER_PAGES_PER_VOLUME = 1, -}; - -int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, - u32 chapters_per_volume, - u32 sparse_chapters_per_volume, u64 remapped_virtual, - u64 remapped_physical, - struct index_geometry **geometry_ptr); - -int __must_check uds_copy_index_geometry(struct index_geometry *source, - struct index_geometry **geometry_ptr); - -void uds_free_index_geometry(struct index_geometry *geometry); - -u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, - u64 virtual_chapter); - -/* - * Check whether this geometry is reduced by a chapter. This will only be true if the volume was - * converted from a non-lvm volume to an lvm volume. - */ -static inline bool __must_check -uds_is_reduced_index_geometry(const struct index_geometry *geometry) -{ - return !!(geometry->chapters_per_volume & 1); -} - -static inline bool __must_check -uds_is_sparse_index_geometry(const struct index_geometry *geometry) -{ - return geometry->sparse_chapters_per_volume > 0; -} - -bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry, - u64 oldest_virtual_chapter, - u64 newest_virtual_chapter); - -bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry, - u64 oldest_virtual_chapter, - u64 newest_virtual_chapter, - u64 virtual_chapter_number); - -u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry, - u64 newest_chapter); - -#endif /* UDS_INDEX_GEOMETRY_H */ diff --git a/drivers/md/dm-vdo/hash-utils.h b/drivers/md/dm-vdo/hash-utils.h deleted file mode 100644 index e3b865bbe9b2c..0000000000000 --- a/drivers/md/dm-vdo/hash-utils.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_HASH_UTILS_H -#define UDS_HASH_UTILS_H - -#include "geometry.h" -#include "indexer.h" -#include "numeric.h" - -/* Utilities for extracting portions of a request name for various uses. */ - -/* How various portions of a record name are apportioned. */ -enum { - VOLUME_INDEX_BYTES_OFFSET = 0, - VOLUME_INDEX_BYTES_COUNT = 8, - CHAPTER_INDEX_BYTES_OFFSET = 8, - CHAPTER_INDEX_BYTES_COUNT = 6, - SAMPLE_BYTES_OFFSET = 14, - SAMPLE_BYTES_COUNT = 2, -}; - -static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name) -{ - const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET]; - u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32; - - bytes |= get_unaligned_be32(chapter_bits + 2); - return bytes; -} - -static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name) -{ - return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]); -} - -static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name) -{ - return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]); -} - -/* Compute the chapter delta list for a given name. */ -static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name, - const struct index_geometry *geometry) -{ - return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) & - ((1 << geometry->chapter_delta_list_bits) - 1)); -} - -/* Compute the chapter delta address for a given name. */ -static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name, - const struct index_geometry *geometry) -{ - return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1); -} - -static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name, - unsigned int slot_count) -{ - return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count); -} - -#endif /* UDS_HASH_UTILS_H */ diff --git a/drivers/md/dm-vdo/index-layout.c b/drivers/md/dm-vdo/index-layout.c deleted file mode 100644 index 2da507b26fd5b..0000000000000 --- a/drivers/md/dm-vdo/index-layout.c +++ /dev/null @@ -1,1768 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "index-layout.h" - -#include - -#include "config.h" -#include "logger.h" -#include "memory-alloc.h" -#include "murmurhash3.h" -#include "numeric.h" -#include "open-chapter.h" -#include "time-utils.h" -#include "volume-index.h" - -/* - * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of - * which are computed when the index is created. Every header and region begins on 4K block - * boundary. Save regions are further sub-divided into regions of their own. - * - * Each region has a kind and an instance number. Some kinds only have one instance and therefore - * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to - * represent sub-indices; now, however there is only ever one sub-index and therefore one instance. - * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved. - * - * Every region header has a type and version. - * - * +-+-+---------+--------+--------+-+ - * | | | I N D E X 0 101, 0 | | - * |H|C+---------+--------+--------+S| - * |D|f| Volume | Save | Save |e| - * |R|g| Region | Region | Region |a| - * | | | 201, -1 | 202, 0 | 202, 1 |l| - * +-+-+--------+---------+--------+-+ - * - * The header contains the encoded region layout table as well as some index configuration data. - * The sub-index region and its subdivisions are maintained in the same table. - * - * There are two save regions to preserve the old state in case saving the new state is incomplete. - * They are used in alternation. Each save region is further divided into sub-regions. - * - * +-+-----+------+------+-----+-----+ - * |H| IPM | MI | MI | | OC | - * |D| | zone | zone | ... | | - * |R| 301 | 302 | 302 | | 303 | - * | | -1 | 0 | 1 | | -1 | - * +-+-----+------+------+-----+-----+ - * - * The header contains the encoded region layout table as well as index state data for that save. - * Each save also has a unique nonce. - */ - -enum { - MAGIC_SIZE = 32, - NONCE_INFO_SIZE = 32, - MAX_SAVES = 2, -}; - -enum region_kind { - RL_KIND_EMPTY = 0, - RL_KIND_HEADER = 1, - RL_KIND_CONFIG = 100, - RL_KIND_INDEX = 101, - RL_KIND_SEAL = 102, - RL_KIND_VOLUME = 201, - RL_KIND_SAVE = 202, - RL_KIND_INDEX_PAGE_MAP = 301, - RL_KIND_VOLUME_INDEX = 302, - RL_KIND_OPEN_CHAPTER = 303, -}; - -/* Some region types are historical and are no longer used. */ -enum region_type { - RH_TYPE_FREE = 0, /* unused */ - RH_TYPE_SUPER = 1, - RH_TYPE_SAVE = 2, - RH_TYPE_CHECKPOINT = 3, /* unused */ - RH_TYPE_UNSAVED = 4, -}; - -enum { - RL_SOLE_INSTANCE = 65535, -}; - -/* - * Super block version 2 is the first released version. - * - * Super block version 3 is the normal version used from RHEL 8.2 onwards. - * - * Super block versions 4 through 6 were incremental development versions and - * are not supported. - * - * Super block version 7 is used for volumes which have been reduced in size by one chapter in - * order to make room to prepend LVM metadata to a volume originally created without lvm. This - * allows the index to retain most its deduplication records. - */ -enum { - SUPER_VERSION_MINIMUM = 3, - SUPER_VERSION_CURRENT = 3, - SUPER_VERSION_MAXIMUM = 7, -}; - -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; -static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ - -struct region_header { - u64 magic; - u64 region_blocks; - u16 type; - /* Currently always version 1 */ - u16 version; - u16 region_count; - u16 payload; -}; - -struct layout_region { - u64 start_block; - u64 block_count; - u32 __unused; - u16 kind; - u16 instance; -}; - -struct region_table { - size_t encoded_size; - struct region_header header; - struct layout_region regions[]; -}; - -struct index_save_data { - u64 timestamp; - u64 nonce; - /* Currently always version 1 */ - u32 version; - u32 unused__; -}; - -struct index_state_version { - s32 signature; - s32 version_id; -}; - -static const struct index_state_version INDEX_STATE_VERSION_301 = { - .signature = -1, - .version_id = 301, -}; - -struct index_state_data301 { - struct index_state_version version; - u64 newest_chapter; - u64 oldest_chapter; - u64 last_save; - u32 unused; - u32 padding; -}; - -struct index_save_layout { - unsigned int zone_count; - struct layout_region index_save; - struct layout_region header; - struct layout_region index_page_map; - struct layout_region free_space; - struct layout_region volume_index_zones[MAX_ZONES]; - struct layout_region open_chapter; - struct index_save_data save_data; - struct index_state_data301 state_data; -}; - -struct sub_index_layout { - u64 nonce; - struct layout_region sub_index; - struct layout_region volume; - struct index_save_layout *saves; -}; - -struct super_block_data { - u8 magic_label[MAGIC_SIZE]; - u8 nonce_info[NONCE_INFO_SIZE]; - u64 nonce; - u32 version; - u32 block_size; - u16 index_count; - u16 max_saves; - /* Padding reflects a blank field on permanent storage */ - u8 padding[4]; - u64 open_chapter_blocks; - u64 page_map_blocks; - u64 volume_offset; - u64 start_offset; -}; - -struct index_layout { - struct io_factory *factory; - size_t factory_size; - off_t offset; - struct super_block_data super; - struct layout_region header; - struct layout_region config; - struct sub_index_layout index; - struct layout_region seal; - u64 total_blocks; -}; - -struct save_layout_sizes { - unsigned int save_count; - size_t block_size; - u64 volume_blocks; - u64 volume_index_blocks; - u64 page_map_blocks; - u64 open_chapter_blocks; - u64 save_blocks; - u64 sub_index_blocks; - u64 total_blocks; - size_t total_size; -}; - -static inline bool is_converted_super_block(struct super_block_data *super) -{ - return super->version == 7; -} - -static int __must_check compute_sizes(const struct uds_configuration *config, - struct save_layout_sizes *sls) -{ - int result; - struct index_geometry *geometry = config->geometry; - - memset(sls, 0, sizeof(*sls)); - sls->save_count = MAX_SAVES; - sls->block_size = UDS_BLOCK_SIZE; - sls->volume_blocks = geometry->bytes_per_volume / sls->block_size; - - result = uds_compute_volume_index_save_blocks(config, sls->block_size, - &sls->volume_index_blocks); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot compute index save size"); - - sls->page_map_blocks = - DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry), - sls->block_size); - sls->open_chapter_blocks = - DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry), - sls->block_size); - sls->save_blocks = - 1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks); - sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks); - sls->total_blocks = 3 + sls->sub_index_blocks; - sls->total_size = sls->total_blocks * sls->block_size; - - return UDS_SUCCESS; -} - -int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size) -{ - int result; - struct uds_configuration *index_config; - struct save_layout_sizes sizes; - - if (index_size == NULL) { - uds_log_error("Missing output size pointer"); - return -EINVAL; - } - - result = uds_make_configuration(parameters, &index_config); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "cannot compute index size"); - return uds_status_to_errno(result); - } - - result = compute_sizes(index_config, &sizes); - uds_free_configuration(index_config); - if (result != UDS_SUCCESS) - return uds_status_to_errno(result); - - *index_size = sizes.total_size; - return UDS_SUCCESS; -} - -/* Create unique data using the current time and a pseudorandom number. */ -static void create_unique_nonce_data(u8 *buffer) -{ - ktime_t now = current_time_ns(CLOCK_REALTIME); - u32 rand; - size_t offset = 0; - - get_random_bytes(&rand, sizeof(u32)); - memcpy(buffer + offset, &now, sizeof(now)); - offset += sizeof(now); - memcpy(buffer + offset, &rand, sizeof(rand)); - offset += sizeof(rand); - while (offset < NONCE_INFO_SIZE) { - size_t len = min(NONCE_INFO_SIZE - offset, offset); - - memcpy(buffer + offset, buffer, len); - offset += len; - } -} - -static u64 hash_stuff(u64 start, const void *data, size_t len) -{ - u32 seed = start ^ (start >> 27); - u8 hash_buffer[16]; - - murmurhash3_128(data, len, seed, hash_buffer); - return get_unaligned_le64(hash_buffer + 4); -} - -/* Generate a primary nonce from the provided data. */ -static u64 generate_primary_nonce(const void *data, size_t len) -{ - return hash_stuff(0xa1b1e0fc, data, len); -} - -/* - * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by - * hashing the original nonce and the data to produce a new nonce. - */ -static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len) -{ - return hash_stuff(nonce + 1, data, len); -} - -static int __must_check open_layout_reader(struct index_layout *layout, - struct layout_region *lr, off_t offset, - struct buffered_reader **reader_ptr) -{ - return uds_make_buffered_reader(layout->factory, lr->start_block + offset, - lr->block_count, reader_ptr); -} - -static int open_region_reader(struct index_layout *layout, struct layout_region *region, - struct buffered_reader **reader_ptr) -{ - return open_layout_reader(layout, region, -layout->super.start_offset, - reader_ptr); -} - -static int __must_check open_layout_writer(struct index_layout *layout, - struct layout_region *lr, off_t offset, - struct buffered_writer **writer_ptr) -{ - return uds_make_buffered_writer(layout->factory, lr->start_block + offset, - lr->block_count, writer_ptr); -} - -static int open_region_writer(struct index_layout *layout, struct layout_region *region, - struct buffered_writer **writer_ptr) -{ - return open_layout_writer(layout, region, -layout->super.start_offset, - writer_ptr); -} - -static void generate_super_block_data(struct save_layout_sizes *sls, - struct super_block_data *super) -{ - memset(super, 0, sizeof(*super)); - memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE); - create_unique_nonce_data(super->nonce_info); - - super->nonce = generate_primary_nonce(super->nonce_info, - sizeof(super->nonce_info)); - super->version = SUPER_VERSION_CURRENT; - super->block_size = sls->block_size; - super->index_count = 1; - super->max_saves = sls->save_count; - super->open_chapter_blocks = sls->open_chapter_blocks; - super->page_map_blocks = sls->page_map_blocks; - super->volume_offset = 0; - super->start_offset = 0; -} - -static void define_sub_index_nonce(struct index_layout *layout) -{ - struct sub_index_nonce_data { - u64 offset; - u16 index_id; - }; - struct sub_index_layout *sil = &layout->index; - u64 primary_nonce = layout->super.nonce; - u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 }; - size_t offset = 0; - - encode_u64_le(buffer, &offset, sil->sub_index.start_block); - encode_u16_le(buffer, &offset, 0); - sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer)); - if (sil->nonce == 0) { - sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer, - sizeof(buffer)); - } -} - -static void setup_sub_index(struct index_layout *layout, u64 start_block, - struct save_layout_sizes *sls) -{ - struct sub_index_layout *sil = &layout->index; - u64 next_block = start_block; - unsigned int i; - - sil->sub_index = (struct layout_region) { - .start_block = start_block, - .block_count = sls->sub_index_blocks, - .kind = RL_KIND_INDEX, - .instance = 0, - }; - - sil->volume = (struct layout_region) { - .start_block = next_block, - .block_count = sls->volume_blocks, - .kind = RL_KIND_VOLUME, - .instance = RL_SOLE_INSTANCE, - }; - - next_block += sls->volume_blocks; - - for (i = 0; i < sls->save_count; i++) { - sil->saves[i].index_save = (struct layout_region) { - .start_block = next_block, - .block_count = sls->save_blocks, - .kind = RL_KIND_SAVE, - .instance = i, - }; - - next_block += sls->save_blocks; - } - - define_sub_index_nonce(layout); -} - -static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls) -{ - u64 next_block = layout->offset / sls->block_size; - - layout->total_blocks = sls->total_blocks; - generate_super_block_data(sls, &layout->super); - layout->header = (struct layout_region) { - .start_block = next_block++, - .block_count = 1, - .kind = RL_KIND_HEADER, - .instance = RL_SOLE_INSTANCE, - }; - - layout->config = (struct layout_region) { - .start_block = next_block++, - .block_count = 1, - .kind = RL_KIND_CONFIG, - .instance = RL_SOLE_INSTANCE, - }; - - setup_sub_index(layout, next_block, sls); - next_block += sls->sub_index_blocks; - - layout->seal = (struct layout_region) { - .start_block = next_block, - .block_count = 1, - .kind = RL_KIND_SEAL, - .instance = RL_SOLE_INSTANCE, - }; -} - -static int __must_check make_index_save_region_table(struct index_save_layout *isl, - struct region_table **table_ptr) -{ - int result; - unsigned int z; - struct region_table *table; - struct layout_region *lr; - u16 region_count; - size_t payload; - size_t type; - - if (isl->zone_count > 0) { - /* - * Normal save regions: header, page map, volume index zones, - * open chapter, and possibly free space. - */ - region_count = 3 + isl->zone_count; - if (isl->free_space.block_count > 0) - region_count++; - - payload = sizeof(isl->save_data) + sizeof(isl->state_data); - type = RH_TYPE_SAVE; - } else { - /* Empty save regions: header, page map, free space. */ - region_count = 3; - payload = sizeof(isl->save_data); - type = RH_TYPE_UNSAVED; - } - - result = uds_allocate_extended(struct region_table, region_count, - struct layout_region, - "layout region table for ISL", &table); - if (result != UDS_SUCCESS) - return result; - - lr = &table->regions[0]; - *lr++ = isl->header; - *lr++ = isl->index_page_map; - for (z = 0; z < isl->zone_count; z++) - *lr++ = isl->volume_index_zones[z]; - - if (isl->zone_count > 0) - *lr++ = isl->open_chapter; - - if (isl->free_space.block_count > 0) - *lr++ = isl->free_space; - - table->header = (struct region_header) { - .magic = REGION_MAGIC, - .region_blocks = isl->index_save.block_count, - .type = type, - .version = 1, - .region_count = region_count, - .payload = payload, - }; - - table->encoded_size = (sizeof(struct region_header) + payload + - region_count * sizeof(struct layout_region)); - *table_ptr = table; - return UDS_SUCCESS; -} - -static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table) -{ - unsigned int i; - - encode_u64_le(buffer, offset, REGION_MAGIC); - encode_u64_le(buffer, offset, table->header.region_blocks); - encode_u16_le(buffer, offset, table->header.type); - encode_u16_le(buffer, offset, table->header.version); - encode_u16_le(buffer, offset, table->header.region_count); - encode_u16_le(buffer, offset, table->header.payload); - - for (i = 0; i < table->header.region_count; i++) { - encode_u64_le(buffer, offset, table->regions[i].start_block); - encode_u64_le(buffer, offset, table->regions[i].block_count); - encode_u32_le(buffer, offset, 0); - encode_u16_le(buffer, offset, table->regions[i].kind); - encode_u16_le(buffer, offset, table->regions[i].instance); - } -} - -static int __must_check write_index_save_header(struct index_save_layout *isl, - struct region_table *table, - struct buffered_writer *writer) -{ - int result; - u8 *buffer; - size_t offset = 0; - - result = uds_allocate(table->encoded_size, u8, "index save data", &buffer); - if (result != UDS_SUCCESS) - return result; - - encode_region_table(buffer, &offset, table); - encode_u64_le(buffer, &offset, isl->save_data.timestamp); - encode_u64_le(buffer, &offset, isl->save_data.nonce); - encode_u32_le(buffer, &offset, isl->save_data.version); - encode_u32_le(buffer, &offset, 0); - if (isl->zone_count > 0) { - encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature); - encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id); - encode_u64_le(buffer, &offset, isl->state_data.newest_chapter); - encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter); - encode_u64_le(buffer, &offset, isl->state_data.last_save); - encode_u64_le(buffer, &offset, 0); - } - - result = uds_write_to_buffered_writer(writer, buffer, offset); - uds_free(buffer); - if (result != UDS_SUCCESS) - return result; - - return uds_flush_buffered_writer(writer); -} - -static int write_index_save_layout(struct index_layout *layout, - struct index_save_layout *isl) -{ - int result; - struct region_table *table; - struct buffered_writer *writer; - - result = make_index_save_region_table(isl, &table); - if (result != UDS_SUCCESS) - return result; - - result = open_region_writer(layout, &isl->header, &writer); - if (result != UDS_SUCCESS) { - uds_free(table); - return result; - } - - result = write_index_save_header(isl, table, writer); - uds_free(table); - uds_free_buffered_writer(writer); - - return result; -} - -static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks) -{ - u64 free_blocks; - u64 next_block = isl->index_save.start_block; - - isl->zone_count = 0; - memset(&isl->save_data, 0, sizeof(isl->save_data)); - - isl->header = (struct layout_region) { - .start_block = next_block++, - .block_count = 1, - .kind = RL_KIND_HEADER, - .instance = RL_SOLE_INSTANCE, - }; - - isl->index_page_map = (struct layout_region) { - .start_block = next_block, - .block_count = page_map_blocks, - .kind = RL_KIND_INDEX_PAGE_MAP, - .instance = RL_SOLE_INSTANCE, - }; - - next_block += page_map_blocks; - - free_blocks = isl->index_save.block_count - page_map_blocks - 1; - isl->free_space = (struct layout_region) { - .start_block = next_block, - .block_count = free_blocks, - .kind = RL_KIND_EMPTY, - .instance = RL_SOLE_INSTANCE, - }; -} - -static int __must_check invalidate_old_save(struct index_layout *layout, - struct index_save_layout *isl) -{ - reset_index_save_layout(isl, layout->super.page_map_blocks); - return write_index_save_layout(layout, isl); -} - -static int discard_index_state_data(struct index_layout *layout) -{ - int result; - int saved_result = UDS_SUCCESS; - unsigned int i; - - for (i = 0; i < layout->super.max_saves; i++) { - result = invalidate_old_save(layout, &layout->index.saves[i]); - if (result != UDS_SUCCESS) - saved_result = result; - } - - if (saved_result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "%s: cannot destroy all index saves", - __func__); - } - - return UDS_SUCCESS; -} - -static int __must_check make_layout_region_table(struct index_layout *layout, - struct region_table **table_ptr) -{ - int result; - unsigned int i; - /* Regions: header, config, index, volume, saves, seal */ - u16 region_count = 5 + layout->super.max_saves; - u16 payload; - struct region_table *table; - struct layout_region *lr; - - result = uds_allocate_extended(struct region_table, region_count, - struct layout_region, "layout region table", - &table); - if (result != UDS_SUCCESS) - return result; - - lr = &table->regions[0]; - *lr++ = layout->header; - *lr++ = layout->config; - *lr++ = layout->index.sub_index; - *lr++ = layout->index.volume; - - for (i = 0; i < layout->super.max_saves; i++) - *lr++ = layout->index.saves[i].index_save; - - *lr++ = layout->seal; - - if (is_converted_super_block(&layout->super)) { - payload = sizeof(struct super_block_data); - } else { - payload = (sizeof(struct super_block_data) - - sizeof(layout->super.volume_offset) - - sizeof(layout->super.start_offset)); - } - - table->header = (struct region_header) { - .magic = REGION_MAGIC, - .region_blocks = layout->total_blocks, - .type = RH_TYPE_SUPER, - .version = 1, - .region_count = region_count, - .payload = payload, - }; - - table->encoded_size = (sizeof(struct region_header) + payload + - region_count * sizeof(struct layout_region)); - *table_ptr = table; - return UDS_SUCCESS; -} - -static int __must_check write_layout_header(struct index_layout *layout, - struct region_table *table, - struct buffered_writer *writer) -{ - int result; - u8 *buffer; - size_t offset = 0; - - result = uds_allocate(table->encoded_size, u8, "layout data", &buffer); - if (result != UDS_SUCCESS) - return result; - - encode_region_table(buffer, &offset, table); - memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE); - offset += MAGIC_SIZE; - memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE); - offset += NONCE_INFO_SIZE; - encode_u64_le(buffer, &offset, layout->super.nonce); - encode_u32_le(buffer, &offset, layout->super.version); - encode_u32_le(buffer, &offset, layout->super.block_size); - encode_u16_le(buffer, &offset, layout->super.index_count); - encode_u16_le(buffer, &offset, layout->super.max_saves); - encode_u32_le(buffer, &offset, 0); - encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks); - encode_u64_le(buffer, &offset, layout->super.page_map_blocks); - - if (is_converted_super_block(&layout->super)) { - encode_u64_le(buffer, &offset, layout->super.volume_offset); - encode_u64_le(buffer, &offset, layout->super.start_offset); - } - - result = uds_write_to_buffered_writer(writer, buffer, offset); - uds_free(buffer); - if (result != UDS_SUCCESS) - return result; - - return uds_flush_buffered_writer(writer); -} - -static int __must_check write_uds_index_config(struct index_layout *layout, - struct uds_configuration *config, - off_t offset) -{ - int result; - struct buffered_writer *writer = NULL; - - result = open_layout_writer(layout, &layout->config, offset, &writer); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "failed to open config region"); - - result = uds_write_config_contents(writer, config, layout->super.version); - if (result != UDS_SUCCESS) { - uds_free_buffered_writer(writer); - return uds_log_error_strerror(result, "failed to write config region"); - } - - result = uds_flush_buffered_writer(writer); - if (result != UDS_SUCCESS) { - uds_free_buffered_writer(writer); - return uds_log_error_strerror(result, "cannot flush config writer"); - } - - uds_free_buffered_writer(writer); - return UDS_SUCCESS; -} - -static int __must_check save_layout(struct index_layout *layout, off_t offset) -{ - int result; - struct buffered_writer *writer = NULL; - struct region_table *table; - - result = make_layout_region_table(layout, &table); - if (result != UDS_SUCCESS) - return result; - - result = open_layout_writer(layout, &layout->header, offset, &writer); - if (result != UDS_SUCCESS) { - uds_free(table); - return result; - } - - result = write_layout_header(layout, table, writer); - uds_free(table); - uds_free_buffered_writer(writer); - - return result; -} - -static int create_index_layout(struct index_layout *layout, struct uds_configuration *config) -{ - int result; - struct save_layout_sizes sizes; - - result = compute_sizes(config, &sizes); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(sizes.save_count, struct index_save_layout, __func__, - &layout->index.saves); - if (result != UDS_SUCCESS) - return result; - - initialize_layout(layout, &sizes); - - result = discard_index_state_data(layout); - if (result != UDS_SUCCESS) - return result; - - result = write_uds_index_config(layout, config, 0); - if (result != UDS_SUCCESS) - return result; - - return save_layout(layout, 0); -} - -static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl) -{ - struct save_nonce_data { - struct index_save_data data; - u64 offset; - } nonce_data; - u8 buffer[sizeof(nonce_data)]; - size_t offset = 0; - - encode_u64_le(buffer, &offset, isl->save_data.timestamp); - encode_u64_le(buffer, &offset, 0); - encode_u32_le(buffer, &offset, isl->save_data.version); - encode_u32_le(buffer, &offset, 0U); - encode_u64_le(buffer, &offset, isl->index_save.start_block); - ASSERT_LOG_ONLY(offset == sizeof(nonce_data), - "%zu bytes encoded of %zu expected", offset, sizeof(nonce_data)); - return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer)); -} - -static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce) -{ - if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0)) - return 0; - - if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl)) - return 0; - - return isl->save_data.timestamp; -} - -static int find_latest_uds_index_save_slot(struct index_layout *layout, - struct index_save_layout **isl_ptr) -{ - struct index_save_layout *latest = NULL; - struct index_save_layout *isl; - unsigned int i; - u64 save_time = 0; - u64 latest_time = 0; - - for (i = 0; i < layout->super.max_saves; i++) { - isl = &layout->index.saves[i]; - save_time = validate_index_save_layout(isl, layout->index.nonce); - if (save_time > latest_time) { - latest = isl; - latest_time = save_time; - } - } - - if (latest == NULL) { - uds_log_error("No valid index save found"); - return UDS_INDEX_NOT_SAVED_CLEANLY; - } - - *isl_ptr = latest; - return UDS_SUCCESS; -} - -int uds_discard_open_chapter(struct index_layout *layout) -{ - int result; - struct index_save_layout *isl; - struct buffered_writer *writer; - - result = find_latest_uds_index_save_slot(layout, &isl); - if (result != UDS_SUCCESS) - return result; - - result = open_region_writer(layout, &isl->open_chapter, &writer); - if (result != UDS_SUCCESS) - return result; - - result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - uds_free_buffered_writer(writer); - return result; - } - - result = uds_flush_buffered_writer(writer); - uds_free_buffered_writer(writer); - return result; -} - -int uds_load_index_state(struct index_layout *layout, struct uds_index *index) -{ - int result; - unsigned int zone; - struct index_save_layout *isl; - struct buffered_reader *readers[MAX_ZONES]; - - result = find_latest_uds_index_save_slot(layout, &isl); - if (result != UDS_SUCCESS) - return result; - - index->newest_virtual_chapter = isl->state_data.newest_chapter; - index->oldest_virtual_chapter = isl->state_data.oldest_chapter; - index->last_save = isl->state_data.last_save; - - result = open_region_reader(layout, &isl->open_chapter, &readers[0]); - if (result != UDS_SUCCESS) - return result; - - result = uds_load_open_chapter(index, readers[0]); - uds_free_buffered_reader(readers[0]); - if (result != UDS_SUCCESS) - return result; - - for (zone = 0; zone < isl->zone_count; zone++) { - result = open_region_reader(layout, &isl->volume_index_zones[zone], - &readers[zone]); - if (result != UDS_SUCCESS) { - for (; zone > 0; zone--) - uds_free_buffered_reader(readers[zone - 1]); - - return result; - } - } - - result = uds_load_volume_index(index->volume_index, readers, isl->zone_count); - for (zone = 0; zone < isl->zone_count; zone++) - uds_free_buffered_reader(readers[zone]); - if (result != UDS_SUCCESS) - return result; - - result = open_region_reader(layout, &isl->index_page_map, &readers[0]); - if (result != UDS_SUCCESS) - return result; - - result = uds_read_index_page_map(index->volume->index_page_map, readers[0]); - uds_free_buffered_reader(readers[0]); - - return result; -} - -static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout) -{ - struct index_save_layout *oldest = NULL; - struct index_save_layout *isl; - unsigned int i; - u64 save_time = 0; - u64 oldest_time = 0; - - for (i = 0; i < layout->super.max_saves; i++) { - isl = &layout->index.saves[i]; - save_time = validate_index_save_layout(isl, layout->index.nonce); - if (oldest == NULL || save_time < oldest_time) { - oldest = isl; - oldest_time = save_time; - } - } - - return oldest; -} - -static void instantiate_index_save_layout(struct index_save_layout *isl, - struct super_block_data *super, - u64 volume_nonce, unsigned int zone_count) -{ - unsigned int z; - u64 next_block; - u64 free_blocks; - u64 volume_index_blocks; - - isl->zone_count = zone_count; - memset(&isl->save_data, 0, sizeof(isl->save_data)); - isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME)); - isl->save_data.version = 1; - isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl); - - next_block = isl->index_save.start_block; - isl->header = (struct layout_region) { - .start_block = next_block++, - .block_count = 1, - .kind = RL_KIND_HEADER, - .instance = RL_SOLE_INSTANCE, - }; - - isl->index_page_map = (struct layout_region) { - .start_block = next_block, - .block_count = super->page_map_blocks, - .kind = RL_KIND_INDEX_PAGE_MAP, - .instance = RL_SOLE_INSTANCE, - }; - next_block += super->page_map_blocks; - - free_blocks = (isl->index_save.block_count - 1 - - super->page_map_blocks - - super->open_chapter_blocks); - volume_index_blocks = free_blocks / isl->zone_count; - for (z = 0; z < isl->zone_count; z++) { - isl->volume_index_zones[z] = (struct layout_region) { - .start_block = next_block, - .block_count = volume_index_blocks, - .kind = RL_KIND_VOLUME_INDEX, - .instance = z, - }; - - next_block += volume_index_blocks; - free_blocks -= volume_index_blocks; - } - - isl->open_chapter = (struct layout_region) { - .start_block = next_block, - .block_count = super->open_chapter_blocks, - .kind = RL_KIND_OPEN_CHAPTER, - .instance = RL_SOLE_INSTANCE, - }; - - next_block += super->open_chapter_blocks; - - isl->free_space = (struct layout_region) { - .start_block = next_block, - .block_count = free_blocks, - .kind = RL_KIND_EMPTY, - .instance = RL_SOLE_INSTANCE, - }; -} - -static int setup_uds_index_save_slot(struct index_layout *layout, - unsigned int zone_count, - struct index_save_layout **isl_ptr) -{ - int result; - struct index_save_layout *isl; - - isl = select_oldest_index_save_layout(layout); - result = invalidate_old_save(layout, isl); - if (result != UDS_SUCCESS) - return result; - - instantiate_index_save_layout(isl, &layout->super, layout->index.nonce, - zone_count); - - *isl_ptr = isl; - return UDS_SUCCESS; -} - -static void cancel_uds_index_save(struct index_save_layout *isl) -{ - memset(&isl->save_data, 0, sizeof(isl->save_data)); - memset(&isl->state_data, 0, sizeof(isl->state_data)); - isl->zone_count = 0; -} - -int uds_save_index_state(struct index_layout *layout, struct uds_index *index) -{ - int result; - unsigned int zone; - struct index_save_layout *isl; - struct buffered_writer *writers[MAX_ZONES]; - - result = setup_uds_index_save_slot(layout, index->zone_count, &isl); - if (result != UDS_SUCCESS) - return result; - - isl->state_data = (struct index_state_data301) { - .newest_chapter = index->newest_virtual_chapter, - .oldest_chapter = index->oldest_virtual_chapter, - .last_save = index->last_save, - }; - - result = open_region_writer(layout, &isl->open_chapter, &writers[0]); - if (result != UDS_SUCCESS) { - cancel_uds_index_save(isl); - return result; - } - - result = uds_save_open_chapter(index, writers[0]); - uds_free_buffered_writer(writers[0]); - if (result != UDS_SUCCESS) { - cancel_uds_index_save(isl); - return result; - } - - for (zone = 0; zone < index->zone_count; zone++) { - result = open_region_writer(layout, &isl->volume_index_zones[zone], - &writers[zone]); - if (result != UDS_SUCCESS) { - for (; zone > 0; zone--) - uds_free_buffered_writer(writers[zone - 1]); - - cancel_uds_index_save(isl); - return result; - } - } - - result = uds_save_volume_index(index->volume_index, writers, index->zone_count); - for (zone = 0; zone < index->zone_count; zone++) - uds_free_buffered_writer(writers[zone]); - if (result != UDS_SUCCESS) { - cancel_uds_index_save(isl); - return result; - } - - result = open_region_writer(layout, &isl->index_page_map, &writers[0]); - if (result != UDS_SUCCESS) { - cancel_uds_index_save(isl); - return result; - } - - result = uds_write_index_page_map(index->volume->index_page_map, writers[0]); - uds_free_buffered_writer(writers[0]); - if (result != UDS_SUCCESS) { - cancel_uds_index_save(isl); - return result; - } - - return write_index_save_layout(layout, isl); -} - -static int __must_check load_region_table(struct buffered_reader *reader, - struct region_table **table_ptr) -{ - int result; - unsigned int i; - struct region_header header; - struct region_table *table; - u8 buffer[sizeof(struct region_header)]; - size_t offset = 0; - - result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot read region table header"); - - decode_u64_le(buffer, &offset, &header.magic); - decode_u64_le(buffer, &offset, &header.region_blocks); - decode_u16_le(buffer, &offset, &header.type); - decode_u16_le(buffer, &offset, &header.version); - decode_u16_le(buffer, &offset, &header.region_count); - decode_u16_le(buffer, &offset, &header.payload); - - if (header.magic != REGION_MAGIC) - return UDS_NO_INDEX; - - if (header.version != 1) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown region table version %hu", - header.version); - } - - result = uds_allocate_extended(struct region_table, header.region_count, - struct layout_region, - "single file layout region table", &table); - if (result != UDS_SUCCESS) - return result; - - table->header = header; - for (i = 0; i < header.region_count; i++) { - u8 region_buffer[sizeof(struct layout_region)]; - - offset = 0; - result = uds_read_from_buffered_reader(reader, region_buffer, - sizeof(region_buffer)); - if (result != UDS_SUCCESS) { - uds_free(table); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "cannot read region table layouts"); - } - - decode_u64_le(region_buffer, &offset, &table->regions[i].start_block); - decode_u64_le(region_buffer, &offset, &table->regions[i].block_count); - offset += sizeof(u32); - decode_u16_le(region_buffer, &offset, &table->regions[i].kind); - decode_u16_le(region_buffer, &offset, &table->regions[i].instance); - } - - *table_ptr = table; - return UDS_SUCCESS; -} - -static int __must_check read_super_block_data(struct buffered_reader *reader, - struct index_layout *layout, - size_t saved_size) -{ - int result; - struct super_block_data *super = &layout->super; - u8 *buffer; - size_t offset = 0; - - result = uds_allocate(saved_size, u8, "super block data", &buffer); - if (result != UDS_SUCCESS) - return result; - - result = uds_read_from_buffered_reader(reader, buffer, saved_size); - if (result != UDS_SUCCESS) { - uds_free(buffer); - return uds_log_error_strerror(result, "cannot read region table header"); - } - - memcpy(&super->magic_label, buffer, MAGIC_SIZE); - offset += MAGIC_SIZE; - memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE); - offset += NONCE_INFO_SIZE; - decode_u64_le(buffer, &offset, &super->nonce); - decode_u32_le(buffer, &offset, &super->version); - decode_u32_le(buffer, &offset, &super->block_size); - decode_u16_le(buffer, &offset, &super->index_count); - decode_u16_le(buffer, &offset, &super->max_saves); - offset += sizeof(u32); - decode_u64_le(buffer, &offset, &super->open_chapter_blocks); - decode_u64_le(buffer, &offset, &super->page_map_blocks); - - if (is_converted_super_block(super)) { - decode_u64_le(buffer, &offset, &super->volume_offset); - decode_u64_le(buffer, &offset, &super->start_offset); - } else { - super->volume_offset = 0; - super->start_offset = 0; - } - - uds_free(buffer); - - if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0) - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "unknown superblock magic label"); - - if ((super->version < SUPER_VERSION_MINIMUM) || - (super->version == 4) || (super->version == 5) || (super->version == 6) || - (super->version > SUPER_VERSION_MAXIMUM)) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown superblock version number %u", - super->version); - } - - if (super->volume_offset < super->start_offset) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "inconsistent offsets (start %llu, volume %llu)", - (unsigned long long) super->start_offset, - (unsigned long long) super->volume_offset); - } - - /* Sub-indexes are no longer used but the layout retains this field. */ - if (super->index_count != 1) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "invalid subindex count %u", - super->index_count); - } - - if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "inconsistent superblock nonce"); - } - - return UDS_SUCCESS; -} - -static int __must_check verify_region(struct layout_region *lr, u64 start_block, - enum region_kind kind, unsigned int instance) -{ - if (lr->start_block != start_block) - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "incorrect layout region offset"); - - if (lr->kind != kind) - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "incorrect layout region kind"); - - if (lr->instance != instance) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "incorrect layout region instance"); - } - - return UDS_SUCCESS; -} - -static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block, - struct region_table *table) -{ - int result; - unsigned int i; - struct sub_index_layout *sil = &layout->index; - u64 next_block = start_block; - - sil->sub_index = table->regions[2]; - result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0); - if (result != UDS_SUCCESS) - return result; - - define_sub_index_nonce(layout); - - sil->volume = table->regions[3]; - result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - next_block += sil->volume.block_count + layout->super.volume_offset; - - for (i = 0; i < layout->super.max_saves; i++) { - sil->saves[i].index_save = table->regions[i + 4]; - result = verify_region(&sil->saves[i].index_save, next_block, - RL_KIND_SAVE, i); - if (result != UDS_SUCCESS) - return result; - - next_block += sil->saves[i].index_save.block_count; - } - - next_block -= layout->super.volume_offset; - if (next_block != start_block + sil->sub_index.block_count) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "sub index region does not span all saves"); - } - - return UDS_SUCCESS; -} - -static int __must_check reconstitute_layout(struct index_layout *layout, - struct region_table *table, u64 first_block) -{ - int result; - u64 next_block = first_block; - - result = uds_allocate(layout->super.max_saves, struct index_save_layout, - __func__, &layout->index.saves); - if (result != UDS_SUCCESS) - return result; - - layout->total_blocks = table->header.region_blocks; - - layout->header = table->regions[0]; - result = verify_region(&layout->header, next_block++, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - layout->config = table->regions[1]; - result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - result = verify_sub_index(layout, next_block, table); - if (result != UDS_SUCCESS) - return result; - - next_block += layout->index.sub_index.block_count; - - layout->seal = table->regions[table->header.region_count - 1]; - result = verify_region(&layout->seal, next_block + layout->super.volume_offset, - RL_KIND_SEAL, RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - if (++next_block != (first_block + layout->total_blocks)) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "layout table does not span total blocks"); - } - - return UDS_SUCCESS; -} - -static int __must_check load_super_block(struct index_layout *layout, size_t block_size, - u64 first_block, struct buffered_reader *reader) -{ - int result; - struct region_table *table = NULL; - struct super_block_data *super = &layout->super; - - result = load_region_table(reader, &table); - if (result != UDS_SUCCESS) - return result; - - if (table->header.type != RH_TYPE_SUPER) { - uds_free(table); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "not a superblock region table"); - } - - result = read_super_block_data(reader, layout, table->header.payload); - if (result != UDS_SUCCESS) { - uds_free(table); - return uds_log_error_strerror(result, "unknown superblock format"); - } - - if (super->block_size != block_size) { - uds_free(table); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "superblock saved block_size %u differs from supplied block_size %zu", - super->block_size, block_size); - } - - first_block -= (super->volume_offset - super->start_offset); - result = reconstitute_layout(layout, table, first_block); - uds_free(table); - return result; -} - -static int __must_check read_index_save_data(struct buffered_reader *reader, - struct index_save_layout *isl, - size_t saved_size) -{ - int result; - struct index_state_version file_version; - u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)]; - size_t offset = 0; - - if (saved_size != sizeof(buffer)) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "unexpected index save data size %zu", - saved_size); - } - - result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "cannot read index save data"); - - decode_u64_le(buffer, &offset, &isl->save_data.timestamp); - decode_u64_le(buffer, &offset, &isl->save_data.nonce); - decode_u32_le(buffer, &offset, &isl->save_data.version); - offset += sizeof(u32); - - if (isl->save_data.version > 1) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown index save version number %u", - isl->save_data.version); - } - - decode_s32_le(buffer, &offset, &file_version.signature); - decode_s32_le(buffer, &offset, &file_version.version_id); - - if ((file_version.signature != INDEX_STATE_VERSION_301.signature) || - (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "index state version %d,%d is unsupported", - file_version.signature, - file_version.version_id); - } - - decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter); - decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter); - decode_u64_le(buffer, &offset, &isl->state_data.last_save); - /* Skip past some historical fields that are now unused */ - offset += sizeof(u32) + sizeof(u32); - return UDS_SUCCESS; -} - -static int __must_check reconstruct_index_save(struct index_save_layout *isl, - struct region_table *table) -{ - int result; - unsigned int z; - struct layout_region *last_region; - u64 next_block = isl->index_save.start_block; - u64 last_block = next_block + isl->index_save.block_count; - - isl->zone_count = table->header.region_count - 3; - - last_region = &table->regions[table->header.region_count - 1]; - if (last_region->kind == RL_KIND_EMPTY) { - isl->free_space = *last_region; - isl->zone_count--; - } else { - isl->free_space = (struct layout_region) { - .start_block = last_block, - .block_count = 0, - .kind = RL_KIND_EMPTY, - .instance = RL_SOLE_INSTANCE, - }; - } - - isl->header = table->regions[0]; - result = verify_region(&isl->header, next_block++, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - isl->index_page_map = table->regions[1]; - result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - next_block += isl->index_page_map.block_count; - - for (z = 0; z < isl->zone_count; z++) { - isl->volume_index_zones[z] = table->regions[z + 2]; - result = verify_region(&isl->volume_index_zones[z], next_block, - RL_KIND_VOLUME_INDEX, z); - if (result != UDS_SUCCESS) - return result; - - next_block += isl->volume_index_zones[z].block_count; - } - - isl->open_chapter = table->regions[isl->zone_count + 2]; - result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - next_block += isl->open_chapter.block_count; - - result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY, - RL_SOLE_INSTANCE); - if (result != UDS_SUCCESS) - return result; - - next_block += isl->free_space.block_count; - if (next_block != last_block) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "index save layout table incomplete"); - } - - return UDS_SUCCESS; -} - -static int __must_check load_index_save(struct index_save_layout *isl, - struct buffered_reader *reader, - unsigned int instance) -{ - int result; - struct region_table *table = NULL; - - result = load_region_table(reader, &table); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, "cannot read index save %u header", - instance); - } - - if (table->header.region_blocks != isl->index_save.block_count) { - u64 region_blocks = table->header.region_blocks; - - uds_free(table); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "unexpected index save %u region block count %llu", - instance, - (unsigned long long) region_blocks); - } - - if (table->header.type == RH_TYPE_UNSAVED) { - uds_free(table); - reset_index_save_layout(isl, 0); - return UDS_SUCCESS; - } - - - if (table->header.type != RH_TYPE_SAVE) { - uds_free(table); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "unexpected index save %u header type %u", - instance, table->header.type); - } - - result = read_index_save_data(reader, isl, table->header.payload); - if (result != UDS_SUCCESS) { - uds_free(table); - return uds_log_error_strerror(result, - "unknown index save %u data format", - instance); - } - - result = reconstruct_index_save(isl, table); - uds_free(table); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, "cannot reconstruct index save %u", - instance); - } - - return UDS_SUCCESS; -} - -static int __must_check load_sub_index_regions(struct index_layout *layout) -{ - int result; - unsigned int j; - struct index_save_layout *isl; - struct buffered_reader *reader; - - for (j = 0; j < layout->super.max_saves; j++) { - isl = &layout->index.saves[j]; - result = open_region_reader(layout, &isl->index_save, &reader); - - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "cannot get reader for index 0 save %u", - j); - return result; - } - - result = load_index_save(isl, reader, j); - uds_free_buffered_reader(reader); - if (result != UDS_SUCCESS) { - /* Another save slot might be valid. */ - reset_index_save_layout(isl, 0); - continue; - } - } - - return UDS_SUCCESS; -} - -static int __must_check verify_uds_index_config(struct index_layout *layout, - struct uds_configuration *config) -{ - int result; - struct buffered_reader *reader = NULL; - u64 offset; - - offset = layout->super.volume_offset - layout->super.start_offset; - result = open_layout_reader(layout, &layout->config, offset, &reader); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "failed to open config reader"); - - result = uds_validate_config_contents(reader, config); - if (result != UDS_SUCCESS) { - uds_free_buffered_reader(reader); - return uds_log_error_strerror(result, "failed to read config region"); - } - - uds_free_buffered_reader(reader); - return UDS_SUCCESS; -} - -static int load_index_layout(struct index_layout *layout, struct uds_configuration *config) -{ - int result; - struct buffered_reader *reader; - - result = uds_make_buffered_reader(layout->factory, - layout->offset / UDS_BLOCK_SIZE, 1, &reader); - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, "unable to read superblock"); - - result = load_super_block(layout, UDS_BLOCK_SIZE, - layout->offset / UDS_BLOCK_SIZE, reader); - uds_free_buffered_reader(reader); - if (result != UDS_SUCCESS) - return result; - - result = verify_uds_index_config(layout, config); - if (result != UDS_SUCCESS) - return result; - - return load_sub_index_regions(layout); -} - -static int create_layout_factory(struct index_layout *layout, - const struct uds_configuration *config) -{ - int result; - size_t writable_size; - struct io_factory *factory = NULL; - - result = uds_make_io_factory(config->bdev, &factory); - if (result != UDS_SUCCESS) - return result; - - writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE; - if (writable_size < config->size + config->offset) { - uds_put_io_factory(factory); - uds_log_error("index storage (%zu) is smaller than the requested size %zu", - writable_size, config->size + config->offset); - return -ENOSPC; - } - - layout->factory = factory; - layout->factory_size = (config->size > 0) ? config->size : writable_size; - layout->offset = config->offset; - return UDS_SUCCESS; -} - -int uds_make_index_layout(struct uds_configuration *config, bool new_layout, - struct index_layout **layout_ptr) -{ - int result; - struct index_layout *layout = NULL; - struct save_layout_sizes sizes; - - result = compute_sizes(config, &sizes); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(1, struct index_layout, __func__, &layout); - if (result != UDS_SUCCESS) - return result; - - result = create_layout_factory(layout, config); - if (result != UDS_SUCCESS) { - uds_free_index_layout(layout); - return result; - } - - if (layout->factory_size < sizes.total_size) { - uds_log_error("index storage (%zu) is smaller than the required size %llu", - layout->factory_size, - (unsigned long long) sizes.total_size); - uds_free_index_layout(layout); - return -ENOSPC; - } - - if (new_layout) - result = create_index_layout(layout, config); - else - result = load_index_layout(layout, config); - if (result != UDS_SUCCESS) { - uds_free_index_layout(layout); - return result; - } - - *layout_ptr = layout; - return UDS_SUCCESS; -} - -void uds_free_index_layout(struct index_layout *layout) -{ - if (layout == NULL) - return; - - uds_free(layout->index.saves); - if (layout->factory != NULL) - uds_put_io_factory(layout->factory); - - uds_free(layout); -} - -int uds_replace_index_layout_storage(struct index_layout *layout, - struct block_device *bdev) -{ - return uds_replace_storage(layout->factory, bdev); -} - -/* Obtain a dm_bufio_client for the volume region. */ -int uds_open_volume_bufio(struct index_layout *layout, size_t block_size, - unsigned int reserved_buffers, - struct dm_bufio_client **client_ptr) -{ - off_t offset = (layout->index.volume.start_block + - layout->super.volume_offset - - layout->super.start_offset); - - return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers, - client_ptr); -} - -u64 uds_get_volume_nonce(struct index_layout *layout) -{ - return layout->index.nonce; -} diff --git a/drivers/md/dm-vdo/index-layout.h b/drivers/md/dm-vdo/index-layout.h deleted file mode 100644 index e9ac6f4302d63..0000000000000 --- a/drivers/md/dm-vdo/index-layout.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_INDEX_LAYOUT_H -#define UDS_INDEX_LAYOUT_H - -#include "config.h" -#include "indexer.h" -#include "io-factory.h" - -/* - * The index layout describes the format of the index on the underlying storage, and is responsible - * for creating those structures when the index is first created. It also validates the index data - * when loading a saved index, and updates it when saving the index. - */ - -struct index_layout; - -int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout, - struct index_layout **layout_ptr); - -void uds_free_index_layout(struct index_layout *layout); - -int __must_check uds_replace_index_layout_storage(struct index_layout *layout, - struct block_device *bdev); - -int __must_check uds_load_index_state(struct index_layout *layout, - struct uds_index *index); - -int __must_check uds_save_index_state(struct index_layout *layout, - struct uds_index *index); - -int __must_check uds_discard_open_chapter(struct index_layout *layout); - -u64 __must_check uds_get_volume_nonce(struct index_layout *layout); - -int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size, - unsigned int reserved_buffers, - struct dm_bufio_client **client_ptr); - -#endif /* UDS_INDEX_LAYOUT_H */ diff --git a/drivers/md/dm-vdo/index-page-map.c b/drivers/md/dm-vdo/index-page-map.c deleted file mode 100644 index 1bb12066ad1a2..0000000000000 --- a/drivers/md/dm-vdo/index-page-map.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "index-page-map.h" - -#include "errors.h" -#include "hash-utils.h" -#include "indexer.h" -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" -#include "permassert.h" -#include "string-utils.h" -#include "thread-utils.h" - -/* - * The index page map is conceptually a two-dimensional array indexed by chapter number and index - * page number within the chapter. Each entry contains the number of the last delta list on that - * index page. In order to save memory, the information for the last page in each chapter is not - * recorded, as it is known from the geometry. - */ - -static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02"; - -enum { - PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1, -}; - -static inline u32 get_entry_count(const struct index_geometry *geometry) -{ - return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1); -} - -int uds_make_index_page_map(const struct index_geometry *geometry, - struct index_page_map **map_ptr) -{ - int result; - struct index_page_map *map; - - result = uds_allocate(1, struct index_page_map, "page map", &map); - if (result != UDS_SUCCESS) - return result; - - map->geometry = geometry; - map->entries_per_chapter = geometry->index_pages_per_chapter - 1; - result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries", - &map->entries); - if (result != UDS_SUCCESS) { - uds_free_index_page_map(map); - return result; - } - - *map_ptr = map; - return UDS_SUCCESS; -} - -void uds_free_index_page_map(struct index_page_map *map) -{ - if (map != NULL) { - uds_free(map->entries); - uds_free(map); - } -} - -void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, - u32 chapter_number, u32 index_page_number, - u32 delta_list_number) -{ - size_t slot; - - map->last_update = virtual_chapter_number; - if (index_page_number == map->entries_per_chapter) - return; - - slot = (chapter_number * map->entries_per_chapter) + index_page_number; - map->entries[slot] = delta_list_number; -} - -u32 uds_find_index_page_number(const struct index_page_map *map, - const struct uds_record_name *name, u32 chapter_number) -{ - u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry); - u32 slot = chapter_number * map->entries_per_chapter; - u32 page; - - for (page = 0; page < map->entries_per_chapter; page++) { - if (delta_list_number <= map->entries[slot + page]) - break; - } - - return page; -} - -void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, - u32 index_page_number, u32 *lowest_list, - u32 *highest_list) -{ - u32 slot = chapter_number * map->entries_per_chapter; - - *lowest_list = ((index_page_number == 0) ? - 0 : map->entries[slot + index_page_number - 1] + 1); - *highest_list = ((index_page_number < map->entries_per_chapter) ? - map->entries[slot + index_page_number] : - map->geometry->delta_lists_per_chapter - 1); -} - -u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry) -{ - return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry); -} - -int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer) -{ - int result; - u8 *buffer; - size_t offset = 0; - u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); - u32 i; - - result = uds_allocate(saved_size, u8, "page map data", &buffer); - if (result != UDS_SUCCESS) - return result; - - memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH); - offset += PAGE_MAP_MAGIC_LENGTH; - encode_u64_le(buffer, &offset, map->last_update); - for (i = 0; i < get_entry_count(map->geometry); i++) - encode_u16_le(buffer, &offset, map->entries[i]); - - result = uds_write_to_buffered_writer(writer, buffer, offset); - uds_free(buffer); - if (result != UDS_SUCCESS) - return result; - - return uds_flush_buffered_writer(writer); -} - -int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader) -{ - int result; - u8 magic[PAGE_MAP_MAGIC_LENGTH]; - u8 *buffer; - size_t offset = 0; - u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); - u32 i; - - result = uds_allocate(saved_size, u8, "page map data", &buffer); - if (result != UDS_SUCCESS) - return result; - - result = uds_read_from_buffered_reader(reader, buffer, saved_size); - if (result != UDS_SUCCESS) { - uds_free(buffer); - return result; - } - - memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH); - offset += PAGE_MAP_MAGIC_LENGTH; - if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) { - uds_free(buffer); - return UDS_CORRUPT_DATA; - } - - decode_u64_le(buffer, &offset, &map->last_update); - for (i = 0; i < get_entry_count(map->geometry); i++) - decode_u16_le(buffer, &offset, &map->entries[i]); - - uds_free(buffer); - uds_log_debug("read index page map, last update %llu", - (unsigned long long) map->last_update); - return UDS_SUCCESS; -} diff --git a/drivers/md/dm-vdo/index-page-map.h b/drivers/md/dm-vdo/index-page-map.h deleted file mode 100644 index b327c0bb96562..0000000000000 --- a/drivers/md/dm-vdo/index-page-map.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_INDEX_PAGE_MAP_H -#define UDS_INDEX_PAGE_MAP_H - -#include "geometry.h" -#include "io-factory.h" - -/* - * The index maintains a page map which records how the chapter delta lists are distributed among - * the index pages for each chapter, allowing the volume to be efficient about reading only pages - * that it knows it will need. - */ - -struct index_page_map { - const struct index_geometry *geometry; - u64 last_update; - u32 entries_per_chapter; - u16 *entries; -}; - -int __must_check uds_make_index_page_map(const struct index_geometry *geometry, - struct index_page_map **map_ptr); - -void uds_free_index_page_map(struct index_page_map *map); - -int __must_check uds_read_index_page_map(struct index_page_map *map, - struct buffered_reader *reader); - -int __must_check uds_write_index_page_map(struct index_page_map *map, - struct buffered_writer *writer); - -void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, - u32 chapter_number, u32 index_page_number, - u32 delta_list_number); - -u32 __must_check uds_find_index_page_number(const struct index_page_map *map, - const struct uds_record_name *name, - u32 chapter_number); - -void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, - u32 index_page_number, u32 *lowest_list, - u32 *highest_list); - -u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry); - -#endif /* UDS_INDEX_PAGE_MAP_H */ diff --git a/drivers/md/dm-vdo/index-session.c b/drivers/md/dm-vdo/index-session.c deleted file mode 100644 index a482ccd3981ef..0000000000000 --- a/drivers/md/dm-vdo/index-session.c +++ /dev/null @@ -1,738 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "index-session.h" - -#include - -#include "funnel-requestqueue.h" -#include "index.h" -#include "index-layout.h" -#include "logger.h" -#include "memory-alloc.h" -#include "time-utils.h" - -/* - * The index session contains a lock (the request_mutex) which ensures that only one thread can - * change the state of its index at a time. The state field indicates the current state of the - * index through a set of descriptive flags. The request_mutex must be notified whenever a - * non-transient state flag is cleared. The request_mutex is also used to count the number of - * requests currently in progress so that they can be drained when suspending or closing the index. - * - * If the index session is suspended shortly after opening an index, it may have to suspend during - * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time, - * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When - * the index session is resumed, the rebuild can continue from where it left off. If the index - * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild - * will start from the beginning the next time the index is loaded. The mutex and status fields in - * the index_load_context are used to record the state of any interrupted rebuild. - */ - -enum index_session_flag_bit { - IS_FLAG_BIT_START = 8, - /* The session has started loading an index but not completed it. */ - IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, - /* The session has loaded an index, which can handle requests. */ - IS_FLAG_BIT_LOADED, - /* The session's index has been permanently disabled. */ - IS_FLAG_BIT_DISABLED, - /* The session's index is suspended. */ - IS_FLAG_BIT_SUSPENDED, - /* The session is handling some index state change. */ - IS_FLAG_BIT_WAITING, - /* The session's index is closing and draining requests. */ - IS_FLAG_BIT_CLOSING, - /* The session is being destroyed and is draining requests. */ - IS_FLAG_BIT_DESTROYING, -}; - -enum index_session_flag { - IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), - IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), - IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), - IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), - IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), - IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), - IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), -}; - -/* Release a reference to an index session. */ -static void release_index_session(struct uds_index_session *index_session) -{ - mutex_lock(&index_session->request_mutex); - if (--index_session->request_count == 0) - uds_broadcast_cond(&index_session->request_cond); - mutex_unlock(&index_session->request_mutex); -} - -/* - * Acquire a reference to the index session for an asynchronous index request. The reference must - * eventually be released with a corresponding call to release_index_session(). - */ -static int get_index_session(struct uds_index_session *index_session) -{ - unsigned int state; - int result = UDS_SUCCESS; - - mutex_lock(&index_session->request_mutex); - index_session->request_count++; - state = index_session->state; - mutex_unlock(&index_session->request_mutex); - - if (state == IS_FLAG_LOADED) { - return UDS_SUCCESS; - } else if (state & IS_FLAG_DISABLED) { - result = UDS_DISABLED; - } else if ((state & IS_FLAG_LOADING) || - (state & IS_FLAG_SUSPENDED) || - (state & IS_FLAG_WAITING)) { - result = -EBUSY; - } else { - result = UDS_NO_INDEX; - } - - release_index_session(index_session); - return result; -} - -int uds_launch_request(struct uds_request *request) -{ - size_t internal_size; - int result; - - if (request->callback == NULL) { - uds_log_error("missing required callback"); - return -EINVAL; - } - - switch (request->type) { - case UDS_DELETE: - case UDS_POST: - case UDS_QUERY: - case UDS_QUERY_NO_UPDATE: - case UDS_UPDATE: - break; - default: - uds_log_error("received invalid callback type"); - return -EINVAL; - } - - /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); - - result = get_index_session(request->session); - if (result != UDS_SUCCESS) - return result; - - request->found = false; - request->unbatched = false; - request->index = request->session->index; - - uds_enqueue_request(request, STAGE_TRIAGE); - return UDS_SUCCESS; -} - -static void enter_callback_stage(struct uds_request *request) -{ - if (request->status != UDS_SUCCESS) { - /* All request errors are considered unrecoverable */ - mutex_lock(&request->session->request_mutex); - request->session->state |= IS_FLAG_DISABLED; - mutex_unlock(&request->session->request_mutex); - } - - uds_request_queue_enqueue(request->session->callback_queue, request); -} - -static inline void count_once(u64 *count_ptr) -{ - WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1); -} - -static void update_session_stats(struct uds_request *request) -{ - struct session_stats *session_stats = &request->session->stats; - - count_once(&session_stats->requests); - - switch (request->type) { - case UDS_POST: - if (request->found) - count_once(&session_stats->posts_found); - else - count_once(&session_stats->posts_not_found); - - if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) - count_once(&session_stats->posts_found_open_chapter); - else if (request->location == UDS_LOCATION_IN_DENSE) - count_once(&session_stats->posts_found_dense); - else if (request->location == UDS_LOCATION_IN_SPARSE) - count_once(&session_stats->posts_found_sparse); - break; - - case UDS_UPDATE: - if (request->found) - count_once(&session_stats->updates_found); - else - count_once(&session_stats->updates_not_found); - break; - - case UDS_DELETE: - if (request->found) - count_once(&session_stats->deletions_found); - else - count_once(&session_stats->deletions_not_found); - break; - - case UDS_QUERY: - case UDS_QUERY_NO_UPDATE: - if (request->found) - count_once(&session_stats->queries_found); - else - count_once(&session_stats->queries_not_found); - break; - - default: - request->status = ASSERT(false, "unknown request type: %d", - request->type); - } -} - -static void handle_callbacks(struct uds_request *request) -{ - struct uds_index_session *index_session = request->session; - - if (request->status == UDS_SUCCESS) - update_session_stats(request); - - request->status = uds_status_to_errno(request->status); - request->callback(request); - release_index_session(index_session); -} - -static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr) -{ - int result; - struct uds_index_session *session; - - result = uds_allocate(1, struct uds_index_session, __func__, &session); - if (result != UDS_SUCCESS) - return result; - - mutex_init(&session->request_mutex); - uds_init_cond(&session->request_cond); - mutex_init(&session->load_context.mutex); - uds_init_cond(&session->load_context.cond); - - result = uds_make_request_queue("callbackW", &handle_callbacks, - &session->callback_queue); - if (result != UDS_SUCCESS) { - uds_free(session); - return result; - } - - *index_session_ptr = session; - return UDS_SUCCESS; -} - -int uds_create_index_session(struct uds_index_session **session) -{ - if (session == NULL) { - uds_log_error("missing session pointer"); - return -EINVAL; - } - - return uds_status_to_errno(make_empty_index_session(session)); -} - -static int __must_check start_loading_index_session(struct uds_index_session *index_session) -{ - int result; - - mutex_lock(&index_session->request_mutex); - if (index_session->state & IS_FLAG_SUSPENDED) { - uds_log_info("Index session is suspended"); - result = -EBUSY; - } else if (index_session->state != 0) { - uds_log_info("Index is already loaded"); - result = -EBUSY; - } else { - index_session->state |= IS_FLAG_LOADING; - result = UDS_SUCCESS; - } - mutex_unlock(&index_session->request_mutex); - return result; -} - -static void finish_loading_index_session(struct uds_index_session *index_session, - int result) -{ - mutex_lock(&index_session->request_mutex); - index_session->state &= ~IS_FLAG_LOADING; - if (result == UDS_SUCCESS) - index_session->state |= IS_FLAG_LOADED; - - uds_broadcast_cond(&index_session->request_cond); - mutex_unlock(&index_session->request_mutex); -} - -static int initialize_index_session(struct uds_index_session *index_session, - enum uds_open_index_type open_type) -{ - int result; - struct uds_configuration *config; - - result = uds_make_configuration(&index_session->parameters, &config); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "Failed to allocate config"); - return result; - } - - memset(&index_session->stats, 0, sizeof(index_session->stats)); - result = uds_make_index(config, open_type, &index_session->load_context, - enter_callback_stage, &index_session->index); - if (result != UDS_SUCCESS) - uds_log_error_strerror(result, "Failed to make index"); - else - uds_log_configuration(config); - - uds_free_configuration(config); - return result; -} - -static const char *get_open_type_string(enum uds_open_index_type open_type) -{ - switch (open_type) { - case UDS_CREATE: - return "creating index"; - case UDS_LOAD: - return "loading or rebuilding index"; - case UDS_NO_REBUILD: - return "loading index"; - default: - return "unknown open method"; - } -} - -/* - * Open an index under the given session. This operation will fail if the - * index session is suspended, or if there is already an open index. - */ -int uds_open_index(enum uds_open_index_type open_type, - const struct uds_parameters *parameters, - struct uds_index_session *session) -{ - int result; - char name[BDEVNAME_SIZE]; - - if (parameters == NULL) { - uds_log_error("missing required parameters"); - return -EINVAL; - } - if (parameters->bdev == NULL) { - uds_log_error("missing required block device"); - return -EINVAL; - } - if (session == NULL) { - uds_log_error("missing required session pointer"); - return -EINVAL; - } - - result = start_loading_index_session(session); - if (result != UDS_SUCCESS) - return uds_status_to_errno(result); - - session->parameters = *parameters; - format_dev_t(name, parameters->bdev->bd_dev); - uds_log_info("%s: %s", get_open_type_string(open_type), name); - - result = initialize_index_session(session, open_type); - if (result != UDS_SUCCESS) - uds_log_error_strerror(result, "Failed %s", - get_open_type_string(open_type)); - - finish_loading_index_session(session, result); - return uds_status_to_errno(result); -} - -static void wait_for_no_requests_in_progress(struct uds_index_session *index_session) -{ - mutex_lock(&index_session->request_mutex); - while (index_session->request_count > 0) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - mutex_unlock(&index_session->request_mutex); -} - -static int __must_check save_index(struct uds_index_session *index_session) -{ - wait_for_no_requests_in_progress(index_session); - return uds_save_index(index_session->index); -} - -static void suspend_rebuild(struct uds_index_session *session) -{ - mutex_lock(&session->load_context.mutex); - switch (session->load_context.status) { - case INDEX_OPENING: - session->load_context.status = INDEX_SUSPENDING; - - /* Wait until the index indicates that it is not replaying. */ - while ((session->load_context.status != INDEX_SUSPENDED) && - (session->load_context.status != INDEX_READY)) { - uds_wait_cond(&session->load_context.cond, - &session->load_context.mutex); - } - - break; - - case INDEX_READY: - /* Index load does not need to be suspended. */ - break; - - case INDEX_SUSPENDED: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - /* These cases should not happen. */ - ASSERT_LOG_ONLY(false, "Bad load context state %u", - session->load_context.status); - break; - } - mutex_unlock(&session->load_context.mutex); -} - -/* - * Suspend index operation, draining all current index requests and preventing new index requests - * from starting. Optionally saves all index data before returning. - */ -int uds_suspend_index_session(struct uds_index_session *session, bool save) -{ - int result = UDS_SUCCESS; - bool no_work = false; - bool rebuilding = false; - - /* Wait for any current index state change to complete. */ - mutex_lock(&session->request_mutex); - while (session->state & IS_FLAG_CLOSING) - uds_wait_cond(&session->request_cond, &session->request_mutex); - - if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) { - no_work = true; - uds_log_info("Index session is already changing state"); - result = -EBUSY; - } else if (session->state & IS_FLAG_SUSPENDED) { - no_work = true; - } else if (session->state & IS_FLAG_LOADING) { - session->state |= IS_FLAG_WAITING; - rebuilding = true; - } else if (session->state & IS_FLAG_LOADED) { - session->state |= IS_FLAG_WAITING; - } else { - no_work = true; - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - } - mutex_unlock(&session->request_mutex); - - if (no_work) - return uds_status_to_errno(result); - - if (rebuilding) - suspend_rebuild(session); - else if (save) - result = save_index(session); - else - result = uds_flush_index_session(session); - - mutex_lock(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - mutex_unlock(&session->request_mutex); - return uds_status_to_errno(result); -} - -static int replace_device(struct uds_index_session *session, struct block_device *bdev) -{ - int result; - - result = uds_replace_index_storage(session->index, bdev); - if (result != UDS_SUCCESS) - return result; - - session->parameters.bdev = bdev; - return UDS_SUCCESS; -} - -/* - * Resume index operation after being suspended. If the index is suspended and the supplied block - * device differs from the current backing store, the index will start using the new backing store. - */ -int uds_resume_index_session(struct uds_index_session *session, - struct block_device *bdev) -{ - int result = UDS_SUCCESS; - bool no_work = false; - bool resume_replay = false; - - mutex_lock(&session->request_mutex); - if (session->state & IS_FLAG_WAITING) { - uds_log_info("Index session is already changing state"); - no_work = true; - result = -EBUSY; - } else if (!(session->state & IS_FLAG_SUSPENDED)) { - /* If not suspended, just succeed. */ - no_work = true; - result = UDS_SUCCESS; - } else { - session->state |= IS_FLAG_WAITING; - if (session->state & IS_FLAG_LOADING) - resume_replay = true; - } - mutex_unlock(&session->request_mutex); - - if (no_work) - return result; - - if ((session->index != NULL) && (bdev != session->parameters.bdev)) { - result = replace_device(session, bdev); - if (result != UDS_SUCCESS) { - mutex_lock(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - uds_broadcast_cond(&session->request_cond); - mutex_unlock(&session->request_mutex); - return uds_status_to_errno(result); - } - } - - if (resume_replay) { - mutex_lock(&session->load_context.mutex); - switch (session->load_context.status) { - case INDEX_SUSPENDED: - session->load_context.status = INDEX_OPENING; - /* Notify the index to start replaying again. */ - uds_broadcast_cond(&session->load_context.cond); - break; - - case INDEX_READY: - /* There is no index rebuild to resume. */ - break; - - case INDEX_OPENING: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - /* These cases should not happen; do nothing. */ - ASSERT_LOG_ONLY(false, "Bad load context state %u", - session->load_context.status); - break; - } - mutex_unlock(&session->load_context.mutex); - } - - mutex_lock(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state &= ~IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - mutex_unlock(&session->request_mutex); - return UDS_SUCCESS; -} - -static int save_and_free_index(struct uds_index_session *index_session) -{ - int result = UDS_SUCCESS; - bool suspended; - struct uds_index *index = index_session->index; - - if (index == NULL) - return UDS_SUCCESS; - - mutex_lock(&index_session->request_mutex); - suspended = (index_session->state & IS_FLAG_SUSPENDED); - mutex_unlock(&index_session->request_mutex); - - if (!suspended) { - result = uds_save_index(index); - if (result != UDS_SUCCESS) - uds_log_warning_strerror(result, - "ignoring error from save_index"); - } - uds_free_index(index); - index_session->index = NULL; - - /* - * Reset all index state that happens to be in the index - * session, so it doesn't affect any future index. - */ - mutex_lock(&index_session->load_context.mutex); - index_session->load_context.status = INDEX_OPENING; - mutex_unlock(&index_session->load_context.mutex); - - mutex_lock(&index_session->request_mutex); - /* Only the suspend bit will remain relevant. */ - index_session->state &= IS_FLAG_SUSPENDED; - mutex_unlock(&index_session->request_mutex); - - return result; -} - -/* Save and close the current index. */ -int uds_close_index(struct uds_index_session *index_session) -{ - int result = UDS_SUCCESS; - - /* Wait for any current index state change to complete. */ - mutex_lock(&index_session->request_mutex); - while ((index_session->state & IS_FLAG_WAITING) || - (index_session->state & IS_FLAG_CLOSING)) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - - if (index_session->state & IS_FLAG_SUSPENDED) { - uds_log_info("Index session is suspended"); - result = -EBUSY; - } else if ((index_session->state & IS_FLAG_DESTROYING) || - !(index_session->state & IS_FLAG_LOADED)) { - /* The index doesn't exist, hasn't finished loading, or is being destroyed. */ - result = UDS_NO_INDEX; - } else { - index_session->state |= IS_FLAG_CLOSING; - } - mutex_unlock(&index_session->request_mutex); - if (result != UDS_SUCCESS) - return uds_status_to_errno(result); - - uds_log_debug("Closing index"); - wait_for_no_requests_in_progress(index_session); - result = save_and_free_index(index_session); - uds_log_debug("Closed index"); - - mutex_lock(&index_session->request_mutex); - index_session->state &= ~IS_FLAG_CLOSING; - uds_broadcast_cond(&index_session->request_cond); - mutex_unlock(&index_session->request_mutex); - return uds_status_to_errno(result); -} - -/* This will save and close an open index before destroying the session. */ -int uds_destroy_index_session(struct uds_index_session *index_session) -{ - int result; - bool load_pending = false; - - uds_log_debug("Destroying index session"); - - /* Wait for any current index state change to complete. */ - mutex_lock(&index_session->request_mutex); - while ((index_session->state & IS_FLAG_WAITING) || - (index_session->state & IS_FLAG_CLOSING)) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - - if (index_session->state & IS_FLAG_DESTROYING) { - mutex_unlock(&index_session->request_mutex); - uds_log_info("Index session is already closing"); - return -EBUSY; - } - - index_session->state |= IS_FLAG_DESTROYING; - load_pending = ((index_session->state & IS_FLAG_LOADING) && - (index_session->state & IS_FLAG_SUSPENDED)); - mutex_unlock(&index_session->request_mutex); - - if (load_pending) { - /* Tell the index to terminate the rebuild. */ - mutex_lock(&index_session->load_context.mutex); - if (index_session->load_context.status == INDEX_SUSPENDED) { - index_session->load_context.status = INDEX_FREEING; - uds_broadcast_cond(&index_session->load_context.cond); - } - mutex_unlock(&index_session->load_context.mutex); - - /* Wait until the load exits before proceeding. */ - mutex_lock(&index_session->request_mutex); - while (index_session->state & IS_FLAG_LOADING) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - mutex_unlock(&index_session->request_mutex); - } - - wait_for_no_requests_in_progress(index_session); - result = save_and_free_index(index_session); - uds_request_queue_finish(index_session->callback_queue); - index_session->callback_queue = NULL; - uds_log_debug("Destroyed index session"); - uds_free(index_session); - return uds_status_to_errno(result); -} - -/* Wait until all callbacks for index operations are complete. */ -int uds_flush_index_session(struct uds_index_session *index_session) -{ - wait_for_no_requests_in_progress(index_session); - uds_wait_for_idle_index(index_session->index); - return UDS_SUCCESS; -} - -/* Statistics collection is intended to be thread-safe. */ -static void collect_stats(const struct uds_index_session *index_session, - struct uds_index_stats *stats) -{ - const struct session_stats *session_stats = &index_session->stats; - - stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME)); - stats->posts_found = READ_ONCE(session_stats->posts_found); - stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter); - stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense); - stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse); - stats->posts_not_found = READ_ONCE(session_stats->posts_not_found); - stats->updates_found = READ_ONCE(session_stats->updates_found); - stats->updates_not_found = READ_ONCE(session_stats->updates_not_found); - stats->deletions_found = READ_ONCE(session_stats->deletions_found); - stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found); - stats->queries_found = READ_ONCE(session_stats->queries_found); - stats->queries_not_found = READ_ONCE(session_stats->queries_not_found); - stats->requests = READ_ONCE(session_stats->requests); -} - -int uds_get_index_session_stats(struct uds_index_session *index_session, - struct uds_index_stats *stats) -{ - if (stats == NULL) { - uds_log_error("received a NULL index stats pointer"); - return -EINVAL; - } - - collect_stats(index_session, stats); - if (index_session->index != NULL) { - uds_get_index_stats(index_session->index, stats); - } else { - stats->entries_indexed = 0; - stats->memory_used = 0; - stats->collisions = 0; - stats->entries_discarded = 0; - } - - return UDS_SUCCESS; -} - -void uds_wait_cond(struct cond_var *cv, struct mutex *mutex) -{ - DEFINE_WAIT(__wait); - - prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE); - mutex_unlock(mutex); - schedule(); - finish_wait(&cv->wait_queue, &__wait); - mutex_lock(mutex); -} diff --git a/drivers/md/dm-vdo/index-session.h b/drivers/md/dm-vdo/index-session.h deleted file mode 100644 index 733d10f8a56cd..0000000000000 --- a/drivers/md/dm-vdo/index-session.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_INDEX_SESSION_H -#define UDS_INDEX_SESSION_H - -#include -#include - -#include "config.h" -#include "indexer.h" -#include "thread-utils.h" - -/* - * The index session mediates all interactions with a UDS index. Once the index session is created, - * it can be used to open, close, suspend, or recreate an index. It implements the majority of the - * functions in the top-level UDS API. - * - * If any deduplication request fails due to an internal error, the index is marked disabled. It - * will not accept any further requests and can only be closed. Closing the index will clear the - * disabled flag, and the index can then be reopened and recovered using the same index session. - */ - -struct __aligned(L1_CACHE_BYTES) session_stats { - /* Post requests that found an entry */ - u64 posts_found; - /* Post requests found in the open chapter */ - u64 posts_found_open_chapter; - /* Post requests found in the dense index */ - u64 posts_found_dense; - /* Post requests found in the sparse index */ - u64 posts_found_sparse; - /* Post requests that did not find an entry */ - u64 posts_not_found; - /* Update requests that found an entry */ - u64 updates_found; - /* Update requests that did not find an entry */ - u64 updates_not_found; - /* Delete requests that found an entry */ - u64 deletions_found; - /* Delete requests that did not find an entry */ - u64 deletions_not_found; - /* Query requests that found an entry */ - u64 queries_found; - /* Query requests that did not find an entry */ - u64 queries_not_found; - /* Total number of requests */ - u64 requests; -}; - -enum index_suspend_status { - /* An index load has started but the index is not ready for use. */ - INDEX_OPENING = 0, - /* The index is able to handle requests. */ - INDEX_READY, - /* The index is attempting to suspend a rebuild. */ - INDEX_SUSPENDING, - /* An index rebuild has been suspended. */ - INDEX_SUSPENDED, - /* An index rebuild is being stopped in order to shut down. */ - INDEX_FREEING, -}; - -struct index_load_context { - struct mutex mutex; - struct cond_var cond; - enum index_suspend_status status; -}; - -struct uds_index_session { - unsigned int state; - struct uds_index *index; - struct uds_request_queue *callback_queue; - struct uds_parameters parameters; - struct index_load_context load_context; - struct mutex request_mutex; - struct cond_var request_cond; - int request_count; - struct session_stats stats; -}; - -#endif /* UDS_INDEX_SESSION_H */ diff --git a/drivers/md/dm-vdo/index.c b/drivers/md/dm-vdo/index.c deleted file mode 100644 index 9d4a8e5cbaadb..0000000000000 --- a/drivers/md/dm-vdo/index.c +++ /dev/null @@ -1,1387 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - - -#include "index.h" - -#include "funnel-requestqueue.h" -#include "hash-utils.h" -#include "logger.h" -#include "memory-alloc.h" -#include "sparse-cache.h" - -static const u64 NO_LAST_SAVE = U64_MAX; - -/* - * When searching for deduplication records, the index first searches the volume index, and then - * searches the chapter index for the relevant chapter. If the chapter has been fully committed to - * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been - * committed (either the open chapter or a recently closed one), the index searches the in-memory - * representation of the chapter. Finally, if the volume index does not find a record and the index - * is sparse, the index will search the sparse cache. - * - * The index send two kinds of messages to coordinate between zones: chapter close messages for the - * chapter writer, and sparse cache barrier messages for the sparse cache. - * - * The chapter writer is responsible for committing chapters of records to storage. Since zones can - * get different numbers of records, some zones may fall behind others. Each time a zone fills up - * its available space in a chapter, it informs the chapter writer that the chapter is complete, - * and also informs all other zones that it has closed the chapter. Each other zone will then close - * the chapter immediately, regardless of how full it is, in order to minimize skew between zones. - * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage. - * - * The last zone to close the chapter also removes the oldest chapter from the volume index. - * Although that chapter is invalid for zones that have moved on, the existence of the open chapter - * means that those zones will never ask the volume index about it. No zone is allowed to get more - * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another - * chapter before the previous one has been closed by all zones, it is forced to wait. - * - * The sparse cache relies on having the same set of chapter indexes available to all zones. When a - * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone - * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and - * paused its operations, the cache membership is changed and each zone is then informed that it - * can proceed. More details can be found in the sparse cache documentation. - * - * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the - * barrier message to change the sparse cache membership, so the index simulates the message by - * invoking the handler directly. - */ - -struct chapter_writer { - /* The index to which we belong */ - struct uds_index *index; - /* The thread to do the writing */ - struct thread *thread; - /* The lock protecting the following fields */ - struct mutex mutex; - /* The condition signalled on state changes */ - struct cond_var cond; - /* Set to true to stop the thread */ - bool stop; - /* The result from the most recent write */ - int result; - /* The number of bytes allocated by the chapter writer */ - size_t memory_size; - /* The number of zones which have submitted a chapter for writing */ - unsigned int zones_to_write; - /* Open chapter index used by uds_close_open_chapter() */ - struct open_chapter_index *open_chapter_index; - /* Collated records used by uds_close_open_chapter() */ - struct uds_volume_record *collated_records; - /* The chapters to write (one per zone) */ - struct open_chapter_zone *chapters[]; -}; - -static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter) -{ - return uds_is_chapter_sparse(zone->index->volume->geometry, - zone->oldest_virtual_chapter, - zone->newest_virtual_chapter, virtual_chapter); -} - -static int launch_zone_message(struct uds_zone_message message, unsigned int zone, - struct uds_index *index) -{ - int result; - struct uds_request *request; - - result = uds_allocate(1, struct uds_request, __func__, &request); - if (result != UDS_SUCCESS) - return result; - - request->index = index; - request->unbatched = true; - request->zone_number = zone; - request->zone_message = message; - - uds_enqueue_request(request, STAGE_MESSAGE); - return UDS_SUCCESS; -} - -static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter) -{ - struct uds_zone_message message = { - .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, - .virtual_chapter = virtual_chapter, - }; - unsigned int zone; - - for (zone = 0; zone < index->zone_count; zone++) { - int result = launch_zone_message(message, zone, index); - - ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); - } -} - -/* - * Determine whether this request should trigger a sparse cache barrier message to change the - * membership of the sparse cache. If a change in membership is desired, the function returns the - * chapter number to add. - */ -static u64 triage_index_request(struct uds_index *index, struct uds_request *request) -{ - u64 virtual_chapter; - struct index_zone *zone; - - virtual_chapter = uds_lookup_volume_index_name(index->volume_index, - &request->record_name); - if (virtual_chapter == NO_CHAPTER) - return NO_CHAPTER; - - zone = index->zones[request->zone_number]; - if (!is_zone_chapter_sparse(zone, virtual_chapter)) - return NO_CHAPTER; - - /* - * FIXME: Optimize for a common case by remembering the chapter from the most recent - * barrier message and skipping this chapter if is it the same. - */ - - return virtual_chapter; -} - -/* - * Simulate a message to change the sparse cache membership for a single-zone sparse index. This - * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind - * of index does nothing here. - */ -static int simulate_index_zone_barrier_message(struct index_zone *zone, - struct uds_request *request) -{ - u64 sparse_virtual_chapter; - - if ((zone->index->zone_count > 1) || - !uds_is_sparse_index_geometry(zone->index->volume->geometry)) - return UDS_SUCCESS; - - sparse_virtual_chapter = triage_index_request(zone->index, request); - if (sparse_virtual_chapter == NO_CHAPTER) - return UDS_SUCCESS; - - return uds_update_sparse_cache(zone, sparse_virtual_chapter); -} - -/* This is the request processing function for the triage queue. */ -static void triage_request(struct uds_request *request) -{ - struct uds_index *index = request->index; - u64 sparse_virtual_chapter = triage_index_request(index, request); - - if (sparse_virtual_chapter != NO_CHAPTER) - enqueue_barrier_messages(index, sparse_virtual_chapter); - - uds_enqueue_request(request, STAGE_INDEX); -} - -static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number) -{ - int result; - struct chapter_writer *writer = index->chapter_writer; - - mutex_lock(&writer->mutex); - while (index->newest_virtual_chapter < current_chapter_number) - uds_wait_cond(&writer->cond, &writer->mutex); - result = writer->result; - mutex_unlock(&writer->mutex); - - if (result != UDS_SUCCESS) - return uds_log_error_strerror(result, - "Writing of previous open chapter failed"); - - return UDS_SUCCESS; -} - -static int swap_open_chapter(struct index_zone *zone) -{ - int result; - struct open_chapter_zone *temporary_chapter; - - result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); - if (result != UDS_SUCCESS) - return result; - - temporary_chapter = zone->open_chapter; - zone->open_chapter = zone->writing_chapter; - zone->writing_chapter = temporary_chapter; - return UDS_SUCCESS; -} - -/* - * Inform the chapter writer that this zone is done with this chapter. The chapter won't start - * writing until all zones have closed it. - */ -static unsigned int start_closing_chapter(struct uds_index *index, - unsigned int zone_number, - struct open_chapter_zone *chapter) -{ - unsigned int finished_zones; - struct chapter_writer *writer = index->chapter_writer; - - mutex_lock(&writer->mutex); - finished_zones = ++writer->zones_to_write; - writer->chapters[zone_number] = chapter; - uds_broadcast_cond(&writer->cond); - mutex_unlock(&writer->mutex); - - return finished_zones; -} - -static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter) -{ - int result; - unsigned int i; - struct uds_zone_message zone_message = { - .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, - .virtual_chapter = closed_chapter, - }; - - for (i = 0; i < zone->index->zone_count; i++) { - if (zone->id == i) - continue; - - result = launch_zone_message(zone_message, i, zone->index); - if (result != UDS_SUCCESS) - return result; - } - - return UDS_SUCCESS; -} - -static int open_next_chapter(struct index_zone *zone) -{ - int result; - u64 closed_chapter; - u64 expiring; - unsigned int finished_zones; - u32 expire_chapters; - - uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", - (unsigned long long) zone->newest_virtual_chapter, zone->id, - zone->open_chapter->size, - zone->open_chapter->capacity - zone->open_chapter->size); - - result = swap_open_chapter(zone); - if (result != UDS_SUCCESS) - return result; - - closed_chapter = zone->newest_virtual_chapter++; - uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id, - zone->newest_virtual_chapter); - uds_reset_open_chapter(zone->open_chapter); - - finished_zones = start_closing_chapter(zone->index, zone->id, - zone->writing_chapter); - if ((finished_zones == 1) && (zone->index->zone_count > 1)) { - result = announce_chapter_closed(zone, closed_chapter); - if (result != UDS_SUCCESS) - return result; - } - - expiring = zone->oldest_virtual_chapter; - expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry, - zone->newest_virtual_chapter); - zone->oldest_virtual_chapter += expire_chapters; - - if (finished_zones < zone->index->zone_count) - return UDS_SUCCESS; - - while (expire_chapters-- > 0) - uds_forget_chapter(zone->index->volume, expiring++); - - return UDS_SUCCESS; -} - -static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter) -{ - if (zone->newest_virtual_chapter == virtual_chapter) - return open_next_chapter(zone); - - return UDS_SUCCESS; -} - -static int dispatch_index_zone_control_request(struct uds_request *request) -{ - struct uds_zone_message *message = &request->zone_message; - struct index_zone *zone = request->index->zones[request->zone_number]; - - switch (message->type) { - case UDS_MESSAGE_SPARSE_CACHE_BARRIER: - return uds_update_sparse_cache(zone, message->virtual_chapter); - - case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: - return handle_chapter_closed(zone, message->virtual_chapter); - - default: - uds_log_error("invalid message type: %d", message->type); - return UDS_INVALID_ARGUMENT; - } -} - -static void set_request_location(struct uds_request *request, - enum uds_index_region new_location) -{ - request->location = new_location; - request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) || - (new_location == UDS_LOCATION_IN_DENSE) || - (new_location == UDS_LOCATION_IN_SPARSE)); -} - -static void set_chapter_location(struct uds_request *request, - const struct index_zone *zone, u64 virtual_chapter) -{ - request->found = true; - if (virtual_chapter == zone->newest_virtual_chapter) - request->location = UDS_LOCATION_IN_OPEN_CHAPTER; - else if (is_zone_chapter_sparse(zone, virtual_chapter)) - request->location = UDS_LOCATION_IN_SPARSE; - else - request->location = UDS_LOCATION_IN_DENSE; -} - -static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request, - u64 virtual_chapter, bool *found) -{ - int result; - struct volume *volume; - u16 record_page_number; - u32 chapter; - - result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter, - &record_page_number); - if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER)) - return result; - - request->virtual_chapter = virtual_chapter; - volume = zone->index->volume; - chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter); - return uds_search_cached_record_page(volume, request, chapter, - record_page_number, found); -} - -static int get_record_from_zone(struct index_zone *zone, struct uds_request *request, - bool *found) -{ - struct volume *volume; - - if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { - *found = true; - return UDS_SUCCESS; - } else if (request->location == UDS_LOCATION_UNAVAILABLE) { - *found = false; - return UDS_SUCCESS; - } - - if (request->virtual_chapter == zone->newest_virtual_chapter) { - uds_search_open_chapter(zone->open_chapter, &request->record_name, - &request->old_metadata, found); - return UDS_SUCCESS; - } - - if ((zone->newest_virtual_chapter > 0) && - (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) && - (zone->writing_chapter->size > 0)) { - uds_search_open_chapter(zone->writing_chapter, &request->record_name, - &request->old_metadata, found); - return UDS_SUCCESS; - } - - volume = zone->index->volume; - if (is_zone_chapter_sparse(zone, request->virtual_chapter) && - uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter, - request->zone_number)) - return search_sparse_cache_in_zone(zone, request, - request->virtual_chapter, found); - - return uds_search_volume_page_cache(volume, request, found); -} - -static int put_record_in_zone(struct index_zone *zone, struct uds_request *request, - const struct uds_record_data *metadata) -{ - unsigned int remaining; - - remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name, - metadata); - if (remaining == 0) - return open_next_chapter(zone); - - return UDS_SUCCESS; -} - -static int search_index_zone(struct index_zone *zone, struct uds_request *request) -{ - int result; - struct volume_index_record record; - bool overflow_record, found = false; - struct uds_record_data *metadata; - u64 chapter; - - result = uds_get_volume_index_record(zone->index->volume_index, - &request->record_name, &record); - if (result != UDS_SUCCESS) - return result; - - if (record.is_found) { - if (request->requeued && request->virtual_chapter != record.virtual_chapter) - set_request_location(request, UDS_LOCATION_UNKNOWN); - - request->virtual_chapter = record.virtual_chapter; - result = get_record_from_zone(zone, request, &found); - if (result != UDS_SUCCESS) - return result; - } - - if (found) - set_chapter_location(request, zone, record.virtual_chapter); - - /* - * If a record has overflowed a chapter index in more than one chapter (or overflowed in - * one chapter and collided with an existing record), it will exist as a collision record - * in the volume index, but we won't find it in the volume. This case needs special - * handling. - */ - overflow_record = (record.is_found && record.is_collision && !found); - chapter = zone->newest_virtual_chapter; - if (found || overflow_record) { - if ((request->type == UDS_QUERY_NO_UPDATE) || - ((request->type == UDS_QUERY) && overflow_record)) { - /* There is nothing left to do. */ - return UDS_SUCCESS; - } - - if (record.virtual_chapter != chapter) { - /* - * Update the volume index to reference the new chapter for the block. If - * the record had been deleted or dropped from the chapter index, it will - * be back. - */ - result = uds_set_volume_index_record_chapter(&record, chapter); - } else if (request->type != UDS_UPDATE) { - /* The record is already in the open chapter. */ - return UDS_SUCCESS; - } - } else { - /* - * The record wasn't in the volume index, so check whether the - * name is in a cached sparse chapter. If we found the name on - * a previous search, use that result instead. - */ - if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { - found = true; - } else if (request->location == UDS_LOCATION_UNAVAILABLE) { - found = false; - } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) && - !uds_is_volume_index_sample(zone->index->volume_index, - &request->record_name)) { - result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER, - &found); - if (result != UDS_SUCCESS) - return result; - } - - if (found) - set_request_location(request, UDS_LOCATION_IN_SPARSE); - - if ((request->type == UDS_QUERY_NO_UPDATE) || - ((request->type == UDS_QUERY) && !found)) { - /* There is nothing left to do. */ - return UDS_SUCCESS; - } - - /* - * Add a new entry to the volume index referencing the open chapter. This needs to - * be done both for new records, and for records from cached sparse chapters. - */ - result = uds_put_volume_index_record(&record, chapter); - } - - if (result == UDS_OVERFLOW) { - /* - * The volume index encountered a delta list overflow. The condition was already - * logged. We will go on without adding the record to the open chapter. - */ - return UDS_SUCCESS; - } - - if (result != UDS_SUCCESS) - return result; - - if (!found || (request->type == UDS_UPDATE)) { - /* This is a new record or we're updating an existing record. */ - metadata = &request->new_metadata; - } else { - /* Move the existing record to the open chapter. */ - metadata = &request->old_metadata; - } - - return put_record_in_zone(zone, request, metadata); -} - -static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request) -{ - int result; - struct volume_index_record record; - - result = uds_get_volume_index_record(zone->index->volume_index, - &request->record_name, &record); - if (result != UDS_SUCCESS) - return result; - - if (!record.is_found) - return UDS_SUCCESS; - - /* If the request was requeued, check whether the saved state is still valid. */ - - if (record.is_collision) { - set_chapter_location(request, zone, record.virtual_chapter); - } else { - /* Non-collision records are hints, so resolve the name in the chapter. */ - bool found; - - if (request->requeued && request->virtual_chapter != record.virtual_chapter) - set_request_location(request, UDS_LOCATION_UNKNOWN); - - request->virtual_chapter = record.virtual_chapter; - result = get_record_from_zone(zone, request, &found); - if (result != UDS_SUCCESS) - return result; - - if (!found) { - /* There is no record to remove. */ - return UDS_SUCCESS; - } - } - - set_chapter_location(request, zone, record.virtual_chapter); - - /* - * Delete the volume index entry for the named record only. Note that a later search might - * later return stale advice if there is a colliding name in the same chapter, but it's a - * very rare case (1 in 2^21). - */ - result = uds_remove_volume_index_record(&record); - if (result != UDS_SUCCESS) - return result; - - /* - * If the record is in the open chapter, we must remove it or mark it deleted to avoid - * trouble if the record is added again later. - */ - if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) - uds_remove_from_open_chapter(zone->open_chapter, &request->record_name); - - return UDS_SUCCESS; -} - -static int dispatch_index_request(struct uds_index *index, struct uds_request *request) -{ - int result; - struct index_zone *zone = index->zones[request->zone_number]; - - if (!request->requeued) { - result = simulate_index_zone_barrier_message(zone, request); - if (result != UDS_SUCCESS) - return result; - } - - switch (request->type) { - case UDS_POST: - case UDS_UPDATE: - case UDS_QUERY: - case UDS_QUERY_NO_UPDATE: - result = search_index_zone(zone, request); - break; - - case UDS_DELETE: - result = remove_from_index_zone(zone, request); - break; - - default: - result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "invalid request type: %d", - request->type); - break; - } - - return result; -} - -/* This is the request processing function invoked by each zone's thread. */ -static void execute_zone_request(struct uds_request *request) -{ - int result; - struct uds_index *index = request->index; - - if (request->zone_message.type != UDS_MESSAGE_NONE) { - result = dispatch_index_zone_control_request(request); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "error executing message: %d", - request->zone_message.type); - } - - /* Once the message is processed it can be freed. */ - uds_free(uds_forget(request)); - return; - } - - index->need_to_save = true; - if (request->requeued && (request->status != UDS_SUCCESS)) { - set_request_location(request, UDS_LOCATION_UNAVAILABLE); - index->callback(request); - return; - } - - result = dispatch_index_request(index, request); - if (result == UDS_QUEUED) { - /* The request has been requeued so don't let it complete. */ - return; - } - - if (!request->found) - set_request_location(request, UDS_LOCATION_UNAVAILABLE); - - request->status = result; - index->callback(request); -} - -static int initialize_index_queues(struct uds_index *index, - const struct index_geometry *geometry) -{ - int result; - unsigned int i; - - for (i = 0; i < index->zone_count; i++) { - result = uds_make_request_queue("indexW", &execute_zone_request, - &index->zone_queues[i]); - if (result != UDS_SUCCESS) - return result; - } - - /* The triage queue is only needed for sparse multi-zone indexes. */ - if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) { - result = uds_make_request_queue("triageW", &triage_request, - &index->triage_queue); - if (result != UDS_SUCCESS) - return result; - } - - return UDS_SUCCESS; -} - -/* This is the driver function for the chapter writer thread. */ -static void close_chapters(void *arg) -{ - int result; - struct chapter_writer *writer = arg; - struct uds_index *index = writer->index; - - uds_log_debug("chapter writer starting"); - mutex_lock(&writer->mutex); - for (;;) { - while (writer->zones_to_write < index->zone_count) { - if (writer->stop && (writer->zones_to_write == 0)) { - /* - * We've been told to stop, and all of the zones are in the same - * open chapter, so we can exit now. - */ - mutex_unlock(&writer->mutex); - uds_log_debug("chapter writer stopping"); - return; - } - uds_wait_cond(&writer->cond, &writer->mutex); - } - - /* - * Release the lock while closing a chapter. We probably don't need to do this, but - * it seems safer in principle. It's OK to access the chapter and chapter_number - * fields without the lock since those aren't allowed to change until we're done. - */ - mutex_unlock(&writer->mutex); - - if (index->has_saved_open_chapter) { - /* - * Remove the saved open chapter the first time we close an open chapter - * after loading from a clean shutdown, or after doing a clean save. The - * lack of the saved open chapter will indicate that a recovery is - * necessary. - */ - index->has_saved_open_chapter = false; - result = uds_discard_open_chapter(index->layout); - if (result == UDS_SUCCESS) - uds_log_debug("Discarding saved open chapter"); - } - - result = uds_close_open_chapter(writer->chapters, index->zone_count, - index->volume, - writer->open_chapter_index, - writer->collated_records, - index->newest_virtual_chapter); - - mutex_lock(&writer->mutex); - index->newest_virtual_chapter++; - index->oldest_virtual_chapter += - uds_chapters_to_expire(index->volume->geometry, - index->newest_virtual_chapter); - writer->result = result; - writer->zones_to_write = 0; - uds_broadcast_cond(&writer->cond); - } -} - -static void stop_chapter_writer(struct chapter_writer *writer) -{ - struct thread *writer_thread = NULL; - - mutex_lock(&writer->mutex); - if (writer->thread != NULL) { - writer_thread = writer->thread; - writer->thread = NULL; - writer->stop = true; - uds_broadcast_cond(&writer->cond); - } - mutex_unlock(&writer->mutex); - - if (writer_thread != NULL) - vdo_join_threads(writer_thread); -} - -static void free_chapter_writer(struct chapter_writer *writer) -{ - if (writer == NULL) - return; - - stop_chapter_writer(writer); - uds_free_open_chapter_index(writer->open_chapter_index); - uds_free(writer->collated_records); - uds_free(writer); -} - -static int make_chapter_writer(struct uds_index *index, - struct chapter_writer **writer_ptr) -{ - int result; - struct chapter_writer *writer; - size_t collated_records_size = - (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); - - result = uds_allocate_extended(struct chapter_writer, index->zone_count, - struct open_chapter_zone *, "Chapter Writer", - &writer); - if (result != UDS_SUCCESS) - return result; - - writer->index = index; - mutex_init(&writer->mutex); - uds_init_cond(&writer->cond); - - result = uds_allocate_cache_aligned(collated_records_size, "collated records", - &writer->collated_records); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - - result = uds_make_open_chapter_index(&writer->open_chapter_index, - index->volume->geometry, - index->volume->nonce); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - - writer->memory_size = (sizeof(struct chapter_writer) + - index->zone_count * sizeof(struct open_chapter_zone *) + - collated_records_size + - writer->open_chapter_index->memory_size); - - result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - - *writer_ptr = writer; - return UDS_SUCCESS; -} - -static int load_index(struct uds_index *index) -{ - int result; - u64 last_save_chapter; - - result = uds_load_index_state(index->layout, index); - if (result != UDS_SUCCESS) - return UDS_INDEX_NOT_SAVED_CLEANLY; - - last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0); - - uds_log_info("loaded index from chapter %llu through chapter %llu", - (unsigned long long) index->oldest_virtual_chapter, - (unsigned long long) last_save_chapter); - - return UDS_SUCCESS; -} - -static int rebuild_index_page_map(struct uds_index *index, u64 vcn) -{ - int result; - struct delta_index_page *chapter_index_page; - struct index_geometry *geometry = index->volume->geometry; - u32 chapter = uds_map_to_physical_chapter(geometry, vcn); - u32 expected_list_number = 0; - u32 index_page_number; - u32 lowest_delta_list; - u32 highest_delta_list; - - for (index_page_number = 0; - index_page_number < geometry->index_pages_per_chapter; - index_page_number++) { - result = uds_get_volume_index_page(index->volume, chapter, - index_page_number, - &chapter_index_page); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to read index page %u in chapter %u", - index_page_number, chapter); - } - - lowest_delta_list = chapter_index_page->lowest_list_number; - highest_delta_list = chapter_index_page->highest_list_number; - if (lowest_delta_list != expected_list_number) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "chapter %u index page %u is corrupt", - chapter, index_page_number); - } - - uds_update_index_page_map(index->volume->index_page_map, vcn, chapter, - index_page_number, highest_delta_list); - expected_list_number = highest_delta_list + 1; - } - - return UDS_SUCCESS; -} - -static int replay_record(struct uds_index *index, const struct uds_record_name *name, - u64 virtual_chapter, bool will_be_sparse_chapter) -{ - int result; - struct volume_index_record record; - bool update_record; - - if (will_be_sparse_chapter && - !uds_is_volume_index_sample(index->volume_index, name)) { - /* - * This entry will be in a sparse chapter after the rebuild completes, and it is - * not a sample, so just skip over it. - */ - return UDS_SUCCESS; - } - - result = uds_get_volume_index_record(index->volume_index, name, &record); - if (result != UDS_SUCCESS) - return result; - - if (record.is_found) { - if (record.is_collision) { - if (record.virtual_chapter == virtual_chapter) { - /* The record is already correct. */ - return UDS_SUCCESS; - } - - update_record = true; - } else if (record.virtual_chapter == virtual_chapter) { - /* - * There is a volume index entry pointing to the current chapter, but we - * don't know if it is for the same name as the one we are currently - * working on or not. For now, we're just going to assume that it isn't. - * This will create one extra collision record if there was a deleted - * record in the current chapter. - */ - update_record = false; - } else { - /* - * If we're rebuilding, we don't normally want to go to disk to see if the - * record exists, since we will likely have just read the record from disk - * (i.e. we know it's there). The exception to this is when we find an - * entry in the volume index that has a different chapter. In this case, we - * need to search that chapter to determine if the volume index entry was - * for the same record or a different one. - */ - result = uds_search_volume_page_cache_for_rebuild(index->volume, - name, - record.virtual_chapter, - &update_record); - if (result != UDS_SUCCESS) - return result; - } - } else { - update_record = false; - } - - if (update_record) { - /* - * Update the volume index to reference the new chapter for the block. If the - * record had been deleted or dropped from the chapter index, it will be back. - */ - result = uds_set_volume_index_record_chapter(&record, virtual_chapter); - } else { - /* - * Add a new entry to the volume index referencing the open chapter. This should be - * done regardless of whether we are a brand new record or a sparse record, i.e. - * one that doesn't exist in the index but does on disk, since for a sparse record, - * we would want to un-sparsify if it did exist. - */ - result = uds_put_volume_index_record(&record, virtual_chapter); - } - - if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { - /* The rebuilt index will lose these records. */ - return UDS_SUCCESS; - } - - return result; -} - -static bool check_for_suspend(struct uds_index *index) -{ - bool closing; - - if (index->load_context == NULL) - return false; - - mutex_lock(&index->load_context->mutex); - if (index->load_context->status != INDEX_SUSPENDING) { - mutex_unlock(&index->load_context->mutex); - return false; - } - - /* Notify that we are suspended and wait for the resume. */ - index->load_context->status = INDEX_SUSPENDED; - uds_broadcast_cond(&index->load_context->cond); - - while ((index->load_context->status != INDEX_OPENING) && - (index->load_context->status != INDEX_FREEING)) - uds_wait_cond(&index->load_context->cond, &index->load_context->mutex); - - closing = (index->load_context->status == INDEX_FREEING); - mutex_unlock(&index->load_context->mutex); - return closing; -} - -static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse) -{ - int result; - u32 i; - u32 j; - const struct index_geometry *geometry; - u32 physical_chapter; - - if (check_for_suspend(index)) { - uds_log_info("Replay interrupted by index shutdown at chapter %llu", - (unsigned long long) virtual); - return -EBUSY; - } - - geometry = index->volume->geometry; - physical_chapter = uds_map_to_physical_chapter(geometry, virtual); - uds_prefetch_volume_chapter(index->volume, physical_chapter); - uds_set_volume_index_open_chapter(index->volume_index, virtual); - - result = rebuild_index_page_map(index, virtual); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "could not rebuild index page map for chapter %u", - physical_chapter); - } - - for (i = 0; i < geometry->record_pages_per_chapter; i++) { - u8 *record_page; - u32 record_page_number; - - record_page_number = geometry->index_pages_per_chapter + i; - result = uds_get_volume_record_page(index->volume, physical_chapter, - record_page_number, &record_page); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, "could not get page %d", - record_page_number); - } - - for (j = 0; j < geometry->records_per_page; j++) { - const u8 *name_bytes; - struct uds_record_name name; - - name_bytes = record_page + (j * BYTES_PER_RECORD); - memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE); - result = replay_record(index, &name, virtual, sparse); - if (result != UDS_SUCCESS) - return result; - } - } - - return UDS_SUCCESS; -} - -static int replay_volume(struct uds_index *index) -{ - int result; - u64 old_map_update; - u64 new_map_update; - u64 virtual; - u64 from_virtual = index->oldest_virtual_chapter; - u64 upto_virtual = index->newest_virtual_chapter; - bool will_be_sparse; - - uds_log_info("Replaying volume from chapter %llu through chapter %llu", - (unsigned long long) from_virtual, - (unsigned long long) upto_virtual); - - /* - * The index failed to load, so the volume index is empty. Add records to the volume index - * in order, skipping non-hooks in chapters which will be sparse to save time. - * - * Go through each record page of each chapter and add the records back to the volume - * index. This should not cause anything to be written to either the open chapter or the - * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this - * would have already been purged from the volume index when the chapter was opened. - * - * Also, go through each index page for each chapter and rebuild the index page map. - */ - old_map_update = index->volume->index_page_map->last_update; - for (virtual = from_virtual; virtual < upto_virtual; virtual++) { - will_be_sparse = uds_is_chapter_sparse(index->volume->geometry, - from_virtual, upto_virtual, - virtual); - result = replay_chapter(index, virtual, will_be_sparse); - if (result != UDS_SUCCESS) - return result; - } - - /* Also reap the chapter being replaced by the open chapter. */ - uds_set_volume_index_open_chapter(index->volume_index, upto_virtual); - - new_map_update = index->volume->index_page_map->last_update; - if (new_map_update != old_map_update) { - uds_log_info("replay changed index page map update from %llu to %llu", - (unsigned long long) old_map_update, - (unsigned long long) new_map_update); - } - - return UDS_SUCCESS; -} - -static int rebuild_index(struct uds_index *index) -{ - int result; - u64 lowest; - u64 highest; - bool is_empty = false; - u32 chapters_per_volume = index->volume->geometry->chapters_per_volume; - - index->volume->lookup_mode = LOOKUP_FOR_REBUILD; - result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest, - &is_empty); - if (result != UDS_SUCCESS) { - return uds_log_fatal_strerror(result, - "cannot rebuild index: unknown volume chapter boundaries"); - } - - if (is_empty) { - index->newest_virtual_chapter = 0; - index->oldest_virtual_chapter = 0; - index->volume->lookup_mode = LOOKUP_NORMAL; - return UDS_SUCCESS; - } - - index->newest_virtual_chapter = highest + 1; - index->oldest_virtual_chapter = lowest; - if (index->newest_virtual_chapter == - (index->oldest_virtual_chapter + chapters_per_volume)) { - /* Skip the chapter shadowed by the open chapter. */ - index->oldest_virtual_chapter++; - } - - result = replay_volume(index); - if (result != UDS_SUCCESS) - return result; - - index->volume->lookup_mode = LOOKUP_NORMAL; - return UDS_SUCCESS; -} - -static void free_index_zone(struct index_zone *zone) -{ - if (zone == NULL) - return; - - uds_free_open_chapter(zone->open_chapter); - uds_free_open_chapter(zone->writing_chapter); - uds_free(zone); -} - -static int make_index_zone(struct uds_index *index, unsigned int zone_number) -{ - int result; - struct index_zone *zone; - - result = uds_allocate(1, struct index_zone, "index zone", &zone); - if (result != UDS_SUCCESS) - return result; - - result = uds_make_open_chapter(index->volume->geometry, index->zone_count, - &zone->open_chapter); - if (result != UDS_SUCCESS) { - free_index_zone(zone); - return result; - } - - result = uds_make_open_chapter(index->volume->geometry, index->zone_count, - &zone->writing_chapter); - if (result != UDS_SUCCESS) { - free_index_zone(zone); - return result; - } - - zone->index = index; - zone->id = zone_number; - index->zones[zone_number] = zone; - - return UDS_SUCCESS; -} - -int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type, - struct index_load_context *load_context, index_callback_fn callback, - struct uds_index **new_index) -{ - int result; - bool loaded = false; - bool new = (open_type == UDS_CREATE); - struct uds_index *index = NULL; - struct index_zone *zone; - u64 nonce; - unsigned int z; - - result = uds_allocate_extended(struct uds_index, config->zone_count, - struct uds_request_queue *, "index", &index); - if (result != UDS_SUCCESS) - return result; - - index->zone_count = config->zone_count; - - result = uds_make_index_layout(config, new, &index->layout); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return result; - } - - result = uds_allocate(index->zone_count, struct index_zone *, "zones", - &index->zones); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return result; - } - - result = uds_make_volume(config, index->layout, &index->volume); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return result; - } - - index->volume->lookup_mode = LOOKUP_NORMAL; - for (z = 0; z < index->zone_count; z++) { - result = make_index_zone(index, z); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return uds_log_error_strerror(result, - "Could not create index zone"); - } - } - - nonce = uds_get_volume_nonce(index->layout); - result = uds_make_volume_index(config, nonce, &index->volume_index); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return uds_log_error_strerror(result, "could not make volume index"); - } - - index->load_context = load_context; - index->callback = callback; - - result = initialize_index_queues(index, config->geometry); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return result; - } - - result = make_chapter_writer(index, &index->chapter_writer); - if (result != UDS_SUCCESS) { - uds_free_index(index); - return result; - } - - if (!new) { - result = load_index(index); - switch (result) { - case UDS_SUCCESS: - loaded = true; - break; - case -ENOMEM: - /* We should not try a rebuild for this error. */ - uds_log_error_strerror(result, "index could not be loaded"); - break; - default: - uds_log_error_strerror(result, "index could not be loaded"); - if (open_type == UDS_LOAD) { - result = rebuild_index(index); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "index could not be rebuilt"); - } - } - break; - } - } - - if (result != UDS_SUCCESS) { - uds_free_index(index); - return uds_log_error_strerror(result, "fatal error in %s()", __func__); - } - - for (z = 0; z < index->zone_count; z++) { - zone = index->zones[z]; - zone->oldest_virtual_chapter = index->oldest_virtual_chapter; - zone->newest_virtual_chapter = index->newest_virtual_chapter; - } - - if (index->load_context != NULL) { - mutex_lock(&index->load_context->mutex); - index->load_context->status = INDEX_READY; - /* - * If we get here, suspend is meaningless, but notify any thread trying to suspend - * us so it doesn't hang. - */ - uds_broadcast_cond(&index->load_context->cond); - mutex_unlock(&index->load_context->mutex); - } - - index->has_saved_open_chapter = loaded; - index->need_to_save = !loaded; - *new_index = index; - return UDS_SUCCESS; -} - -void uds_free_index(struct uds_index *index) -{ - unsigned int i; - - if (index == NULL) - return; - - uds_request_queue_finish(index->triage_queue); - for (i = 0; i < index->zone_count; i++) - uds_request_queue_finish(index->zone_queues[i]); - - free_chapter_writer(index->chapter_writer); - - uds_free_volume_index(index->volume_index); - if (index->zones != NULL) { - for (i = 0; i < index->zone_count; i++) - free_index_zone(index->zones[i]); - uds_free(index->zones); - } - - uds_free_volume(index->volume); - uds_free_index_layout(uds_forget(index->layout)); - uds_free(index); -} - -/* Wait for the chapter writer to complete any outstanding writes. */ -void uds_wait_for_idle_index(struct uds_index *index) -{ - struct chapter_writer *writer = index->chapter_writer; - - mutex_lock(&writer->mutex); - while (writer->zones_to_write > 0) - uds_wait_cond(&writer->cond, &writer->mutex); - mutex_unlock(&writer->mutex); -} - -/* This function assumes that all requests have been drained. */ -int uds_save_index(struct uds_index *index) -{ - int result; - - if (!index->need_to_save) - return UDS_SUCCESS; - - uds_wait_for_idle_index(index); - index->prev_save = index->last_save; - index->last_save = ((index->newest_virtual_chapter == 0) ? - NO_LAST_SAVE : index->newest_virtual_chapter - 1); - uds_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save); - - result = uds_save_index_state(index->layout, index); - if (result != UDS_SUCCESS) { - uds_log_info("save index failed"); - index->last_save = index->prev_save; - } else { - index->has_saved_open_chapter = true; - index->need_to_save = false; - uds_log_info("finished save (vcn %llu)", - (unsigned long long) index->last_save); - } - - return result; -} - -int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev) -{ - return uds_replace_volume_storage(index->volume, index->layout, bdev); -} - -/* Accessing statistics should be safe from any thread. */ -void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters) -{ - struct volume_index_stats stats; - - uds_get_volume_index_stats(index->volume_index, &stats); - counters->entries_indexed = stats.record_count; - counters->collisions = stats.collision_count; - counters->entries_discarded = stats.discard_count; - - counters->memory_used = (index->volume_index->memory_size + - index->volume->cache_size + - index->chapter_writer->memory_size); -} - -void uds_enqueue_request(struct uds_request *request, enum request_stage stage) -{ - struct uds_index *index = request->index; - struct uds_request_queue *queue; - - switch (stage) { - case STAGE_TRIAGE: - if (index->triage_queue != NULL) { - queue = index->triage_queue; - break; - } - - fallthrough; - - case STAGE_INDEX: - request->zone_number = - uds_get_volume_index_zone(index->volume_index, &request->record_name); - fallthrough; - - case STAGE_MESSAGE: - queue = index->zone_queues[request->zone_number]; - break; - - default: - ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage); - return; - } - - uds_request_queue_enqueue(queue, request); -} diff --git a/drivers/md/dm-vdo/index.h b/drivers/md/dm-vdo/index.h deleted file mode 100644 index edabb239548ec..0000000000000 --- a/drivers/md/dm-vdo/index.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_INDEX_H -#define UDS_INDEX_H - -#include "index-layout.h" -#include "index-session.h" -#include "open-chapter.h" -#include "volume.h" -#include "volume-index.h" - -/* - * The index is a high-level structure which represents the totality of the UDS index. It manages - * the queues for incoming requests and dispatches them to the appropriate sub-components like the - * volume or the volume index. It also manages administrative tasks such as saving and loading the - * index. - * - * The index is divided into a number of independent zones and assigns each request to a zone based - * on its name. Most sub-components are similarly divided into zones as well so that requests in - * each zone usually operate without interference or coordination between zones. - */ - -typedef void (*index_callback_fn)(struct uds_request *request); - -struct index_zone { - struct uds_index *index; - struct open_chapter_zone *open_chapter; - struct open_chapter_zone *writing_chapter; - u64 oldest_virtual_chapter; - u64 newest_virtual_chapter; - unsigned int id; -}; - -struct uds_index { - bool has_saved_open_chapter; - bool need_to_save; - struct index_load_context *load_context; - struct index_layout *layout; - struct volume_index *volume_index; - struct volume *volume; - unsigned int zone_count; - struct index_zone **zones; - - u64 oldest_virtual_chapter; - u64 newest_virtual_chapter; - - u64 last_save; - u64 prev_save; - struct chapter_writer *chapter_writer; - - index_callback_fn callback; - struct uds_request_queue *triage_queue; - struct uds_request_queue *zone_queues[]; -}; - -enum request_stage { - STAGE_TRIAGE, - STAGE_INDEX, - STAGE_MESSAGE, -}; - -int __must_check uds_make_index(struct uds_configuration *config, - enum uds_open_index_type open_type, - struct index_load_context *load_context, - index_callback_fn callback, struct uds_index **new_index); - -int __must_check uds_save_index(struct uds_index *index); - -void uds_free_index(struct uds_index *index); - -int __must_check uds_replace_index_storage(struct uds_index *index, - struct block_device *bdev); - -void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters); - -void uds_enqueue_request(struct uds_request *request, enum request_stage stage); - -void uds_wait_for_idle_index(struct uds_index *index); - -#endif /* UDS_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer.h b/drivers/md/dm-vdo/indexer.h deleted file mode 100644 index 3744aaf625b05..0000000000000 --- a/drivers/md/dm-vdo/indexer.h +++ /dev/null @@ -1,353 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef INDEXER_H -#define INDEXER_H - -#include -#include -#include -#include - -#include "funnel-queue.h" - -/* - * UDS public API - * - * The Universal Deduplication System (UDS) is an efficient name-value store. When used for - * deduplicating storage, the names are generally hashes of data blocks and the associated data is - * where that block is located on the underlying storage medium. The stored names are expected to - * be randomly distributed among the space of possible names. If this assumption is violated, the - * UDS index will store fewer names than normal but will otherwise continue to work. The data - * associated with each name can be any 16-byte value. - * - * A client must first create an index session to interact with an index. Once created, the session - * can be shared among multiple threads or users. When a session is destroyed, it will also close - * and save any associated index. - * - * To make a request, a client must allocate a uds_request structure and set the required fields - * before launching it. UDS will invoke the provided callback to complete the request. After the - * callback has been called, the uds_request structure can be freed or reused for a new request. - * There are five types of requests: - * - * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data - * associated with that name will be discarded. - * - * A UDS_QUERY request will return the data associated with the provided name, if any. The entry - * for the name will also be marked as most recent, as if the data had been updated. - * - * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data - * associated with the provided name, that data is returned. If there is no existing association, - * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY - * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient. - * - * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will - * not change the recency of the entry for the name. This request is primarily useful for testing, - * to determine whether an entry exists without changing the internal state of the index. - * - * A UDS_DELETE request removes any data associated with the provided name. This operation is - * generally not necessary, because the index will automatically discard its oldest entries once it - * becomes full. - */ - -/* General UDS constants and structures */ - -enum uds_request_type { - /* Create or update the mapping for a name, and make the name most recent. */ - UDS_UPDATE, - - /* Return any mapped data for a name, and make the name most recent. */ - UDS_QUERY, - - /* - * Return any mapped data for a name, or map the provided data to the name if there is no - * current data, and make the name most recent. - */ - UDS_POST, - - /* Return any mapped data for a name without updating its recency. */ - UDS_QUERY_NO_UPDATE, - - /* Remove any mapping for a name. */ - UDS_DELETE, - -}; - -enum uds_open_index_type { - /* Create a new index. */ - UDS_CREATE, - - /* Load an existing index and try to recover if necessary. */ - UDS_LOAD, - - /* Load an existing index, but only if it was saved cleanly. */ - UDS_NO_REBUILD, -}; - -enum { - /* The record name size in bytes */ - UDS_RECORD_NAME_SIZE = 16, - /* The maximum record data size in bytes */ - UDS_RECORD_DATA_SIZE = 16, -}; - -/* - * A type representing a UDS memory configuration which is either a positive integer number of - * gigabytes or one of the six special constants for configurations smaller than one gigabyte. - */ -typedef int uds_memory_config_size_t; - -enum { - /* The maximum configurable amount of memory */ - UDS_MEMORY_CONFIG_MAX = 1024, - /* Flag indicating that the index has one less chapter than usual */ - UDS_MEMORY_CONFIG_REDUCED = 0x1000, - UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED, - /* Special values indicating sizes less than 1 GB */ - UDS_MEMORY_CONFIG_256MB = -256, - UDS_MEMORY_CONFIG_512MB = -512, - UDS_MEMORY_CONFIG_768MB = -768, - UDS_MEMORY_CONFIG_REDUCED_256MB = -1280, - UDS_MEMORY_CONFIG_REDUCED_512MB = -1536, - UDS_MEMORY_CONFIG_REDUCED_768MB = -1792, -}; - -struct uds_record_name { - unsigned char name[UDS_RECORD_NAME_SIZE]; -}; - -struct uds_record_data { - unsigned char data[UDS_RECORD_DATA_SIZE]; -}; - -struct uds_volume_record { - struct uds_record_name name; - struct uds_record_data data; -}; - -struct uds_parameters { - /* The block_device used for storage */ - struct block_device *bdev; - /* The maximum allowable size of the index on storage */ - size_t size; - /* The offset where the index should start */ - off_t offset; - /* The maximum memory allocation, in GB */ - uds_memory_config_size_t memory_size; - /* Whether the index should include sparse chapters */ - bool sparse; - /* A 64-bit nonce to validate the index */ - u64 nonce; - /* The number of threads used to process index requests */ - unsigned int zone_count; - /* The number of threads used to read volume pages */ - unsigned int read_threads; -}; - -/* - * These statistics capture characteristics of the current index, including resource usage and - * requests processed since the index was opened. - */ -struct uds_index_stats { - /* The total number of records stored in the index */ - u64 entries_indexed; - /* An estimate of the index's memory usage, in bytes */ - u64 memory_used; - /* The number of collisions recorded in the volume index */ - u64 collisions; - /* The number of entries discarded from the index since startup */ - u64 entries_discarded; - /* The time at which these statistics were fetched */ - s64 current_time; - /* The number of post calls that found an existing entry */ - u64 posts_found; - /* The number of post calls that added an entry */ - u64 posts_not_found; - /* - * The number of post calls that found an existing entry that is current enough to only - * exist in memory and not have been committed to disk yet - */ - u64 in_memory_posts_found; - /* - * The number of post calls that found an existing entry in the dense portion of the index - */ - u64 dense_posts_found; - /* - * The number of post calls that found an existing entry in the sparse portion of the index - */ - u64 sparse_posts_found; - /* The number of update calls that updated an existing entry */ - u64 updates_found; - /* The number of update calls that added a new entry */ - u64 updates_not_found; - /* The number of delete requests that deleted an existing entry */ - u64 deletions_found; - /* The number of delete requests that did nothing */ - u64 deletions_not_found; - /* The number of query calls that found existing entry */ - u64 queries_found; - /* The number of query calls that did not find an entry */ - u64 queries_not_found; - /* The total number of requests processed */ - u64 requests; -}; - -enum uds_index_region { - /* No location information has been determined */ - UDS_LOCATION_UNKNOWN = 0, - /* The index page entry has been found */ - UDS_LOCATION_INDEX_PAGE_LOOKUP, - /* The record page entry has been found */ - UDS_LOCATION_RECORD_PAGE_LOOKUP, - /* The record is not in the index */ - UDS_LOCATION_UNAVAILABLE, - /* The record was found in the open chapter */ - UDS_LOCATION_IN_OPEN_CHAPTER, - /* The record was found in the dense part of the index */ - UDS_LOCATION_IN_DENSE, - /* The record was found in the sparse part of the index */ - UDS_LOCATION_IN_SPARSE, -} __packed; - -/* Zone message requests are used to communicate between index zones. */ -enum uds_zone_message_type { - /* A standard request with no message */ - UDS_MESSAGE_NONE = 0, - /* Add a chapter to the sparse chapter index cache */ - UDS_MESSAGE_SPARSE_CACHE_BARRIER, - /* Close a chapter to keep the zone from falling behind */ - UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, -} __packed; - -struct uds_zone_message { - /* The type of message, determining how it will be processed */ - enum uds_zone_message_type type; - /* The virtual chapter number to which the message applies */ - u64 virtual_chapter; -}; - -struct uds_index_session; -struct uds_index; -struct uds_request; - -/* Once this callback has been invoked, the uds_request structure can be reused or freed. */ -typedef void (*uds_request_callback_fn)(struct uds_request *request); - -struct uds_request { - /* These input fields must be set before launching a request. */ - - /* The name of the record to look up or create */ - struct uds_record_name record_name; - /* New data to associate with the record name, if applicable */ - struct uds_record_data new_metadata; - /* A callback to invoke when the request is complete */ - uds_request_callback_fn callback; - /* The index session that will manage this request */ - struct uds_index_session *session; - /* The type of operation to perform, as describe above */ - enum uds_request_type type; - - /* These output fields are set when a request is complete. */ - - /* The existing data associated with the request name, if any */ - struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; - /* True if the record name had an existing entry in the index */ - bool found; - - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; -}; - -/* Compute the number of bytes needed to store an index. */ -int __must_check uds_compute_index_size(const struct uds_parameters *parameters, - u64 *index_size); - -/* A session is required for most index operations. */ -int __must_check uds_create_index_session(struct uds_index_session **session); - -/* Destroying an index session also closes and saves the associated index. */ -int uds_destroy_index_session(struct uds_index_session *session); - -/* - * Create or open an index with an existing session. This operation fails if the index session is - * suspended, or if there is already an open index. - */ -int __must_check uds_open_index(enum uds_open_index_type open_type, - const struct uds_parameters *parameters, - struct uds_index_session *session); - -/* - * Wait until all callbacks for index operations are complete, and prevent new index operations - * from starting. New index operations will fail with EBUSY until the session is resumed. Also - * optionally saves the index. - */ -int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save); - -/* - * Allow new index operations for an index, whether it was suspended or not. If the index is - * suspended and the supplied block device differs from the current backing store, the index will - * start using the new backing store instead. - */ -int __must_check uds_resume_index_session(struct uds_index_session *session, - struct block_device *bdev); - -/* Wait until all outstanding index operations are complete. */ -int __must_check uds_flush_index_session(struct uds_index_session *session); - -/* Close an index. This operation fails if the index session is suspended. */ -int __must_check uds_close_index(struct uds_index_session *session); - -/* Get index statistics since the last time the index was opened. */ -int __must_check uds_get_index_session_stats(struct uds_index_session *session, - struct uds_index_stats *stats); - -/* This function will fail if any required field of the request is not set. */ -int __must_check uds_launch_request(struct uds_request *request); - -struct cond_var { - wait_queue_head_t wait_queue; -}; - -static inline void uds_init_cond(struct cond_var *cv) -{ - init_waitqueue_head(&cv->wait_queue); -} - -static inline void uds_signal_cond(struct cond_var *cv) -{ - wake_up(&cv->wait_queue); -} - -static inline void uds_broadcast_cond(struct cond_var *cv) -{ - wake_up_all(&cv->wait_queue); -} - -void uds_wait_cond(struct cond_var *cv, struct mutex *mutex); - -#endif /* INDEXER_H */ diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c new file mode 100644 index 0000000000000..6487825ada906 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/chapter-index.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "chapter-index.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "hash-utils.h" +#include "indexer.h" + +int uds_make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct index_geometry *geometry, u64 volume_nonce) +{ + int result; + size_t memory_size; + struct open_chapter_index *index; + + result = uds_allocate(1, struct open_chapter_index, "open chapter index", &index); + if (result != UDS_SUCCESS) + return result; + + /* + * The delta index will rebalance delta lists when memory gets tight, + * so give the chapter index one extra page. + */ + memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page); + index->geometry = geometry; + index->volume_nonce = volume_nonce; + result = uds_initialize_delta_index(&index->delta_index, 1, + geometry->delta_lists_per_chapter, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + memory_size, 'm'); + if (result != UDS_SUCCESS) { + uds_free(index); + return result; + } + + index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index); + *chapter_index = index; + return UDS_SUCCESS; +} + +void uds_free_open_chapter_index(struct open_chapter_index *chapter_index) +{ + if (chapter_index == NULL) + return; + + uds_uninitialize_delta_index(&chapter_index->delta_index); + uds_free(chapter_index); +} + +/* Re-initialize an open chapter index for a new chapter. */ +void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, + u64 virtual_chapter_number) +{ + uds_reset_delta_index(&chapter_index->delta_index); + chapter_index->virtual_chapter_number = virtual_chapter_number; +} + +static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address) +{ + return (!entry->at_end) && (entry->key == address); +} + +/* Associate a record name with the record page containing its metadata. */ +int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_record_name *name, + u32 page_number) +{ + int result; + struct delta_index_entry entry; + u32 address; + u32 list_number; + const u8 *found_name; + bool found; + const struct index_geometry *geometry = chapter_index->geometry; + u64 chapter_number = chapter_index->virtual_chapter_number; + u32 record_pages = geometry->record_pages_per_chapter; + + result = ASSERT(page_number < record_pages, + "Page number within chapter (%u) exceeds the maximum value %u", + page_number, record_pages); + if (result != UDS_SUCCESS) + return UDS_INVALID_ARGUMENT; + + address = uds_hash_to_chapter_delta_address(name, geometry); + list_number = uds_hash_to_chapter_delta_list(name, geometry); + result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number, + address, name->name, &entry); + if (result != UDS_SUCCESS) + return result; + + found = was_entry_found(&entry, address); + result = ASSERT(!(found && entry.is_collision), + "Chunk appears more than once in chapter %llu", + (unsigned long long) chapter_number); + if (result != UDS_SUCCESS) + return UDS_BAD_STATE; + + found_name = (found ? name->name : NULL); + return uds_put_delta_index_entry(&entry, address, page_number, found_name); +} + +/* + * Pack a section of an open chapter index into a chapter index page. A range of delta lists + * (starting with a specified list index) is copied from the open chapter index into a memory page. + * The number of lists copied onto the page is returned to the caller on success. + * + * @chapter_index: The open chapter index + * @memory: The memory page to use + * @first_list: The first delta list number to be copied + * @last_page: If true, this is the last page of the chapter index and all the remaining lists must + * be packed onto this page + * @lists_packed: The number of delta lists that were packed onto this page + */ +int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + u8 *memory, u32 first_list, bool last_page, + u32 *lists_packed) +{ + int result; + struct delta_index *delta_index = &chapter_index->delta_index; + struct delta_index_stats stats; + u64 nonce = chapter_index->volume_nonce; + u64 chapter_number = chapter_index->virtual_chapter_number; + const struct index_geometry *geometry = chapter_index->geometry; + u32 list_count = geometry->delta_lists_per_chapter; + unsigned int removals = 0; + struct delta_index_entry entry; + u32 next_list; + s32 list_number; + + for (;;) { + result = uds_pack_delta_index_page(delta_index, nonce, memory, + geometry->bytes_per_page, + chapter_number, first_list, + lists_packed); + if (result != UDS_SUCCESS) + return result; + + if ((first_list + *lists_packed) == list_count) { + /* All lists are packed. */ + break; + } else if (*lists_packed == 0) { + /* + * The next delta list does not fit on a page. This delta list will be + * removed. + */ + } else if (last_page) { + /* + * This is the last page and there are lists left unpacked, but all of the + * remaining lists must fit on the page. Find a list that contains entries + * and remove the entire list. Try the first list that does not fit. If it + * is empty, we will select the last list that already fits and has any + * entries. + */ + } else { + /* This page is done. */ + break; + } + + if (removals == 0) { + uds_get_delta_index_stats(delta_index, &stats); + uds_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions", + (unsigned long long) chapter_number, + (unsigned long long) stats.record_count, + (unsigned long long) stats.collision_count); + } + + list_number = *lists_packed; + do { + if (list_number < 0) + return UDS_OVERFLOW; + + next_list = first_list + list_number--, + result = uds_start_delta_index_search(delta_index, next_list, 0, + &entry); + if (result != UDS_SUCCESS) + return result; + + result = uds_next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) + return result; + } while (entry.at_end); + + do { + result = uds_remove_delta_index_entry(&entry); + if (result != UDS_SUCCESS) + return result; + + removals++; + } while (!entry.at_end); + } + + if (removals > 0) { + uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index", + (unsigned long long) chapter_number, removals); + } + + return UDS_SUCCESS; +} + +/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */ +int uds_initialize_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + u8 *page_buffer, u64 volume_nonce) +{ + return uds_initialize_delta_index_page(index_page, volume_nonce, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + page_buffer, geometry->bytes_per_page); +} + +/* Validate a chapter index page read during rebuild. */ +int uds_validate_chapter_index_page(const struct delta_index_page *index_page, + const struct index_geometry *geometry) +{ + int result; + const struct delta_index *delta_index = &index_page->delta_index; + u32 first = index_page->lowest_list_number; + u32 last = index_page->highest_list_number; + u32 list_number; + + /* We walk every delta list from start to finish. */ + for (list_number = first; list_number <= last; list_number++) { + struct delta_index_entry entry; + + result = uds_start_delta_index_search(delta_index, list_number - first, + 0, &entry); + if (result != UDS_SUCCESS) + return result; + + for (;;) { + result = uds_next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) { + /* + * A random bit stream is highly likely to arrive here when we go + * past the end of the delta list. + */ + return result; + } + + if (entry.at_end) + break; + + /* Also make sure that the record page field contains a plausible value. */ + if (uds_get_delta_entry_value(&entry) >= + geometry->record_pages_per_chapter) { + /* + * Do not log this as an error. It happens in normal operation when + * we are doing a rebuild but haven't written the entire volume + * once. + */ + return UDS_CORRUPT_DATA; + } + } + } + return UDS_SUCCESS; +} + +/* + * Search a chapter index page for a record name, returning the record page number that may contain + * the name. + */ +int uds_search_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + const struct uds_record_name *name, + u16 *record_page_ptr) +{ + int result; + struct delta_index *delta_index = &index_page->delta_index; + u32 address = uds_hash_to_chapter_delta_address(name, geometry); + u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry); + u32 sub_list_number = delta_list_number - index_page->lowest_list_number; + struct delta_index_entry entry; + + result = uds_get_delta_index_entry(delta_index, sub_list_number, address, + name->name, &entry); + if (result != UDS_SUCCESS) + return result; + + if (was_entry_found(&entry, address)) + *record_page_ptr = uds_get_delta_entry_value(&entry); + else + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h new file mode 100644 index 0000000000000..be8bf2b675b1c --- /dev/null +++ b/drivers/md/dm-vdo/indexer/chapter-index.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_CHAPTER_INDEX_H +#define UDS_CHAPTER_INDEX_H + +#include + +#include "delta-index.h" +#include "geometry.h" + +/* + * A chapter index for an open chapter is a mutable structure that tracks all the records that have + * been added to the chapter. A chapter index for a closed chapter is similar except that it is + * immutable because the contents of a closed chapter can never change, and the immutable structure + * is more efficient. Both types of chapter index are implemented with a delta index. + */ + +/* The value returned when no entry is found in the chapter index. */ +#define NO_CHAPTER_INDEX_ENTRY U16_MAX + +struct open_chapter_index { + const struct index_geometry *geometry; + struct delta_index delta_index; + u64 virtual_chapter_number; + u64 volume_nonce; + size_t memory_size; +}; + +int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct index_geometry *geometry, + u64 volume_nonce); + +void uds_free_open_chapter_index(struct open_chapter_index *chapter_index); + +void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, + u64 virtual_chapter_number); + +int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_record_name *name, + u32 page_number); + +int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + u8 *memory, u32 first_list, + bool last_page, u32 *lists_packed); + +int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + u8 *page_buffer, u64 volume_nonce); + +int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page, + const struct index_geometry *geometry); + +int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + const struct uds_record_name *name, + u16 *record_page_ptr); + +#endif /* UDS_CHAPTER_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c new file mode 100644 index 0000000000000..0bf315e7b5d13 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/config.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "config.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "string-utils.h" +#include "thread-utils.h" + +static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC"; +static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02"; +static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02"; + +enum { + DEFAULT_VOLUME_READ_THREADS = 2, + MAX_VOLUME_READ_THREADS = 16, + INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, + INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1, +}; + +static bool is_version(const u8 *version, u8 *buffer) +{ + return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0; +} + +static bool are_matching_configurations(struct uds_configuration *saved_config, + struct index_geometry *saved_geometry, + struct uds_configuration *user) +{ + struct index_geometry *geometry = user->geometry; + bool result = true; + + if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) { + uds_log_error("Record pages per chapter (%u) does not match (%u)", + saved_geometry->record_pages_per_chapter, + geometry->record_pages_per_chapter); + result = false; + } + + if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) { + uds_log_error("Chapter count (%u) does not match (%u)", + saved_geometry->chapters_per_volume, + geometry->chapters_per_volume); + result = false; + } + + if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) { + uds_log_error("Sparse chapter count (%u) does not match (%u)", + saved_geometry->sparse_chapters_per_volume, + geometry->sparse_chapters_per_volume); + result = false; + } + + if (saved_config->cache_chapters != user->cache_chapters) { + uds_log_error("Cache size (%u) does not match (%u)", + saved_config->cache_chapters, user->cache_chapters); + result = false; + } + + if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) { + uds_log_error("Volume index mean delta (%u) does not match (%u)", + saved_config->volume_index_mean_delta, + user->volume_index_mean_delta); + result = false; + } + + if (saved_geometry->bytes_per_page != geometry->bytes_per_page) { + uds_log_error("Bytes per page value (%zu) does not match (%zu)", + saved_geometry->bytes_per_page, geometry->bytes_per_page); + result = false; + } + + if (saved_config->sparse_sample_rate != user->sparse_sample_rate) { + uds_log_error("Sparse sample rate (%u) does not match (%u)", + saved_config->sparse_sample_rate, + user->sparse_sample_rate); + result = false; + } + + if (saved_config->nonce != user->nonce) { + uds_log_error("Nonce (%llu) does not match (%llu)", + (unsigned long long) saved_config->nonce, + (unsigned long long) user->nonce); + result = false; + } + + return result; +} + +/* Read the configuration and validate it against the provided one. */ +int uds_validate_config_contents(struct buffered_reader *reader, + struct uds_configuration *user_config) +{ + int result; + struct uds_configuration config; + struct index_geometry geometry; + u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH]; + u32 bytes_per_page; + u8 buffer[sizeof(struct uds_configuration_6_02)]; + size_t offset = 0; + + result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, version_buffer, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot read index config version"); + + if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) && + !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unsupported configuration version: '%.*s'", + INDEX_CONFIG_VERSION_LENGTH, + version_buffer); + } + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot read config data"); + + decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter); + decode_u32_le(buffer, &offset, &geometry.chapters_per_volume); + decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume); + decode_u32_le(buffer, &offset, &config.cache_chapters); + offset += sizeof(u32); + decode_u32_le(buffer, &offset, &config.volume_index_mean_delta); + decode_u32_le(buffer, &offset, &bytes_per_page); + geometry.bytes_per_page = bytes_per_page; + decode_u32_le(buffer, &offset, &config.sparse_sample_rate); + decode_u64_le(buffer, &offset, &config.nonce); + + result = ASSERT(offset == sizeof(struct uds_configuration_6_02), + "%zu bytes read but not decoded", + sizeof(struct uds_configuration_6_02) - offset); + if (result != UDS_SUCCESS) + return UDS_CORRUPT_DATA; + + if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) { + user_config->geometry->remapped_virtual = 0; + user_config->geometry->remapped_physical = 0; + } else { + u8 remapping[sizeof(u64) + sizeof(u64)]; + + result = uds_read_from_buffered_reader(reader, remapping, + sizeof(remapping)); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot read converted config"); + + offset = 0; + decode_u64_le(remapping, &offset, + &user_config->geometry->remapped_virtual); + decode_u64_le(remapping, &offset, + &user_config->geometry->remapped_physical); + } + + if (!are_matching_configurations(&config, &geometry, user_config)) { + uds_log_warning("Supplied configuration does not match save"); + return UDS_NO_INDEX; + } + + return UDS_SUCCESS; +} + +/* + * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02 + * version; otherwise write the 8.02 version, indicating the configuration is for an index that has + * been reduced by one chapter. + */ +int uds_write_config_contents(struct buffered_writer *writer, + struct uds_configuration *config, u32 version) +{ + int result; + struct index_geometry *geometry = config->geometry; + u8 buffer[sizeof(struct uds_configuration_8_02)]; + size_t offset = 0; + + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + /* + * If version is < 4, the index has not been reduced by a chapter so it must be written out + * as version 6.02 so that it is still compatible with older versions of UDS. + */ + if (version >= 4) { + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + } else { + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + } + + encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter); + encode_u32_le(buffer, &offset, geometry->chapters_per_volume); + encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume); + encode_u32_le(buffer, &offset, config->cache_chapters); + encode_u32_le(buffer, &offset, 0); + encode_u32_le(buffer, &offset, config->volume_index_mean_delta); + encode_u32_le(buffer, &offset, geometry->bytes_per_page); + encode_u32_le(buffer, &offset, config->sparse_sample_rate); + encode_u64_le(buffer, &offset, config->nonce); + + result = ASSERT(offset == sizeof(struct uds_configuration_6_02), + "%zu bytes encoded, of %zu expected", offset, + sizeof(struct uds_configuration_6_02)); + if (result != UDS_SUCCESS) + return result; + + if (version >= 4) { + encode_u64_le(buffer, &offset, geometry->remapped_virtual); + encode_u64_le(buffer, &offset, geometry->remapped_physical); + } + + return uds_write_to_buffered_writer(writer, buffer, offset); +} + +/* Compute configuration parameters that depend on memory size. */ +static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse, + u32 *chapters_per_volume, u32 *record_pages_per_chapter, + u32 *sparse_chapters_per_volume) +{ + u32 reduced_chapters = 0; + u32 base_chapters; + + if (mem_gb == UDS_MEMORY_CONFIG_256MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) { + base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) && + (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) { + reduced_chapters = 1; + base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) * + DEFAULT_CHAPTERS_PER_VOLUME); + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else { + uds_log_error("received invalid memory size"); + return -EINVAL; + } + + if (sparse) { + /* Make 95% of chapters sparse, allowing 10x more records. */ + *sparse_chapters_per_volume = (19 * base_chapters) / 2; + base_chapters *= 10; + } else { + *sparse_chapters_per_volume = 0; + } + + *chapters_per_volume = base_chapters - reduced_chapters; + return UDS_SUCCESS; +} + +static unsigned int __must_check normalize_zone_count(unsigned int requested) +{ + unsigned int zone_count = requested; + + if (zone_count == 0) + zone_count = num_online_cpus() / 2; + + if (zone_count < 1) + zone_count = 1; + + if (zone_count > MAX_ZONES) + zone_count = MAX_ZONES; + + uds_log_info("Using %u indexing zone%s for concurrency.", + zone_count, zone_count == 1 ? "" : "s"); + return zone_count; +} + +static unsigned int __must_check normalize_read_threads(unsigned int requested) +{ + unsigned int read_threads = requested; + + if (read_threads < 1) + read_threads = DEFAULT_VOLUME_READ_THREADS; + + if (read_threads > MAX_VOLUME_READ_THREADS) + read_threads = MAX_VOLUME_READ_THREADS; + + return read_threads; +} + +int uds_make_configuration(const struct uds_parameters *params, + struct uds_configuration **config_ptr) +{ + struct uds_configuration *config; + u32 chapters_per_volume = 0; + u32 record_pages_per_chapter = 0; + u32 sparse_chapters_per_volume = 0; + int result; + + result = compute_memory_sizes(params->memory_size, params->sparse, + &chapters_per_volume, &record_pages_per_chapter, + &sparse_chapters_per_volume); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(1, struct uds_configuration, __func__, &config); + if (result != UDS_SUCCESS) + return result; + + result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter, + chapters_per_volume, sparse_chapters_per_volume, + 0, 0, &config->geometry); + if (result != UDS_SUCCESS) { + uds_free_configuration(config); + return result; + } + + config->zone_count = normalize_zone_count(params->zone_count); + config->read_threads = normalize_read_threads(params->read_threads); + + config->cache_chapters = DEFAULT_CACHE_CHAPTERS; + config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA; + config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0); + config->nonce = params->nonce; + config->bdev = params->bdev; + config->offset = params->offset; + config->size = params->size; + + *config_ptr = config; + return UDS_SUCCESS; +} + +void uds_free_configuration(struct uds_configuration *config) +{ + if (config != NULL) { + uds_free_index_geometry(config->geometry); + uds_free(config); + } +} + +void uds_log_configuration(struct uds_configuration *config) +{ + struct index_geometry *geometry = config->geometry; + + uds_log_debug("Configuration:"); + uds_log_debug(" Record pages per chapter: %10u", geometry->record_pages_per_chapter); + uds_log_debug(" Chapters per volume: %10u", geometry->chapters_per_volume); + uds_log_debug(" Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume); + uds_log_debug(" Cache size (chapters): %10u", config->cache_chapters); + uds_log_debug(" Volume index mean delta: %10u", config->volume_index_mean_delta); + uds_log_debug(" Bytes per page: %10zu", geometry->bytes_per_page); + uds_log_debug(" Sparse sample rate: %10u", config->sparse_sample_rate); + uds_log_debug(" Nonce: %llu", (unsigned long long) config->nonce); +} diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h new file mode 100644 index 0000000000000..08507dc2f7a14 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/config.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_CONFIG_H +#define UDS_CONFIG_H + +#include "geometry.h" +#include "indexer.h" +#include "io-factory.h" + +/* + * The uds_configuration records a variety of parameters used to configure a new UDS index. Some + * parameters are provided by the client, while others are fixed or derived from user-supplied + * values. It is created when an index is created, and it is recorded in the index metadata. + */ + +enum { + DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096, + DEFAULT_CACHE_CHAPTERS = 7, + DEFAULT_SPARSE_SAMPLE_RATE = 32, + MAX_ZONES = 16, +}; + +/* A set of configuration parameters for the indexer. */ +struct uds_configuration { + /* Storage device for the index */ + struct block_device *bdev; + + /* The maximum allowable size of the index */ + size_t size; + + /* The offset where the index should start */ + off_t offset; + + /* Parameters for the volume */ + + /* The volume layout */ + struct index_geometry *geometry; + + /* Index owner's nonce */ + u64 nonce; + + /* The number of threads used to process index requests */ + unsigned int zone_count; + + /* The number of threads used to read volume pages */ + unsigned int read_threads; + + /* Size of the page cache and sparse chapter index cache in chapters */ + u32 cache_chapters; + + /* Parameters for the volume index */ + + /* The mean delta for the volume index */ + u32 volume_index_mean_delta; + + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; +}; + +/* On-disk structure of data for a version 8.02 index. */ +struct uds_configuration_8_02 { + /* Smaller (16), Small (64) or large (256) indices */ + u32 record_pages_per_chapter; + /* Total number of chapters per volume */ + u32 chapters_per_volume; + /* Number of sparse chapters per volume */ + u32 sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + u32 cache_chapters; + /* Unused field */ + u32 unused; + /* The volume index mean delta to use */ + u32 volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + u32 bytes_per_page; + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; + /* Index owner's nonce */ + u64 nonce; + /* Virtual chapter remapped from physical chapter 0 */ + u64 remapped_virtual; + /* New physical chapter which remapped chapter was moved to */ + u64 remapped_physical; +} __packed; + +/* On-disk structure of data for a version 6.02 index. */ +struct uds_configuration_6_02 { + /* Smaller (16), Small (64) or large (256) indices */ + u32 record_pages_per_chapter; + /* Total number of chapters per volume */ + u32 chapters_per_volume; + /* Number of sparse chapters per volume */ + u32 sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + u32 cache_chapters; + /* Unused field */ + u32 unused; + /* The volume index mean delta to use */ + u32 volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + u32 bytes_per_page; + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; + /* Index owner's nonce */ + u64 nonce; +} __packed; + +int __must_check uds_make_configuration(const struct uds_parameters *params, + struct uds_configuration **config_ptr); + +void uds_free_configuration(struct uds_configuration *config); + +int __must_check uds_validate_config_contents(struct buffered_reader *reader, + struct uds_configuration *config); + +int __must_check uds_write_config_contents(struct buffered_writer *writer, + struct uds_configuration *config, u32 version); + +void uds_log_configuration(struct uds_configuration *config); + +#endif /* UDS_CONFIG_H */ diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c new file mode 100644 index 0000000000000..4aace707545a2 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/delta-index.c @@ -0,0 +1,1988 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ +#include "delta-index.h" + +#include +#include +#include +#include +#include + +#include "cpu.h" +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "time-utils.h" + +#include "config.h" +#include "indexer.h" + +/* + * The entries in a delta index could be stored in a single delta list, but to reduce search times + * and update costs it uses multiple delta lists. These lists are stored in a single chunk of + * memory managed by the delta_zone structure. The delta_zone can move the data around within its + * memory, so the location of each delta list is recorded as a bit offset into the memory. Because + * the volume index can contain over a million delta lists, we want to be efficient with the size + * of the delta list header information. This information is encoded into 16 bytes per list. The + * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to + * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are + * sufficient to store the size of a delta list. + * + * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are + * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and + * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most signficant bit + * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's + * number corresponds to its index in the array. + * + * A standard delta list entry is stored as a fixed length payload (the value) followed by a + * variable length key (the delta). A collision entry is used when two block names have the same + * delta list address. A collision entry always follows a standard entry for the hash with which it + * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end, + * containing the full block name. An entry with a delta of 0 at the beginning of a delta list + * indicates a normal entry. + * + * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory + * used by small deltas. The Huffman code is specified by three parameters, which can be computed + * from the desired mean delta when the index is full. (See compute_coding_constants() for + * details.) + * + * The bit field utilities used to read and write delta entries assume that it is possible to read + * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two + * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are + * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit + * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted + * delta list could cause this step to run off the end of the delta_zone memory, so as extra + * protection against this happening, the tail guard list is set to all ones. + * + * The delta_index supports two different forms. The mutable form is created by + * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The + * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and + * cached) chapter index pages. The immutable form does not allocate delta list headers or + * temporary offsets, and thus is somewhat more memory efficient. + */ + +/* + * This is the largest field size supported by get_field() and set_field(). Any field that is + * larger is not guaranteed to fit in a single byte-aligned u32. + */ +enum { + MAX_FIELD_BITS = (sizeof(u32) - 1) * BITS_PER_BYTE + 1, +}; + +/* + * This is the largest field size supported by get_big_field() and set_big_field(). Any field that + * is larger is not guaranteed to fit in a single byte-aligned u64. + */ +enum { + MAX_BIG_FIELD_BITS = (sizeof(u64) - 1) * BITS_PER_BYTE + 1, +}; + +/* + * This is the number of guard bytes needed at the end of the memory byte array when using the bit + * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7 + * bytes beyond the end of the desired field. The definition is written to make it clear how this + * value is derived. + */ +enum { + POST_FIELD_GUARD_BYTES = sizeof(u64) - 1, +}; + +/* The number of guard bits that are needed in the tail guard list */ +enum { + GUARD_BITS = POST_FIELD_GUARD_BYTES * BITS_PER_BYTE +}; + +/* + * The maximum size of a single delta list in bytes. We count guard bytes in this value because a + * buffer of this size can be used with move_bits(). + */ +enum { + DELTA_LIST_MAX_BYTE_COUNT = + ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES) +}; + +/* The number of extra bytes and bits needed to store a collision entry */ +enum { + COLLISION_BYTES = UDS_RECORD_NAME_SIZE, + COLLISION_BITS = COLLISION_BYTES * BITS_PER_BYTE +}; + +/* + * Immutable delta lists are packed into pages containing a header that encodes the delta list + * information into 19 bits per list (64KB bit offset). + */ + +enum { IMMUTABLE_HEADER_SIZE = 19 }; + +/* + * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a + * number to increment when the format of the data changes. + */ + +enum { + MAGIC_SIZE = 8, +}; + +static const char DELTA_INDEX_MAGIC[] = "DI-00002"; + +struct delta_index_header { + char magic[MAGIC_SIZE]; + u32 zone_number; + u32 zone_count; + u32 first_list; + u32 list_count; + u64 record_count; + u64 collision_count; +}; + +/* + * Header data used for immutable delta index pages. This data is followed by the delta list offset + * table. + */ +struct delta_page_header { + /* Externally-defined nonce */ + u64 nonce; + /* The virtual chapter number */ + u64 virtual_chapter_number; + /* Index of the first delta list on the page */ + u16 first_list; + /* Number of delta lists on the page */ + u16 list_count; +} __packed; + +static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list) +{ + return delta_list->start / BITS_PER_BYTE; +} + +static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list) +{ + unsigned int bit_offset = delta_list->start % BITS_PER_BYTE; + + return BITS_TO_BYTES(bit_offset + delta_list->size); +} + +static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first, + u32 last) +{ + struct delta_list *delta_list; + u64 new_start; + + if (first == last) { + /* Only one list is moving, and we know there is space. */ + delta_list = &delta_zone->delta_lists[first]; + new_start = delta_zone->new_offsets[first]; + if (delta_list->start != new_start) { + u64 source; + u64 destination; + + source = get_delta_list_byte_start(delta_list); + delta_list->start = new_start; + destination = get_delta_list_byte_start(delta_list); + memmove(delta_zone->memory + destination, + delta_zone->memory + source, + get_delta_list_byte_size(delta_list)); + } + } else { + /* + * There is more than one list. Divide the problem in half, and use recursive calls + * to process each half. Note that after this computation, first <= middle, and + * middle < last. + */ + u32 middle = (first + last) / 2; + + delta_list = &delta_zone->delta_lists[middle]; + new_start = delta_zone->new_offsets[middle]; + + /* + * The direction that our middle list is moving determines which half of the + * problem must be processed first. + */ + if (new_start > delta_list->start) { + rebalance_delta_zone(delta_zone, middle + 1, last); + rebalance_delta_zone(delta_zone, first, middle); + } else { + rebalance_delta_zone(delta_zone, first, middle); + rebalance_delta_zone(delta_zone, middle + 1, last); + } + } +} + +static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size) +{ + /* Round up so that each zone is a multiple of 64K in size. */ + enum { + ALLOC_BOUNDARY = 64 * 1024, + }; + + return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; +} + +void uds_reset_delta_index(const struct delta_index *delta_index) +{ + unsigned int z; + + /* + * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one + * before the first real entry and one after so that we don't need to bounds check the + * array access when calculating preceding and following gap sizes. + */ + for (z = 0; z < delta_index->zone_count; z++) { + u64 list_bits; + u64 spacing; + u64 offset; + unsigned int i; + struct delta_zone *zone = &delta_index->delta_zones[z]; + struct delta_list *delta_lists = zone->delta_lists; + + /* Zeroing the delta list headers initializes the head guard list correctly. */ + memset(delta_lists, 0, + (zone->list_count + 2) * sizeof(struct delta_list)); + + /* Set all the bits in the end guard list. */ + list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS; + delta_lists[zone->list_count + 1].start = list_bits; + delta_lists[zone->list_count + 1].size = GUARD_BITS; + memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0, + POST_FIELD_GUARD_BYTES); + + /* Evenly space out the real delta lists by setting regular offsets. */ + spacing = list_bits / zone->list_count; + offset = spacing / 2; + for (i = 1; i <= zone->list_count; i++) { + delta_lists[i].start = offset; + offset += spacing; + } + + /* Update the statistics. */ + zone->discard_count += zone->record_count; + zone->record_count = 0; + zone->collision_count = 0; + } +} + +/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by + * three parameters: + * + * MINBITS The number of bits in the smallest code + * BASE The number of values coded using a code of length MINBITS + * INCR The number of values coded by using one additional bit + * + * These parameters are related by this equation: + * + * BASE + INCR == 1 << MINBITS + * + * The math for the Huffman code of an exponential distribution says that + * + * INCR = log(2) * MEAN_DELTA + * + * Then use the smallest MINBITS value so that + * + * (1 << MINBITS) > INCR + * + * And then + * + * BASE = (1 << MINBITS) - INCR + * + * Now the index can generate a code such that + * - The first BASE values code using MINBITS bits. + * - The next INCR values code using MINBITS+1 bits. + * - The next INCR values code using MINBITS+2 bits. + * - (and so on). + */ +static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys) +{ + /* + * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use + * floating point, use a really good integer approximation. + */ + *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL; + *min_bits = bits_per(*incr_keys + 1); + *min_keys = (1 << *min_bits) - *incr_keys; +} + +void uds_uninitialize_delta_index(struct delta_index *delta_index) +{ + unsigned int z; + + if (delta_index->delta_zones == NULL) + return; + + for (z = 0; z < delta_index->zone_count; z++) { + uds_free(uds_forget(delta_index->delta_zones[z].new_offsets)); + uds_free(uds_forget(delta_index->delta_zones[z].delta_lists)); + uds_free(uds_forget(delta_index->delta_zones[z].memory)); + } + + uds_free(delta_index->delta_zones); + memset(delta_index, 0, sizeof(struct delta_index)); +} + +static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size, + u32 first_list, u32 list_count, u32 mean_delta, + u32 payload_bits, u8 tag) +{ + int result; + + result = uds_allocate(size, u8, "delta list", &delta_zone->memory); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(list_count + 2, u64, "delta list temp", + &delta_zone->new_offsets); + if (result != UDS_SUCCESS) + return result; + + /* Allocate the delta lists. */ + result = uds_allocate(list_count + 2, struct delta_list, "delta lists", + &delta_zone->delta_lists); + if (result != UDS_SUCCESS) + return result; + + compute_coding_constants(mean_delta, &delta_zone->min_bits, + &delta_zone->min_keys, &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->buffered_writer = NULL; + delta_zone->size = size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = first_list; + delta_zone->list_count = list_count; + delta_zone->tag = tag; + + return UDS_SUCCESS; +} + +int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count, + u32 list_count, u32 mean_delta, u32 payload_bits, + size_t memory_size, u8 tag) +{ + int result; + unsigned int z; + size_t zone_memory; + + result = uds_allocate(zone_count, struct delta_zone, "Delta Index Zones", + &delta_index->delta_zones); + if (result != UDS_SUCCESS) + return result; + + delta_index->zone_count = zone_count; + delta_index->list_count = list_count; + delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count); + delta_index->memory_size = 0; + delta_index->mutable = true; + delta_index->tag = tag; + + for (z = 0; z < zone_count; z++) { + u32 lists_in_zone = delta_index->lists_per_zone; + u32 first_list_in_zone = z * lists_in_zone; + + if (z == zone_count - 1) { + /* + * The last zone gets fewer lists if zone_count doesn't evenly divide + * list_count. We'll have an underflow if the assertion below doesn't hold. + */ + if (delta_index->list_count <= first_list_in_zone) { + uds_uninitialize_delta_index(delta_index); + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "%u delta lists not enough for %u zones", + list_count, zone_count); + } + lists_in_zone = delta_index->list_count - first_list_in_zone; + } + + zone_memory = get_zone_memory_size(zone_count, memory_size); + result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory, + first_list_in_zone, lists_in_zone, + mean_delta, payload_bits, tag); + if (result != UDS_SUCCESS) { + uds_uninitialize_delta_index(delta_index); + return result; + } + + delta_index->memory_size += + (sizeof(struct delta_zone) + zone_memory + + (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64))); + } + + uds_reset_delta_index(delta_index); + return UDS_SUCCESS; +} + +/* Read a bit field from an arbitrary bit boundary. */ +static inline u32 get_field(const u8 *memory, u64 offset, u8 size) +{ + const void *addr = memory + offset / BITS_PER_BYTE; + + return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1); +} + +/* Write a bit field to an arbitrary bit boundary. */ +static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size) +{ + void *addr = memory + offset / BITS_PER_BYTE; + int shift = offset % BITS_PER_BYTE; + u32 data = get_unaligned_le32(addr); + + data &= ~(((1 << size) - 1) << shift); + data |= value << shift; + put_unaligned_le32(data, addr); +} + +/* Get the bit offset to the immutable delta list header. */ +static inline u32 get_immutable_header_offset(u32 list_number) +{ + return sizeof(struct delta_page_header) * BITS_PER_BYTE + + list_number * IMMUTABLE_HEADER_SIZE; +} + +/* Get the bit offset to the start of the immutable delta list bit stream. */ +static inline u32 get_immutable_start(const u8 *memory, u32 list_number) +{ + return get_field(memory, get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +/* Set the bit offset to the start of the immutable delta list bit stream. */ +static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start) +{ + set_field(start, memory, get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce, + u8 *memory, size_t memory_size) +{ + unsigned int i; + + /* + * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the + * entire volume at least once. + */ + if (nonce != expected_nonce) + return false; + + /* Verify that the number of delta lists can fit in the page. */ + if (list_count > ((memory_size - sizeof(struct delta_page_header)) * + BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE)) + return false; + + /* + * Verify that the first delta list is immediately after the last delta + * list header. + */ + if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1)) + return false; + + /* Verify that the lists are in the correct order. */ + for (i = 0; i < list_count; i++) { + if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1)) + return false; + } + + /* + * Verify that the last list ends on the page, and that there is room + * for the post-field guard bits. + */ + if (get_immutable_start(memory, list_count) > + (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE) + return false; + + /* Verify that the guard bytes are correctly set to all ones. */ + for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { + if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0) + return false; + } + + /* All verifications passed. */ + return true; +} + +/* Initialize a delta index page to refer to a supplied page. */ +int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, + u64 expected_nonce, u32 mean_delta, u32 payload_bits, + u8 *memory, size_t memory_size) +{ + u64 nonce; + u64 vcn; + u64 first_list; + u64 list_count; + struct delta_page_header *header = (struct delta_page_header *) memory; + struct delta_zone *delta_zone = &delta_index_page->delta_zone; + const u8 *nonce_addr = (const u8 *) &header->nonce; + const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number; + const u8 *first_list_addr = (const u8 *) &header->first_list; + const u8 *list_count_addr = (const u8 *) &header->list_count; + + /* First assume that the header is little endian. */ + nonce = get_unaligned_le64(nonce_addr); + vcn = get_unaligned_le64(vcn_addr); + first_list = get_unaligned_le16(first_list_addr); + list_count = get_unaligned_le16(list_count_addr); + if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, + memory_size)) { + /* If that fails, try big endian. */ + nonce = get_unaligned_be64(nonce_addr); + vcn = get_unaligned_be64(vcn_addr); + first_list = get_unaligned_be16(first_list_addr); + list_count = get_unaligned_be16(list_count_addr); + if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, + memory_size)) { + /* + * Both attempts failed. Do not log this as an error, because it can happen + * during a rebuild if we haven't written the entire volume at least once. + */ + return UDS_CORRUPT_DATA; + } + } + + delta_index_page->delta_index.delta_zones = delta_zone; + delta_index_page->delta_index.zone_count = 1; + delta_index_page->delta_index.list_count = list_count; + delta_index_page->delta_index.lists_per_zone = list_count; + delta_index_page->delta_index.mutable = false; + delta_index_page->delta_index.tag = 'p'; + delta_index_page->virtual_chapter_number = vcn; + delta_index_page->lowest_list_number = first_list; + delta_index_page->highest_list_number = first_list + list_count - 1; + + compute_coding_constants(mean_delta, &delta_zone->min_bits, + &delta_zone->min_keys, &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->memory = memory; + delta_zone->delta_lists = NULL; + delta_zone->new_offsets = NULL; + delta_zone->buffered_writer = NULL; + delta_zone->size = memory_size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = 0; + delta_zone->list_count = list_count; + delta_zone->tag = 'p'; + + return UDS_SUCCESS; +} + +/* Read a large bit field from an arbitrary bit boundary. */ +static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size) +{ + const void *addr = memory + offset / BITS_PER_BYTE; + + return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1); +} + +/* Write a large bit field to an arbitrary bit boundary. */ +static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size) +{ + void *addr = memory + offset / BITS_PER_BYTE; + u8 shift = offset % BITS_PER_BYTE; + u64 data = get_unaligned_le64(addr); + + data &= ~(((1UL << size) - 1) << shift); + data |= value << shift; + put_unaligned_le64(data, addr); +} + +/* Set a sequence of bits to all zeros. */ +static inline void set_zero(u8 *memory, u64 offset, u32 size) +{ + if (size > 0) { + u8 *addr = memory + offset / BITS_PER_BYTE; + u8 shift = offset % BITS_PER_BYTE; + u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size; + + *addr++ &= ~(((1 << count) - 1) << shift); + for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE) + *addr++ = 0; + + if (size > 0) + *addr &= 0xFF << size; + } +} + +/* + * Move several bits from a higher to a lower address, moving the lower addressed bits first. The + * size and memory offsets are measured in bits. + */ +static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + const u8 *source; + u8 *destination; + u8 offset; + u8 count; + u64 field; + + /* Start by moving one field that ends on a to int boundary. */ + count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32))); + field = get_big_field(from, from_offset, count); + set_big_field(field, to, to_offset, count); + from_offset += count; + to_offset += count; + size -= count; + + /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ + offset = from_offset % BITS_PER_TYPE(u32); + source = from + (from_offset - offset) / BITS_PER_BYTE; + destination = to + to_offset / BITS_PER_BYTE; + while (size > MAX_BIG_FIELD_BITS) { + put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); + source += sizeof(u32); + destination += sizeof(u32); + from_offset += BITS_PER_TYPE(u32); + to_offset += BITS_PER_TYPE(u32); + size -= BITS_PER_TYPE(u32); + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move several bits from a lower to a higher address, moving the higher addressed bits first. The + * size and memory offsets are measured in bits. + */ +static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + const u8 *source; + u8 *destination; + u8 offset; + u8 count; + u64 field; + + /* Start by moving one field that begins on a destination int boundary. */ + count = (to_offset + size) % BITS_PER_TYPE(u32); + if (count > 0) { + size -= count; + field = get_big_field(from, from_offset + size, count); + set_big_field(field, to, to_offset + size, count); + } + + /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ + offset = (from_offset + size) % BITS_PER_TYPE(u32); + source = from + (from_offset + size - offset) / BITS_PER_BYTE; + destination = to + (to_offset + size) / BITS_PER_BYTE; + while (size > MAX_BIG_FIELD_BITS) { + source -= sizeof(u32); + destination -= sizeof(u32); + size -= BITS_PER_TYPE(u32); + put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move bits from one field to another. When the fields overlap, behave as if we first move all the + * bits from the source to a temporary value, and then move all the bits from the temporary value + * to the destination. The size and memory offsets are measured in bits. + */ +static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + u64 field; + + /* A small move doesn't require special handling. */ + if (size <= MAX_BIG_FIELD_BITS) { + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } + + return; + } + + if (from_offset > to_offset) + move_bits_down(from, from_offset, to, to_offset, size); + else + move_bits_up(from, from_offset, to, to_offset, size); +} + +/* + * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta + * lists (starting with a specified list index) is copied from the mutable delta index into a + * memory page used in the immutable index. The number of lists copied onto the page is returned in + * list_count. + */ +int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce, + u8 *memory, size_t memory_size, u64 virtual_chapter_number, + u32 first_list, u32 *list_count) +{ + const struct delta_zone *delta_zone; + struct delta_list *delta_lists; + u32 max_lists; + u32 n_lists = 0; + u32 offset; + u32 i; + int free_bits; + int bits; + struct delta_page_header *header; + + delta_zone = &delta_index->delta_zones[0]; + delta_lists = &delta_zone->delta_lists[first_list + 1]; + max_lists = delta_index->list_count - first_list; + + /* + * Compute how many lists will fit on the page. Subtract the size of the fixed header, one + * delta list offset, and the guard bytes from the page size to determine how much space is + * available for delta lists. + */ + free_bits = memory_size * BITS_PER_BYTE; + free_bits -= get_immutable_header_offset(1); + free_bits -= GUARD_BITS; + if (free_bits < IMMUTABLE_HEADER_SIZE) { + /* This page is too small to store any delta lists. */ + return uds_log_error_strerror(UDS_OVERFLOW, + "Chapter Index Page of %zu bytes is too small", + memory_size); + } + + while (n_lists < max_lists) { + /* Each list requires a delta list offset and the list data. */ + bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size; + if (bits > free_bits) + break; + + n_lists++; + free_bits -= bits; + } + + *list_count = n_lists; + + header = (struct delta_page_header *) memory; + put_unaligned_le64(header_nonce, (u8 *) &header->nonce); + put_unaligned_le64(virtual_chapter_number, + (u8 *) &header->virtual_chapter_number); + put_unaligned_le16(first_list, (u8 *) &header->first_list); + put_unaligned_le16(n_lists, (u8 *) &header->list_count); + + /* Construct the delta list offset table. */ + offset = get_immutable_header_offset(n_lists + 1); + set_immutable_start(memory, 0, offset); + for (i = 0; i < n_lists; i++) { + offset += delta_lists[i].size; + set_immutable_start(memory, i + 1, offset); + } + + /* Copy the delta list data onto the memory page. */ + for (i = 0; i < n_lists; i++) { + move_bits(delta_zone->memory, delta_lists[i].start, memory, + get_immutable_start(memory, i), delta_lists[i].size); + } + + /* Set all the bits in the guard bytes. */ + memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0, + POST_FIELD_GUARD_BYTES); + return UDS_SUCCESS; +} + +/* Compute the new offsets of the delta lists. */ +static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index, + size_t growing_size, size_t used_space) +{ + size_t spacing; + u32 i; + struct delta_list *delta_lists = delta_zone->delta_lists; + u32 tail_guard_index = delta_zone->list_count + 1; + + spacing = (delta_zone->size - used_space) / delta_zone->list_count; + delta_zone->new_offsets[0] = 0; + for (i = 0; i <= delta_zone->list_count; i++) { + delta_zone->new_offsets[i + 1] = + (delta_zone->new_offsets[i] + + get_delta_list_byte_size(&delta_lists[i]) + spacing); + delta_zone->new_offsets[i] *= BITS_PER_BYTE; + delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE; + if (i == 0) + delta_zone->new_offsets[i + 1] -= spacing / 2; + if (i + 1 == growing_index) + delta_zone->new_offsets[i + 1] += growing_size; + } + + delta_zone->new_offsets[tail_guard_index] = + (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size); +} + +static void rebalance_lists(struct delta_zone *delta_zone) +{ + struct delta_list *delta_lists; + u32 i; + size_t used_space = 0; + + /* Extend and balance memory to receive the delta lists */ + delta_lists = delta_zone->delta_lists; + for (i = 0; i <= delta_zone->list_count + 1; i++) + used_space += get_delta_list_byte_size(&delta_lists[i]); + + compute_new_list_offsets(delta_zone, 0, 0, used_space); + for (i = 1; i <= delta_zone->list_count + 1; i++) + delta_lists[i].start = delta_zone->new_offsets[i]; +} + +/* Start restoring a delta index from multiple input streams. */ +int uds_start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int zone_count = reader_count; + u64 record_count = 0; + u64 collision_count = 0; + u32 first_list[MAX_ZONES]; + u32 list_count[MAX_ZONES]; + unsigned int z; + u32 list_next = 0; + const struct delta_zone *delta_zone; + + /* Read and validate each header. */ + for (z = 0; z < zone_count; z++) { + struct delta_index_header header; + u8 buffer[sizeof(struct delta_index_header)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(buffered_readers[z], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u32_le(buffer, &offset, &header.zone_number); + decode_u32_le(buffer, &offset, &header.zone_count); + decode_u32_le(buffer, &offset, &header.first_list); + decode_u32_le(buffer, &offset, &header.list_count); + decode_u64_le(buffer, &offset, &header.record_count); + decode_u64_le(buffer, &offset, &header.collision_count); + + result = ASSERT(offset == sizeof(struct delta_index_header), + "%zu bytes decoded of %zu expected", offset, + sizeof(struct delta_index_header)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta index header"); + } + + if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file has bad magic number"); + } + + if (zone_count != header.zone_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain mismatched zone counts (%u,%u)", + zone_count, header.zone_count); + } + + if (header.zone_number != z) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index zone %u found in slot %u", + header.zone_number, z); + } + + first_list[z] = header.first_list; + list_count[z] = header.list_count; + record_count += header.record_count; + collision_count += header.collision_count; + + if (first_list[z] != list_next) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file for zone %u starts with list %u instead of list %u", + z, first_list[z], list_next); + } + + list_next += list_count[z]; + } + + if (list_next != delta_index->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %u delta lists instead of %u delta lists", + list_next, delta_index->list_count); + } + + if (collision_count > record_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %llu collisions and %llu records", + (unsigned long long) collision_count, + (unsigned long long) record_count); + } + + uds_reset_delta_index(delta_index); + delta_index->delta_zones[0].record_count = record_count; + delta_index->delta_zones[0].collision_count = collision_count; + + /* Read the delta lists and distribute them to the proper zones. */ + for (z = 0; z < zone_count; z++) { + u32 i; + + delta_index->load_lists[z] = 0; + for (i = 0; i < list_count[z]; i++) { + u16 delta_list_size; + u32 list_number; + unsigned int zone_number; + u8 size_data[sizeof(u16)]; + + result = uds_read_from_buffered_reader(buffered_readers[z], + size_data, + sizeof(size_data)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta index size"); + } + + delta_list_size = get_unaligned_le16(size_data); + if (delta_list_size > 0) + delta_index->load_lists[z] += 1; + + list_number = first_list[z] + i; + zone_number = list_number / delta_index->lists_per_zone; + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + delta_zone->delta_lists[list_number + 1].size = delta_list_size; + } + } + + /* Prepare each zone to start receiving the delta list data. */ + for (z = 0; z < delta_index->zone_count; z++) + rebalance_lists(&delta_index->delta_zones[z]); + + return UDS_SUCCESS; +} + +static int restore_delta_list_to_zone(struct delta_zone *delta_zone, + const struct delta_list_save_info *save_info, + const u8 *data) +{ + struct delta_list *delta_list; + u16 bit_count; + u16 byte_count; + u32 list_number = save_info->index - delta_zone->first_list; + + if (list_number >= delta_zone->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u not in range [%u,%u)", + save_info->index, delta_zone->first_list, + delta_zone->first_list + delta_zone->list_count); + } + + delta_list = &delta_zone->delta_lists[list_number + 1]; + if (delta_list->size == 0) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list number %u", + save_info->index); + } + + bit_count = delta_list->size + save_info->bit_offset; + byte_count = BITS_TO_BYTES(bit_count); + if (save_info->byte_count != byte_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list size %u != %u", + save_info->byte_count, byte_count); + } + + move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start, + delta_list->size); + return UDS_SUCCESS; +} + +static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone, + struct buffered_reader *buffered_reader, u8 *data) +{ + int result; + struct delta_list_save_info save_info; + u8 buffer[sizeof(struct delta_list_save_info)]; + unsigned int new_zone; + + result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta list data"); + } + + save_info = (struct delta_list_save_info) { + .tag = buffer[0], + .bit_offset = buffer[1], + .byte_count = get_unaligned_le16(&buffer[2]), + .index = get_unaligned_le32(&buffer[4]), + }; + + if ((save_info.bit_offset >= BITS_PER_BYTE) || + (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "corrupt delta list data"); + } + + /* Make sure the data is intended for this delta index. */ + if (save_info.tag != delta_index->tag) + return UDS_CORRUPT_DATA; + + if (save_info.index >= delta_index->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u of %u", + save_info.index, + delta_index->list_count); + } + + result = uds_read_from_buffered_reader(buffered_reader, data, + save_info.byte_count); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta list data"); + } + + delta_index->load_lists[load_zone] -= 1; + new_zone = save_info.index / delta_index->lists_per_zone; + return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone], + &save_info, data); +} + +/* Restore delta lists from saved data. */ +int uds_finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + int saved_result = UDS_SUCCESS; + unsigned int z; + u8 *data; + + result = uds_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data); + if (result != UDS_SUCCESS) + return result; + + for (z = 0; z < reader_count; z++) { + while (delta_index->load_lists[z] > 0) { + result = restore_delta_list_data(delta_index, z, + buffered_readers[z], data); + if (result != UDS_SUCCESS) { + saved_result = result; + break; + } + } + } + + uds_free(data); + return saved_result; +} + +int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int z; + u8 buffer[sizeof(struct delta_list_save_info)]; + + for (z = 0; z < reader_count; z++) { + result = uds_read_from_buffered_reader(buffered_readers[z], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) + return result; + + if (buffer[0] != 'z') + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +static int flush_delta_list(struct delta_zone *zone, u32 flush_index) +{ + struct delta_list *delta_list; + u8 buffer[sizeof(struct delta_list_save_info)]; + int result; + + delta_list = &zone->delta_lists[flush_index + 1]; + + buffer[0] = zone->tag; + buffer[1] = delta_list->start % BITS_PER_BYTE; + put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]); + put_unaligned_le32(zone->first_list + flush_index, &buffer[4]); + + result = uds_write_to_buffered_writer(zone->buffered_writer, buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, "failed to write delta list memory"); + return result; + } + + result = uds_write_to_buffered_writer(zone->buffered_writer, + zone->memory + get_delta_list_byte_start(delta_list), + get_delta_list_byte_size(delta_list)); + if (result != UDS_SUCCESS) + uds_log_warning_strerror(result, "failed to write delta list memory"); + + return result; +} + +/* Start saving a delta index zone to a buffered output stream. */ +int uds_start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer) +{ + int result; + u32 i; + struct delta_zone *delta_zone; + u8 buffer[sizeof(struct delta_index_header)]; + size_t offset = 0; + + delta_zone = &delta_index->delta_zones[zone_number]; + memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u32_le(buffer, &offset, zone_number); + encode_u32_le(buffer, &offset, delta_index->zone_count); + encode_u32_le(buffer, &offset, delta_zone->first_list); + encode_u32_le(buffer, &offset, delta_zone->list_count); + encode_u64_le(buffer, &offset, delta_zone->record_count); + encode_u64_le(buffer, &offset, delta_zone->collision_count); + + result = ASSERT(offset == sizeof(struct delta_index_header), + "%zu bytes encoded of %zu expected", offset, + sizeof(struct delta_index_header)); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); + if (result != UDS_SUCCESS) + return uds_log_warning_strerror(result, + "failed to write delta index header"); + + for (i = 0; i < delta_zone->list_count; i++) { + u8 data[sizeof(u16)]; + struct delta_list *delta_list; + + delta_list = &delta_zone->delta_lists[i + 1]; + put_unaligned_le16(delta_list->size, data); + result = uds_write_to_buffered_writer(buffered_writer, data, + sizeof(data)); + if (result != UDS_SUCCESS) + return uds_log_warning_strerror(result, + "failed to write delta list size"); + } + + delta_zone->buffered_writer = buffered_writer; + return UDS_SUCCESS; +} + +int uds_finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number) +{ + int result; + int first_error = UDS_SUCCESS; + u32 i; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + delta_zone = &delta_index->delta_zones[zone_number]; + for (i = 0; i < delta_zone->list_count; i++) { + delta_list = &delta_zone->delta_lists[i + 1]; + if (delta_list->size > 0) { + result = flush_delta_list(delta_zone, i); + if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS)) + first_error = result; + } + } + + delta_zone->buffered_writer = NULL; + return first_error; +} + +int uds_write_guard_delta_list(struct buffered_writer *buffered_writer) +{ + int result; + u8 buffer[sizeof(struct delta_list_save_info)]; + + memset(buffer, 0, sizeof(struct delta_list_save_info)); + buffer[0] = 'z'; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + uds_log_warning_strerror(result, "failed to write guard delta list"); + + return UDS_SUCCESS; +} + +size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size) +{ + /* One zone will use at least as much memory as other zone counts. */ + return (sizeof(struct delta_index_header) + + list_count * (sizeof(struct delta_list_save_info) + 1) + + get_zone_memory_size(1, memory_size)); +} + +static int assert_not_at_end(const struct delta_index_entry *delta_entry) +{ + int result = ASSERT(!delta_entry->at_end, + "operation is invalid because the list entry is at the end of the delta list"); + if (result != UDS_SUCCESS) + result = UDS_BAD_STATE; + + return result; +} + +/* + * Prepare to search for an entry in the specified delta list. + * + * This is always the first function to be called when dealing with delta index entries. It is + * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The + * fields of the delta_index_entry argument will be set up for iteration, but will not contain an + * entry from the list. + */ +int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number, + u32 key, struct delta_index_entry *delta_entry) +{ + int result; + unsigned int zone_number; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = ASSERT((list_number < delta_index->list_count), + "Delta list number (%u) is out of range (%u)", list_number, + delta_index->list_count); + if (result != UDS_SUCCESS) + return UDS_CORRUPT_DATA; + + zone_number = list_number / delta_index->lists_per_zone; + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + result = ASSERT((list_number < delta_zone->list_count), + "Delta list number (%u) is out of range (%u) for zone (%u)", + list_number, delta_zone->list_count, zone_number); + if (result != UDS_SUCCESS) + return UDS_CORRUPT_DATA; + + if (delta_index->mutable) { + delta_list = &delta_zone->delta_lists[list_number + 1]; + } else { + u32 end_offset; + + /* + * Translate the immutable delta list header into a temporary + * full delta list header. + */ + delta_list = &delta_entry->temp_delta_list; + delta_list->start = get_immutable_start(delta_zone->memory, list_number); + end_offset = get_immutable_start(delta_zone->memory, list_number + 1); + delta_list->size = end_offset - delta_list->start; + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + if (key > delta_list->save_key) { + delta_entry->key = delta_list->save_key; + delta_entry->offset = delta_list->save_offset; + } else { + delta_entry->key = 0; + delta_entry->offset = 0; + if (key == 0) { + /* + * This usually means we're about to walk the entire delta list, so get all + * of it into the CPU cache. + */ + uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE], + delta_list->size / BITS_PER_BYTE, false); + } + } + + delta_entry->at_end = false; + delta_entry->delta_zone = delta_zone; + delta_entry->delta_list = delta_list; + delta_entry->entry_bits = 0; + delta_entry->is_collision = false; + delta_entry->list_number = list_number; + delta_entry->list_overflow = false; + delta_entry->value_bits = delta_zone->value_bits; + return UDS_SUCCESS; +} + +static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry) +{ + return delta_entry->delta_list->start + delta_entry->offset; +} + +/* + * Decode a delta index entry delta value. The delta_index_entry basically describes the previous + * list entry, and has had its offset field changed to point to the subsequent entry. We decode the + * bit stream and update the delta_list_entry to describe the entry. + */ +static inline void decode_delta(struct delta_index_entry *delta_entry) +{ + int key_bits; + u32 delta; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + const u8 *memory = delta_zone->memory; + u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + const u8 *addr = memory + delta_offset / BITS_PER_BYTE; + int offset = delta_offset % BITS_PER_BYTE; + u32 data = get_unaligned_le32(addr) >> offset; + + addr += sizeof(u32); + key_bits = delta_zone->min_bits; + delta = data & ((1 << key_bits) - 1); + if (delta >= delta_zone->min_keys) { + data >>= key_bits; + if (data == 0) { + key_bits = sizeof(u32) * BITS_PER_BYTE - offset; + while ((data = get_unaligned_le32(addr)) == 0) { + addr += sizeof(u32); + key_bits += sizeof(u32) * BITS_PER_BYTE; + } + } + key_bits += ffs(data); + delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys); + } + delta_entry->delta = delta; + delta_entry->key += delta; + + /* Check for a collision, a delta of zero after the start. */ + if (unlikely((delta == 0) && (delta_entry->offset > 0))) { + delta_entry->is_collision = true; + delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS; + } else { + delta_entry->is_collision = false; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; + } +} + +noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + const struct delta_list *delta_list; + u32 next_offset; + u16 size; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + delta_list = delta_entry->delta_list; + delta_entry->offset += delta_entry->entry_bits; + size = delta_list->size; + if (unlikely(delta_entry->offset >= size)) { + delta_entry->at_end = true; + delta_entry->delta = 0; + delta_entry->is_collision = false; + result = ASSERT((delta_entry->offset == size), + "next offset past end of delta list"); + if (result != UDS_SUCCESS) + result = UDS_CORRUPT_DATA; + + return result; + } + + decode_delta(delta_entry); + + next_offset = delta_entry->offset + delta_entry->entry_bits; + if (next_offset > size) { + /* + * This is not an assertion because uds_validate_chapter_index_page() wants to + * handle this error. + */ + uds_log_warning("Decoded past the end of the delta list"); + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry) +{ + int result; + struct delta_list *delta_list = delta_entry->delta_list; + + result = ASSERT(!delta_entry->is_collision, "entry is not a collision"); + if (result != UDS_SUCCESS) + return result; + + delta_list->save_key = delta_entry->key - delta_entry->delta; + delta_list->save_offset = delta_entry->offset; + return UDS_SUCCESS; +} + +static void set_delta(struct delta_index_entry *delta_entry, u32 delta) +{ + const struct delta_zone *delta_zone = delta_entry->delta_zone; + u32 key_bits = (delta_zone->min_bits + + ((delta_zone->incr_keys - delta_zone->min_keys + delta) / + delta_zone->incr_keys)); + + delta_entry->delta = delta; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; +} + +static void get_collision_name(const struct delta_index_entry *entry, u8 *name) +{ + u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; + const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; + int size = COLLISION_BYTES; + int shift = offset % BITS_PER_BYTE; + + while (--size >= 0) + *name++ = get_unaligned_le16(addr++) >> shift; +} + +static void set_collision_name(const struct delta_index_entry *entry, const u8 *name) +{ + u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; + u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; + int size = COLLISION_BYTES; + int shift = offset % BITS_PER_BYTE; + u16 mask = ~((u16) 0xFF << shift); + u16 data; + + while (--size >= 0) { + data = (get_unaligned_le16(addr) & mask) | (*name++ << shift); + put_unaligned_le16(data, addr++); + } +} + +int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number, + u32 key, const u8 *name, + struct delta_index_entry *delta_entry) +{ + int result; + + result = uds_start_delta_index_search(delta_index, list_number, key, + delta_entry); + if (result != UDS_SUCCESS) + return result; + + do { + result = uds_next_delta_index_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + } while (!delta_entry->at_end && (key > delta_entry->key)); + + result = uds_remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) + return result; + + if (!delta_entry->at_end && (key == delta_entry->key)) { + struct delta_index_entry collision_entry = *delta_entry; + + for (;;) { + u8 full_name[COLLISION_BYTES]; + + result = uds_next_delta_index_entry(&collision_entry); + if (result != UDS_SUCCESS) + return result; + + if (collision_entry.at_end || !collision_entry.is_collision) + break; + + get_collision_name(&collision_entry, full_name); + if (memcmp(full_name, name, COLLISION_BYTES) == 0) { + *delta_entry = collision_entry; + break; + } + } + } + + return UDS_SUCCESS; +} + +int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name) +{ + int result; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(delta_entry->is_collision, + "Cannot get full block name from a non-collision delta index entry"); + if (result != UDS_SUCCESS) + return UDS_BAD_STATE; + + get_collision_name(delta_entry, name); + return UDS_SUCCESS; +} + +u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry) +{ + return get_field(delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), delta_entry->value_bits); +} + +static int assert_mutable_entry(const struct delta_index_entry *delta_entry) +{ + int result = ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list), + "delta index is mutable"); + if (result != UDS_SUCCESS) + result = UDS_BAD_STATE; + + return result; +} + +int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value) +{ + int result; + u32 value_mask = (1 << delta_entry->value_bits) - 1; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT((value & value_mask) == value, + "Value (%u) being set in a delta index is too large (must fit in %u bits)", + value, delta_entry->value_bits); + if (result != UDS_SUCCESS) + return UDS_INVALID_ARGUMENT; + + set_field(value, delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), delta_entry->value_bits); + return UDS_SUCCESS; +} + +/* + * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated + * by growing_index, then rebalancing the lists in the new chunk. + */ +static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index, + size_t growing_size) +{ + ktime_t start_time; + ktime_t end_time; + struct delta_list *delta_lists; + u32 i; + size_t used_space; + + + /* Calculate the amount of space that is or will be in use. */ + start_time = current_time_ns(CLOCK_MONOTONIC); + delta_lists = delta_zone->delta_lists; + used_space = growing_size; + for (i = 0; i <= delta_zone->list_count + 1; i++) + used_space += get_delta_list_byte_size(&delta_lists[i]); + + if (delta_zone->size < used_space) + return UDS_OVERFLOW; + + /* Compute the new offsets of the delta lists. */ + compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space); + + /* + * When we rebalance the delta list, we will include the end guard list in the rebalancing. + * It contains the end guard data, which must be copied. + */ + rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1); + end_time = current_time_ns(CLOCK_MONOTONIC); + delta_zone->rebalance_count++; + delta_zone->rebalance_time += ktime_sub(end_time, start_time); + return UDS_SUCCESS; +} + +static int insert_bits(struct delta_index_entry *delta_entry, u16 size) +{ + u64 free_before; + u64 free_after; + u64 source; + u64 destination; + u32 count; + bool before_flag; + u8 *memory; + struct delta_zone *delta_zone = delta_entry->delta_zone; + struct delta_list *delta_list = delta_entry->delta_list; + /* Compute bits in use before and after the inserted bits. */ + u32 total_size = delta_list->size; + u32 before_size = delta_entry->offset; + u32 after_size = total_size - delta_entry->offset; + + if (total_size + size > U16_MAX) { + delta_entry->list_overflow = true; + delta_zone->overflow_count++; + return UDS_OVERFLOW; + } + + /* Compute bits available before and after the delta list. */ + free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); + free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); + + if ((size <= free_before) && (size <= free_after)) { + /* + * We have enough space to use either before or after the list. Select the smaller + * amount of data. If it is exactly the same, try to take from the larger amount of + * free space. + */ + if (before_size < after_size) + before_flag = true; + else if (after_size < before_size) + before_flag = false; + else + before_flag = free_before > free_after; + } else if (size <= free_before) { + /* There is space before but not after. */ + before_flag = true; + } else if (size <= free_after) { + /* There is space after but not before. */ + before_flag = false; + } else { + /* + * Neither of the surrounding spaces is large enough for this request. Extend + * and/or rebalance the delta list memory choosing to move the least amount of + * data. + */ + int result; + u32 growing_index = delta_entry->list_number + 1; + + before_flag = before_size < after_size; + if (!before_flag) + growing_index++; + result = extend_delta_zone(delta_zone, growing_index, + BITS_TO_BYTES(size)); + if (result != UDS_SUCCESS) + return result; + } + + delta_list->size += size; + if (before_flag) { + source = delta_list->start; + destination = source - size; + delta_list->start -= size; + count = before_size; + } else { + source = delta_list->start + delta_entry->offset; + destination = source + size; + count = after_size; + } + + memory = delta_zone->memory; + move_bits(memory, source, memory, destination, count); + return UDS_SUCCESS; +} + +static void encode_delta(const struct delta_index_entry *delta_entry) +{ + u32 temp; + u32 t1; + u32 t2; + u64 offset; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + u8 *memory = delta_zone->memory; + + offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + if (delta_entry->delta < delta_zone->min_keys) { + set_field(delta_entry->delta, memory, offset, delta_zone->min_bits); + return; + } + + temp = delta_entry->delta - delta_zone->min_keys; + t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys; + t2 = temp / delta_zone->incr_keys; + set_field(t1, memory, offset, delta_zone->min_bits); + set_zero(memory, offset + delta_zone->min_bits, t2); + set_field(1, memory, offset + delta_zone->min_bits + t2, 1); +} + +static void encode_entry(const struct delta_index_entry *delta_entry, u32 value, + const u8 *name) +{ + u8 *memory = delta_entry->delta_zone->memory; + u64 offset = get_delta_entry_offset(delta_entry); + + set_field(value, memory, offset, delta_entry->value_bits); + encode_delta(delta_entry); + if (name != NULL) + set_collision_name(delta_entry, name); +} + +/* + * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must + * be provided. + */ +int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value, + const u8 *name) +{ + int result; + struct delta_zone *delta_zone; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + if (delta_entry->is_collision) { + /* + * The caller wants us to insert a collision entry onto a collision entry. This + * happens when we find a collision and attempt to add the name again to the index. + * This is normally a fatal error unless we are replaying a closed chapter while we + * are rebuilding a volume index. + */ + return UDS_DUPLICATE_NAME; + } + + if (delta_entry->offset < delta_entry->delta_list->save_offset) { + /* + * The saved entry offset is after the new entry and will no longer be valid, so + * replace it with the insertion point. + */ + result = uds_remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) + return result; + } + + if (name != NULL) { + /* Insert a collision entry which is placed after this entry. */ + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT((key == delta_entry->key), + "incorrect key for collision entry"); + if (result != UDS_SUCCESS) + return result; + + delta_entry->offset += delta_entry->entry_bits; + set_delta(delta_entry, 0); + delta_entry->is_collision = true; + delta_entry->entry_bits += COLLISION_BITS; + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else if (delta_entry->at_end) { + /* Insert a new entry at the end of the delta list. */ + result = ASSERT((key >= delta_entry->key), "key past end of list"); + if (result != UDS_SUCCESS) + return result; + + set_delta(delta_entry, key - delta_entry->key); + delta_entry->key = key; + delta_entry->at_end = false; + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else { + u16 old_entry_size; + u16 additional_size; + struct delta_index_entry next_entry; + u32 next_value; + + /* + * Insert a new entry which requires the delta in the following entry to be + * updated. + */ + result = ASSERT((key < delta_entry->key), + "key precedes following entry"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT((key >= delta_entry->key - delta_entry->delta), + "key effects following entry's delta"); + if (result != UDS_SUCCESS) + return result; + + old_entry_size = delta_entry->entry_bits; + next_entry = *delta_entry; + next_value = uds_get_delta_entry_value(&next_entry); + set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta)); + delta_entry->key = key; + set_delta(&next_entry, next_entry.key - key); + next_entry.offset += delta_entry->entry_bits; + /* The two new entries are always bigger than the single entry being replaced. */ + additional_size = (delta_entry->entry_bits + + next_entry.entry_bits - old_entry_size); + result = insert_bits(delta_entry, additional_size); + if (result != UDS_SUCCESS) + return result; + + encode_entry(&next_entry, next_value, NULL); + } + + if (result != UDS_SUCCESS) + return result; + + encode_entry(delta_entry, value, name); + delta_zone = delta_entry->delta_zone; + delta_zone->record_count++; + delta_zone->collision_count += delta_entry->is_collision ? 1 : 0; + return UDS_SUCCESS; +} + +static void delete_bits(const struct delta_index_entry *delta_entry, int size) +{ + u64 source; + u64 destination; + u32 count; + bool before_flag; + struct delta_list *delta_list = delta_entry->delta_list; + u8 *memory = delta_entry->delta_zone->memory; + /* Compute bits retained before and after the deleted bits. */ + u32 total_size = delta_list->size; + u32 before_size = delta_entry->offset; + u32 after_size = total_size - delta_entry->offset - size; + + /* + * Determine whether to add to the available space either before or after the delta list. + * We prefer to move the least amount of data. If it is exactly the same, try to add to the + * smaller amount of free space. + */ + if (before_size < after_size) { + before_flag = true; + } else if (after_size < before_size) { + before_flag = false; + } else { + u64 free_before = + (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); + u64 free_after = + (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); + + before_flag = (free_before < free_after); + } + + delta_list->size -= size; + if (before_flag) { + source = delta_list->start; + destination = source + size; + delta_list->start += size; + count = before_size; + } else { + destination = delta_list->start + delta_entry->offset; + source = destination + size; + count = after_size; + } + + move_bits(memory, source, memory, destination, count); +} + +int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + struct delta_index_entry next_entry; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + next_entry = *delta_entry; + result = uds_next_delta_index_entry(&next_entry); + if (result != UDS_SUCCESS) + return result; + + delta_zone = delta_entry->delta_zone; + + if (delta_entry->is_collision) { + /* This is a collision entry, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.offset = delta_entry->offset; + delta_zone->collision_count -= 1; + } else if (next_entry.at_end) { + /* This entry is at the end of the list, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.key -= delta_entry->delta; + next_entry.offset = delta_entry->offset; + } else { + /* The delta in the next entry needs to be updated. */ + u32 next_value = uds_get_delta_entry_value(&next_entry); + u16 old_size = delta_entry->entry_bits + next_entry.entry_bits; + + if (next_entry.is_collision) { + next_entry.is_collision = false; + delta_zone->collision_count -= 1; + } + + set_delta(&next_entry, delta_entry->delta + next_entry.delta); + next_entry.offset = delta_entry->offset; + /* The one new entry is always smaller than the two entries being replaced. */ + delete_bits(delta_entry, old_size - next_entry.entry_bits); + encode_entry(&next_entry, next_value, NULL); + } + + delta_zone->record_count--; + delta_zone->discard_count++; + *delta_entry = next_entry; + + delta_list = delta_entry->delta_list; + if (delta_entry->offset < delta_list->save_offset) { + /* The saved entry offset is no longer valid. */ + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + return UDS_SUCCESS; +} + +void uds_get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats) +{ + unsigned int z; + const struct delta_zone *delta_zone; + + memset(stats, 0, sizeof(struct delta_index_stats)); + for (z = 0; z < delta_index->zone_count; z++) { + delta_zone = &delta_index->delta_zones[z]; + stats->rebalance_time += delta_zone->rebalance_time; + stats->rebalance_count += delta_zone->rebalance_count; + stats->record_count += delta_zone->record_count; + stats->collision_count += delta_zone->collision_count; + stats->discard_count += delta_zone->discard_count; + stats->overflow_count += delta_zone->overflow_count; + stats->list_count += delta_zone->list_count; + } +} + +size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits) +{ + u16 min_bits; + u32 incr_keys; + u32 min_keys; + + compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys); + /* On average, each delta is encoded into about min_bits + 1.5 bits. */ + return entry_count * (payload_bits + min_bits + 1) + entry_count / 2; +} + +u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, + u32 payload_bits, size_t bytes_per_page) +{ + unsigned int bits_per_delta_list; + unsigned int bits_per_page; + size_t bits_per_index; + + /* Compute the expected number of bits needed for all the entries. */ + bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta, + payload_bits); + bits_per_delta_list = bits_per_index / list_count; + + /* Add in the immutable delta list headers. */ + bits_per_index += list_count * IMMUTABLE_HEADER_SIZE; + /* Compute the number of usable bits on an immutable index page. */ + bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE); + /* + * Reduce the bits per page by one immutable delta list header and one delta list to + * account for internal fragmentation. + */ + bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list; + /* Now compute the number of pages needed. */ + return DIV_ROUND_UP(bits_per_index, bits_per_page); +} + +void uds_log_delta_index_entry(struct delta_index_entry *delta_entry) +{ + uds_log_ratelimit(uds_log_info, + "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s", + delta_entry->list_number, delta_entry->key, + delta_entry->offset, delta_entry->at_end ? " end" : "", + delta_entry->is_collision ? " collision" : "", + delta_entry->delta_list->size, + delta_entry->list_overflow ? " overflow" : ""); + delta_entry->list_overflow = false; +} diff --git a/drivers/md/dm-vdo/indexer/delta-index.h b/drivers/md/dm-vdo/indexer/delta-index.h new file mode 100644 index 0000000000000..3d2ea19aef616 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/delta-index.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_DELTA_INDEX_H +#define UDS_DELTA_INDEX_H + +#include + +#include "numeric.h" +#include "time-utils.h" + +#include "config.h" +#include "io-factory.h" + +/* + * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the + * value). The entries are sorted by address, and only the delta between successive addresses is + * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are + * therefore exponentially distributed. + * + * A delta_index can either be mutable or immutable depending on its expected use. The immutable + * form of a delta index is used for the indexes of closed chapters committed to the volume. The + * mutable form of a delta index is used by the volume index, and also by the chapter index in an + * open chapter. Like the index as a whole, each mutable delta index is divided into a number of + * independent zones. + */ + +struct delta_list { + /* The offset of the delta list start, in bits */ + u64 start; + /* The number of bits in the delta list */ + u16 size; + /* Where the last search "found" the key, in bits */ + u16 save_offset; + /* The key for the record just before save_offset */ + u32 save_key; +}; + +struct delta_zone { + /* The delta list memory */ + u8 *memory; + /* The delta list headers */ + struct delta_list *delta_lists; + /* Temporary starts of delta lists */ + u64 *new_offsets; + /* Buffered writer for saving an index */ + struct buffered_writer *buffered_writer; + /* The size of delta list memory */ + size_t size; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of bits in a stored value */ + u8 value_bits; + /* The number of bits in the minimal key code */ + u16 min_bits; + /* The number of keys used in a minimal code */ + u32 min_keys; + /* The number of keys used for another code bit */ + u32 incr_keys; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOW errors detected */ + u64 overflow_count; + /* The index of the first delta list */ + u32 first_list; + /* The number of delta lists */ + u32 list_count; + /* Tag belonging to this delta index */ + u8 tag; +} __aligned(L1_CACHE_BYTES); + +struct delta_list_save_info { + /* Tag identifying which delta index this list is in */ + u8 tag; + /* Bit offset of the start of the list data */ + u8 bit_offset; + /* Number of bytes of list data */ + u16 byte_count; + /* The delta list number within the delta index */ + u32 index; +} __packed; + +struct delta_index { + /* The zones */ + struct delta_zone *delta_zones; + /* The number of zones */ + unsigned int zone_count; + /* The number of delta lists */ + u32 list_count; + /* Maximum lists per zone */ + u32 lists_per_zone; + /* Total memory allocated to this index */ + size_t memory_size; + /* The number of non-empty lists at load time per zone */ + u32 load_lists[MAX_ZONES]; + /* True if this index is mutable */ + bool mutable; + /* Tag belonging to this delta index */ + u8 tag; +}; + +/* + * A delta_index_page describes a single page of a chapter index. The delta_index field allows the + * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter + * index page as a single zone index, and without the need to do an additional memory allocation. + */ +struct delta_index_page { + struct delta_index delta_index; + /* These values are loaded from the delta_page_header */ + u32 lowest_list_number; + u32 highest_list_number; + u64 virtual_chapter_number; + /* This structure describes the single zone of a delta index page. */ + struct delta_zone delta_zone; +}; + +/* + * Notes on the delta_index_entries: + * + * The fields documented as "public" can be read by any code that uses a delta_index. The fields + * documented as "private" carry information between delta_index method calls and should not be + * used outside the delta_index module. + * + * (1) The delta_index_entry is used like an iterator when searching a delta list. + * + * (2) It is also the result of a successful search and can be used to refer to the element found + * by the search. + * + * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion + * point for a new record. + * + * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new + * record at the end of the list. + * + * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a + * collision entry in the list, and the delta_list entry can be used a a reference to this + * entry. + * + * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a + * non-collision entry in the list. Such delta_list entries can be used as a reference to a + * found entry, or an insertion point for a non-collision entry before this entry, or an + * insertion point for a collision entry that collides with this entry. + */ +struct delta_index_entry { + /* Public fields */ + /* The key for this entry */ + u32 key; + /* We are after the last list entry */ + bool at_end; + /* This record is a collision */ + bool is_collision; + + /* Private fields */ + /* This delta list overflowed */ + bool list_overflow; + /* The number of bits used for the value */ + u8 value_bits; + /* The number of bits used for the entire entry */ + u16 entry_bits; + /* The delta index zone */ + struct delta_zone *delta_zone; + /* The delta list containing the entry */ + struct delta_list *delta_list; + /* The delta list number */ + u32 list_number; + /* Bit offset of this entry within the list */ + u16 offset; + /* The delta between this and previous entry */ + u32 delta; + /* Temporary delta list for immutable indices */ + struct delta_list temp_delta_list; +}; + +struct delta_index_stats { + /* Number of bytes allocated */ + size_t memory_allocated; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOW errors detected */ + u64 overflow_count; + /* The number of delta lists */ + u32 list_count; +}; + +int __must_check uds_initialize_delta_index(struct delta_index *delta_index, + unsigned int zone_count, u32 list_count, + u32 mean_delta, u32 payload_bits, + size_t memory_size, u8 tag); + +int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, + u64 expected_nonce, u32 mean_delta, + u32 payload_bits, u8 *memory, + size_t memory_size); + +void uds_uninitialize_delta_index(struct delta_index *delta_index); + +void uds_reset_delta_index(const struct delta_index *delta_index); + +int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index, + u64 header_nonce, u8 *memory, + size_t memory_size, + u64 virtual_chapter_number, u32 first_list, + u32 *list_count); + +int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer); + +int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number); + +int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer); + +size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count, + size_t memory_size); + +int __must_check uds_start_delta_index_search(const struct delta_index *delta_index, + u32 list_number, u32 key, + struct delta_index_entry *iterator); + +int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry); + +int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry); + +int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index, + u32 list_number, u32 key, const u8 *name, + struct delta_index_entry *delta_entry); + +int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, + u8 *name); + +u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry); + +int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value); + +int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, + u32 value, const u8 *name); + +int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry); + +void uds_get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats); + +size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, + u32 payload_bits); + +u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, + u32 payload_bits, size_t bytes_per_page); + +void uds_log_delta_index_entry(struct delta_index_entry *delta_entry); + +#endif /* UDS_DELTA_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c new file mode 100644 index 0000000000000..d2b49e39550c9 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "funnel-requestqueue.h" + +#include +#include +#include + +#include "funnel-queue.h" +#include "logger.h" +#include "memory-alloc.h" +#include "thread-utils.h" + +/* + * This queue will attempt to handle requests in reasonably sized batches instead of reacting + * immediately to each new request. The wait time between batches is dynamically adjusted up or + * down to try to balance responsiveness against wasted thread run time. + * + * If the wait time becomes long enough, the queue will become dormant and must be explicitly + * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel + * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a + * wakeup of the worker thread. + * + * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to + * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before + * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or + * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel + * queue's "next" field update isn't visible yet to make the entry accessible, its existence will + * kick the worker thread out of dormant mode and back into timer-based mode. + * + * Unbatched requests are used to communicate between different zone threads and will also cause + * the queue to awaken immediately. + */ + +enum { + NANOSECOND = 1, + MICROSECOND = 1000 * NANOSECOND, + MILLISECOND = 1000 * MICROSECOND, + DEFAULT_WAIT_TIME = 20 * MICROSECOND, + MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, + MAXIMUM_WAIT_TIME = MILLISECOND, + MINIMUM_BATCH = 32, + MAXIMUM_BATCH = 64, +}; + +struct uds_request_queue { + /* Wait queue for synchronizing producers and consumer */ + struct wait_queue_head wait_head; + /* Function to process a request */ + uds_request_queue_processor_fn processor; + /* Queue of new incoming requests */ + struct funnel_queue *main_queue; + /* Queue of old requests to retry */ + struct funnel_queue *retry_queue; + /* The thread id of the worker thread */ + struct thread *thread; + /* True if the worker was started */ + bool started; + /* When true, requests can be enqueued */ + bool running; + /* A flag set when the worker is waiting without a timeout */ + atomic_t dormant; +}; + +static inline struct uds_request *poll_queues(struct uds_request_queue *queue) +{ + struct funnel_queue_entry *entry; + + entry = uds_funnel_queue_poll(queue->retry_queue); + if (entry != NULL) + return container_of(entry, struct uds_request, queue_link); + + entry = uds_funnel_queue_poll(queue->main_queue); + if (entry != NULL) + return container_of(entry, struct uds_request, queue_link); + + return NULL; +} + +static inline bool are_queues_idle(struct uds_request_queue *queue) +{ + return uds_is_funnel_queue_idle(queue->retry_queue) && + uds_is_funnel_queue_idle(queue->main_queue); +} + +/* + * Determine if there is a next request to process, and return it if there is. Also return flags + * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether + * the thread did sleep before returning a new request. + */ +static inline bool dequeue_request(struct uds_request_queue *queue, + struct uds_request **request_ptr, bool *waited_ptr) +{ + struct uds_request *request = poll_queues(queue); + + if (request != NULL) { + *request_ptr = request; + return true; + } + + if (!READ_ONCE(queue->running)) { + /* Wake the worker thread so it can exit. */ + *request_ptr = NULL; + return true; + } + + *request_ptr = NULL; + *waited_ptr = true; + return false; +} + +static void wait_for_request(struct uds_request_queue *queue, bool dormant, + unsigned long timeout, struct uds_request **request, + bool *waited) +{ + if (dormant) { + wait_event_interruptible(queue->wait_head, + (dequeue_request(queue, request, waited) || + !are_queues_idle(queue))); + return; + } + + wait_event_interruptible_hrtimeout(queue->wait_head, + dequeue_request(queue, request, waited), + ns_to_ktime(timeout)); +} + +static void request_queue_worker(void *arg) +{ + struct uds_request_queue *queue = arg; + struct uds_request *request = NULL; + unsigned long time_batch = DEFAULT_WAIT_TIME; + bool dormant = atomic_read(&queue->dormant); + bool waited = false; + long current_batch = 0; + + for (;;) { + wait_for_request(queue, dormant, time_batch, &request, &waited); + if (likely(request != NULL)) { + current_batch++; + queue->processor(request); + } else if (!READ_ONCE(queue->running)) { + break; + } + + if (dormant) { + /* + * The queue has been roused from dormancy. Clear the flag so enqueuers can + * stop broadcasting. No fence is needed for this transition. + */ + atomic_set(&queue->dormant, false); + dormant = false; + time_batch = DEFAULT_WAIT_TIME; + } else if (waited) { + /* + * We waited for this request to show up. Adjust the wait time to smooth + * out the batch size. + */ + if (current_batch < MINIMUM_BATCH) { + /* + * If the last batch of requests was too small, increase the wait + * time. + */ + time_batch += time_batch / 4; + if (time_batch >= MAXIMUM_WAIT_TIME) { + atomic_set(&queue->dormant, true); + dormant = true; + } + } else if (current_batch > MAXIMUM_BATCH) { + /* + * If the last batch of requests was too large, decrease the wait + * time. + */ + time_batch -= time_batch / 4; + if (time_batch < MINIMUM_WAIT_TIME) + time_batch = MINIMUM_WAIT_TIME; + } + current_batch = 0; + } + } + + /* + * Ensure that we process any remaining requests that were enqueued before trying to shut + * down. The corresponding write barrier is in uds_request_queue_finish(). + */ + smp_rmb(); + while ((request = poll_queues(queue)) != NULL) + queue->processor(request); +} + +int uds_make_request_queue(const char *queue_name, + uds_request_queue_processor_fn processor, + struct uds_request_queue **queue_ptr) +{ + int result; + struct uds_request_queue *queue; + + result = uds_allocate(1, struct uds_request_queue, __func__, &queue); + if (result != UDS_SUCCESS) + return result; + + queue->processor = processor; + queue->running = true; + atomic_set(&queue->dormant, false); + init_waitqueue_head(&queue->wait_head); + + result = uds_make_funnel_queue(&queue->main_queue); + if (result != UDS_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + result = uds_make_funnel_queue(&queue->retry_queue); + if (result != UDS_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + result = vdo_create_thread(request_queue_worker, queue, queue_name, + &queue->thread); + if (result != UDS_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + queue->started = true; + *queue_ptr = queue; + return UDS_SUCCESS; +} + +static inline void wake_up_worker(struct uds_request_queue *queue) +{ + if (wq_has_sleeper(&queue->wait_head)) + wake_up(&queue->wait_head); +} + +void uds_request_queue_enqueue(struct uds_request_queue *queue, + struct uds_request *request) +{ + struct funnel_queue *sub_queue; + bool unbatched = request->unbatched; + + sub_queue = request->requeued ? queue->retry_queue : queue->main_queue; + uds_funnel_queue_put(sub_queue, &request->queue_link); + + /* + * We must wake the worker thread when it is dormant. A read fence isn't needed here since + * we know the queue operation acts as one. + */ + if (atomic_read(&queue->dormant) || unbatched) + wake_up_worker(queue); +} + +void uds_request_queue_finish(struct uds_request_queue *queue) +{ + if (queue == NULL) + return; + + /* + * This memory barrier ensures that any requests we queued will be seen. The point is that + * when dequeue_request() sees the following update to the running flag, it will also be + * able to see any change we made to a next field in the funnel queue entry. The + * corresponding read barrier is in request_queue_worker(). + */ + smp_wmb(); + WRITE_ONCE(queue->running, false); + + if (queue->started) { + wake_up_worker(queue); + vdo_join_threads(queue->thread); + } + + uds_free_funnel_queue(queue->main_queue); + uds_free_funnel_queue(queue->retry_queue); + uds_free(queue); +} diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.h b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h new file mode 100644 index 0000000000000..9b0f53939b4dd --- /dev/null +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_REQUEST_QUEUE_H +#define UDS_REQUEST_QUEUE_H + +#include "indexer.h" + +/* + * A simple request queue which will handle new requests in the order in which they are received, + * and will attempt to handle requeued requests before new ones. However, the nature of the + * implementation means that it cannot guarantee this ordering; the prioritization is merely a + * hint. + */ + +struct uds_request_queue; + +typedef void (*uds_request_queue_processor_fn)(struct uds_request *); + +int __must_check uds_make_request_queue(const char *queue_name, + uds_request_queue_processor_fn processor, + struct uds_request_queue **queue_ptr); + +void uds_request_queue_enqueue(struct uds_request_queue *queue, + struct uds_request *request); + +void uds_request_queue_finish(struct uds_request_queue *queue); + +#endif /* UDS_REQUEST_QUEUE_H */ diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c new file mode 100644 index 0000000000000..38c18283cdde8 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/geometry.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "geometry.h" + +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "delta-index.h" +#include "indexer.h" + +/* + * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a + * fixed number of fixed-size pages. The volume layout is defined by two constants and four + * parameters. The constants are that index records are 32 bytes long (16-byte block name plus + * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters + * are the number of bytes in a page, the number of record pages in a chapter, the number of + * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can + * derive the rest of the layout and other index properties. + * + * The index volume is sized by its maximum memory footprint. For a dense index, the persistent + * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent + * storage is about 100 times the size of the memory footprint. + * + * For a small index with a memory footprint less than 1GB, there are three possible memory + * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records + * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter + * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For + * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5 + * GB for the persistent storage and 256 MB of RAM. + * + * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024 + * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024 + * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window + * of 1 TB using about 9GB of persistent storage and 1 GB of RAM. + * + * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times + * as many chapters as the corresponding non-sparse volume, which provides 10 times the + * deduplication window while using 10 times as much persistent storage as the equivalent + * non-sparse volume with the same memory footprint. + * + * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters + * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual + * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter. + * This remapping is expressed by storing which virtual chapter was remapped, and which physical + * chapter it was moved to. + */ + +int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, + u32 chapters_per_volume, u32 sparse_chapters_per_volume, + u64 remapped_virtual, u64 remapped_physical, + struct index_geometry **geometry_ptr) +{ + int result; + struct index_geometry *geometry; + + result = uds_allocate(1, struct index_geometry, "geometry", &geometry); + if (result != UDS_SUCCESS) + return result; + + geometry->bytes_per_page = bytes_per_page; + geometry->record_pages_per_chapter = record_pages_per_chapter; + geometry->chapters_per_volume = chapters_per_volume; + geometry->sparse_chapters_per_volume = sparse_chapters_per_volume; + geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume; + geometry->remapped_virtual = remapped_virtual; + geometry->remapped_physical = remapped_physical; + + geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD; + geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter; + geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume; + + geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; + geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1); + /* + * We want 1 delta list for every 64 records in the chapter. + * The "| 077" ensures that the chapter_delta_list_bits computation + * does not underflow. + */ + geometry->chapter_delta_list_bits = + bits_per((geometry->records_per_chapter - 1) | 077) - 6; + geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits; + /* We need enough address bits to achieve the desired mean delta. */ + geometry->chapter_address_bits = + (DEFAULT_CHAPTER_MEAN_DELTA_BITS - + geometry->chapter_delta_list_bits + + bits_per(geometry->records_per_chapter - 1)); + geometry->index_pages_per_chapter = + uds_get_delta_index_page_count(geometry->records_per_chapter, + geometry->delta_lists_per_chapter, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + bytes_per_page); + + geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter; + geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume; + geometry->bytes_per_volume = + bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME); + + *geometry_ptr = geometry; + return UDS_SUCCESS; +} + +int uds_copy_index_geometry(struct index_geometry *source, + struct index_geometry **geometry_ptr) +{ + return uds_make_index_geometry(source->bytes_per_page, + source->record_pages_per_chapter, + source->chapters_per_volume, + source->sparse_chapters_per_volume, + source->remapped_virtual, source->remapped_physical, + geometry_ptr); +} + +void uds_free_index_geometry(struct index_geometry *geometry) +{ + uds_free(geometry); +} + +u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, + u64 virtual_chapter) +{ + u64 delta; + + if (!uds_is_reduced_index_geometry(geometry)) + return virtual_chapter % geometry->chapters_per_volume; + + if (likely(virtual_chapter > geometry->remapped_virtual)) { + delta = virtual_chapter - geometry->remapped_virtual; + if (likely(delta > geometry->remapped_physical)) + return delta % geometry->chapters_per_volume; + else + return delta - 1; + } + + if (virtual_chapter == geometry->remapped_virtual) + return geometry->remapped_physical; + + delta = geometry->remapped_virtual - virtual_chapter; + if (delta < geometry->chapters_per_volume) + return geometry->chapters_per_volume - delta; + + /* This chapter is so old the answer doesn't matter. */ + return 0; +} + +/* Check whether any sparse chapters are in use. */ +bool uds_has_sparse_chapters(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, u64 newest_virtual_chapter) +{ + return uds_is_sparse_index_geometry(geometry) && + ((newest_virtual_chapter - oldest_virtual_chapter + 1) > + geometry->dense_chapters_per_volume); +} + +bool uds_is_chapter_sparse(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, u64 newest_virtual_chapter, + u64 virtual_chapter_number) +{ + return uds_has_sparse_chapters(geometry, oldest_virtual_chapter, + newest_virtual_chapter) && + ((virtual_chapter_number + geometry->dense_chapters_per_volume) <= + newest_virtual_chapter); +} + +/* Calculate how many chapters to expire after opening the newest chapter. */ +u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter) +{ + /* If the index isn't full yet, don't expire anything. */ + if (newest_chapter < geometry->chapters_per_volume) + return 0; + + /* If a chapter is out of order... */ + if (geometry->remapped_physical > 0) { + u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume; + + /* + * ... expire an extra chapter when expiring the moved chapter to free physical + * space for the new chapter ... + */ + if (oldest_chapter == geometry->remapped_virtual) + return 2; + + /* + * ... but don't expire anything when the new chapter will use the physical chapter + * freed by expiring the moved chapter. + */ + if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical)) + return 0; + } + + /* Normally, just expire one. */ + return 1; +} diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h new file mode 100644 index 0000000000000..a2ecdb238cf2d --- /dev/null +++ b/drivers/md/dm-vdo/indexer/geometry.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_GEOMETRY_H +#define UDS_INDEX_GEOMETRY_H + +#include "indexer.h" + +/* + * The index_geometry records parameters that define the layout of a UDS index volume, and the size and + * shape of various index structures. It is created when the index is created, and is referenced by + * many index sub-components. + */ + +struct index_geometry { + /* Size of a chapter page, in bytes */ + size_t bytes_per_page; + /* Number of record pages in a chapter */ + u32 record_pages_per_chapter; + /* Total number of chapters in a volume */ + u32 chapters_per_volume; + /* Number of sparsely-indexed chapters in a volume */ + u32 sparse_chapters_per_volume; + /* Number of bits used to determine delta list numbers */ + u8 chapter_delta_list_bits; + /* Virtual chapter remapped from physical chapter 0 */ + u64 remapped_virtual; + /* New physical chapter where the remapped chapter can be found */ + u64 remapped_physical; + + /* + * The following properties are derived from the ones above, but they are computed and + * recorded as fields for convenience. + */ + /* Total number of pages in a volume, excluding the header */ + u32 pages_per_volume; + /* Total number of bytes in a volume, including the header */ + size_t bytes_per_volume; + /* Number of pages in a chapter */ + u32 pages_per_chapter; + /* Number of index pages in a chapter index */ + u32 index_pages_per_chapter; + /* Number of records that fit on a page */ + u32 records_per_page; + /* Number of records that fit in a chapter */ + u32 records_per_chapter; + /* Number of records that fit in a volume */ + u64 records_per_volume; + /* Number of delta lists per chapter index */ + u32 delta_lists_per_chapter; + /* Mean delta for chapter indexes */ + u32 chapter_mean_delta; + /* Number of bits needed for record page numbers */ + u8 chapter_payload_bits; + /* Number of bits used to compute addresses for chapter delta lists */ + u8 chapter_address_bits; + /* Number of densely-indexed chapters in a volume */ + u32 dense_chapters_per_volume; +}; + +enum { + /* The number of bytes in a record (name + metadata) */ + BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE), + + /* The default length of a page in a chapter, in bytes */ + DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, + + /* The default maximum number of records per page */ + DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, + + /* The default number of record pages in a chapter */ + DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, + + /* The default number of record pages in a chapter for a small index */ + SMALL_RECORD_PAGES_PER_CHAPTER = 64, + + /* The default number of chapters in a volume */ + DEFAULT_CHAPTERS_PER_VOLUME = 1024, + + /* The default number of sparsely-indexed chapters in a volume */ + DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, + + /* The log2 of the default mean delta */ + DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, + + /* The log2 of the number of delta lists in a large chapter */ + DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, + + /* The log2 of the number of delta lists in a small chapter */ + SMALL_CHAPTER_DELTA_LIST_BITS = 10, + + /* The number of header pages per volume */ + HEADER_PAGES_PER_VOLUME = 1, +}; + +int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, + u32 chapters_per_volume, + u32 sparse_chapters_per_volume, u64 remapped_virtual, + u64 remapped_physical, + struct index_geometry **geometry_ptr); + +int __must_check uds_copy_index_geometry(struct index_geometry *source, + struct index_geometry **geometry_ptr); + +void uds_free_index_geometry(struct index_geometry *geometry); + +u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, + u64 virtual_chapter); + +/* + * Check whether this geometry is reduced by a chapter. This will only be true if the volume was + * converted from a non-lvm volume to an lvm volume. + */ +static inline bool __must_check +uds_is_reduced_index_geometry(const struct index_geometry *geometry) +{ + return !!(geometry->chapters_per_volume & 1); +} + +static inline bool __must_check +uds_is_sparse_index_geometry(const struct index_geometry *geometry) +{ + return geometry->sparse_chapters_per_volume > 0; +} + +bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, + u64 newest_virtual_chapter); + +bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, + u64 newest_virtual_chapter, + u64 virtual_chapter_number); + +u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry, + u64 newest_chapter); + +#endif /* UDS_INDEX_GEOMETRY_H */ diff --git a/drivers/md/dm-vdo/indexer/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h new file mode 100644 index 0000000000000..6a8dd8ffea6ce --- /dev/null +++ b/drivers/md/dm-vdo/indexer/hash-utils.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_HASH_UTILS_H +#define UDS_HASH_UTILS_H + +#include "numeric.h" + +#include "geometry.h" +#include "indexer.h" + +/* Utilities for extracting portions of a request name for various uses. */ + +/* How various portions of a record name are apportioned. */ +enum { + VOLUME_INDEX_BYTES_OFFSET = 0, + VOLUME_INDEX_BYTES_COUNT = 8, + CHAPTER_INDEX_BYTES_OFFSET = 8, + CHAPTER_INDEX_BYTES_COUNT = 6, + SAMPLE_BYTES_OFFSET = 14, + SAMPLE_BYTES_COUNT = 2, +}; + +static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name) +{ + const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET]; + u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32; + + bytes |= get_unaligned_be32(chapter_bits + 2); + return bytes; +} + +static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name) +{ + return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]); +} + +static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name) +{ + return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]); +} + +/* Compute the chapter delta list for a given name. */ +static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name, + const struct index_geometry *geometry) +{ + return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) & + ((1 << geometry->chapter_delta_list_bits) - 1)); +} + +/* Compute the chapter delta address for a given name. */ +static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name, + const struct index_geometry *geometry) +{ + return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1); +} + +static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name, + unsigned int slot_count) +{ + return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count); +} + +#endif /* UDS_HASH_UTILS_H */ diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c new file mode 100644 index 0000000000000..af533aa270a86 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -0,0 +1,1769 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-layout.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "murmurhash3.h" +#include "numeric.h" +#include "time-utils.h" + +#include "config.h" +#include "open-chapter.h" +#include "volume-index.h" + +/* + * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of + * which are computed when the index is created. Every header and region begins on 4K block + * boundary. Save regions are further sub-divided into regions of their own. + * + * Each region has a kind and an instance number. Some kinds only have one instance and therefore + * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to + * represent sub-indices; now, however there is only ever one sub-index and therefore one instance. + * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved. + * + * Every region header has a type and version. + * + * +-+-+---------+--------+--------+-+ + * | | | I N D E X 0 101, 0 | | + * |H|C+---------+--------+--------+S| + * |D|f| Volume | Save | Save |e| + * |R|g| Region | Region | Region |a| + * | | | 201, -1 | 202, 0 | 202, 1 |l| + * +-+-+--------+---------+--------+-+ + * + * The header contains the encoded region layout table as well as some index configuration data. + * The sub-index region and its subdivisions are maintained in the same table. + * + * There are two save regions to preserve the old state in case saving the new state is incomplete. + * They are used in alternation. Each save region is further divided into sub-regions. + * + * +-+-----+------+------+-----+-----+ + * |H| IPM | MI | MI | | OC | + * |D| | zone | zone | ... | | + * |R| 301 | 302 | 302 | | 303 | + * | | -1 | 0 | 1 | | -1 | + * +-+-----+------+------+-----+-----+ + * + * The header contains the encoded region layout table as well as index state data for that save. + * Each save also has a unique nonce. + */ + +enum { + MAGIC_SIZE = 32, + NONCE_INFO_SIZE = 32, + MAX_SAVES = 2, +}; + +enum region_kind { + RL_KIND_EMPTY = 0, + RL_KIND_HEADER = 1, + RL_KIND_CONFIG = 100, + RL_KIND_INDEX = 101, + RL_KIND_SEAL = 102, + RL_KIND_VOLUME = 201, + RL_KIND_SAVE = 202, + RL_KIND_INDEX_PAGE_MAP = 301, + RL_KIND_VOLUME_INDEX = 302, + RL_KIND_OPEN_CHAPTER = 303, +}; + +/* Some region types are historical and are no longer used. */ +enum region_type { + RH_TYPE_FREE = 0, /* unused */ + RH_TYPE_SUPER = 1, + RH_TYPE_SAVE = 2, + RH_TYPE_CHECKPOINT = 3, /* unused */ + RH_TYPE_UNSAVED = 4, +}; + +enum { + RL_SOLE_INSTANCE = 65535, +}; + +/* + * Super block version 2 is the first released version. + * + * Super block version 3 is the normal version used from RHEL 8.2 onwards. + * + * Super block versions 4 through 6 were incremental development versions and + * are not supported. + * + * Super block version 7 is used for volumes which have been reduced in size by one chapter in + * order to make room to prepend LVM metadata to a volume originally created without lvm. This + * allows the index to retain most its deduplication records. + */ +enum { + SUPER_VERSION_MINIMUM = 3, + SUPER_VERSION_CURRENT = 3, + SUPER_VERSION_MAXIMUM = 7, +}; + +static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ + +struct region_header { + u64 magic; + u64 region_blocks; + u16 type; + /* Currently always version 1 */ + u16 version; + u16 region_count; + u16 payload; +}; + +struct layout_region { + u64 start_block; + u64 block_count; + u32 __unused; + u16 kind; + u16 instance; +}; + +struct region_table { + size_t encoded_size; + struct region_header header; + struct layout_region regions[]; +}; + +struct index_save_data { + u64 timestamp; + u64 nonce; + /* Currently always version 1 */ + u32 version; + u32 unused__; +}; + +struct index_state_version { + s32 signature; + s32 version_id; +}; + +static const struct index_state_version INDEX_STATE_VERSION_301 = { + .signature = -1, + .version_id = 301, +}; + +struct index_state_data301 { + struct index_state_version version; + u64 newest_chapter; + u64 oldest_chapter; + u64 last_save; + u32 unused; + u32 padding; +}; + +struct index_save_layout { + unsigned int zone_count; + struct layout_region index_save; + struct layout_region header; + struct layout_region index_page_map; + struct layout_region free_space; + struct layout_region volume_index_zones[MAX_ZONES]; + struct layout_region open_chapter; + struct index_save_data save_data; + struct index_state_data301 state_data; +}; + +struct sub_index_layout { + u64 nonce; + struct layout_region sub_index; + struct layout_region volume; + struct index_save_layout *saves; +}; + +struct super_block_data { + u8 magic_label[MAGIC_SIZE]; + u8 nonce_info[NONCE_INFO_SIZE]; + u64 nonce; + u32 version; + u32 block_size; + u16 index_count; + u16 max_saves; + /* Padding reflects a blank field on permanent storage */ + u8 padding[4]; + u64 open_chapter_blocks; + u64 page_map_blocks; + u64 volume_offset; + u64 start_offset; +}; + +struct index_layout { + struct io_factory *factory; + size_t factory_size; + off_t offset; + struct super_block_data super; + struct layout_region header; + struct layout_region config; + struct sub_index_layout index; + struct layout_region seal; + u64 total_blocks; +}; + +struct save_layout_sizes { + unsigned int save_count; + size_t block_size; + u64 volume_blocks; + u64 volume_index_blocks; + u64 page_map_blocks; + u64 open_chapter_blocks; + u64 save_blocks; + u64 sub_index_blocks; + u64 total_blocks; + size_t total_size; +}; + +static inline bool is_converted_super_block(struct super_block_data *super) +{ + return super->version == 7; +} + +static int __must_check compute_sizes(const struct uds_configuration *config, + struct save_layout_sizes *sls) +{ + int result; + struct index_geometry *geometry = config->geometry; + + memset(sls, 0, sizeof(*sls)); + sls->save_count = MAX_SAVES; + sls->block_size = UDS_BLOCK_SIZE; + sls->volume_blocks = geometry->bytes_per_volume / sls->block_size; + + result = uds_compute_volume_index_save_blocks(config, sls->block_size, + &sls->volume_index_blocks); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot compute index save size"); + + sls->page_map_blocks = + DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry), + sls->block_size); + sls->open_chapter_blocks = + DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry), + sls->block_size); + sls->save_blocks = + 1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks); + sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks); + sls->total_blocks = 3 + sls->sub_index_blocks; + sls->total_size = sls->total_blocks * sls->block_size; + + return UDS_SUCCESS; +} + +int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size) +{ + int result; + struct uds_configuration *index_config; + struct save_layout_sizes sizes; + + if (index_size == NULL) { + uds_log_error("Missing output size pointer"); + return -EINVAL; + } + + result = uds_make_configuration(parameters, &index_config); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "cannot compute index size"); + return uds_status_to_errno(result); + } + + result = compute_sizes(index_config, &sizes); + uds_free_configuration(index_config); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + *index_size = sizes.total_size; + return UDS_SUCCESS; +} + +/* Create unique data using the current time and a pseudorandom number. */ +static void create_unique_nonce_data(u8 *buffer) +{ + ktime_t now = current_time_ns(CLOCK_REALTIME); + u32 rand; + size_t offset = 0; + + get_random_bytes(&rand, sizeof(u32)); + memcpy(buffer + offset, &now, sizeof(now)); + offset += sizeof(now); + memcpy(buffer + offset, &rand, sizeof(rand)); + offset += sizeof(rand); + while (offset < NONCE_INFO_SIZE) { + size_t len = min(NONCE_INFO_SIZE - offset, offset); + + memcpy(buffer + offset, buffer, len); + offset += len; + } +} + +static u64 hash_stuff(u64 start, const void *data, size_t len) +{ + u32 seed = start ^ (start >> 27); + u8 hash_buffer[16]; + + murmurhash3_128(data, len, seed, hash_buffer); + return get_unaligned_le64(hash_buffer + 4); +} + +/* Generate a primary nonce from the provided data. */ +static u64 generate_primary_nonce(const void *data, size_t len) +{ + return hash_stuff(0xa1b1e0fc, data, len); +} + +/* + * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by + * hashing the original nonce and the data to produce a new nonce. + */ +static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len) +{ + return hash_stuff(nonce + 1, data, len); +} + +static int __must_check open_layout_reader(struct index_layout *layout, + struct layout_region *lr, off_t offset, + struct buffered_reader **reader_ptr) +{ + return uds_make_buffered_reader(layout->factory, lr->start_block + offset, + lr->block_count, reader_ptr); +} + +static int open_region_reader(struct index_layout *layout, struct layout_region *region, + struct buffered_reader **reader_ptr) +{ + return open_layout_reader(layout, region, -layout->super.start_offset, + reader_ptr); +} + +static int __must_check open_layout_writer(struct index_layout *layout, + struct layout_region *lr, off_t offset, + struct buffered_writer **writer_ptr) +{ + return uds_make_buffered_writer(layout->factory, lr->start_block + offset, + lr->block_count, writer_ptr); +} + +static int open_region_writer(struct index_layout *layout, struct layout_region *region, + struct buffered_writer **writer_ptr) +{ + return open_layout_writer(layout, region, -layout->super.start_offset, + writer_ptr); +} + +static void generate_super_block_data(struct save_layout_sizes *sls, + struct super_block_data *super) +{ + memset(super, 0, sizeof(*super)); + memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE); + create_unique_nonce_data(super->nonce_info); + + super->nonce = generate_primary_nonce(super->nonce_info, + sizeof(super->nonce_info)); + super->version = SUPER_VERSION_CURRENT; + super->block_size = sls->block_size; + super->index_count = 1; + super->max_saves = sls->save_count; + super->open_chapter_blocks = sls->open_chapter_blocks; + super->page_map_blocks = sls->page_map_blocks; + super->volume_offset = 0; + super->start_offset = 0; +} + +static void define_sub_index_nonce(struct index_layout *layout) +{ + struct sub_index_nonce_data { + u64 offset; + u16 index_id; + }; + struct sub_index_layout *sil = &layout->index; + u64 primary_nonce = layout->super.nonce; + u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 }; + size_t offset = 0; + + encode_u64_le(buffer, &offset, sil->sub_index.start_block); + encode_u16_le(buffer, &offset, 0); + sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer)); + if (sil->nonce == 0) { + sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer, + sizeof(buffer)); + } +} + +static void setup_sub_index(struct index_layout *layout, u64 start_block, + struct save_layout_sizes *sls) +{ + struct sub_index_layout *sil = &layout->index; + u64 next_block = start_block; + unsigned int i; + + sil->sub_index = (struct layout_region) { + .start_block = start_block, + .block_count = sls->sub_index_blocks, + .kind = RL_KIND_INDEX, + .instance = 0, + }; + + sil->volume = (struct layout_region) { + .start_block = next_block, + .block_count = sls->volume_blocks, + .kind = RL_KIND_VOLUME, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += sls->volume_blocks; + + for (i = 0; i < sls->save_count; i++) { + sil->saves[i].index_save = (struct layout_region) { + .start_block = next_block, + .block_count = sls->save_blocks, + .kind = RL_KIND_SAVE, + .instance = i, + }; + + next_block += sls->save_blocks; + } + + define_sub_index_nonce(layout); +} + +static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls) +{ + u64 next_block = layout->offset / sls->block_size; + + layout->total_blocks = sls->total_blocks; + generate_super_block_data(sls, &layout->super); + layout->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + layout->config = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_CONFIG, + .instance = RL_SOLE_INSTANCE, + }; + + setup_sub_index(layout, next_block, sls); + next_block += sls->sub_index_blocks; + + layout->seal = (struct layout_region) { + .start_block = next_block, + .block_count = 1, + .kind = RL_KIND_SEAL, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check make_index_save_region_table(struct index_save_layout *isl, + struct region_table **table_ptr) +{ + int result; + unsigned int z; + struct region_table *table; + struct layout_region *lr; + u16 region_count; + size_t payload; + size_t type; + + if (isl->zone_count > 0) { + /* + * Normal save regions: header, page map, volume index zones, + * open chapter, and possibly free space. + */ + region_count = 3 + isl->zone_count; + if (isl->free_space.block_count > 0) + region_count++; + + payload = sizeof(isl->save_data) + sizeof(isl->state_data); + type = RH_TYPE_SAVE; + } else { + /* Empty save regions: header, page map, free space. */ + region_count = 3; + payload = sizeof(isl->save_data); + type = RH_TYPE_UNSAVED; + } + + result = uds_allocate_extended(struct region_table, region_count, + struct layout_region, + "layout region table for ISL", &table); + if (result != UDS_SUCCESS) + return result; + + lr = &table->regions[0]; + *lr++ = isl->header; + *lr++ = isl->index_page_map; + for (z = 0; z < isl->zone_count; z++) + *lr++ = isl->volume_index_zones[z]; + + if (isl->zone_count > 0) + *lr++ = isl->open_chapter; + + if (isl->free_space.block_count > 0) + *lr++ = isl->free_space; + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = isl->index_save.block_count, + .type = type, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + table->encoded_size = (sizeof(struct region_header) + payload + + region_count * sizeof(struct layout_region)); + *table_ptr = table; + return UDS_SUCCESS; +} + +static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table) +{ + unsigned int i; + + encode_u64_le(buffer, offset, REGION_MAGIC); + encode_u64_le(buffer, offset, table->header.region_blocks); + encode_u16_le(buffer, offset, table->header.type); + encode_u16_le(buffer, offset, table->header.version); + encode_u16_le(buffer, offset, table->header.region_count); + encode_u16_le(buffer, offset, table->header.payload); + + for (i = 0; i < table->header.region_count; i++) { + encode_u64_le(buffer, offset, table->regions[i].start_block); + encode_u64_le(buffer, offset, table->regions[i].block_count); + encode_u32_le(buffer, offset, 0); + encode_u16_le(buffer, offset, table->regions[i].kind); + encode_u16_le(buffer, offset, table->regions[i].instance); + } +} + +static int __must_check write_index_save_header(struct index_save_layout *isl, + struct region_table *table, + struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + + result = uds_allocate(table->encoded_size, u8, "index save data", &buffer); + if (result != UDS_SUCCESS) + return result; + + encode_region_table(buffer, &offset, table); + encode_u64_le(buffer, &offset, isl->save_data.timestamp); + encode_u64_le(buffer, &offset, isl->save_data.nonce); + encode_u32_le(buffer, &offset, isl->save_data.version); + encode_u32_le(buffer, &offset, 0); + if (isl->zone_count > 0) { + encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature); + encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id); + encode_u64_le(buffer, &offset, isl->state_data.newest_chapter); + encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter); + encode_u64_le(buffer, &offset, isl->state_data.last_save); + encode_u64_le(buffer, &offset, 0); + } + + result = uds_write_to_buffered_writer(writer, buffer, offset); + uds_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +static int write_index_save_layout(struct index_layout *layout, + struct index_save_layout *isl) +{ + int result; + struct region_table *table; + struct buffered_writer *writer; + + result = make_index_save_region_table(isl, &table); + if (result != UDS_SUCCESS) + return result; + + result = open_region_writer(layout, &isl->header, &writer); + if (result != UDS_SUCCESS) { + uds_free(table); + return result; + } + + result = write_index_save_header(isl, table, writer); + uds_free(table); + uds_free_buffered_writer(writer); + + return result; +} + +static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks) +{ + u64 free_blocks; + u64 next_block = isl->index_save.start_block; + + isl->zone_count = 0; + memset(&isl->save_data, 0, sizeof(isl->save_data)); + + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += page_map_blocks; + + free_blocks = isl->index_save.block_count - page_map_blocks - 1; + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check invalidate_old_save(struct index_layout *layout, + struct index_save_layout *isl) +{ + reset_index_save_layout(isl, layout->super.page_map_blocks); + return write_index_save_layout(layout, isl); +} + +static int discard_index_state_data(struct index_layout *layout) +{ + int result; + int saved_result = UDS_SUCCESS; + unsigned int i; + + for (i = 0; i < layout->super.max_saves; i++) { + result = invalidate_old_save(layout, &layout->index.saves[i]); + if (result != UDS_SUCCESS) + saved_result = result; + } + + if (saved_result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "%s: cannot destroy all index saves", + __func__); + } + + return UDS_SUCCESS; +} + +static int __must_check make_layout_region_table(struct index_layout *layout, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + /* Regions: header, config, index, volume, saves, seal */ + u16 region_count = 5 + layout->super.max_saves; + u16 payload; + struct region_table *table; + struct layout_region *lr; + + result = uds_allocate_extended(struct region_table, region_count, + struct layout_region, "layout region table", + &table); + if (result != UDS_SUCCESS) + return result; + + lr = &table->regions[0]; + *lr++ = layout->header; + *lr++ = layout->config; + *lr++ = layout->index.sub_index; + *lr++ = layout->index.volume; + + for (i = 0; i < layout->super.max_saves; i++) + *lr++ = layout->index.saves[i].index_save; + + *lr++ = layout->seal; + + if (is_converted_super_block(&layout->super)) { + payload = sizeof(struct super_block_data); + } else { + payload = (sizeof(struct super_block_data) - + sizeof(layout->super.volume_offset) - + sizeof(layout->super.start_offset)); + } + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = layout->total_blocks, + .type = RH_TYPE_SUPER, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + table->encoded_size = (sizeof(struct region_header) + payload + + region_count * sizeof(struct layout_region)); + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check write_layout_header(struct index_layout *layout, + struct region_table *table, + struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + + result = uds_allocate(table->encoded_size, u8, "layout data", &buffer); + if (result != UDS_SUCCESS) + return result; + + encode_region_table(buffer, &offset, table); + memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE); + offset += MAGIC_SIZE; + memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE); + offset += NONCE_INFO_SIZE; + encode_u64_le(buffer, &offset, layout->super.nonce); + encode_u32_le(buffer, &offset, layout->super.version); + encode_u32_le(buffer, &offset, layout->super.block_size); + encode_u16_le(buffer, &offset, layout->super.index_count); + encode_u16_le(buffer, &offset, layout->super.max_saves); + encode_u32_le(buffer, &offset, 0); + encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks); + encode_u64_le(buffer, &offset, layout->super.page_map_blocks); + + if (is_converted_super_block(&layout->super)) { + encode_u64_le(buffer, &offset, layout->super.volume_offset); + encode_u64_le(buffer, &offset, layout->super.start_offset); + } + + result = uds_write_to_buffered_writer(writer, buffer, offset); + uds_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +static int __must_check write_uds_index_config(struct index_layout *layout, + struct uds_configuration *config, + off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + + result = open_layout_writer(layout, &layout->config, offset, &writer); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "failed to open config region"); + + result = uds_write_config_contents(writer, config, layout->super.version); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return uds_log_error_strerror(result, "failed to write config region"); + } + + result = uds_flush_buffered_writer(writer); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return uds_log_error_strerror(result, "cannot flush config writer"); + } + + uds_free_buffered_writer(writer); + return UDS_SUCCESS; +} + +static int __must_check save_layout(struct index_layout *layout, off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + struct region_table *table; + + result = make_layout_region_table(layout, &table); + if (result != UDS_SUCCESS) + return result; + + result = open_layout_writer(layout, &layout->header, offset, &writer); + if (result != UDS_SUCCESS) { + uds_free(table); + return result; + } + + result = write_layout_header(layout, table, writer); + uds_free(table); + uds_free_buffered_writer(writer); + + return result; +} + +static int create_index_layout(struct index_layout *layout, struct uds_configuration *config) +{ + int result; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(sizes.save_count, struct index_save_layout, __func__, + &layout->index.saves); + if (result != UDS_SUCCESS) + return result; + + initialize_layout(layout, &sizes); + + result = discard_index_state_data(layout); + if (result != UDS_SUCCESS) + return result; + + result = write_uds_index_config(layout, config, 0); + if (result != UDS_SUCCESS) + return result; + + return save_layout(layout, 0); +} + +static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl) +{ + struct save_nonce_data { + struct index_save_data data; + u64 offset; + } nonce_data; + u8 buffer[sizeof(nonce_data)]; + size_t offset = 0; + + encode_u64_le(buffer, &offset, isl->save_data.timestamp); + encode_u64_le(buffer, &offset, 0); + encode_u32_le(buffer, &offset, isl->save_data.version); + encode_u32_le(buffer, &offset, 0U); + encode_u64_le(buffer, &offset, isl->index_save.start_block); + ASSERT_LOG_ONLY(offset == sizeof(nonce_data), + "%zu bytes encoded of %zu expected", offset, sizeof(nonce_data)); + return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer)); +} + +static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce) +{ + if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0)) + return 0; + + if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl)) + return 0; + + return isl->save_data.timestamp; +} + +static int find_latest_uds_index_save_slot(struct index_layout *layout, + struct index_save_layout **isl_ptr) +{ + struct index_save_layout *latest = NULL; + struct index_save_layout *isl; + unsigned int i; + u64 save_time = 0; + u64 latest_time = 0; + + for (i = 0; i < layout->super.max_saves; i++) { + isl = &layout->index.saves[i]; + save_time = validate_index_save_layout(isl, layout->index.nonce); + if (save_time > latest_time) { + latest = isl; + latest_time = save_time; + } + } + + if (latest == NULL) { + uds_log_error("No valid index save found"); + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + + *isl_ptr = latest; + return UDS_SUCCESS; +} + +int uds_discard_open_chapter(struct index_layout *layout) +{ + int result; + struct index_save_layout *isl; + struct buffered_writer *writer; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) + return result; + + result = open_region_writer(layout, &isl->open_chapter, &writer); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return result; + } + + result = uds_flush_buffered_writer(writer); + uds_free_buffered_writer(writer); + return result; +} + +int uds_load_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_reader *readers[MAX_ZONES]; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) + return result; + + index->newest_virtual_chapter = isl->state_data.newest_chapter; + index->oldest_virtual_chapter = isl->state_data.oldest_chapter; + index->last_save = isl->state_data.last_save; + + result = open_region_reader(layout, &isl->open_chapter, &readers[0]); + if (result != UDS_SUCCESS) + return result; + + result = uds_load_open_chapter(index, readers[0]); + uds_free_buffered_reader(readers[0]); + if (result != UDS_SUCCESS) + return result; + + for (zone = 0; zone < isl->zone_count; zone++) { + result = open_region_reader(layout, &isl->volume_index_zones[zone], + &readers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) + uds_free_buffered_reader(readers[zone - 1]); + + return result; + } + } + + result = uds_load_volume_index(index->volume_index, readers, isl->zone_count); + for (zone = 0; zone < isl->zone_count; zone++) + uds_free_buffered_reader(readers[zone]); + if (result != UDS_SUCCESS) + return result; + + result = open_region_reader(layout, &isl->index_page_map, &readers[0]); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_index_page_map(index->volume->index_page_map, readers[0]); + uds_free_buffered_reader(readers[0]); + + return result; +} + +static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout) +{ + struct index_save_layout *oldest = NULL; + struct index_save_layout *isl; + unsigned int i; + u64 save_time = 0; + u64 oldest_time = 0; + + for (i = 0; i < layout->super.max_saves; i++) { + isl = &layout->index.saves[i]; + save_time = validate_index_save_layout(isl, layout->index.nonce); + if (oldest == NULL || save_time < oldest_time) { + oldest = isl; + oldest_time = save_time; + } + } + + return oldest; +} + +static void instantiate_index_save_layout(struct index_save_layout *isl, + struct super_block_data *super, + u64 volume_nonce, unsigned int zone_count) +{ + unsigned int z; + u64 next_block; + u64 free_blocks; + u64 volume_index_blocks; + + isl->zone_count = zone_count; + memset(&isl->save_data, 0, sizeof(isl->save_data)); + isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME)); + isl->save_data.version = 1; + isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl); + + next_block = isl->index_save.start_block; + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = super->page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + next_block += super->page_map_blocks; + + free_blocks = (isl->index_save.block_count - 1 - + super->page_map_blocks - + super->open_chapter_blocks); + volume_index_blocks = free_blocks / isl->zone_count; + for (z = 0; z < isl->zone_count; z++) { + isl->volume_index_zones[z] = (struct layout_region) { + .start_block = next_block, + .block_count = volume_index_blocks, + .kind = RL_KIND_VOLUME_INDEX, + .instance = z, + }; + + next_block += volume_index_blocks; + free_blocks -= volume_index_blocks; + } + + isl->open_chapter = (struct layout_region) { + .start_block = next_block, + .block_count = super->open_chapter_blocks, + .kind = RL_KIND_OPEN_CHAPTER, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += super->open_chapter_blocks; + + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int setup_uds_index_save_slot(struct index_layout *layout, + unsigned int zone_count, + struct index_save_layout **isl_ptr) +{ + int result; + struct index_save_layout *isl; + + isl = select_oldest_index_save_layout(layout); + result = invalidate_old_save(layout, isl); + if (result != UDS_SUCCESS) + return result; + + instantiate_index_save_layout(isl, &layout->super, layout->index.nonce, + zone_count); + + *isl_ptr = isl; + return UDS_SUCCESS; +} + +static void cancel_uds_index_save(struct index_save_layout *isl) +{ + memset(&isl->save_data, 0, sizeof(isl->save_data)); + memset(&isl->state_data, 0, sizeof(isl->state_data)); + isl->zone_count = 0; +} + +int uds_save_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_writer *writers[MAX_ZONES]; + + result = setup_uds_index_save_slot(layout, index->zone_count, &isl); + if (result != UDS_SUCCESS) + return result; + + isl->state_data = (struct index_state_data301) { + .newest_chapter = index->newest_virtual_chapter, + .oldest_chapter = index->oldest_virtual_chapter, + .last_save = index->last_save, + }; + + result = open_region_writer(layout, &isl->open_chapter, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = uds_save_open_chapter(index, writers[0]); + uds_free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + for (zone = 0; zone < index->zone_count; zone++) { + result = open_region_writer(layout, &isl->volume_index_zones[zone], + &writers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) + uds_free_buffered_writer(writers[zone - 1]); + + cancel_uds_index_save(isl); + return result; + } + } + + result = uds_save_volume_index(index->volume_index, writers, index->zone_count); + for (zone = 0; zone < index->zone_count; zone++) + uds_free_buffered_writer(writers[zone]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = open_region_writer(layout, &isl->index_page_map, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = uds_write_index_page_map(index->volume->index_page_map, writers[0]); + uds_free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + return write_index_save_layout(layout, isl); +} + +static int __must_check load_region_table(struct buffered_reader *reader, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + struct region_header header; + struct region_table *table; + u8 buffer[sizeof(struct region_header)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot read region table header"); + + decode_u64_le(buffer, &offset, &header.magic); + decode_u64_le(buffer, &offset, &header.region_blocks); + decode_u16_le(buffer, &offset, &header.type); + decode_u16_le(buffer, &offset, &header.version); + decode_u16_le(buffer, &offset, &header.region_count); + decode_u16_le(buffer, &offset, &header.payload); + + if (header.magic != REGION_MAGIC) + return UDS_NO_INDEX; + + if (header.version != 1) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown region table version %hu", + header.version); + } + + result = uds_allocate_extended(struct region_table, header.region_count, + struct layout_region, + "single file layout region table", &table); + if (result != UDS_SUCCESS) + return result; + + table->header = header; + for (i = 0; i < header.region_count; i++) { + u8 region_buffer[sizeof(struct layout_region)]; + + offset = 0; + result = uds_read_from_buffered_reader(reader, region_buffer, + sizeof(region_buffer)); + if (result != UDS_SUCCESS) { + uds_free(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "cannot read region table layouts"); + } + + decode_u64_le(region_buffer, &offset, &table->regions[i].start_block); + decode_u64_le(region_buffer, &offset, &table->regions[i].block_count); + offset += sizeof(u32); + decode_u16_le(region_buffer, &offset, &table->regions[i].kind); + decode_u16_le(region_buffer, &offset, &table->regions[i].instance); + } + + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check read_super_block_data(struct buffered_reader *reader, + struct index_layout *layout, + size_t saved_size) +{ + int result; + struct super_block_data *super = &layout->super; + u8 *buffer; + size_t offset = 0; + + result = uds_allocate(saved_size, u8, "super block data", &buffer); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, buffer, saved_size); + if (result != UDS_SUCCESS) { + uds_free(buffer); + return uds_log_error_strerror(result, "cannot read region table header"); + } + + memcpy(&super->magic_label, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE); + offset += NONCE_INFO_SIZE; + decode_u64_le(buffer, &offset, &super->nonce); + decode_u32_le(buffer, &offset, &super->version); + decode_u32_le(buffer, &offset, &super->block_size); + decode_u16_le(buffer, &offset, &super->index_count); + decode_u16_le(buffer, &offset, &super->max_saves); + offset += sizeof(u32); + decode_u64_le(buffer, &offset, &super->open_chapter_blocks); + decode_u64_le(buffer, &offset, &super->page_map_blocks); + + if (is_converted_super_block(super)) { + decode_u64_le(buffer, &offset, &super->volume_offset); + decode_u64_le(buffer, &offset, &super->start_offset); + } else { + super->volume_offset = 0; + super->start_offset = 0; + } + + uds_free(buffer); + + if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0) + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unknown superblock magic label"); + + if ((super->version < SUPER_VERSION_MINIMUM) || + (super->version == 4) || (super->version == 5) || (super->version == 6) || + (super->version > SUPER_VERSION_MAXIMUM)) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown superblock version number %u", + super->version); + } + + if (super->volume_offset < super->start_offset) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent offsets (start %llu, volume %llu)", + (unsigned long long) super->start_offset, + (unsigned long long) super->volume_offset); + } + + /* Sub-indexes are no longer used but the layout retains this field. */ + if (super->index_count != 1) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "invalid subindex count %u", + super->index_count); + } + + if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent superblock nonce"); + } + + return UDS_SUCCESS; +} + +static int __must_check verify_region(struct layout_region *lr, u64 start_block, + enum region_kind kind, unsigned int instance) +{ + if (lr->start_block != start_block) + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region offset"); + + if (lr->kind != kind) + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region kind"); + + if (lr->instance != instance) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region instance"); + } + + return UDS_SUCCESS; +} + +static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block, + struct region_table *table) +{ + int result; + unsigned int i; + struct sub_index_layout *sil = &layout->index; + u64 next_block = start_block; + + sil->sub_index = table->regions[2]; + result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0); + if (result != UDS_SUCCESS) + return result; + + define_sub_index_nonce(layout); + + sil->volume = table->regions[3]; + result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += sil->volume.block_count + layout->super.volume_offset; + + for (i = 0; i < layout->super.max_saves; i++) { + sil->saves[i].index_save = table->regions[i + 4]; + result = verify_region(&sil->saves[i].index_save, next_block, + RL_KIND_SAVE, i); + if (result != UDS_SUCCESS) + return result; + + next_block += sil->saves[i].index_save.block_count; + } + + next_block -= layout->super.volume_offset; + if (next_block != start_block + sil->sub_index.block_count) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "sub index region does not span all saves"); + } + + return UDS_SUCCESS; +} + +static int __must_check reconstitute_layout(struct index_layout *layout, + struct region_table *table, u64 first_block) +{ + int result; + u64 next_block = first_block; + + result = uds_allocate(layout->super.max_saves, struct index_save_layout, + __func__, &layout->index.saves); + if (result != UDS_SUCCESS) + return result; + + layout->total_blocks = table->header.region_blocks; + + layout->header = table->regions[0]; + result = verify_region(&layout->header, next_block++, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + layout->config = table->regions[1]; + result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + result = verify_sub_index(layout, next_block, table); + if (result != UDS_SUCCESS) + return result; + + next_block += layout->index.sub_index.block_count; + + layout->seal = table->regions[table->header.region_count - 1]; + result = verify_region(&layout->seal, next_block + layout->super.volume_offset, + RL_KIND_SEAL, RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + if (++next_block != (first_block + layout->total_blocks)) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "layout table does not span total blocks"); + } + + return UDS_SUCCESS; +} + +static int __must_check load_super_block(struct index_layout *layout, size_t block_size, + u64 first_block, struct buffered_reader *reader) +{ + int result; + struct region_table *table = NULL; + struct super_block_data *super = &layout->super; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) + return result; + + if (table->header.type != RH_TYPE_SUPER) { + uds_free(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "not a superblock region table"); + } + + result = read_super_block_data(reader, layout, table->header.payload); + if (result != UDS_SUCCESS) { + uds_free(table); + return uds_log_error_strerror(result, "unknown superblock format"); + } + + if (super->block_size != block_size) { + uds_free(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "superblock saved block_size %u differs from supplied block_size %zu", + super->block_size, block_size); + } + + first_block -= (super->volume_offset - super->start_offset); + result = reconstitute_layout(layout, table, first_block); + uds_free(table); + return result; +} + +static int __must_check read_index_save_data(struct buffered_reader *reader, + struct index_save_layout *isl, + size_t saved_size) +{ + int result; + struct index_state_version file_version; + u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)]; + size_t offset = 0; + + if (saved_size != sizeof(buffer)) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save data size %zu", + saved_size); + } + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "cannot read index save data"); + + decode_u64_le(buffer, &offset, &isl->save_data.timestamp); + decode_u64_le(buffer, &offset, &isl->save_data.nonce); + decode_u32_le(buffer, &offset, &isl->save_data.version); + offset += sizeof(u32); + + if (isl->save_data.version > 1) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown index save version number %u", + isl->save_data.version); + } + + decode_s32_le(buffer, &offset, &file_version.signature); + decode_s32_le(buffer, &offset, &file_version.version_id); + + if ((file_version.signature != INDEX_STATE_VERSION_301.signature) || + (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "index state version %d,%d is unsupported", + file_version.signature, + file_version.version_id); + } + + decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter); + decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter); + decode_u64_le(buffer, &offset, &isl->state_data.last_save); + /* Skip past some historical fields that are now unused */ + offset += sizeof(u32) + sizeof(u32); + return UDS_SUCCESS; +} + +static int __must_check reconstruct_index_save(struct index_save_layout *isl, + struct region_table *table) +{ + int result; + unsigned int z; + struct layout_region *last_region; + u64 next_block = isl->index_save.start_block; + u64 last_block = next_block + isl->index_save.block_count; + + isl->zone_count = table->header.region_count - 3; + + last_region = &table->regions[table->header.region_count - 1]; + if (last_region->kind == RL_KIND_EMPTY) { + isl->free_space = *last_region; + isl->zone_count--; + } else { + isl->free_space = (struct layout_region) { + .start_block = last_block, + .block_count = 0, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; + } + + isl->header = table->regions[0]; + result = verify_region(&isl->header, next_block++, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + isl->index_page_map = table->regions[1]; + result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->index_page_map.block_count; + + for (z = 0; z < isl->zone_count; z++) { + isl->volume_index_zones[z] = table->regions[z + 2]; + result = verify_region(&isl->volume_index_zones[z], next_block, + RL_KIND_VOLUME_INDEX, z); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->volume_index_zones[z].block_count; + } + + isl->open_chapter = table->regions[isl->zone_count + 2]; + result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->open_chapter.block_count; + + result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->free_space.block_count; + if (next_block != last_block) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "index save layout table incomplete"); + } + + return UDS_SUCCESS; +} + +static int __must_check load_index_save(struct index_save_layout *isl, + struct buffered_reader *reader, + unsigned int instance) +{ + int result; + struct region_table *table = NULL; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, "cannot read index save %u header", + instance); + } + + if (table->header.region_blocks != isl->index_save.block_count) { + u64 region_blocks = table->header.region_blocks; + + uds_free(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u region block count %llu", + instance, + (unsigned long long) region_blocks); + } + + if (table->header.type == RH_TYPE_UNSAVED) { + uds_free(table); + reset_index_save_layout(isl, 0); + return UDS_SUCCESS; + } + + + if (table->header.type != RH_TYPE_SAVE) { + uds_free(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u header type %u", + instance, table->header.type); + } + + result = read_index_save_data(reader, isl, table->header.payload); + if (result != UDS_SUCCESS) { + uds_free(table); + return uds_log_error_strerror(result, + "unknown index save %u data format", + instance); + } + + result = reconstruct_index_save(isl, table); + uds_free(table); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, "cannot reconstruct index save %u", + instance); + } + + return UDS_SUCCESS; +} + +static int __must_check load_sub_index_regions(struct index_layout *layout) +{ + int result; + unsigned int j; + struct index_save_layout *isl; + struct buffered_reader *reader; + + for (j = 0; j < layout->super.max_saves; j++) { + isl = &layout->index.saves[j]; + result = open_region_reader(layout, &isl->index_save, &reader); + + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "cannot get reader for index 0 save %u", + j); + return result; + } + + result = load_index_save(isl, reader, j); + uds_free_buffered_reader(reader); + if (result != UDS_SUCCESS) { + /* Another save slot might be valid. */ + reset_index_save_layout(isl, 0); + continue; + } + } + + return UDS_SUCCESS; +} + +static int __must_check verify_uds_index_config(struct index_layout *layout, + struct uds_configuration *config) +{ + int result; + struct buffered_reader *reader = NULL; + u64 offset; + + offset = layout->super.volume_offset - layout->super.start_offset; + result = open_layout_reader(layout, &layout->config, offset, &reader); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "failed to open config reader"); + + result = uds_validate_config_contents(reader, config); + if (result != UDS_SUCCESS) { + uds_free_buffered_reader(reader); + return uds_log_error_strerror(result, "failed to read config region"); + } + + uds_free_buffered_reader(reader); + return UDS_SUCCESS; +} + +static int load_index_layout(struct index_layout *layout, struct uds_configuration *config) +{ + int result; + struct buffered_reader *reader; + + result = uds_make_buffered_reader(layout->factory, + layout->offset / UDS_BLOCK_SIZE, 1, &reader); + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, "unable to read superblock"); + + result = load_super_block(layout, UDS_BLOCK_SIZE, + layout->offset / UDS_BLOCK_SIZE, reader); + uds_free_buffered_reader(reader); + if (result != UDS_SUCCESS) + return result; + + result = verify_uds_index_config(layout, config); + if (result != UDS_SUCCESS) + return result; + + return load_sub_index_regions(layout); +} + +static int create_layout_factory(struct index_layout *layout, + const struct uds_configuration *config) +{ + int result; + size_t writable_size; + struct io_factory *factory = NULL; + + result = uds_make_io_factory(config->bdev, &factory); + if (result != UDS_SUCCESS) + return result; + + writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE; + if (writable_size < config->size + config->offset) { + uds_put_io_factory(factory); + uds_log_error("index storage (%zu) is smaller than the requested size %zu", + writable_size, config->size + config->offset); + return -ENOSPC; + } + + layout->factory = factory; + layout->factory_size = (config->size > 0) ? config->size : writable_size; + layout->offset = config->offset; + return UDS_SUCCESS; +} + +int uds_make_index_layout(struct uds_configuration *config, bool new_layout, + struct index_layout **layout_ptr) +{ + int result; + struct index_layout *layout = NULL; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(1, struct index_layout, __func__, &layout); + if (result != UDS_SUCCESS) + return result; + + result = create_layout_factory(layout, config); + if (result != UDS_SUCCESS) { + uds_free_index_layout(layout); + return result; + } + + if (layout->factory_size < sizes.total_size) { + uds_log_error("index storage (%zu) is smaller than the required size %llu", + layout->factory_size, + (unsigned long long) sizes.total_size); + uds_free_index_layout(layout); + return -ENOSPC; + } + + if (new_layout) + result = create_index_layout(layout, config); + else + result = load_index_layout(layout, config); + if (result != UDS_SUCCESS) { + uds_free_index_layout(layout); + return result; + } + + *layout_ptr = layout; + return UDS_SUCCESS; +} + +void uds_free_index_layout(struct index_layout *layout) +{ + if (layout == NULL) + return; + + uds_free(layout->index.saves); + if (layout->factory != NULL) + uds_put_io_factory(layout->factory); + + uds_free(layout); +} + +int uds_replace_index_layout_storage(struct index_layout *layout, + struct block_device *bdev) +{ + return uds_replace_storage(layout->factory, bdev); +} + +/* Obtain a dm_bufio_client for the volume region. */ +int uds_open_volume_bufio(struct index_layout *layout, size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr) +{ + off_t offset = (layout->index.volume.start_block + + layout->super.volume_offset - + layout->super.start_offset); + + return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers, + client_ptr); +} + +u64 uds_get_volume_nonce(struct index_layout *layout) +{ + return layout->index.nonce; +} diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h new file mode 100644 index 0000000000000..e9ac6f4302d63 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-layout.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_LAYOUT_H +#define UDS_INDEX_LAYOUT_H + +#include "config.h" +#include "indexer.h" +#include "io-factory.h" + +/* + * The index layout describes the format of the index on the underlying storage, and is responsible + * for creating those structures when the index is first created. It also validates the index data + * when loading a saved index, and updates it when saving the index. + */ + +struct index_layout; + +int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout, + struct index_layout **layout_ptr); + +void uds_free_index_layout(struct index_layout *layout); + +int __must_check uds_replace_index_layout_storage(struct index_layout *layout, + struct block_device *bdev); + +int __must_check uds_load_index_state(struct index_layout *layout, + struct uds_index *index); + +int __must_check uds_save_index_state(struct index_layout *layout, + struct uds_index *index); + +int __must_check uds_discard_open_chapter(struct index_layout *layout); + +u64 __must_check uds_get_volume_nonce(struct index_layout *layout); + +int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr); + +#endif /* UDS_INDEX_LAYOUT_H */ diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c new file mode 100644 index 0000000000000..90d97c33a9c32 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-page-map.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-page-map.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "thread-utils.h" + +#include "hash-utils.h" +#include "indexer.h" + +/* + * The index page map is conceptually a two-dimensional array indexed by chapter number and index + * page number within the chapter. Each entry contains the number of the last delta list on that + * index page. In order to save memory, the information for the last page in each chapter is not + * recorded, as it is known from the geometry. + */ + +static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02"; + +enum { + PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1, +}; + +static inline u32 get_entry_count(const struct index_geometry *geometry) +{ + return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1); +} + +int uds_make_index_page_map(const struct index_geometry *geometry, + struct index_page_map **map_ptr) +{ + int result; + struct index_page_map *map; + + result = uds_allocate(1, struct index_page_map, "page map", &map); + if (result != UDS_SUCCESS) + return result; + + map->geometry = geometry; + map->entries_per_chapter = geometry->index_pages_per_chapter - 1; + result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries", + &map->entries); + if (result != UDS_SUCCESS) { + uds_free_index_page_map(map); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +void uds_free_index_page_map(struct index_page_map *map) +{ + if (map != NULL) { + uds_free(map->entries); + uds_free(map); + } +} + +void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, + u32 chapter_number, u32 index_page_number, + u32 delta_list_number) +{ + size_t slot; + + map->last_update = virtual_chapter_number; + if (index_page_number == map->entries_per_chapter) + return; + + slot = (chapter_number * map->entries_per_chapter) + index_page_number; + map->entries[slot] = delta_list_number; +} + +u32 uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, u32 chapter_number) +{ + u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry); + u32 slot = chapter_number * map->entries_per_chapter; + u32 page; + + for (page = 0; page < map->entries_per_chapter; page++) { + if (delta_list_number <= map->entries[slot + page]) + break; + } + + return page; +} + +void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, + u32 index_page_number, u32 *lowest_list, + u32 *highest_list) +{ + u32 slot = chapter_number * map->entries_per_chapter; + + *lowest_list = ((index_page_number == 0) ? + 0 : map->entries[slot + index_page_number - 1] + 1); + *highest_list = ((index_page_number < map->entries_per_chapter) ? + map->entries[slot + index_page_number] : + map->geometry->delta_lists_per_chapter - 1); +} + +u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry) +{ + return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry); +} + +int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = uds_allocate(saved_size, u8, "page map data", &buffer); + if (result != UDS_SUCCESS) + return result; + + memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + encode_u64_le(buffer, &offset, map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + encode_u16_le(buffer, &offset, map->entries[i]); + + result = uds_write_to_buffered_writer(writer, buffer, offset); + uds_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader) +{ + int result; + u8 magic[PAGE_MAP_MAGIC_LENGTH]; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = uds_allocate(saved_size, u8, "page map data", &buffer); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, buffer, saved_size); + if (result != UDS_SUCCESS) { + uds_free(buffer); + return result; + } + + memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) { + uds_free(buffer); + return UDS_CORRUPT_DATA; + } + + decode_u64_le(buffer, &offset, &map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + decode_u16_le(buffer, &offset, &map->entries[i]); + + uds_free(buffer); + uds_log_debug("read index page map, last update %llu", + (unsigned long long) map->last_update); + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h new file mode 100644 index 0000000000000..b327c0bb96562 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-page-map.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_PAGE_MAP_H +#define UDS_INDEX_PAGE_MAP_H + +#include "geometry.h" +#include "io-factory.h" + +/* + * The index maintains a page map which records how the chapter delta lists are distributed among + * the index pages for each chapter, allowing the volume to be efficient about reading only pages + * that it knows it will need. + */ + +struct index_page_map { + const struct index_geometry *geometry; + u64 last_update; + u32 entries_per_chapter; + u16 *entries; +}; + +int __must_check uds_make_index_page_map(const struct index_geometry *geometry, + struct index_page_map **map_ptr); + +void uds_free_index_page_map(struct index_page_map *map); + +int __must_check uds_read_index_page_map(struct index_page_map *map, + struct buffered_reader *reader); + +int __must_check uds_write_index_page_map(struct index_page_map *map, + struct buffered_writer *writer); + +void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, + u32 chapter_number, u32 index_page_number, + u32 delta_list_number); + +u32 __must_check uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, + u32 chapter_number); + +void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, + u32 index_page_number, u32 *lowest_list, + u32 *highest_list); + +u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry); + +#endif /* UDS_INDEX_PAGE_MAP_H */ diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c new file mode 100644 index 0000000000000..07b478f57c681 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -0,0 +1,739 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-session.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "time-utils.h" + +#include "funnel-requestqueue.h" +#include "index.h" +#include "index-layout.h" + +/* + * The index session contains a lock (the request_mutex) which ensures that only one thread can + * change the state of its index at a time. The state field indicates the current state of the + * index through a set of descriptive flags. The request_mutex must be notified whenever a + * non-transient state flag is cleared. The request_mutex is also used to count the number of + * requests currently in progress so that they can be drained when suspending or closing the index. + * + * If the index session is suspended shortly after opening an index, it may have to suspend during + * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time, + * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When + * the index session is resumed, the rebuild can continue from where it left off. If the index + * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild + * will start from the beginning the next time the index is loaded. The mutex and status fields in + * the index_load_context are used to record the state of any interrupted rebuild. + */ + +enum index_session_flag_bit { + IS_FLAG_BIT_START = 8, + /* The session has started loading an index but not completed it. */ + IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, + /* The session has loaded an index, which can handle requests. */ + IS_FLAG_BIT_LOADED, + /* The session's index has been permanently disabled. */ + IS_FLAG_BIT_DISABLED, + /* The session's index is suspended. */ + IS_FLAG_BIT_SUSPENDED, + /* The session is handling some index state change. */ + IS_FLAG_BIT_WAITING, + /* The session's index is closing and draining requests. */ + IS_FLAG_BIT_CLOSING, + /* The session is being destroyed and is draining requests. */ + IS_FLAG_BIT_DESTROYING, +}; + +enum index_session_flag { + IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), + IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), + IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), + IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), + IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), + IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), + IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), +}; + +/* Release a reference to an index session. */ +static void release_index_session(struct uds_index_session *index_session) +{ + mutex_lock(&index_session->request_mutex); + if (--index_session->request_count == 0) + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); +} + +/* + * Acquire a reference to the index session for an asynchronous index request. The reference must + * eventually be released with a corresponding call to release_index_session(). + */ +static int get_index_session(struct uds_index_session *index_session) +{ + unsigned int state; + int result = UDS_SUCCESS; + + mutex_lock(&index_session->request_mutex); + index_session->request_count++; + state = index_session->state; + mutex_unlock(&index_session->request_mutex); + + if (state == IS_FLAG_LOADED) { + return UDS_SUCCESS; + } else if (state & IS_FLAG_DISABLED) { + result = UDS_DISABLED; + } else if ((state & IS_FLAG_LOADING) || + (state & IS_FLAG_SUSPENDED) || + (state & IS_FLAG_WAITING)) { + result = -EBUSY; + } else { + result = UDS_NO_INDEX; + } + + release_index_session(index_session); + return result; +} + +int uds_launch_request(struct uds_request *request) +{ + size_t internal_size; + int result; + + if (request->callback == NULL) { + uds_log_error("missing required callback"); + return -EINVAL; + } + + switch (request->type) { + case UDS_DELETE: + case UDS_POST: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + case UDS_UPDATE: + break; + default: + uds_log_error("received invalid callback type"); + return -EINVAL; + } + + /* Reset all internal fields before processing. */ + internal_size = + sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); + // FIXME should be using struct_group for this instead + memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + + result = get_index_session(request->session); + if (result != UDS_SUCCESS) + return result; + + request->found = false; + request->unbatched = false; + request->index = request->session->index; + + uds_enqueue_request(request, STAGE_TRIAGE); + return UDS_SUCCESS; +} + +static void enter_callback_stage(struct uds_request *request) +{ + if (request->status != UDS_SUCCESS) { + /* All request errors are considered unrecoverable */ + mutex_lock(&request->session->request_mutex); + request->session->state |= IS_FLAG_DISABLED; + mutex_unlock(&request->session->request_mutex); + } + + uds_request_queue_enqueue(request->session->callback_queue, request); +} + +static inline void count_once(u64 *count_ptr) +{ + WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1); +} + +static void update_session_stats(struct uds_request *request) +{ + struct session_stats *session_stats = &request->session->stats; + + count_once(&session_stats->requests); + + switch (request->type) { + case UDS_POST: + if (request->found) + count_once(&session_stats->posts_found); + else + count_once(&session_stats->posts_not_found); + + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) + count_once(&session_stats->posts_found_open_chapter); + else if (request->location == UDS_LOCATION_IN_DENSE) + count_once(&session_stats->posts_found_dense); + else if (request->location == UDS_LOCATION_IN_SPARSE) + count_once(&session_stats->posts_found_sparse); + break; + + case UDS_UPDATE: + if (request->found) + count_once(&session_stats->updates_found); + else + count_once(&session_stats->updates_not_found); + break; + + case UDS_DELETE: + if (request->found) + count_once(&session_stats->deletions_found); + else + count_once(&session_stats->deletions_not_found); + break; + + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + if (request->found) + count_once(&session_stats->queries_found); + else + count_once(&session_stats->queries_not_found); + break; + + default: + request->status = ASSERT(false, "unknown request type: %d", + request->type); + } +} + +static void handle_callbacks(struct uds_request *request) +{ + struct uds_index_session *index_session = request->session; + + if (request->status == UDS_SUCCESS) + update_session_stats(request); + + request->status = uds_status_to_errno(request->status); + request->callback(request); + release_index_session(index_session); +} + +static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr) +{ + int result; + struct uds_index_session *session; + + result = uds_allocate(1, struct uds_index_session, __func__, &session); + if (result != UDS_SUCCESS) + return result; + + mutex_init(&session->request_mutex); + uds_init_cond(&session->request_cond); + mutex_init(&session->load_context.mutex); + uds_init_cond(&session->load_context.cond); + + result = uds_make_request_queue("callbackW", &handle_callbacks, + &session->callback_queue); + if (result != UDS_SUCCESS) { + uds_free(session); + return result; + } + + *index_session_ptr = session; + return UDS_SUCCESS; +} + +int uds_create_index_session(struct uds_index_session **session) +{ + if (session == NULL) { + uds_log_error("missing session pointer"); + return -EINVAL; + } + + return uds_status_to_errno(make_empty_index_session(session)); +} + +static int __must_check start_loading_index_session(struct uds_index_session *index_session) +{ + int result; + + mutex_lock(&index_session->request_mutex); + if (index_session->state & IS_FLAG_SUSPENDED) { + uds_log_info("Index session is suspended"); + result = -EBUSY; + } else if (index_session->state != 0) { + uds_log_info("Index is already loaded"); + result = -EBUSY; + } else { + index_session->state |= IS_FLAG_LOADING; + result = UDS_SUCCESS; + } + mutex_unlock(&index_session->request_mutex); + return result; +} + +static void finish_loading_index_session(struct uds_index_session *index_session, + int result) +{ + mutex_lock(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_LOADING; + if (result == UDS_SUCCESS) + index_session->state |= IS_FLAG_LOADED; + + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); +} + +static int initialize_index_session(struct uds_index_session *index_session, + enum uds_open_index_type open_type) +{ + int result; + struct uds_configuration *config; + + result = uds_make_configuration(&index_session->parameters, &config); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "Failed to allocate config"); + return result; + } + + memset(&index_session->stats, 0, sizeof(index_session->stats)); + result = uds_make_index(config, open_type, &index_session->load_context, + enter_callback_stage, &index_session->index); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Failed to make index"); + else + uds_log_configuration(config); + + uds_free_configuration(config); + return result; +} + +static const char *get_open_type_string(enum uds_open_index_type open_type) +{ + switch (open_type) { + case UDS_CREATE: + return "creating index"; + case UDS_LOAD: + return "loading or rebuilding index"; + case UDS_NO_REBUILD: + return "loading index"; + default: + return "unknown open method"; + } +} + +/* + * Open an index under the given session. This operation will fail if the + * index session is suspended, or if there is already an open index. + */ +int uds_open_index(enum uds_open_index_type open_type, + const struct uds_parameters *parameters, + struct uds_index_session *session) +{ + int result; + char name[BDEVNAME_SIZE]; + + if (parameters == NULL) { + uds_log_error("missing required parameters"); + return -EINVAL; + } + if (parameters->bdev == NULL) { + uds_log_error("missing required block device"); + return -EINVAL; + } + if (session == NULL) { + uds_log_error("missing required session pointer"); + return -EINVAL; + } + + result = start_loading_index_session(session); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + session->parameters = *parameters; + format_dev_t(name, parameters->bdev->bd_dev); + uds_log_info("%s: %s", get_open_type_string(open_type), name); + + result = initialize_index_session(session, open_type); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Failed %s", + get_open_type_string(open_type)); + + finish_loading_index_session(session, result); + return uds_status_to_errno(result); +} + +static void wait_for_no_requests_in_progress(struct uds_index_session *index_session) +{ + mutex_lock(&index_session->request_mutex); + while (index_session->request_count > 0) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + mutex_unlock(&index_session->request_mutex); +} + +static int __must_check save_index(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + return uds_save_index(index_session->index); +} + +static void suspend_rebuild(struct uds_index_session *session) +{ + mutex_lock(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_OPENING: + session->load_context.status = INDEX_SUSPENDING; + + /* Wait until the index indicates that it is not replaying. */ + while ((session->load_context.status != INDEX_SUSPENDED) && + (session->load_context.status != INDEX_READY)) { + uds_wait_cond(&session->load_context.cond, + &session->load_context.mutex); + } + + break; + + case INDEX_READY: + /* Index load does not need to be suspended. */ + break; + + case INDEX_SUSPENDED: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen. */ + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->load_context.status); + break; + } + mutex_unlock(&session->load_context.mutex); +} + +/* + * Suspend index operation, draining all current index requests and preventing new index requests + * from starting. Optionally saves all index data before returning. + */ +int uds_suspend_index_session(struct uds_index_session *session, bool save) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool rebuilding = false; + + /* Wait for any current index state change to complete. */ + mutex_lock(&session->request_mutex); + while (session->state & IS_FLAG_CLOSING) + uds_wait_cond(&session->request_cond, &session->request_mutex); + + if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) { + no_work = true; + uds_log_info("Index session is already changing state"); + result = -EBUSY; + } else if (session->state & IS_FLAG_SUSPENDED) { + no_work = true; + } else if (session->state & IS_FLAG_LOADING) { + session->state |= IS_FLAG_WAITING; + rebuilding = true; + } else if (session->state & IS_FLAG_LOADED) { + session->state |= IS_FLAG_WAITING; + } else { + no_work = true; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + } + mutex_unlock(&session->request_mutex); + + if (no_work) + return uds_status_to_errno(result); + + if (rebuilding) + suspend_rebuild(session); + else if (save) + result = save_index(session); + else + result = uds_flush_index_session(session); + + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return uds_status_to_errno(result); +} + +static int replace_device(struct uds_index_session *session, struct block_device *bdev) +{ + int result; + + result = uds_replace_index_storage(session->index, bdev); + if (result != UDS_SUCCESS) + return result; + + session->parameters.bdev = bdev; + return UDS_SUCCESS; +} + +/* + * Resume index operation after being suspended. If the index is suspended and the supplied block + * device differs from the current backing store, the index will start using the new backing store. + */ +int uds_resume_index_session(struct uds_index_session *session, + struct block_device *bdev) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool resume_replay = false; + + mutex_lock(&session->request_mutex); + if (session->state & IS_FLAG_WAITING) { + uds_log_info("Index session is already changing state"); + no_work = true; + result = -EBUSY; + } else if (!(session->state & IS_FLAG_SUSPENDED)) { + /* If not suspended, just succeed. */ + no_work = true; + result = UDS_SUCCESS; + } else { + session->state |= IS_FLAG_WAITING; + if (session->state & IS_FLAG_LOADING) + resume_replay = true; + } + mutex_unlock(&session->request_mutex); + + if (no_work) + return result; + + if ((session->index != NULL) && (bdev != session->parameters.bdev)) { + result = replace_device(session, bdev); + if (result != UDS_SUCCESS) { + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return uds_status_to_errno(result); + } + } + + if (resume_replay) { + mutex_lock(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_SUSPENDED: + session->load_context.status = INDEX_OPENING; + /* Notify the index to start replaying again. */ + uds_broadcast_cond(&session->load_context.cond); + break; + + case INDEX_READY: + /* There is no index rebuild to resume. */ + break; + + case INDEX_OPENING: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen; do nothing. */ + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->load_context.status); + break; + } + mutex_unlock(&session->load_context.mutex); + } + + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state &= ~IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return UDS_SUCCESS; +} + +static int save_and_free_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + bool suspended; + struct uds_index *index = index_session->index; + + if (index == NULL) + return UDS_SUCCESS; + + mutex_lock(&index_session->request_mutex); + suspended = (index_session->state & IS_FLAG_SUSPENDED); + mutex_unlock(&index_session->request_mutex); + + if (!suspended) { + result = uds_save_index(index); + if (result != UDS_SUCCESS) + uds_log_warning_strerror(result, + "ignoring error from save_index"); + } + uds_free_index(index); + index_session->index = NULL; + + /* + * Reset all index state that happens to be in the index + * session, so it doesn't affect any future index. + */ + mutex_lock(&index_session->load_context.mutex); + index_session->load_context.status = INDEX_OPENING; + mutex_unlock(&index_session->load_context.mutex); + + mutex_lock(&index_session->request_mutex); + /* Only the suspend bit will remain relevant. */ + index_session->state &= IS_FLAG_SUSPENDED; + mutex_unlock(&index_session->request_mutex); + + return result; +} + +/* Save and close the current index. */ +int uds_close_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + + /* Wait for any current index state change to complete. */ + mutex_lock(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_SUSPENDED) { + uds_log_info("Index session is suspended"); + result = -EBUSY; + } else if ((index_session->state & IS_FLAG_DESTROYING) || + !(index_session->state & IS_FLAG_LOADED)) { + /* The index doesn't exist, hasn't finished loading, or is being destroyed. */ + result = UDS_NO_INDEX; + } else { + index_session->state |= IS_FLAG_CLOSING; + } + mutex_unlock(&index_session->request_mutex); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + uds_log_debug("Closing index"); + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + uds_log_debug("Closed index"); + + mutex_lock(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_CLOSING; + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); + return uds_status_to_errno(result); +} + +/* This will save and close an open index before destroying the session. */ +int uds_destroy_index_session(struct uds_index_session *index_session) +{ + int result; + bool load_pending = false; + + uds_log_debug("Destroying index session"); + + /* Wait for any current index state change to complete. */ + mutex_lock(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_DESTROYING) { + mutex_unlock(&index_session->request_mutex); + uds_log_info("Index session is already closing"); + return -EBUSY; + } + + index_session->state |= IS_FLAG_DESTROYING; + load_pending = ((index_session->state & IS_FLAG_LOADING) && + (index_session->state & IS_FLAG_SUSPENDED)); + mutex_unlock(&index_session->request_mutex); + + if (load_pending) { + /* Tell the index to terminate the rebuild. */ + mutex_lock(&index_session->load_context.mutex); + if (index_session->load_context.status == INDEX_SUSPENDED) { + index_session->load_context.status = INDEX_FREEING; + uds_broadcast_cond(&index_session->load_context.cond); + } + mutex_unlock(&index_session->load_context.mutex); + + /* Wait until the load exits before proceeding. */ + mutex_lock(&index_session->request_mutex); + while (index_session->state & IS_FLAG_LOADING) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + mutex_unlock(&index_session->request_mutex); + } + + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + uds_request_queue_finish(index_session->callback_queue); + index_session->callback_queue = NULL; + uds_log_debug("Destroyed index session"); + uds_free(index_session); + return uds_status_to_errno(result); +} + +/* Wait until all callbacks for index operations are complete. */ +int uds_flush_index_session(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + uds_wait_for_idle_index(index_session->index); + return UDS_SUCCESS; +} + +/* Statistics collection is intended to be thread-safe. */ +static void collect_stats(const struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + const struct session_stats *session_stats = &index_session->stats; + + stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME)); + stats->posts_found = READ_ONCE(session_stats->posts_found); + stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter); + stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense); + stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse); + stats->posts_not_found = READ_ONCE(session_stats->posts_not_found); + stats->updates_found = READ_ONCE(session_stats->updates_found); + stats->updates_not_found = READ_ONCE(session_stats->updates_not_found); + stats->deletions_found = READ_ONCE(session_stats->deletions_found); + stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found); + stats->queries_found = READ_ONCE(session_stats->queries_found); + stats->queries_not_found = READ_ONCE(session_stats->queries_not_found); + stats->requests = READ_ONCE(session_stats->requests); +} + +int uds_get_index_session_stats(struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + if (stats == NULL) { + uds_log_error("received a NULL index stats pointer"); + return -EINVAL; + } + + collect_stats(index_session, stats); + if (index_session->index != NULL) { + uds_get_index_stats(index_session->index, stats); + } else { + stats->entries_indexed = 0; + stats->memory_used = 0; + stats->collisions = 0; + stats->entries_discarded = 0; + } + + return UDS_SUCCESS; +} + +void uds_wait_cond(struct cond_var *cv, struct mutex *mutex) +{ + DEFINE_WAIT(__wait); + + prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE); + mutex_unlock(mutex); + schedule(); + finish_wait(&cv->wait_queue, &__wait); + mutex_lock(mutex); +} diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h new file mode 100644 index 0000000000000..066648f6e0626 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-session.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_SESSION_H +#define UDS_INDEX_SESSION_H + +#include +#include + +#include "thread-utils.h" + +#include "config.h" +#include "indexer.h" + +/* + * The index session mediates all interactions with a UDS index. Once the index session is created, + * it can be used to open, close, suspend, or recreate an index. It implements the majority of the + * functions in the top-level UDS API. + * + * If any deduplication request fails due to an internal error, the index is marked disabled. It + * will not accept any further requests and can only be closed. Closing the index will clear the + * disabled flag, and the index can then be reopened and recovered using the same index session. + */ + +struct __aligned(L1_CACHE_BYTES) session_stats { + /* Post requests that found an entry */ + u64 posts_found; + /* Post requests found in the open chapter */ + u64 posts_found_open_chapter; + /* Post requests found in the dense index */ + u64 posts_found_dense; + /* Post requests found in the sparse index */ + u64 posts_found_sparse; + /* Post requests that did not find an entry */ + u64 posts_not_found; + /* Update requests that found an entry */ + u64 updates_found; + /* Update requests that did not find an entry */ + u64 updates_not_found; + /* Delete requests that found an entry */ + u64 deletions_found; + /* Delete requests that did not find an entry */ + u64 deletions_not_found; + /* Query requests that found an entry */ + u64 queries_found; + /* Query requests that did not find an entry */ + u64 queries_not_found; + /* Total number of requests */ + u64 requests; +}; + +enum index_suspend_status { + /* An index load has started but the index is not ready for use. */ + INDEX_OPENING = 0, + /* The index is able to handle requests. */ + INDEX_READY, + /* The index is attempting to suspend a rebuild. */ + INDEX_SUSPENDING, + /* An index rebuild has been suspended. */ + INDEX_SUSPENDED, + /* An index rebuild is being stopped in order to shut down. */ + INDEX_FREEING, +}; + +struct index_load_context { + struct mutex mutex; + struct cond_var cond; + enum index_suspend_status status; +}; + +struct uds_index_session { + unsigned int state; + struct uds_index *index; + struct uds_request_queue *callback_queue; + struct uds_parameters parameters; + struct index_load_context load_context; + struct mutex request_mutex; + struct cond_var request_cond; + int request_count; + struct session_stats stats; +}; + +#endif /* UDS_INDEX_SESSION_H */ diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c new file mode 100644 index 0000000000000..35e3b45cdb713 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index.c @@ -0,0 +1,1388 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + + +#include "index.h" + +#include "logger.h" +#include "memory-alloc.h" + +#include "funnel-requestqueue.h" +#include "hash-utils.h" +#include "sparse-cache.h" + +static const u64 NO_LAST_SAVE = U64_MAX; + +/* + * When searching for deduplication records, the index first searches the volume index, and then + * searches the chapter index for the relevant chapter. If the chapter has been fully committed to + * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been + * committed (either the open chapter or a recently closed one), the index searches the in-memory + * representation of the chapter. Finally, if the volume index does not find a record and the index + * is sparse, the index will search the sparse cache. + * + * The index send two kinds of messages to coordinate between zones: chapter close messages for the + * chapter writer, and sparse cache barrier messages for the sparse cache. + * + * The chapter writer is responsible for committing chapters of records to storage. Since zones can + * get different numbers of records, some zones may fall behind others. Each time a zone fills up + * its available space in a chapter, it informs the chapter writer that the chapter is complete, + * and also informs all other zones that it has closed the chapter. Each other zone will then close + * the chapter immediately, regardless of how full it is, in order to minimize skew between zones. + * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage. + * + * The last zone to close the chapter also removes the oldest chapter from the volume index. + * Although that chapter is invalid for zones that have moved on, the existence of the open chapter + * means that those zones will never ask the volume index about it. No zone is allowed to get more + * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another + * chapter before the previous one has been closed by all zones, it is forced to wait. + * + * The sparse cache relies on having the same set of chapter indexes available to all zones. When a + * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone + * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and + * paused its operations, the cache membership is changed and each zone is then informed that it + * can proceed. More details can be found in the sparse cache documentation. + * + * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the + * barrier message to change the sparse cache membership, so the index simulates the message by + * invoking the handler directly. + */ + +struct chapter_writer { + /* The index to which we belong */ + struct uds_index *index; + /* The thread to do the writing */ + struct thread *thread; + /* The lock protecting the following fields */ + struct mutex mutex; + /* The condition signalled on state changes */ + struct cond_var cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memory_size; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zones_to_write; + /* Open chapter index used by uds_close_open_chapter() */ + struct open_chapter_index *open_chapter_index; + /* Collated records used by uds_close_open_chapter() */ + struct uds_volume_record *collated_records; + /* The chapters to write (one per zone) */ + struct open_chapter_zone *chapters[]; +}; + +static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter) +{ + return uds_is_chapter_sparse(zone->index->volume->geometry, + zone->oldest_virtual_chapter, + zone->newest_virtual_chapter, virtual_chapter); +} + +static int launch_zone_message(struct uds_zone_message message, unsigned int zone, + struct uds_index *index) +{ + int result; + struct uds_request *request; + + result = uds_allocate(1, struct uds_request, __func__, &request); + if (result != UDS_SUCCESS) + return result; + + request->index = index; + request->unbatched = true; + request->zone_number = zone; + request->zone_message = message; + + uds_enqueue_request(request, STAGE_MESSAGE); + return UDS_SUCCESS; +} + +static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter) +{ + struct uds_zone_message message = { + .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, + .virtual_chapter = virtual_chapter, + }; + unsigned int zone; + + for (zone = 0; zone < index->zone_count; zone++) { + int result = launch_zone_message(message, zone, index); + + ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); + } +} + +/* + * Determine whether this request should trigger a sparse cache barrier message to change the + * membership of the sparse cache. If a change in membership is desired, the function returns the + * chapter number to add. + */ +static u64 triage_index_request(struct uds_index *index, struct uds_request *request) +{ + u64 virtual_chapter; + struct index_zone *zone; + + virtual_chapter = uds_lookup_volume_index_name(index->volume_index, + &request->record_name); + if (virtual_chapter == NO_CHAPTER) + return NO_CHAPTER; + + zone = index->zones[request->zone_number]; + if (!is_zone_chapter_sparse(zone, virtual_chapter)) + return NO_CHAPTER; + + /* + * FIXME: Optimize for a common case by remembering the chapter from the most recent + * barrier message and skipping this chapter if is it the same. + */ + + return virtual_chapter; +} + +/* + * Simulate a message to change the sparse cache membership for a single-zone sparse index. This + * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind + * of index does nothing here. + */ +static int simulate_index_zone_barrier_message(struct index_zone *zone, + struct uds_request *request) +{ + u64 sparse_virtual_chapter; + + if ((zone->index->zone_count > 1) || + !uds_is_sparse_index_geometry(zone->index->volume->geometry)) + return UDS_SUCCESS; + + sparse_virtual_chapter = triage_index_request(zone->index, request); + if (sparse_virtual_chapter == NO_CHAPTER) + return UDS_SUCCESS; + + return uds_update_sparse_cache(zone, sparse_virtual_chapter); +} + +/* This is the request processing function for the triage queue. */ +static void triage_request(struct uds_request *request) +{ + struct uds_index *index = request->index; + u64 sparse_virtual_chapter = triage_index_request(index, request); + + if (sparse_virtual_chapter != NO_CHAPTER) + enqueue_barrier_messages(index, sparse_virtual_chapter); + + uds_enqueue_request(request, STAGE_INDEX); +} + +static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number) +{ + int result; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (index->newest_virtual_chapter < current_chapter_number) + uds_wait_cond(&writer->cond, &writer->mutex); + result = writer->result; + mutex_unlock(&writer->mutex); + + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, + "Writing of previous open chapter failed"); + + return UDS_SUCCESS; +} + +static int swap_open_chapter(struct index_zone *zone) +{ + int result; + struct open_chapter_zone *temporary_chapter; + + result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); + if (result != UDS_SUCCESS) + return result; + + temporary_chapter = zone->open_chapter; + zone->open_chapter = zone->writing_chapter; + zone->writing_chapter = temporary_chapter; + return UDS_SUCCESS; +} + +/* + * Inform the chapter writer that this zone is done with this chapter. The chapter won't start + * writing until all zones have closed it. + */ +static unsigned int start_closing_chapter(struct uds_index *index, + unsigned int zone_number, + struct open_chapter_zone *chapter) +{ + unsigned int finished_zones; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + finished_zones = ++writer->zones_to_write; + writer->chapters[zone_number] = chapter; + uds_broadcast_cond(&writer->cond); + mutex_unlock(&writer->mutex); + + return finished_zones; +} + +static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter) +{ + int result; + unsigned int i; + struct uds_zone_message zone_message = { + .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, + .virtual_chapter = closed_chapter, + }; + + for (i = 0; i < zone->index->zone_count; i++) { + if (zone->id == i) + continue; + + result = launch_zone_message(zone_message, i, zone->index); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +static int open_next_chapter(struct index_zone *zone) +{ + int result; + u64 closed_chapter; + u64 expiring; + unsigned int finished_zones; + u32 expire_chapters; + + uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", + (unsigned long long) zone->newest_virtual_chapter, zone->id, + zone->open_chapter->size, + zone->open_chapter->capacity - zone->open_chapter->size); + + result = swap_open_chapter(zone); + if (result != UDS_SUCCESS) + return result; + + closed_chapter = zone->newest_virtual_chapter++; + uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id, + zone->newest_virtual_chapter); + uds_reset_open_chapter(zone->open_chapter); + + finished_zones = start_closing_chapter(zone->index, zone->id, + zone->writing_chapter); + if ((finished_zones == 1) && (zone->index->zone_count > 1)) { + result = announce_chapter_closed(zone, closed_chapter); + if (result != UDS_SUCCESS) + return result; + } + + expiring = zone->oldest_virtual_chapter; + expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry, + zone->newest_virtual_chapter); + zone->oldest_virtual_chapter += expire_chapters; + + if (finished_zones < zone->index->zone_count) + return UDS_SUCCESS; + + while (expire_chapters-- > 0) + uds_forget_chapter(zone->index->volume, expiring++); + + return UDS_SUCCESS; +} + +static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter) +{ + if (zone->newest_virtual_chapter == virtual_chapter) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int dispatch_index_zone_control_request(struct uds_request *request) +{ + struct uds_zone_message *message = &request->zone_message; + struct index_zone *zone = request->index->zones[request->zone_number]; + + switch (message->type) { + case UDS_MESSAGE_SPARSE_CACHE_BARRIER: + return uds_update_sparse_cache(zone, message->virtual_chapter); + + case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: + return handle_chapter_closed(zone, message->virtual_chapter); + + default: + uds_log_error("invalid message type: %d", message->type); + return UDS_INVALID_ARGUMENT; + } +} + +static void set_request_location(struct uds_request *request, + enum uds_index_region new_location) +{ + request->location = new_location; + request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) || + (new_location == UDS_LOCATION_IN_DENSE) || + (new_location == UDS_LOCATION_IN_SPARSE)); +} + +static void set_chapter_location(struct uds_request *request, + const struct index_zone *zone, u64 virtual_chapter) +{ + request->found = true; + if (virtual_chapter == zone->newest_virtual_chapter) + request->location = UDS_LOCATION_IN_OPEN_CHAPTER; + else if (is_zone_chapter_sparse(zone, virtual_chapter)) + request->location = UDS_LOCATION_IN_SPARSE; + else + request->location = UDS_LOCATION_IN_DENSE; +} + +static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request, + u64 virtual_chapter, bool *found) +{ + int result; + struct volume *volume; + u16 record_page_number; + u32 chapter; + + result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter, + &record_page_number); + if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER)) + return result; + + request->virtual_chapter = virtual_chapter; + volume = zone->index->volume; + chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + return uds_search_cached_record_page(volume, request, chapter, + record_page_number, found); +} + +static int get_record_from_zone(struct index_zone *zone, struct uds_request *request, + bool *found) +{ + struct volume *volume; + + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + *found = true; + return UDS_SUCCESS; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + *found = false; + return UDS_SUCCESS; + } + + if (request->virtual_chapter == zone->newest_virtual_chapter) { + uds_search_open_chapter(zone->open_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + if ((zone->newest_virtual_chapter > 0) && + (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) && + (zone->writing_chapter->size > 0)) { + uds_search_open_chapter(zone->writing_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + volume = zone->index->volume; + if (is_zone_chapter_sparse(zone, request->virtual_chapter) && + uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter, + request->zone_number)) + return search_sparse_cache_in_zone(zone, request, + request->virtual_chapter, found); + + return uds_search_volume_page_cache(volume, request, found); +} + +static int put_record_in_zone(struct index_zone *zone, struct uds_request *request, + const struct uds_record_data *metadata) +{ + unsigned int remaining; + + remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name, + metadata); + if (remaining == 0) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int search_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + bool overflow_record, found = false; + struct uds_record_data *metadata; + u64 chapter; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * If a record has overflowed a chapter index in more than one chapter (or overflowed in + * one chapter and collided with an existing record), it will exist as a collision record + * in the volume index, but we won't find it in the volume. This case needs special + * handling. + */ + overflow_record = (record.is_found && record.is_collision && !found); + chapter = zone->newest_virtual_chapter; + if (found || overflow_record) { + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && overflow_record)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + if (record.virtual_chapter != chapter) { + /* + * Update the volume index to reference the new chapter for the block. If + * the record had been deleted or dropped from the chapter index, it will + * be back. + */ + result = uds_set_volume_index_record_chapter(&record, chapter); + } else if (request->type != UDS_UPDATE) { + /* The record is already in the open chapter. */ + return UDS_SUCCESS; + } + } else { + /* + * The record wasn't in the volume index, so check whether the + * name is in a cached sparse chapter. If we found the name on + * a previous search, use that result instead. + */ + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + found = true; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + found = false; + } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) && + !uds_is_volume_index_sample(zone->index->volume_index, + &request->record_name)) { + result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER, + &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_request_location(request, UDS_LOCATION_IN_SPARSE); + + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && !found)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + /* + * Add a new entry to the volume index referencing the open chapter. This needs to + * be done both for new records, and for records from cached sparse chapters. + */ + result = uds_put_volume_index_record(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The volume index encountered a delta list overflow. The condition was already + * logged. We will go on without adding the record to the open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) + return result; + + if (!found || (request->type == UDS_UPDATE)) { + /* This is a new record or we're updating an existing record. */ + metadata = &request->new_metadata; + } else { + /* Move the existing record to the open chapter. */ + metadata = &request->old_metadata; + } + + return put_record_in_zone(zone, request, metadata); +} + +static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (!record.is_found) + return UDS_SUCCESS; + + /* If the request was requeued, check whether the saved state is still valid. */ + + if (record.is_collision) { + set_chapter_location(request, zone, record.virtual_chapter); + } else { + /* Non-collision records are hints, so resolve the name in the chapter. */ + bool found; + + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + + if (!found) { + /* There is no record to remove. */ + return UDS_SUCCESS; + } + } + + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * Delete the volume index entry for the named record only. Note that a later search might + * later return stale advice if there is a colliding name in the same chapter, but it's a + * very rare case (1 in 2^21). + */ + result = uds_remove_volume_index_record(&record); + if (result != UDS_SUCCESS) + return result; + + /* + * If the record is in the open chapter, we must remove it or mark it deleted to avoid + * trouble if the record is added again later. + */ + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) + uds_remove_from_open_chapter(zone->open_chapter, &request->record_name); + + return UDS_SUCCESS; +} + +static int dispatch_index_request(struct uds_index *index, struct uds_request *request) +{ + int result; + struct index_zone *zone = index->zones[request->zone_number]; + + if (!request->requeued) { + result = simulate_index_zone_barrier_message(zone, request); + if (result != UDS_SUCCESS) + return result; + } + + switch (request->type) { + case UDS_POST: + case UDS_UPDATE: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + result = search_index_zone(zone, request); + break; + + case UDS_DELETE: + result = remove_from_index_zone(zone, request); + break; + + default: + result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "invalid request type: %d", + request->type); + break; + } + + return result; +} + +/* This is the request processing function invoked by each zone's thread. */ +static void execute_zone_request(struct uds_request *request) +{ + int result; + struct uds_index *index = request->index; + + if (request->zone_message.type != UDS_MESSAGE_NONE) { + result = dispatch_index_zone_control_request(request); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "error executing message: %d", + request->zone_message.type); + } + + /* Once the message is processed it can be freed. */ + uds_free(uds_forget(request)); + return; + } + + index->need_to_save = true; + if (request->requeued && (request->status != UDS_SUCCESS)) { + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + index->callback(request); + return; + } + + result = dispatch_index_request(index, request); + if (result == UDS_QUEUED) { + /* The request has been requeued so don't let it complete. */ + return; + } + + if (!request->found) + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + + request->status = result; + index->callback(request); +} + +static int initialize_index_queues(struct uds_index *index, + const struct index_geometry *geometry) +{ + int result; + unsigned int i; + + for (i = 0; i < index->zone_count; i++) { + result = uds_make_request_queue("indexW", &execute_zone_request, + &index->zone_queues[i]); + if (result != UDS_SUCCESS) + return result; + } + + /* The triage queue is only needed for sparse multi-zone indexes. */ + if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) { + result = uds_make_request_queue("triageW", &triage_request, + &index->triage_queue); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +/* This is the driver function for the chapter writer thread. */ +static void close_chapters(void *arg) +{ + int result; + struct chapter_writer *writer = arg; + struct uds_index *index = writer->index; + + uds_log_debug("chapter writer starting"); + mutex_lock(&writer->mutex); + for (;;) { + while (writer->zones_to_write < index->zone_count) { + if (writer->stop && (writer->zones_to_write == 0)) { + /* + * We've been told to stop, and all of the zones are in the same + * open chapter, so we can exit now. + */ + mutex_unlock(&writer->mutex); + uds_log_debug("chapter writer stopping"); + return; + } + uds_wait_cond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't need to do this, but + * it seems safer in principle. It's OK to access the chapter and chapter_number + * fields without the lock since those aren't allowed to change until we're done. + */ + mutex_unlock(&writer->mutex); + + if (index->has_saved_open_chapter) { + /* + * Remove the saved open chapter the first time we close an open chapter + * after loading from a clean shutdown, or after doing a clean save. The + * lack of the saved open chapter will indicate that a recovery is + * necessary. + */ + index->has_saved_open_chapter = false; + result = uds_discard_open_chapter(index->layout); + if (result == UDS_SUCCESS) + uds_log_debug("Discarding saved open chapter"); + } + + result = uds_close_open_chapter(writer->chapters, index->zone_count, + index->volume, + writer->open_chapter_index, + writer->collated_records, + index->newest_virtual_chapter); + + mutex_lock(&writer->mutex); + index->newest_virtual_chapter++; + index->oldest_virtual_chapter += + uds_chapters_to_expire(index->volume->geometry, + index->newest_virtual_chapter); + writer->result = result; + writer->zones_to_write = 0; + uds_broadcast_cond(&writer->cond); + } +} + +static void stop_chapter_writer(struct chapter_writer *writer) +{ + struct thread *writer_thread = NULL; + + mutex_lock(&writer->mutex); + if (writer->thread != NULL) { + writer_thread = writer->thread; + writer->thread = NULL; + writer->stop = true; + uds_broadcast_cond(&writer->cond); + } + mutex_unlock(&writer->mutex); + + if (writer_thread != NULL) + vdo_join_threads(writer_thread); +} + +static void free_chapter_writer(struct chapter_writer *writer) +{ + if (writer == NULL) + return; + + stop_chapter_writer(writer); + uds_free_open_chapter_index(writer->open_chapter_index); + uds_free(writer->collated_records); + uds_free(writer); +} + +static int make_chapter_writer(struct uds_index *index, + struct chapter_writer **writer_ptr) +{ + int result; + struct chapter_writer *writer; + size_t collated_records_size = + (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); + + result = uds_allocate_extended(struct chapter_writer, index->zone_count, + struct open_chapter_zone *, "Chapter Writer", + &writer); + if (result != UDS_SUCCESS) + return result; + + writer->index = index; + mutex_init(&writer->mutex); + uds_init_cond(&writer->cond); + + result = uds_allocate_cache_aligned(collated_records_size, "collated records", + &writer->collated_records); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + result = uds_make_open_chapter_index(&writer->open_chapter_index, + index->volume->geometry, + index->volume->nonce); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + writer->memory_size = (sizeof(struct chapter_writer) + + index->zone_count * sizeof(struct open_chapter_zone *) + + collated_records_size + + writer->open_chapter_index->memory_size); + + result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static int load_index(struct uds_index *index) +{ + int result; + u64 last_save_chapter; + + result = uds_load_index_state(index->layout, index); + if (result != UDS_SUCCESS) + return UDS_INDEX_NOT_SAVED_CLEANLY; + + last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0); + + uds_log_info("loaded index from chapter %llu through chapter %llu", + (unsigned long long) index->oldest_virtual_chapter, + (unsigned long long) last_save_chapter); + + return UDS_SUCCESS; +} + +static int rebuild_index_page_map(struct uds_index *index, u64 vcn) +{ + int result; + struct delta_index_page *chapter_index_page; + struct index_geometry *geometry = index->volume->geometry; + u32 chapter = uds_map_to_physical_chapter(geometry, vcn); + u32 expected_list_number = 0; + u32 index_page_number; + u32 lowest_delta_list; + u32 highest_delta_list; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + result = uds_get_volume_index_page(index->volume, chapter, + index_page_number, + &chapter_index_page); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "failed to read index page %u in chapter %u", + index_page_number, chapter); + } + + lowest_delta_list = chapter_index_page->lowest_list_number; + highest_delta_list = chapter_index_page->highest_list_number; + if (lowest_delta_list != expected_list_number) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, index_page_number); + } + + uds_update_index_page_map(index->volume->index_page_map, vcn, chapter, + index_page_number, highest_delta_list); + expected_list_number = highest_delta_list + 1; + } + + return UDS_SUCCESS; +} + +static int replay_record(struct uds_index *index, const struct uds_record_name *name, + u64 virtual_chapter, bool will_be_sparse_chapter) +{ + int result; + struct volume_index_record record; + bool update_record; + + if (will_be_sparse_chapter && + !uds_is_volume_index_sample(index->volume_index, name)) { + /* + * This entry will be in a sparse chapter after the rebuild completes, and it is + * not a sample, so just skip over it. + */ + return UDS_SUCCESS; + } + + result = uds_get_volume_index_record(index->volume_index, name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (record.is_collision) { + if (record.virtual_chapter == virtual_chapter) { + /* The record is already correct. */ + return UDS_SUCCESS; + } + + update_record = true; + } else if (record.virtual_chapter == virtual_chapter) { + /* + * There is a volume index entry pointing to the current chapter, but we + * don't know if it is for the same name as the one we are currently + * working on or not. For now, we're just going to assume that it isn't. + * This will create one extra collision record if there was a deleted + * record in the current chapter. + */ + update_record = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to disk to see if the + * record exists, since we will likely have just read the record from disk + * (i.e. we know it's there). The exception to this is when we find an + * entry in the volume index that has a different chapter. In this case, we + * need to search that chapter to determine if the volume index entry was + * for the same record or a different one. + */ + result = uds_search_volume_page_cache_for_rebuild(index->volume, + name, + record.virtual_chapter, + &update_record); + if (result != UDS_SUCCESS) + return result; + } + } else { + update_record = false; + } + + if (update_record) { + /* + * Update the volume index to reference the new chapter for the block. If the + * record had been deleted or dropped from the chapter index, it will be back. + */ + result = uds_set_volume_index_record_chapter(&record, virtual_chapter); + } else { + /* + * Add a new entry to the volume index referencing the open chapter. This should be + * done regardless of whether we are a brand new record or a sparse record, i.e. + * one that doesn't exist in the index but does on disk, since for a sparse record, + * we would want to un-sparsify if it did exist. + */ + result = uds_put_volume_index_record(&record, virtual_chapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* The rebuilt index will lose these records. */ + return UDS_SUCCESS; + } + + return result; +} + +static bool check_for_suspend(struct uds_index *index) +{ + bool closing; + + if (index->load_context == NULL) + return false; + + mutex_lock(&index->load_context->mutex); + if (index->load_context->status != INDEX_SUSPENDING) { + mutex_unlock(&index->load_context->mutex); + return false; + } + + /* Notify that we are suspended and wait for the resume. */ + index->load_context->status = INDEX_SUSPENDED; + uds_broadcast_cond(&index->load_context->cond); + + while ((index->load_context->status != INDEX_OPENING) && + (index->load_context->status != INDEX_FREEING)) + uds_wait_cond(&index->load_context->cond, &index->load_context->mutex); + + closing = (index->load_context->status == INDEX_FREEING); + mutex_unlock(&index->load_context->mutex); + return closing; +} + +static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse) +{ + int result; + u32 i; + u32 j; + const struct index_geometry *geometry; + u32 physical_chapter; + + if (check_for_suspend(index)) { + uds_log_info("Replay interrupted by index shutdown at chapter %llu", + (unsigned long long) virtual); + return -EBUSY; + } + + geometry = index->volume->geometry; + physical_chapter = uds_map_to_physical_chapter(geometry, virtual); + uds_prefetch_volume_chapter(index->volume, physical_chapter); + uds_set_volume_index_open_chapter(index->volume_index, virtual); + + result = rebuild_index_page_map(index, virtual); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "could not rebuild index page map for chapter %u", + physical_chapter); + } + + for (i = 0; i < geometry->record_pages_per_chapter; i++) { + u8 *record_page; + u32 record_page_number; + + record_page_number = geometry->index_pages_per_chapter + i; + result = uds_get_volume_record_page(index->volume, physical_chapter, + record_page_number, &record_page); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, "could not get page %d", + record_page_number); + } + + for (j = 0; j < geometry->records_per_page; j++) { + const u8 *name_bytes; + struct uds_record_name name; + + name_bytes = record_page + (j * BYTES_PER_RECORD); + memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE); + result = replay_record(index, &name, virtual, sparse); + if (result != UDS_SUCCESS) + return result; + } + } + + return UDS_SUCCESS; +} + +static int replay_volume(struct uds_index *index) +{ + int result; + u64 old_map_update; + u64 new_map_update; + u64 virtual; + u64 from_virtual = index->oldest_virtual_chapter; + u64 upto_virtual = index->newest_virtual_chapter; + bool will_be_sparse; + + uds_log_info("Replaying volume from chapter %llu through chapter %llu", + (unsigned long long) from_virtual, + (unsigned long long) upto_virtual); + + /* + * The index failed to load, so the volume index is empty. Add records to the volume index + * in order, skipping non-hooks in chapters which will be sparse to save time. + * + * Go through each record page of each chapter and add the records back to the volume + * index. This should not cause anything to be written to either the open chapter or the + * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this + * would have already been purged from the volume index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the index page map. + */ + old_map_update = index->volume->index_page_map->last_update; + for (virtual = from_virtual; virtual < upto_virtual; virtual++) { + will_be_sparse = uds_is_chapter_sparse(index->volume->geometry, + from_virtual, upto_virtual, + virtual); + result = replay_chapter(index, virtual, will_be_sparse); + if (result != UDS_SUCCESS) + return result; + } + + /* Also reap the chapter being replaced by the open chapter. */ + uds_set_volume_index_open_chapter(index->volume_index, upto_virtual); + + new_map_update = index->volume->index_page_map->last_update; + if (new_map_update != old_map_update) { + uds_log_info("replay changed index page map update from %llu to %llu", + (unsigned long long) old_map_update, + (unsigned long long) new_map_update); + } + + return UDS_SUCCESS; +} + +static int rebuild_index(struct uds_index *index) +{ + int result; + u64 lowest; + u64 highest; + bool is_empty = false; + u32 chapters_per_volume = index->volume->geometry->chapters_per_volume; + + index->volume->lookup_mode = LOOKUP_FOR_REBUILD; + result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest, + &is_empty); + if (result != UDS_SUCCESS) { + return uds_log_fatal_strerror(result, + "cannot rebuild index: unknown volume chapter boundaries"); + } + + if (is_empty) { + index->newest_virtual_chapter = 0; + index->oldest_virtual_chapter = 0; + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; + } + + index->newest_virtual_chapter = highest + 1; + index->oldest_virtual_chapter = lowest; + if (index->newest_virtual_chapter == + (index->oldest_virtual_chapter + chapters_per_volume)) { + /* Skip the chapter shadowed by the open chapter. */ + index->oldest_virtual_chapter++; + } + + result = replay_volume(index); + if (result != UDS_SUCCESS) + return result; + + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; +} + +static void free_index_zone(struct index_zone *zone) +{ + if (zone == NULL) + return; + + uds_free_open_chapter(zone->open_chapter); + uds_free_open_chapter(zone->writing_chapter); + uds_free(zone); +} + +static int make_index_zone(struct uds_index *index, unsigned int zone_number) +{ + int result; + struct index_zone *zone; + + result = uds_allocate(1, struct index_zone, "index zone", &zone); + if (result != UDS_SUCCESS) + return result; + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->open_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->writing_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + zone->index = index; + zone->id = zone_number; + index->zones[zone_number] = zone; + + return UDS_SUCCESS; +} + +int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type, + struct index_load_context *load_context, index_callback_fn callback, + struct uds_index **new_index) +{ + int result; + bool loaded = false; + bool new = (open_type == UDS_CREATE); + struct uds_index *index = NULL; + struct index_zone *zone; + u64 nonce; + unsigned int z; + + result = uds_allocate_extended(struct uds_index, config->zone_count, + struct uds_request_queue *, "index", &index); + if (result != UDS_SUCCESS) + return result; + + index->zone_count = config->zone_count; + + result = uds_make_index_layout(config, new, &index->layout); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = uds_allocate(index->zone_count, struct index_zone *, "zones", + &index->zones); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = uds_make_volume(config, index->layout, &index->volume); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + index->volume->lookup_mode = LOOKUP_NORMAL; + for (z = 0; z < index->zone_count; z++) { + result = make_index_zone(index, z); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return uds_log_error_strerror(result, + "Could not create index zone"); + } + } + + nonce = uds_get_volume_nonce(index->layout); + result = uds_make_volume_index(config, nonce, &index->volume_index); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return uds_log_error_strerror(result, "could not make volume index"); + } + + index->load_context = load_context; + index->callback = callback; + + result = initialize_index_queues(index, config->geometry); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = make_chapter_writer(index, &index->chapter_writer); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + if (!new) { + result = load_index(index); + switch (result) { + case UDS_SUCCESS: + loaded = true; + break; + case -ENOMEM: + /* We should not try a rebuild for this error. */ + uds_log_error_strerror(result, "index could not be loaded"); + break; + default: + uds_log_error_strerror(result, "index could not be loaded"); + if (open_type == UDS_LOAD) { + result = rebuild_index(index); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "index could not be rebuilt"); + } + } + break; + } + } + + if (result != UDS_SUCCESS) { + uds_free_index(index); + return uds_log_error_strerror(result, "fatal error in %s()", __func__); + } + + for (z = 0; z < index->zone_count; z++) { + zone = index->zones[z]; + zone->oldest_virtual_chapter = index->oldest_virtual_chapter; + zone->newest_virtual_chapter = index->newest_virtual_chapter; + } + + if (index->load_context != NULL) { + mutex_lock(&index->load_context->mutex); + index->load_context->status = INDEX_READY; + /* + * If we get here, suspend is meaningless, but notify any thread trying to suspend + * us so it doesn't hang. + */ + uds_broadcast_cond(&index->load_context->cond); + mutex_unlock(&index->load_context->mutex); + } + + index->has_saved_open_chapter = loaded; + index->need_to_save = !loaded; + *new_index = index; + return UDS_SUCCESS; +} + +void uds_free_index(struct uds_index *index) +{ + unsigned int i; + + if (index == NULL) + return; + + uds_request_queue_finish(index->triage_queue); + for (i = 0; i < index->zone_count; i++) + uds_request_queue_finish(index->zone_queues[i]); + + free_chapter_writer(index->chapter_writer); + + uds_free_volume_index(index->volume_index); + if (index->zones != NULL) { + for (i = 0; i < index->zone_count; i++) + free_index_zone(index->zones[i]); + uds_free(index->zones); + } + + uds_free_volume(index->volume); + uds_free_index_layout(uds_forget(index->layout)); + uds_free(index); +} + +/* Wait for the chapter writer to complete any outstanding writes. */ +void uds_wait_for_idle_index(struct uds_index *index) +{ + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (writer->zones_to_write > 0) + uds_wait_cond(&writer->cond, &writer->mutex); + mutex_unlock(&writer->mutex); +} + +/* This function assumes that all requests have been drained. */ +int uds_save_index(struct uds_index *index) +{ + int result; + + if (!index->need_to_save) + return UDS_SUCCESS; + + uds_wait_for_idle_index(index); + index->prev_save = index->last_save; + index->last_save = ((index->newest_virtual_chapter == 0) ? + NO_LAST_SAVE : index->newest_virtual_chapter - 1); + uds_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save); + + result = uds_save_index_state(index->layout, index); + if (result != UDS_SUCCESS) { + uds_log_info("save index failed"); + index->last_save = index->prev_save; + } else { + index->has_saved_open_chapter = true; + index->need_to_save = false; + uds_log_info("finished save (vcn %llu)", + (unsigned long long) index->last_save); + } + + return result; +} + +int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev) +{ + return uds_replace_volume_storage(index->volume, index->layout, bdev); +} + +/* Accessing statistics should be safe from any thread. */ +void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters) +{ + struct volume_index_stats stats; + + uds_get_volume_index_stats(index->volume_index, &stats); + counters->entries_indexed = stats.record_count; + counters->collisions = stats.collision_count; + counters->entries_discarded = stats.discard_count; + + counters->memory_used = (index->volume_index->memory_size + + index->volume->cache_size + + index->chapter_writer->memory_size); +} + +void uds_enqueue_request(struct uds_request *request, enum request_stage stage) +{ + struct uds_index *index = request->index; + struct uds_request_queue *queue; + + switch (stage) { + case STAGE_TRIAGE: + if (index->triage_queue != NULL) { + queue = index->triage_queue; + break; + } + + fallthrough; + + case STAGE_INDEX: + request->zone_number = + uds_get_volume_index_zone(index->volume_index, &request->record_name); + fallthrough; + + case STAGE_MESSAGE: + queue = index->zone_queues[request->zone_number]; + break; + + default: + ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage); + return; + } + + uds_request_queue_enqueue(queue, request); +} diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h new file mode 100644 index 0000000000000..edabb239548ec --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_H +#define UDS_INDEX_H + +#include "index-layout.h" +#include "index-session.h" +#include "open-chapter.h" +#include "volume.h" +#include "volume-index.h" + +/* + * The index is a high-level structure which represents the totality of the UDS index. It manages + * the queues for incoming requests and dispatches them to the appropriate sub-components like the + * volume or the volume index. It also manages administrative tasks such as saving and loading the + * index. + * + * The index is divided into a number of independent zones and assigns each request to a zone based + * on its name. Most sub-components are similarly divided into zones as well so that requests in + * each zone usually operate without interference or coordination between zones. + */ + +typedef void (*index_callback_fn)(struct uds_request *request); + +struct index_zone { + struct uds_index *index; + struct open_chapter_zone *open_chapter; + struct open_chapter_zone *writing_chapter; + u64 oldest_virtual_chapter; + u64 newest_virtual_chapter; + unsigned int id; +}; + +struct uds_index { + bool has_saved_open_chapter; + bool need_to_save; + struct index_load_context *load_context; + struct index_layout *layout; + struct volume_index *volume_index; + struct volume *volume; + unsigned int zone_count; + struct index_zone **zones; + + u64 oldest_virtual_chapter; + u64 newest_virtual_chapter; + + u64 last_save; + u64 prev_save; + struct chapter_writer *chapter_writer; + + index_callback_fn callback; + struct uds_request_queue *triage_queue; + struct uds_request_queue *zone_queues[]; +}; + +enum request_stage { + STAGE_TRIAGE, + STAGE_INDEX, + STAGE_MESSAGE, +}; + +int __must_check uds_make_index(struct uds_configuration *config, + enum uds_open_index_type open_type, + struct index_load_context *load_context, + index_callback_fn callback, struct uds_index **new_index); + +int __must_check uds_save_index(struct uds_index *index); + +void uds_free_index(struct uds_index *index); + +int __must_check uds_replace_index_storage(struct uds_index *index, + struct block_device *bdev); + +void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters); + +void uds_enqueue_request(struct uds_request *request, enum request_stage stage); + +void uds_wait_for_idle_index(struct uds_index *index); + +#endif /* UDS_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h new file mode 100644 index 0000000000000..3744aaf625b05 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef INDEXER_H +#define INDEXER_H + +#include +#include +#include +#include + +#include "funnel-queue.h" + +/* + * UDS public API + * + * The Universal Deduplication System (UDS) is an efficient name-value store. When used for + * deduplicating storage, the names are generally hashes of data blocks and the associated data is + * where that block is located on the underlying storage medium. The stored names are expected to + * be randomly distributed among the space of possible names. If this assumption is violated, the + * UDS index will store fewer names than normal but will otherwise continue to work. The data + * associated with each name can be any 16-byte value. + * + * A client must first create an index session to interact with an index. Once created, the session + * can be shared among multiple threads or users. When a session is destroyed, it will also close + * and save any associated index. + * + * To make a request, a client must allocate a uds_request structure and set the required fields + * before launching it. UDS will invoke the provided callback to complete the request. After the + * callback has been called, the uds_request structure can be freed or reused for a new request. + * There are five types of requests: + * + * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data + * associated with that name will be discarded. + * + * A UDS_QUERY request will return the data associated with the provided name, if any. The entry + * for the name will also be marked as most recent, as if the data had been updated. + * + * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data + * associated with the provided name, that data is returned. If there is no existing association, + * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY + * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient. + * + * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will + * not change the recency of the entry for the name. This request is primarily useful for testing, + * to determine whether an entry exists without changing the internal state of the index. + * + * A UDS_DELETE request removes any data associated with the provided name. This operation is + * generally not necessary, because the index will automatically discard its oldest entries once it + * becomes full. + */ + +/* General UDS constants and structures */ + +enum uds_request_type { + /* Create or update the mapping for a name, and make the name most recent. */ + UDS_UPDATE, + + /* Return any mapped data for a name, and make the name most recent. */ + UDS_QUERY, + + /* + * Return any mapped data for a name, or map the provided data to the name if there is no + * current data, and make the name most recent. + */ + UDS_POST, + + /* Return any mapped data for a name without updating its recency. */ + UDS_QUERY_NO_UPDATE, + + /* Remove any mapping for a name. */ + UDS_DELETE, + +}; + +enum uds_open_index_type { + /* Create a new index. */ + UDS_CREATE, + + /* Load an existing index and try to recover if necessary. */ + UDS_LOAD, + + /* Load an existing index, but only if it was saved cleanly. */ + UDS_NO_REBUILD, +}; + +enum { + /* The record name size in bytes */ + UDS_RECORD_NAME_SIZE = 16, + /* The maximum record data size in bytes */ + UDS_RECORD_DATA_SIZE = 16, +}; + +/* + * A type representing a UDS memory configuration which is either a positive integer number of + * gigabytes or one of the six special constants for configurations smaller than one gigabyte. + */ +typedef int uds_memory_config_size_t; + +enum { + /* The maximum configurable amount of memory */ + UDS_MEMORY_CONFIG_MAX = 1024, + /* Flag indicating that the index has one less chapter than usual */ + UDS_MEMORY_CONFIG_REDUCED = 0x1000, + UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED, + /* Special values indicating sizes less than 1 GB */ + UDS_MEMORY_CONFIG_256MB = -256, + UDS_MEMORY_CONFIG_512MB = -512, + UDS_MEMORY_CONFIG_768MB = -768, + UDS_MEMORY_CONFIG_REDUCED_256MB = -1280, + UDS_MEMORY_CONFIG_REDUCED_512MB = -1536, + UDS_MEMORY_CONFIG_REDUCED_768MB = -1792, +}; + +struct uds_record_name { + unsigned char name[UDS_RECORD_NAME_SIZE]; +}; + +struct uds_record_data { + unsigned char data[UDS_RECORD_DATA_SIZE]; +}; + +struct uds_volume_record { + struct uds_record_name name; + struct uds_record_data data; +}; + +struct uds_parameters { + /* The block_device used for storage */ + struct block_device *bdev; + /* The maximum allowable size of the index on storage */ + size_t size; + /* The offset where the index should start */ + off_t offset; + /* The maximum memory allocation, in GB */ + uds_memory_config_size_t memory_size; + /* Whether the index should include sparse chapters */ + bool sparse; + /* A 64-bit nonce to validate the index */ + u64 nonce; + /* The number of threads used to process index requests */ + unsigned int zone_count; + /* The number of threads used to read volume pages */ + unsigned int read_threads; +}; + +/* + * These statistics capture characteristics of the current index, including resource usage and + * requests processed since the index was opened. + */ +struct uds_index_stats { + /* The total number of records stored in the index */ + u64 entries_indexed; + /* An estimate of the index's memory usage, in bytes */ + u64 memory_used; + /* The number of collisions recorded in the volume index */ + u64 collisions; + /* The number of entries discarded from the index since startup */ + u64 entries_discarded; + /* The time at which these statistics were fetched */ + s64 current_time; + /* The number of post calls that found an existing entry */ + u64 posts_found; + /* The number of post calls that added an entry */ + u64 posts_not_found; + /* + * The number of post calls that found an existing entry that is current enough to only + * exist in memory and not have been committed to disk yet + */ + u64 in_memory_posts_found; + /* + * The number of post calls that found an existing entry in the dense portion of the index + */ + u64 dense_posts_found; + /* + * The number of post calls that found an existing entry in the sparse portion of the index + */ + u64 sparse_posts_found; + /* The number of update calls that updated an existing entry */ + u64 updates_found; + /* The number of update calls that added a new entry */ + u64 updates_not_found; + /* The number of delete requests that deleted an existing entry */ + u64 deletions_found; + /* The number of delete requests that did nothing */ + u64 deletions_not_found; + /* The number of query calls that found existing entry */ + u64 queries_found; + /* The number of query calls that did not find an entry */ + u64 queries_not_found; + /* The total number of requests processed */ + u64 requests; +}; + +enum uds_index_region { + /* No location information has been determined */ + UDS_LOCATION_UNKNOWN = 0, + /* The index page entry has been found */ + UDS_LOCATION_INDEX_PAGE_LOOKUP, + /* The record page entry has been found */ + UDS_LOCATION_RECORD_PAGE_LOOKUP, + /* The record is not in the index */ + UDS_LOCATION_UNAVAILABLE, + /* The record was found in the open chapter */ + UDS_LOCATION_IN_OPEN_CHAPTER, + /* The record was found in the dense part of the index */ + UDS_LOCATION_IN_DENSE, + /* The record was found in the sparse part of the index */ + UDS_LOCATION_IN_SPARSE, +} __packed; + +/* Zone message requests are used to communicate between index zones. */ +enum uds_zone_message_type { + /* A standard request with no message */ + UDS_MESSAGE_NONE = 0, + /* Add a chapter to the sparse chapter index cache */ + UDS_MESSAGE_SPARSE_CACHE_BARRIER, + /* Close a chapter to keep the zone from falling behind */ + UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, +} __packed; + +struct uds_zone_message { + /* The type of message, determining how it will be processed */ + enum uds_zone_message_type type; + /* The virtual chapter number to which the message applies */ + u64 virtual_chapter; +}; + +struct uds_index_session; +struct uds_index; +struct uds_request; + +/* Once this callback has been invoked, the uds_request structure can be reused or freed. */ +typedef void (*uds_request_callback_fn)(struct uds_request *request); + +struct uds_request { + /* These input fields must be set before launching a request. */ + + /* The name of the record to look up or create */ + struct uds_record_name record_name; + /* New data to associate with the record name, if applicable */ + struct uds_record_data new_metadata; + /* A callback to invoke when the request is complete */ + uds_request_callback_fn callback; + /* The index session that will manage this request */ + struct uds_index_session *session; + /* The type of operation to perform, as describe above */ + enum uds_request_type type; + + /* These output fields are set when a request is complete. */ + + /* The existing data associated with the request name, if any */ + struct uds_record_data old_metadata; + /* Either UDS_SUCCESS or an error code for the request */ + int status; + /* True if the record name had an existing entry in the index */ + bool found; + + /* + * The remaining fields are used internally and should not be altered by clients. The index + * relies on zone_number being the first field in this section. + */ + + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; +}; + +/* Compute the number of bytes needed to store an index. */ +int __must_check uds_compute_index_size(const struct uds_parameters *parameters, + u64 *index_size); + +/* A session is required for most index operations. */ +int __must_check uds_create_index_session(struct uds_index_session **session); + +/* Destroying an index session also closes and saves the associated index. */ +int uds_destroy_index_session(struct uds_index_session *session); + +/* + * Create or open an index with an existing session. This operation fails if the index session is + * suspended, or if there is already an open index. + */ +int __must_check uds_open_index(enum uds_open_index_type open_type, + const struct uds_parameters *parameters, + struct uds_index_session *session); + +/* + * Wait until all callbacks for index operations are complete, and prevent new index operations + * from starting. New index operations will fail with EBUSY until the session is resumed. Also + * optionally saves the index. + */ +int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save); + +/* + * Allow new index operations for an index, whether it was suspended or not. If the index is + * suspended and the supplied block device differs from the current backing store, the index will + * start using the new backing store instead. + */ +int __must_check uds_resume_index_session(struct uds_index_session *session, + struct block_device *bdev); + +/* Wait until all outstanding index operations are complete. */ +int __must_check uds_flush_index_session(struct uds_index_session *session); + +/* Close an index. This operation fails if the index session is suspended. */ +int __must_check uds_close_index(struct uds_index_session *session); + +/* Get index statistics since the last time the index was opened. */ +int __must_check uds_get_index_session_stats(struct uds_index_session *session, + struct uds_index_stats *stats); + +/* This function will fail if any required field of the request is not set. */ +int __must_check uds_launch_request(struct uds_request *request); + +struct cond_var { + wait_queue_head_t wait_queue; +}; + +static inline void uds_init_cond(struct cond_var *cv) +{ + init_waitqueue_head(&cv->wait_queue); +} + +static inline void uds_signal_cond(struct cond_var *cv) +{ + wake_up(&cv->wait_queue); +} + +static inline void uds_broadcast_cond(struct cond_var *cv) +{ + wake_up_all(&cv->wait_queue); +} + +void uds_wait_cond(struct cond_var *cv, struct mutex *mutex); + +#endif /* INDEXER_H */ diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c new file mode 100644 index 0000000000000..02242df94e373 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/io-factory.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "io-factory.h" + +#include +#include +#include +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" + +/* + * The I/O factory object manages access to index storage, which is a contiguous range of blocks on + * a block device. + * + * The factory holds the open device and is responsible for closing it. The factory has methods to + * make helper structures that can be used to access sections of the index. + */ +struct io_factory { + struct block_device *bdev; + atomic_t ref_count; +}; + +/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */ +struct buffered_reader { + struct io_factory *factory; + struct dm_bufio_client *client; + struct dm_buffer *buffer; + sector_t limit; + sector_t block_number; + u8 *start; + u8 *end; +}; + +enum { MAX_READ_AHEAD_BLOCKS = 4 }; + +/* + * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments + * to storage. + */ +struct buffered_writer { + struct io_factory *factory; + struct dm_bufio_client *client; + struct dm_buffer *buffer; + sector_t limit; + sector_t block_number; + u8 *start; + u8 *end; + int error; +}; + +static void uds_get_io_factory(struct io_factory *factory) +{ + atomic_inc(&factory->ref_count); +} + +int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr) +{ + int result; + struct io_factory *factory; + + result = uds_allocate(1, struct io_factory, __func__, &factory); + if (result != UDS_SUCCESS) + return result; + + factory->bdev = bdev; + atomic_set_release(&factory->ref_count, 1); + + *factory_ptr = factory; + return UDS_SUCCESS; +} + +int uds_replace_storage(struct io_factory *factory, struct block_device *bdev) +{ + factory->bdev = bdev; + return UDS_SUCCESS; +} + +/* Free an I/O factory once all references have been released. */ +void uds_put_io_factory(struct io_factory *factory) +{ + if (atomic_add_return(-1, &factory->ref_count) <= 0) + uds_free(factory); +} + +size_t uds_get_writable_size(struct io_factory *factory) +{ + return i_size_read(factory->bdev->bd_inode); +} + +/* Create a struct dm_bufio_client for an index region starting at offset. */ +int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size, + unsigned int reserved_buffers, struct dm_bufio_client **client_ptr) +{ + struct dm_bufio_client *client; + + client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0, + NULL, NULL, 0); + if (IS_ERR(client)) + return -PTR_ERR(client); + + dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK); + *client_ptr = client; + return UDS_SUCCESS; +} + +static void read_ahead(struct buffered_reader *reader, sector_t block_number) +{ + if (block_number < reader->limit) { + sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS, + reader->limit - block_number); + + dm_bufio_prefetch(reader->client, block_number, read_ahead); + } +} + +void uds_free_buffered_reader(struct buffered_reader *reader) +{ + if (reader == NULL) + return; + + if (reader->buffer != NULL) + dm_bufio_release(reader->buffer); + + dm_bufio_client_destroy(reader->client); + uds_put_io_factory(reader->factory); + uds_free(reader); +} + +/* Create a buffered reader for an index region starting at offset. */ +int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count, + struct buffered_reader **reader_ptr) +{ + int result; + struct dm_bufio_client *client = NULL; + struct buffered_reader *reader = NULL; + + result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(1, struct buffered_reader, "buffered reader", &reader); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + return result; + } + + *reader = (struct buffered_reader) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_count, + .block_number = 0, + .start = NULL, + .end = NULL, + }; + + read_ahead(reader, 0); + uds_get_io_factory(factory); + *reader_ptr = reader; + return UDS_SUCCESS; +} + +static int position_reader(struct buffered_reader *reader, sector_t block_number, + off_t offset) +{ + struct dm_buffer *buffer = NULL; + void *data; + + if ((reader->end == NULL) || (block_number != reader->block_number)) { + if (block_number >= reader->limit) + return UDS_OUT_OF_RANGE; + + if (reader->buffer != NULL) + dm_bufio_release(uds_forget(reader->buffer)); + + data = dm_bufio_read(reader->client, block_number, &buffer); + if (IS_ERR(data)) + return -PTR_ERR(data); + + reader->buffer = buffer; + reader->start = data; + if (block_number == reader->block_number + 1) + read_ahead(reader, block_number + 1); + } + + reader->block_number = block_number; + reader->end = reader->start + offset; + return UDS_SUCCESS; +} + +static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader) +{ + return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end; +} + +static int reset_reader(struct buffered_reader *reader) +{ + sector_t block_number; + + if (bytes_remaining_in_read_buffer(reader) > 0) + return UDS_SUCCESS; + + block_number = reader->block_number; + if (reader->end != NULL) + block_number++; + + return position_reader(reader, block_number, 0); +} + +int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, + size_t length) +{ + int result = UDS_SUCCESS; + size_t chunk_size; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) + return result; + + chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); + memcpy(data, reader->end, chunk_size); + length -= chunk_size; + data += chunk_size; + reader->end += chunk_size; + } + + return UDS_SUCCESS; +} + +/* + * Verify that the next data on the reader matches the required value. If the value matches, the + * matching contents are consumed. If the value does not match, the reader state is unchanged. + */ +int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, + size_t length) +{ + int result = UDS_SUCCESS; + size_t chunk_size; + sector_t start_block_number = reader->block_number; + int start_offset = reader->end - reader->start; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_DATA; + break; + } + + chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); + if (memcmp(value, reader->end, chunk_size) != 0) { + result = UDS_CORRUPT_DATA; + break; + } + + length -= chunk_size; + value += chunk_size; + reader->end += chunk_size; + } + + if (result != UDS_SUCCESS) + position_reader(reader, start_block_number, start_offset); + + return result; +} + +/* Create a buffered writer for an index region starting at offset. */ +int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count, + struct buffered_writer **writer_ptr) +{ + int result; + struct dm_bufio_client *client = NULL; + struct buffered_writer *writer; + + result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(1, struct buffered_writer, "buffered writer", &writer); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + return result; + } + + *writer = (struct buffered_writer) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_count, + .start = NULL, + .end = NULL, + .block_number = 0, + .error = UDS_SUCCESS, + }; + + uds_get_io_factory(factory); + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static size_t get_remaining_write_space(struct buffered_writer *writer) +{ + return writer->start + UDS_BLOCK_SIZE - writer->end; +} + +static int __must_check prepare_next_buffer(struct buffered_writer *writer) +{ + struct dm_buffer *buffer = NULL; + void *data; + + if (writer->block_number >= writer->limit) { + writer->error = UDS_OUT_OF_RANGE; + return UDS_OUT_OF_RANGE; + } + + data = dm_bufio_new(writer->client, writer->block_number, &buffer); + if (IS_ERR(data)) { + writer->error = -PTR_ERR(data); + return writer->error; + } + + writer->buffer = buffer; + writer->start = data; + writer->end = data; + return UDS_SUCCESS; +} + +static int flush_previous_buffer(struct buffered_writer *writer) +{ + size_t available; + + if (writer->buffer == NULL) + return writer->error; + + if (writer->error == UDS_SUCCESS) { + available = get_remaining_write_space(writer); + + if (available > 0) + memset(writer->end, 0, available); + + dm_bufio_mark_buffer_dirty(writer->buffer); + } + + dm_bufio_release(writer->buffer); + writer->buffer = NULL; + writer->start = NULL; + writer->end = NULL; + writer->block_number++; + return writer->error; +} + +void uds_free_buffered_writer(struct buffered_writer *writer) +{ + int result; + + if (writer == NULL) + return; + + flush_previous_buffer(writer); + result = -dm_bufio_write_dirty_buffers(writer->client); + if (result != UDS_SUCCESS) + uds_log_warning_strerror(result, "%s: failed to sync storage", __func__); + + dm_bufio_client_destroy(writer->client); + uds_put_io_factory(writer->factory); + uds_free(writer); +} + +/* + * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead. + * If a write error occurs, it is recorded and returned on every subsequent write attempt. + */ +int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data, + size_t length) +{ + int result = writer->error; + size_t chunk_size; + + while ((length > 0) && (result == UDS_SUCCESS)) { + if (writer->buffer == NULL) { + result = prepare_next_buffer(writer); + continue; + } + + chunk_size = min(length, get_remaining_write_space(writer)); + if (data == NULL) { + memset(writer->end, 0, chunk_size); + } else { + memcpy(writer->end, data, chunk_size); + data += chunk_size; + } + + length -= chunk_size; + writer->end += chunk_size; + + if (get_remaining_write_space(writer) == 0) + result = uds_flush_buffered_writer(writer); + } + + return result; +} + +int uds_flush_buffered_writer(struct buffered_writer *writer) +{ + if (writer->error != UDS_SUCCESS) + return writer->error; + + return flush_previous_buffer(writer); +} diff --git a/drivers/md/dm-vdo/indexer/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h new file mode 100644 index 0000000000000..7fb5a0616a791 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/io-factory.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_IO_FACTORY_H +#define UDS_IO_FACTORY_H + +#include + +/* + * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main + * clients are the index layout and the volume. The buffered reader and buffered writer interfaces + * are helpers for accessing data in a contiguous range of storage blocks. + */ + +struct buffered_reader; +struct buffered_writer; + +struct io_factory; + +enum { + UDS_BLOCK_SIZE = 4096, + SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT, +}; + +int __must_check uds_make_io_factory(struct block_device *bdev, + struct io_factory **factory_ptr); + +int __must_check uds_replace_storage(struct io_factory *factory, + struct block_device *bdev); + +void uds_put_io_factory(struct io_factory *factory); + +size_t __must_check uds_get_writable_size(struct io_factory *factory); + +int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset, + size_t block_size, unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr); + +int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset, + u64 block_count, + struct buffered_reader **reader_ptr); + +void uds_free_buffered_reader(struct buffered_reader *reader); + +int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, + size_t length); + +int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, + size_t length); + +int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset, + u64 block_count, + struct buffered_writer **writer_ptr); + +void uds_free_buffered_writer(struct buffered_writer *buffer); + +int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer, + const u8 *data, size_t length); + +int __must_check uds_flush_buffered_writer(struct buffered_writer *writer); + +#endif /* UDS_IO_FACTORY_H */ diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c new file mode 100644 index 0000000000000..da16afaec07f2 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/open-chapter.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "open-chapter.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "config.h" +#include "hash-utils.h" + +/* + * Each index zone has a dedicated open chapter zone structure which gets an equal share of the + * open chapter space. Records are assigned to zones based on their record name. Within each zone, + * records are stored in an array in the order they arrive. Additionally, a reference to each + * record is stored in a hash table to help determine if a new record duplicates an existing one. + * If new metadata for an existing name arrives, the record is altered in place. The array of + * records is 1-based so that record number 0 can be used to indicate an unused hash slot. + * + * Deleted records are marked with a flag rather than actually removed to simplify hash table + * management. The array of deleted flags overlays the array of hash slots, but the flags are + * indexed by record number instead of by record name. The number of hash slots will always be a + * power of two that is greater than the number of records to be indexed, guaranteeing that hash + * insertion cannot fail, and that there are sufficient flags for all records. + * + * Once any open chapter zone fills its available space, the chapter is closed. The records from + * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages. + * Empty or deleted records are replaced by copies of a valid record so that the record pages only + * contain valid records. The chapter then constructs a delta index which maps each record name to + * the record page on which that record can be found, which is split into index pages. These + * structures are then passed to the volume to be recorded on storage. + * + * When the index is saved, the open chapter records are saved in a single array, once again + * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a + * different number of zones than previously, so the records must be parcelled out to their new + * zones. In addition, depending on the distribution of record names, a new zone may have more + * records than it has space. In this case, the latest records for that zone will be discarded. + */ + +static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC"; +static const u8 OPEN_CHAPTER_VERSION[] = "02.00"; + +enum { + OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, + OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1, + LOAD_RATIO = 2, +}; + +static inline size_t records_size(const struct open_chapter_zone *open_chapter) +{ + return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity); +} + +static inline size_t slots_size(size_t slot_count) +{ + return sizeof(struct open_chapter_zone_slot) * slot_count; +} + +int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr) +{ + int result; + struct open_chapter_zone *open_chapter; + size_t capacity = geometry->records_per_chapter / zone_count; + size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO)); + + result = uds_allocate_extended(struct open_chapter_zone, slot_count, + struct open_chapter_zone_slot, "open chapter", + &open_chapter); + if (result != UDS_SUCCESS) + return result; + + open_chapter->slot_count = slot_count; + open_chapter->capacity = capacity; + result = uds_allocate_cache_aligned(records_size(open_chapter), "record pages", + &open_chapter->records); + if (result != UDS_SUCCESS) { + uds_free_open_chapter(open_chapter); + return result; + } + + *open_chapter_ptr = open_chapter; + return UDS_SUCCESS; +} + +void uds_reset_open_chapter(struct open_chapter_zone *open_chapter) +{ + open_chapter->size = 0; + open_chapter->deletions = 0; + + memset(open_chapter->records, 0, records_size(open_chapter)); + memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count)); +} + +static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name) +{ + struct uds_volume_record *record; + unsigned int slot_count = open_chapter->slot_count; + unsigned int slot = uds_name_to_hash_slot(name, slot_count); + unsigned int record_number; + unsigned int attempts = 1; + + while (true) { + record_number = open_chapter->slots[slot].record_number; + + /* + * If the hash slot is empty, we've reached the end of a chain without finding the + * record and should terminate the search. + */ + if (record_number == 0) + return slot; + + /* + * If the name of the record referenced by the slot matches and has not been + * deleted, then we've found the requested name. + */ + record = &open_chapter->records[record_number]; + if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) && + !open_chapter->slots[record_number].deleted) + return slot; + + /* + * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This + * performs better than linear probing and works best for 2^N slots. + */ + slot = (slot + attempts++) % slot_count; + } +} + +void uds_search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + struct uds_record_data *metadata, bool *found) +{ + unsigned int slot; + unsigned int record_number; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + if (record_number == 0) { + *found = false; + } else { + *found = true; + *metadata = open_chapter->records[record_number].data; + } +} + +/* Add a record to the open chapter zone and return the remaining space. */ +int uds_put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + const struct uds_record_data *metadata) +{ + unsigned int slot; + unsigned int record_number; + struct uds_volume_record *record; + + if (open_chapter->size >= open_chapter->capacity) + return 0; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + + if (record_number == 0) { + record_number = ++open_chapter->size; + open_chapter->slots[slot].record_number = record_number; + } + + record = &open_chapter->records[record_number]; + record->name = *name; + record->data = *metadata; + + return open_chapter->capacity - open_chapter->size; +} + +void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name) +{ + unsigned int slot; + unsigned int record_number; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + + if (record_number > 0) { + open_chapter->slots[record_number].deleted = true; + open_chapter->deletions += 1; + } +} + +void uds_free_open_chapter(struct open_chapter_zone *open_chapter) +{ + if (open_chapter != NULL) { + uds_free(open_chapter->records); + uds_free(open_chapter); + } +} + +/* Map each record name to its record page number in the delta chapter index. */ +static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, + struct open_chapter_index *index, + struct uds_volume_record *collated_records) +{ + int result; + unsigned int records_per_chapter; + unsigned int records_per_page; + unsigned int record_index; + unsigned int records = 0; + u32 page_number; + unsigned int z; + int overflow_count = 0; + struct uds_volume_record *fill_record = NULL; + + /* + * The record pages should not have any empty space, so find a record with which to fill + * the chapter zone if it was closed early, and also to replace any deleted records. The + * last record in any filled zone is guaranteed to not have been deleted, so use one of + * those. + */ + for (z = 0; z < zone_count; z++) { + struct open_chapter_zone *zone = chapter_zones[z]; + + if (zone->size == zone->capacity) { + fill_record = &zone->records[zone->size]; + break; + } + } + + records_per_chapter = index->geometry->records_per_chapter; + records_per_page = index->geometry->records_per_page; + + for (records = 0; records < records_per_chapter; records++) { + struct uds_volume_record *record = &collated_records[records]; + struct open_chapter_zone *open_chapter; + + /* The record arrays in the zones are 1-based. */ + record_index = 1 + (records / zone_count); + page_number = records / records_per_page; + open_chapter = chapter_zones[records % zone_count]; + + /* Use the fill record in place of an unused record. */ + if (record_index > open_chapter->size || + open_chapter->slots[record_index].deleted) { + *record = *fill_record; + continue; + } + + *record = open_chapter->records[record_index]; + result = uds_put_open_chapter_index_record(index, &record->name, + page_number); + switch (result) { + case UDS_SUCCESS: + break; + case UDS_OVERFLOW: + overflow_count++; + break; + default: + uds_log_error_strerror(result, + "failed to build open chapter index"); + return result; + } + } + + if (overflow_count > 0) + uds_log_warning("Failed to add %d entries to chapter index", + overflow_count); + + return UDS_SUCCESS; +} + +int uds_close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_volume_record *collated_records, + u64 virtual_chapter_number) +{ + int result; + + uds_empty_open_chapter_index(chapter_index, virtual_chapter_number); + result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index, + collated_records); + if (result != UDS_SUCCESS) + return result; + + return uds_write_chapter(volume, chapter_index, collated_records); +} + +int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer) +{ + int result; + struct open_chapter_zone *open_chapter; + struct uds_volume_record *record; + u8 record_count_data[sizeof(u32)]; + u32 record_count = 0; + unsigned int record_index; + unsigned int z; + + result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION, + OPEN_CHAPTER_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + + for (z = 0; z < index->zone_count; z++) { + open_chapter = index->zones[z]->open_chapter; + record_count += open_chapter->size - open_chapter->deletions; + } + + put_unaligned_le32(record_count, record_count_data); + result = uds_write_to_buffered_writer(writer, record_count_data, + sizeof(record_count_data)); + if (result != UDS_SUCCESS) + return result; + + record_index = 1; + while (record_count > 0) { + for (z = 0; z < index->zone_count; z++) { + open_chapter = index->zones[z]->open_chapter; + if (record_index > open_chapter->size) + continue; + + if (open_chapter->slots[record_index].deleted) + continue; + + record = &open_chapter->records[record_index]; + result = uds_write_to_buffered_writer(writer, (u8 *) record, + sizeof(*record)); + if (result != UDS_SUCCESS) + return result; + + record_count--; + } + + record_index++; + } + + return uds_flush_buffered_writer(writer); +} + +u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry) +{ + unsigned int records_per_chapter = geometry->records_per_chapter; + + return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) + + records_per_chapter * sizeof(struct uds_volume_record); +} + +static int load_version20(struct uds_index *index, struct buffered_reader *reader) +{ + int result; + u32 record_count; + u8 record_count_data[sizeof(u32)]; + struct uds_volume_record record; + + /* + * Track which zones cannot accept any more records. If the open chapter had a different + * number of zones previously, some new zones may have more records than they have space + * for. These overflow records will be discarded. + */ + bool full_flags[MAX_ZONES] = { + false, + }; + + result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data, + sizeof(record_count_data)); + if (result != UDS_SUCCESS) + return result; + + record_count = get_unaligned_le32(record_count_data); + while (record_count-- > 0) { + unsigned int zone = 0; + + result = uds_read_from_buffered_reader(reader, (u8 *) &record, + sizeof(record)); + if (result != UDS_SUCCESS) + return result; + + if (index->zone_count > 1) + zone = uds_get_volume_index_zone(index->volume_index, + &record.name); + + if (!full_flags[zone]) { + struct open_chapter_zone *open_chapter; + unsigned int remaining; + + open_chapter = index->zones[zone]->open_chapter; + remaining = uds_put_open_chapter(open_chapter, &record.name, + &record.data); + /* Do not allow any zone to fill completely. */ + full_flags[zone] = (remaining <= 1); + } + } + + return UDS_SUCCESS; +} + +int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader) +{ + u8 version[OPEN_CHAPTER_VERSION_LENGTH]; + int result; + + result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, version, sizeof(version)); + if (result != UDS_SUCCESS) + return result; + + if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "Invalid open chapter version: %.*s", + (int) sizeof(version), version); + } + + return load_version20(index, reader); +} diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h new file mode 100644 index 0000000000000..a4250bb19525e --- /dev/null +++ b/drivers/md/dm-vdo/indexer/open-chapter.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_OPEN_CHAPTER_H +#define UDS_OPEN_CHAPTER_H + +#include "chapter-index.h" +#include "geometry.h" +#include "index.h" +#include "volume.h" + +/* + * The open chapter tracks the newest records in memory. Like the index as a whole, each open + * chapter is divided into a number of independent zones which are interleaved when the chapter is + * committed to the volume. + */ + +enum { + OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, +}; + +struct open_chapter_zone_slot { + /* If non-zero, the record number addressed by this hash slot */ + unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS; + /* If true, the record at the index of this hash slot was deleted */ + bool deleted : 1; +} __packed; + +struct open_chapter_zone { + /* The maximum number of records that can be stored */ + unsigned int capacity; + /* The number of records stored */ + unsigned int size; + /* The number of deleted records */ + unsigned int deletions; + /* Array of chunk records, 1-based */ + struct uds_volume_record *records; + /* The number of slots in the hash table */ + unsigned int slot_count; + /* The hash table slots, referencing virtual record numbers */ + struct open_chapter_zone_slot slots[]; +}; + +int __must_check uds_make_open_chapter(const struct index_geometry *geometry, + unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr); + +void uds_reset_open_chapter(struct open_chapter_zone *open_chapter); + +void uds_search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + struct uds_record_data *metadata, bool *found); + +int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + const struct uds_record_data *metadata); + +void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name); + +void uds_free_open_chapter(struct open_chapter_zone *open_chapter); + +int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_volume_record *collated_records, + u64 virtual_chapter_number); + +int __must_check uds_save_open_chapter(struct uds_index *index, + struct buffered_writer *writer); + +int __must_check uds_load_open_chapter(struct uds_index *index, + struct buffered_reader *reader); + +u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry); + +#endif /* UDS_OPEN_CHAPTER_H */ diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c new file mode 100644 index 0000000000000..1f17c708a6526 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/radix-sort.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "radix-sort.h" + +#include +#include + +#include "memory-alloc.h" +#include "string-utils.h" + +/* + * This implementation allocates one large object to do the sorting, which can be reused as many + * times as desired. The amount of memory required is logarithmically proportional to the number of + * keys to be sorted. + */ + +enum { + /* Piles smaller than this are handled with a simple insertion sort. */ + INSERTION_SORT_THRESHOLD = 12, +}; + +/* Sort keys are pointers to immutable fixed-length arrays of bytes. */ +typedef const u8 *sort_key_t; + +/* + * The keys are separated into piles based on the byte in each keys at the current offset, so the + * number of keys with each byte must be counted. + */ +struct histogram { + /* The number of non-empty bins */ + u16 used; + /* The index (key byte) of the first non-empty bin */ + u16 first; + /* The index (key byte) of the last non-empty bin */ + u16 last; + /* The number of occurrences of each specific byte */ + u32 size[256]; +}; + +/* + * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound + * on the stack space needed. + */ +struct task { + /* Pointer to the first key to sort. */ + sort_key_t *first_key; + /* Pointer to the last key to sort. */ + sort_key_t *last_key; + /* The offset into the key at which to continue sorting. */ + u16 offset; + /* The number of bytes remaining in the sort keys. */ + u16 length; +}; + +struct radix_sorter { + unsigned int count; + struct histogram bins; + sort_key_t *pile[256]; + struct task *end_of_stack; + struct task insertion_list[256]; + struct task stack[]; +}; + +/* Compare a segment of two fixed-length keys starting at an offset. */ +static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length) +{ + return memcmp(&key1[offset], &key2[offset], length); +} + +/* Insert the next unsorted key into an array of sorted keys. */ +static inline void insert_key(const struct task task, sort_key_t *next) +{ + /* Pull the unsorted key out, freeing up the array slot. */ + sort_key_t unsorted = *next; + + /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */ + while ((--next >= task.first_key) && + (compare(unsorted, next[0], task.offset, task.length) < 0)) + next[1] = next[0]; + + /* Insert the key into the last slot that was cleared, sorting it. */ + next[1] = unsorted; +} + +/* + * Sort a range of key segments using an insertion sort. This simple sort is faster than the + * 256-way radix sort when the number of keys to sort is small. + */ +static inline void insertion_sort(const struct task task) +{ + sort_key_t *next; + + for (next = task.first_key + 1; next <= task.last_key; next++) + insert_key(task, next); +} + +/* Push a sorting task onto a task stack. */ +static inline void push_task(struct task **stack_pointer, sort_key_t *first_key, + u32 count, u16 offset, u16 length) +{ + struct task *task = (*stack_pointer)++; + + task->first_key = first_key; + task->last_key = &first_key[count - 1]; + task->offset = offset; + task->length = length; +} + +static inline void swap_keys(sort_key_t *a, sort_key_t *b) +{ + sort_key_t c = *a; + *a = *b; + *b = c; +} + +/* + * Count the number of times each byte value appears in the arrays of keys to sort at the current + * offset, keeping track of the number of non-empty bins, and the index of the first and last + * non-empty bin. + */ +static inline void measure_bins(const struct task task, struct histogram *bins) +{ + sort_key_t *key_ptr; + + /* + * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears + * it all out as it goes. Even though this structure is re-used, we don't need to pay to + * zero it before starting a new tally. + */ + bins->first = U8_MAX; + bins->last = 0; + + for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) { + /* Increment the count for the byte in the key at the current offset. */ + u8 bin = (*key_ptr)[task.offset]; + u32 size = ++bins->size[bin]; + + /* Track non-empty bins. */ + if (size == 1) { + bins->used += 1; + if (bin < bins->first) + bins->first = bin; + + if (bin > bins->last) + bins->last = bin; + } + } +} + +/* + * Convert the bin sizes to pointers to where each pile goes. + * + * pile[0] = first_key + bin->size[0], + * pile[1] = pile[0] + bin->size[1], etc. + * + * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the + * next radix position. A new task is put on the stack for each pile containing lots of keys, or a + * new task is put on the list for each pile containing few keys. + * + * @stack: pointer the top of the stack + * @end_of_stack: the end of the stack + * @list: pointer the head of the list + * @pile: array for pointers to the end of each pile + * @bins: the histogram of the sizes of each pile + * @first_key: the first key of the stack + * @offset: the next radix position to sort by + * @length: the number of bytes remaining in the sort keys + * + * Return: UDS_SUCCESS or an error code + */ +static inline int push_bins(struct task **stack, struct task *end_of_stack, + struct task **list, sort_key_t *pile[], + struct histogram *bins, sort_key_t *first_key, + u16 offset, u16 length) +{ + sort_key_t *pile_start = first_key; + int bin; + + for (bin = bins->first; ; bin++) { + u32 size = bins->size[bin]; + + /* Skip empty piles. */ + if (size == 0) + continue; + + /* There's no need to sort empty keys. */ + if (length > 0) { + if (size > INSERTION_SORT_THRESHOLD) { + if (*stack >= end_of_stack) + return UDS_BAD_STATE; + + push_task(stack, pile_start, size, offset, length); + } else if (size > 1) { + push_task(list, pile_start, size, offset, length); + } + } + + pile_start += size; + pile[bin] = pile_start; + if (--bins->used == 0) + break; + } + + return UDS_SUCCESS; +} + +int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) +{ + int result; + unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; + struct radix_sorter *radix_sorter; + + result = uds_allocate_extended(struct radix_sorter, stack_size, struct task, + __func__, &radix_sorter); + if (result != UDS_SUCCESS) + return result; + + radix_sorter->count = count; + radix_sorter->end_of_stack = radix_sorter->stack + stack_size; + *sorter = radix_sorter; + return UDS_SUCCESS; +} + +void uds_free_radix_sorter(struct radix_sorter *sorter) +{ + uds_free(sorter); +} + +/* + * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation + * is unstable, so the relative ordering of equal keys is not preserved. + */ +int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], + unsigned int count, unsigned short length) +{ + struct task start; + struct histogram *bins = &sorter->bins; + sort_key_t **pile = sorter->pile; + struct task *task_stack = sorter->stack; + + /* All zero-length keys are identical and therefore already sorted. */ + if ((count == 0) || (length == 0)) + return UDS_SUCCESS; + + /* The initial task is to sort the entire length of all the keys. */ + start = (struct task) { + .first_key = keys, + .last_key = &keys[count - 1], + .offset = 0, + .length = length, + }; + + if (count <= INSERTION_SORT_THRESHOLD) { + insertion_sort(start); + return UDS_SUCCESS; + } + + if (count > sorter->count) + return UDS_INVALID_ARGUMENT; + + /* + * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks + * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been + * processed, the stack will be empty and all the keys in the starting task will be fully + * sorted. + */ + for (*task_stack = start; task_stack >= sorter->stack; task_stack--) { + const struct task task = *task_stack; + struct task *insertion_task_list; + int result; + sort_key_t *fence; + sort_key_t *end; + + measure_bins(task, bins); + + /* + * Now that we know how large each bin is, generate pointers for each of the piles + * and push a new task to sort each pile by the next radix byte. + */ + insertion_task_list = sorter->insertion_list; + result = push_bins(&task_stack, sorter->end_of_stack, + &insertion_task_list, pile, bins, task.first_key, + task.offset + 1, task.length - 1); + if (result != UDS_SUCCESS) { + memset(bins, 0, sizeof(*bins)); + return result; + } + + /* Now bins->used is zero again. */ + + /* + * Don't bother processing the last pile: when piles 0..N-1 are all in place, then + * pile N must also be in place. + */ + end = task.last_key - bins->size[bins->last]; + bins->size[bins->last] = 0; + + for (fence = task.first_key; fence <= end; ) { + u8 bin; + sort_key_t key = *fence; + + /* + * The radix byte of the key tells us which pile it belongs in. Swap it for + * an unprocessed item just below that pile, and repeat. + */ + while (--pile[bin = key[task.offset]] > fence) + swap_keys(pile[bin], &key); + + /* + * The pile reached the fence. Put the key at the bottom of that pile, + * completing it, and advance the fence to the next pile. + */ + *fence = key; + fence += bins->size[bin]; + bins->size[bin] = 0; + } + + /* Now bins->size[] is all zero again. */ + + /* + * When the number of keys in a task gets small enough, it is faster to use an + * insertion sort than to keep subdividing into tiny piles. + */ + while (--insertion_task_list >= sorter->insertion_list) + insertion_sort(*insertion_task_list); + } + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h new file mode 100644 index 0000000000000..812949bc2cee9 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/radix-sort.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_RADIX_SORT_H +#define UDS_RADIX_SORT_H + +/* + * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix + * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith + * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort". + * + * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf + */ + +struct radix_sorter; + +int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter); + +void uds_free_radix_sorter(struct radix_sorter *sorter); + +int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], + unsigned int count, unsigned short length); + +#endif /* UDS_RADIX_SORT_H */ diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c new file mode 100644 index 0000000000000..f2141de6ed00e --- /dev/null +++ b/drivers/md/dm-vdo/indexer/sparse-cache.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "sparse-cache.h" + +#include +#include +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "chapter-index.h" +#include "config.h" +#include "index.h" + +/* + * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a + * specific virtual chapter is implemented as a linear search. The cache replacement policy is + * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be + * maintained by shifting entries in an array list. + * + * Changing the contents of the cache requires the coordinated participation of all zone threads + * via the careful use of barrier messages sent to all the index zones by the triage queue worker + * thread. The critical invariant for coordination is that the cache membership must not change + * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all + * receive the same results for every virtual chapter number. To ensure that critical invariant, + * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that + * chapter because it has had too many cache misses" are represented separately from the cache + * membership information (the virtual chapter number). + * + * As a result of this invariant, we have the guarantee that every zone thread will call + * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache, + * and the serialization of the barrier requests from the triage queue ensures they will all + * request the same chapter number. This means the only synchronization we need can be provided by + * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical + * section where a single zone thread can drive the cache update while all the other zone threads + * are known to be blocked, waiting in the second barrier. Outside that critical section, all the + * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an + * exclusive lock. No other threads may access or modify the cache entries. + * + * Chapter statistics must only be modified by a single thread, which is also the zone zero thread. + * All fields that might be frequently updated by that thread are kept in separate cache-aligned + * structures so they will not cause cache contention via "false sharing" with the fields that are + * frequently accessed by all of the zone threads. + * + * The LRU order is managed independently by each zone thread, and each zone uses its own list for + * searching and cache membership queries. The zone zero list is used to decide which chapter to + * evict when the cache is updated, and its search list is copied to the other threads at that + * time. + * + * The virtual chapter number field of the cache entry is the single field indicating whether a + * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or + * undefined chapter number. When present in the virtual chapter number field of a + * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of + * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any + * cache entry that is not marked as dead is fully defined and a member of the cache, and + * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears + * in any of the cache entries. + * + * A chapter index that is a member of the cache may be excluded from searches between calls to + * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the + * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since + * that chapter is no longer part of the volume, there's no point in continuing to search that + * chapter index. Once invalidated, that virtual chapter will still be considered a member of the + * cache, but it will no longer be searched for matching names. + * + * The second mechanism is a heuristic based on keeping track of the number of consecutive search + * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will + * be set to true, causing the chapter to be skipped when searching the entire cache, but still + * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will + * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry. + * Again, regardless of the state of the skip_search flag, the virtual chapter must still + * considered to be a member of the cache for uds_sparse_cache_contains(). + */ + +enum { + SKIP_SEARCH_THRESHOLD = 20000, + ZONE_ZERO = 0, +}; + +/* + * These counters are essentially fields of the struct cached_chapter_index, but are segregated + * into this structure because they are frequently modified. They are grouped and aligned to keep + * them on different cache lines from the chapter fields that are accessed far more often than they + * are updated. + */ +struct __aligned(L1_CACHE_BYTES) cached_index_counters { + u64 consecutive_misses; +}; + +struct __aligned(L1_CACHE_BYTES) cached_chapter_index { + /* + * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache + * entry is unused. This field must only be modified in the critical section in + * uds_update_sparse_cache(). + */ + u64 virtual_chapter; + + u32 index_pages_count; + + /* + * These pointers are immutable during the life of the cache. The contents of the arrays + * change when the cache entry is replaced. + */ + struct delta_index_page *index_pages; + struct dm_buffer **page_buffers; + + /* + * If set, skip the chapter when searching the entire cache. This flag is just a + * performance optimization. This flag is mutable between cache updates, but it rarely + * changes and is frequently accessed, so it groups with the immutable fields. + */ + bool skip_search; + + /* + * The cache-aligned counters change often and are placed at the end of the structure to + * prevent false sharing with the more stable fields above. + */ + struct cached_index_counters counters; +}; + +/* + * A search_list represents an ordering of the sparse chapter index cache entry array, from most + * recently accessed to least recently accessed, which is the order in which the indexes should be + * searched and the reverse order in which they should be evicted from the cache. + * + * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even + * iterate over them to search, and ensuring that dead entries are replaced before any live entries + * are evicted. + * + * The search list is instantiated for each zone thread, avoiding any need for synchronization. The + * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between + * zone threads. + */ +struct search_list { + u8 capacity; + u8 first_dead_entry; + struct cached_chapter_index *entries[]; +}; + +struct threads_barrier { + /* Lock for this barrier object */ + struct semaphore lock; + /* Semaphore for threads waiting at this barrier */ + struct semaphore wait; + /* Number of threads which have arrived */ + int arrived; + /* Total number of threads using this barrier */ + int thread_count; +}; + +struct sparse_cache { + const struct index_geometry *geometry; + unsigned int capacity; + unsigned int zone_count; + + unsigned int skip_threshold; + struct search_list *search_lists[MAX_ZONES]; + struct cached_chapter_index **scratch_entries; + + struct threads_barrier begin_update_barrier; + struct threads_barrier end_update_barrier; + + struct cached_chapter_index chapters[]; +}; + +static void initialize_threads_barrier(struct threads_barrier *barrier, + unsigned int thread_count) +{ + sema_init(&barrier->lock, 1); + barrier->arrived = 0; + barrier->thread_count = thread_count; + sema_init(&barrier->wait, 0); +} + +static inline void __down(struct semaphore *semaphore) +{ + /* + * Do not use down(semaphore). Instead use down_interruptible so that + * we do not get 120 second stall messages in kern.log. + */ + while (down_interruptible(semaphore) != 0) { + /* + * If we're called from a user-mode process (e.g., "dmsetup + * remove") while waiting for an operation that may take a + * while (e.g., UDS index save), and a signal is sent (SIGINT, + * SIGUSR2), then down_interruptible will not block. If that + * happens, sleep briefly to avoid keeping the CPU locked up in + * this loop. We could just call cond_resched, but then we'd + * still keep consuming CPU time slices and swamp other threads + * trying to do computational work. + */ + fsleep(1000); + } +} + +static void enter_threads_barrier(struct threads_barrier *barrier) +{ + __down(&barrier->lock); + if (++barrier->arrived == barrier->thread_count) { + /* last thread */ + int i; + + for (i = 1; i < barrier->thread_count; i++) + up(&barrier->wait); + + barrier->arrived = 0; + up(&barrier->lock); + } else { + up(&barrier->lock); + __down(&barrier->wait); + } +} + +static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter, + const struct index_geometry *geometry) +{ + int result; + + chapter->virtual_chapter = NO_CHAPTER; + chapter->index_pages_count = geometry->index_pages_per_chapter; + + result = uds_allocate(chapter->index_pages_count, struct delta_index_page, + __func__, &chapter->index_pages); + if (result != UDS_SUCCESS) + return result; + + return uds_allocate(chapter->index_pages_count, struct dm_buffer *, + "sparse index volume pages", &chapter->page_buffers); +} + +static int __must_check make_search_list(struct sparse_cache *cache, + struct search_list **list_ptr) +{ + struct search_list *list; + unsigned int bytes; + u8 i; + int result; + + bytes = (sizeof(struct search_list) + + (cache->capacity * sizeof(struct cached_chapter_index *))); + result = uds_allocate_cache_aligned(bytes, "search list", &list); + if (result != UDS_SUCCESS) + return result; + + list->capacity = cache->capacity; + list->first_dead_entry = 0; + + for (i = 0; i < list->capacity; i++) + list->entries[i] = &cache->chapters[i]; + + *list_ptr = list; + return UDS_SUCCESS; +} + +int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity, + unsigned int zone_count, struct sparse_cache **cache_ptr) +{ + int result; + unsigned int i; + struct sparse_cache *cache; + unsigned int bytes; + + bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index))); + result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache); + if (result != UDS_SUCCESS) + return result; + + cache->geometry = geometry; + cache->capacity = capacity; + cache->zone_count = zone_count; + + /* + * Scale down the skip threshold since the cache only counts cache misses in zone zero, but + * requests are being handled in all zones. + */ + cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); + + initialize_threads_barrier(&cache->begin_update_barrier, zone_count); + initialize_threads_barrier(&cache->end_update_barrier, zone_count); + + for (i = 0; i < capacity; i++) { + result = initialize_cached_chapter_index(&cache->chapters[i], geometry); + if (result != UDS_SUCCESS) + goto out; + } + + for (i = 0; i < zone_count; i++) { + result = make_search_list(cache, &cache->search_lists[i]); + if (result != UDS_SUCCESS) + goto out; + } + + /* purge_search_list() needs some temporary lists for sorting. */ + result = uds_allocate(capacity * 2, struct cached_chapter_index *, + "scratch entries", &cache->scratch_entries); + if (result != UDS_SUCCESS) + goto out; + + *cache_ptr = cache; + return UDS_SUCCESS; +out: + uds_free_sparse_cache(cache); + return result; +} + +static inline void set_skip_search(struct cached_chapter_index *chapter, + bool skip_search) +{ + /* Check before setting to reduce cache line contention. */ + if (READ_ONCE(chapter->skip_search) != skip_search) + WRITE_ONCE(chapter->skip_search, skip_search); +} + +static void score_search_hit(struct cached_chapter_index *chapter) +{ + chapter->counters.consecutive_misses = 0; + set_skip_search(chapter, false); +} + +static void score_search_miss(struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + chapter->counters.consecutive_misses++; + if (chapter->counters.consecutive_misses > cache->skip_threshold) + set_skip_search(chapter, true); +} + +static void release_cached_chapter_index(struct cached_chapter_index *chapter) +{ + unsigned int i; + + chapter->virtual_chapter = NO_CHAPTER; + if (chapter->page_buffers == NULL) + return; + + for (i = 0; i < chapter->index_pages_count; i++) { + if (chapter->page_buffers[i] != NULL) + dm_bufio_release(uds_forget(chapter->page_buffers[i])); + } +} + +void uds_free_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + if (cache == NULL) + return; + + uds_free(cache->scratch_entries); + + for (i = 0; i < cache->zone_count; i++) + uds_free(cache->search_lists[i]); + + for (i = 0; i < cache->capacity; i++) { + release_cached_chapter_index(&cache->chapters[i]); + uds_free(cache->chapters[i].index_pages); + uds_free(cache->chapters[i].page_buffers); + } + + uds_free(cache); +} + +/* + * Take the indicated element of the search list and move it to the start, pushing the pointers + * previously before it back down the list. + */ +static inline void set_newest_entry(struct search_list *search_list, u8 index) +{ + struct cached_chapter_index *newest; + + if (index > 0) { + newest = search_list->entries[index]; + memmove(&search_list->entries[1], &search_list->entries[0], + index * sizeof(struct cached_chapter_index *)); + search_list->entries[0] = newest; + } + + /* + * This function may have moved a dead chapter to the front of the list for reuse, in which + * case the set of dead chapters becomes smaller. + */ + if (search_list->first_dead_entry <= index) + search_list->first_dead_entry++; +} + +bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, + unsigned int zone_number) +{ + struct search_list *search_list; + struct cached_chapter_index *chapter; + u8 i; + + /* + * The correctness of the barriers depends on the invariant that between calls to + * uds_update_sparse_cache(), the answers this function returns must never vary: the result + * for a given chapter must be identical across zones. That invariant must be maintained + * even if the chapter falls off the end of the volume, or if searching it is disabled + * because of too many search misses. + */ + search_list = cache->search_lists[zone_number]; + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + + if (virtual_chapter == chapter->virtual_chapter) { + if (zone_number == ZONE_ZERO) + score_search_hit(chapter); + + set_newest_entry(search_list, i); + return true; + } + } + + return false; +} + +/* + * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU + * ordering that already existed. This operation must only be called during the critical section in + * uds_update_sparse_cache(). + */ +static void purge_search_list(struct search_list *search_list, + struct sparse_cache *cache, u64 oldest_virtual_chapter) +{ + struct cached_chapter_index **entries; + struct cached_chapter_index **skipped; + struct cached_chapter_index **dead; + struct cached_chapter_index *chapter; + unsigned int next_alive = 0; + unsigned int next_skipped = 0; + unsigned int next_dead = 0; + unsigned int i; + + entries = &search_list->entries[0]; + skipped = &cache->scratch_entries[0]; + dead = &cache->scratch_entries[search_list->capacity]; + + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + if ((chapter->virtual_chapter < oldest_virtual_chapter) || + (chapter->virtual_chapter == NO_CHAPTER)) + dead[next_dead++] = chapter; + else if (chapter->skip_search) + skipped[next_skipped++] = chapter; + else + entries[next_alive++] = chapter; + } + + memcpy(&entries[next_alive], skipped, + next_skipped * sizeof(struct cached_chapter_index *)); + memcpy(&entries[next_alive + next_skipped], dead, + next_dead * sizeof(struct cached_chapter_index *)); + search_list->first_dead_entry = next_alive + next_skipped; +} + +static int __must_check cache_chapter_index(struct cached_chapter_index *chapter, + u64 virtual_chapter, + const struct volume *volume) +{ + int result; + + release_cached_chapter_index(chapter); + + result = uds_read_chapter_index_from_volume(volume, virtual_chapter, + chapter->page_buffers, + chapter->index_pages); + if (result != UDS_SUCCESS) + return result; + + chapter->counters.consecutive_misses = 0; + chapter->virtual_chapter = virtual_chapter; + chapter->skip_search = false; + + return UDS_SUCCESS; +} + +static inline void copy_search_list(const struct search_list *source, + struct search_list *target) +{ + *target = *source; + memcpy(target->entries, source->entries, + source->capacity * sizeof(struct cached_chapter_index *)); +} + +/* + * Update the sparse cache to contain a chapter index. This function must be called by all the zone + * threads with the same chapter number to correctly enter the thread barriers used to synchronize + * the cache updates. + */ +int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter) +{ + int result = UDS_SUCCESS; + const struct uds_index *index = zone->index; + struct sparse_cache *cache = index->volume->sparse_cache; + + if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id)) + return UDS_SUCCESS; + + /* + * Wait for every zone thread to reach its corresponding barrier request and invoke this + * function before starting to modify the cache. + */ + enter_threads_barrier(&cache->begin_update_barrier); + + /* + * This is the start of the critical section: the zone zero thread is captain, effectively + * holding an exclusive lock on the sparse cache. All the other zone threads must do + * nothing between the two barriers. They will wait at the end_update_barrier again for the + * captain to finish the update. + */ + + if (zone->id == ZONE_ZERO) { + unsigned int z; + struct search_list *list = cache->search_lists[ZONE_ZERO]; + + purge_search_list(list, cache, zone->oldest_virtual_chapter); + + if (virtual_chapter >= index->oldest_virtual_chapter) { + set_newest_entry(list, list->capacity - 1); + result = cache_chapter_index(list->entries[0], virtual_chapter, + index->volume); + } + + for (z = 1; z < cache->zone_count; z++) + copy_search_list(list, cache->search_lists[z]); + } + + /* + * This is the end of the critical section. All cache invariants must have been restored. + */ + enter_threads_barrier(&cache->end_update_barrier); + return result; +} + +void uds_invalidate_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + for (i = 0; i < cache->capacity; i++) + release_cached_chapter_index(&cache->chapters[i]); +} + +static inline bool should_skip_chapter(struct cached_chapter_index *chapter, + u64 oldest_chapter, u64 requested_chapter) +{ + if ((chapter->virtual_chapter == NO_CHAPTER) || + (chapter->virtual_chapter < oldest_chapter)) + return true; + + if (requested_chapter != NO_CHAPTER) + return requested_chapter != chapter->virtual_chapter; + else + return READ_ONCE(chapter->skip_search); +} + +static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter, + const struct index_geometry *geometry, + const struct index_page_map *index_page_map, + const struct uds_record_name *name, + u16 *record_page_ptr) +{ + u32 physical_chapter = + uds_map_to_physical_chapter(geometry, chapter->virtual_chapter); + u32 index_page_number = + uds_find_index_page_number(index_page_map, name, physical_chapter); + struct delta_index_page *index_page = + &chapter->index_pages[index_page_number]; + + return uds_search_chapter_index_page(index_page, geometry, name, + record_page_ptr); +} + +int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name, + u64 *virtual_chapter_ptr, u16 *record_page_ptr) +{ + int result; + struct volume *volume = zone->index->volume; + struct sparse_cache *cache = volume->sparse_cache; + struct cached_chapter_index *chapter; + struct search_list *search_list; + u8 i; + /* Search the entire cache unless a specific chapter was requested. */ + bool search_one = (*virtual_chapter_ptr != NO_CHAPTER); + + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + search_list = cache->search_lists[zone->id]; + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + + if (should_skip_chapter(chapter, zone->oldest_virtual_chapter, + *virtual_chapter_ptr)) + continue; + + result = search_cached_chapter_index(chapter, cache->geometry, + volume->index_page_map, name, + record_page_ptr); + if (result != UDS_SUCCESS) + return result; + + if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { + /* + * In theory, this might be a false match while a true match exists in + * another chapter, but that's a very rare case and not worth the extra + * search complexity. + */ + set_newest_entry(search_list, i); + if (zone->id == ZONE_ZERO) + score_search_hit(chapter); + + *virtual_chapter_ptr = chapter->virtual_chapter; + return UDS_SUCCESS; + } + + if (zone->id == ZONE_ZERO) + score_search_miss(cache, chapter); + + if (search_one) + break; + } + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h new file mode 100644 index 0000000000000..45e2dcf165b51 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/sparse-cache.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_SPARSE_CACHE_H +#define UDS_SPARSE_CACHE_H + +#include "geometry.h" +#include "indexer.h" + +/* + * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching + * for names after all other search paths have failed. It contains only complete chapter indexes; + * record pages from sparse chapters and single index pages used for resolving hooks are kept in + * the regular page cache in the volume. + * + * The most important property of this cache is the absence of synchronization for read operations. + * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and + * the barrier requests it issues to the zone queues. The set of cached chapters does not and must + * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone + * threads. Outside of updates, every zone will get the same result when calling + * uds_sparse_cache_contains() as every other zone. + */ + +struct index_zone; +struct sparse_cache; + +int __must_check uds_make_sparse_cache(const struct index_geometry *geometry, + unsigned int capacity, unsigned int zone_count, + struct sparse_cache **cache_ptr); + +void uds_free_sparse_cache(struct sparse_cache *cache); + +bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, + unsigned int zone_number); + +int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter); + +void uds_invalidate_sparse_cache(struct sparse_cache *cache); + +int __must_check uds_search_sparse_cache(struct index_zone *zone, + const struct uds_record_name *name, + u64 *virtual_chapter_ptr, u16 *record_page_ptr); + +#endif /* UDS_SPARSE_CACHE_H */ diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c new file mode 100644 index 0000000000000..8cbd9280c4bd0 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -0,0 +1,1281 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ +#include "volume-index.h" + +#include +#include +#include +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "thread-utils.h" + +#include "config.h" +#include "geometry.h" +#include "hash-utils.h" +#include "indexer.h" + +/* + * The volume index is a combination of two separate subindexes, one containing sparse hook entries + * (retained for all chapters), and one containing the remaining entries (retained only for the + * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it + * will contain all records for all chapters. + * + * The volume index is also divided into zones, with one thread operating on each zone. Each + * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex. + * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to + * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta + * lists must be the square of the maximum zone count for both subindexes. + * + * Each subindex zone is a delta index where the payload is a chapter number. The volume index can + * compute the delta list number, address, and zone number from the record name in order to + * dispatch record handling to the correct structures. + * + * Most operations that use all the zones take place either before request processing is allowed, + * or after all requests have been flushed in order to shut down. The only multi-threaded operation + * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine + * whether a new chapter should be loaded into the sparse index cache. This operation only uses the + * sparse hook subindex, and the zone mutexes are used to make this operation safe. + * + * There are three ways of expressing chapter numbers in the volume index: virtual, index, and + * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long. + * Internally the subindex stores only the minimal number of bits necessary by masking away the + * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when + * flushing entries from older chapters, it rolls the index chapter number around so that the + * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries() + * for an example of this technique. + * + * For efficiency, when older chapter numbers become invalid, the index does not immediately remove + * the invalidated entries. Instead it lazily removes them from a given delta list the next time it + * walks that list during normal operation. Because of this, the index size must be increased + * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard + * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in + * the index. + */ + +struct sub_index_parameters { + /* The number of bits in address mask */ + u8 address_bits; + /* The number of bits in chapter number */ + u8 chapter_bits; + /* The mean delta */ + u32 mean_delta; + /* The number of delta lists */ + u64 list_count; + /* The number of chapters used */ + u32 chapter_count; + /* The number of bits per chapter */ + size_t chapter_size_in_bits; + /* The number of bytes of delta list memory */ + size_t memory_size; + /* The number of bytes the index should keep free at all times */ + size_t target_free_bytes; +}; + +struct split_config { + /* The hook subindex configuration */ + struct uds_configuration hook_config; + struct index_geometry hook_geometry; + + /* The non-hook subindex configuration */ + struct uds_configuration non_hook_config; + struct index_geometry non_hook_geometry; +}; + +struct chapter_range { + u32 chapter_start; + u32 chapter_count; +}; + +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_START_5[] = "MI5-0005"; + +struct sub_index_data { + char magic[MAGIC_SIZE]; /* MAGIC_START_5 */ + u64 volume_nonce; + u64 virtual_chapter_low; + u64 virtual_chapter_high; + u32 first_list; + u32 list_count; +}; + +static const char MAGIC_START_6[] = "MI6-0001"; + +struct volume_index_data { + char magic[MAGIC_SIZE]; /* MAGIC_START_6 */ + u32 sparse_sample_rate; +}; + +static inline u32 extract_address(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + return uds_extract_volume_index_bytes(name) & sub_index->address_mask; +} + +static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + u64 bits = uds_extract_volume_index_bytes(name); + + return (bits >> sub_index->address_bits) % sub_index->list_count; +} + +static inline const struct volume_sub_index_zone * +get_zone_for_record(const struct volume_index_record *record) +{ + return &record->sub_index->zones[record->zone_number]; +} + +static inline u64 convert_index_to_virtual(const struct volume_index_record *record, + u32 index_chapter) +{ + const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); + u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) & + record->sub_index->chapter_mask); + + return volume_index_zone->virtual_chapter_low + rolling_chapter; +} + +static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index, + u64 virtual_chapter) +{ + return virtual_chapter & sub_index->chapter_mask; +} + +static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record, + u64 virtual_chapter) +{ + const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); + + return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) && + (virtual_chapter <= volume_index_zone->virtual_chapter_high)); +} + +static inline bool has_sparse(const struct volume_index *volume_index) +{ + return volume_index->sparse_sample_rate > 0; +} + +bool uds_is_volume_index_sample(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + if (!has_sparse(volume_index)) + return false; + + return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0; +} + +static inline const struct volume_sub_index * +get_volume_sub_index(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + return (uds_is_volume_index_sample(volume_index, name) ? + &volume_index->vi_hook : + &volume_index->vi_non_hook); +} + +static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone; +} + +unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name); +} + +static int compute_volume_sub_index_parameters(const struct uds_configuration *config, + struct sub_index_parameters *params) +{ + enum { DELTA_LIST_SIZE = 256 }; + u64 entries_in_volume_index, address_span; + u32 chapters_in_volume_index, invalid_chapters; + u32 rounded_chapters; + u64 delta_list_records; + u32 address_count; + u64 index_size_in_bits; + size_t expected_index_size; + u64 min_delta_lists = MAX_ZONES * MAX_ZONES; + struct index_geometry *geometry = config->geometry; + u64 records_per_chapter = geometry->records_per_chapter; + + params->chapter_count = geometry->chapters_per_volume; + /* + * Make sure that the number of delta list records in the volume index does not change when + * the volume is reduced by one chapter. This preserves the mapping from name to volume + * index delta list. + */ + rounded_chapters = params->chapter_count; + if (uds_is_reduced_index_geometry(geometry)) + rounded_chapters += 1; + delta_list_records = records_per_chapter * rounded_chapters; + address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE; + params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists); + params->address_bits = bits_per(address_count - 1); + params->chapter_bits = bits_per(rounded_chapters - 1); + if ((u32) params->list_count != params->list_count) { + return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot initialize volume index with %llu delta lists", + (unsigned long long) params->list_count); + } + + if (params->address_bits > 31) { + return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot initialize volume index with %u address bits", + params->address_bits); + } + + /* + * The probability that a given delta list is not touched during the writing of an entire + * chapter is: + * + * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count, + * records_per_chapter); + * + * For the standard index sizes, about 78% of the delta lists are not touched, and + * therefore contain old index entries that have not been eliminated by the lazy LRU + * processing. Then the number of old index entries that accumulate over the entire index, + * in terms of full chapters worth of entries, is: + * + * double invalid_chapters = p_not_touched / (1.0 - p_not_touched); + * + * For the standard index sizes, the index needs about 3.5 chapters of space for the old + * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in + * the index. + */ + invalid_chapters = max(rounded_chapters / 256, 2U); + chapters_in_volume_index = rounded_chapters + invalid_chapters; + entries_in_volume_index = records_per_chapter * chapters_in_volume_index; + + address_span = params->list_count << params->address_bits; + params->mean_delta = address_span / entries_in_volume_index; + + /* + * Compute the expected size of a full index, then set the total memory to be 6% larger + * than that expected size. This number should be large enough that there are not many + * rebalances when the index is full. + */ + params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter, + params->mean_delta, + params->chapter_bits); + index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index; + expected_index_size = index_size_in_bits / BITS_PER_BYTE; + params->memory_size = expected_index_size * 106 / 100; + + params->target_free_bytes = expected_index_size / 20; + return UDS_SUCCESS; +} + +static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index) +{ + uds_free(uds_forget(sub_index->flush_chapters)); + uds_free(uds_forget(sub_index->zones)); + uds_uninitialize_delta_index(&sub_index->delta_index); +} + +void uds_free_volume_index(struct volume_index *volume_index) +{ + if (volume_index == NULL) + return; + + if (volume_index->zones != NULL) + uds_free(uds_forget(volume_index->zones)); + + uninitialize_volume_sub_index(&volume_index->vi_non_hook); + uninitialize_volume_sub_index(&volume_index->vi_hook); + uds_free(volume_index); +} + + +static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config, + size_t *bytes) +{ + struct sub_index_parameters params = { .address_bits = 0 }; + int result; + + result = compute_volume_sub_index_parameters(config, ¶ms); + if (result != UDS_SUCCESS) + return result; + + *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) + + uds_compute_delta_index_save_bytes(params.list_count, + params.memory_size)); + return UDS_SUCCESS; +} + +/* This function is only useful if the configuration includes sparse chapters. */ +static void split_configuration(const struct uds_configuration *config, + struct split_config *split) +{ + u64 sample_rate, sample_records; + u64 dense_chapters, sparse_chapters; + + /* Start with copies of the base configuration. */ + split->hook_config = *config; + split->hook_geometry = *config->geometry; + split->hook_config.geometry = &split->hook_geometry; + split->non_hook_config = *config; + split->non_hook_geometry = *config->geometry; + split->non_hook_config.geometry = &split->non_hook_geometry; + + sample_rate = config->sparse_sample_rate; + sparse_chapters = config->geometry->sparse_chapters_per_volume; + dense_chapters = config->geometry->chapters_per_volume - sparse_chapters; + sample_records = config->geometry->records_per_chapter / sample_rate; + + /* Adjust the number of records indexed for each chapter. */ + split->hook_geometry.records_per_chapter = sample_records; + split->non_hook_geometry.records_per_chapter -= sample_records; + + /* Adjust the number of chapters indexed. */ + split->hook_geometry.sparse_chapters_per_volume = 0; + split->non_hook_geometry.sparse_chapters_per_volume = 0; + split->non_hook_geometry.chapters_per_volume = dense_chapters; +} + +static int compute_volume_index_save_bytes(const struct uds_configuration *config, + size_t *bytes) +{ + size_t hook_bytes, non_hook_bytes; + struct split_config split; + int result; + + if (!uds_is_sparse_index_geometry(config->geometry)) + return compute_volume_sub_index_save_bytes(config, bytes); + + split_configuration(config, &split); + result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes); + if (result != UDS_SUCCESS) + return result; + + result = compute_volume_sub_index_save_bytes(&split.non_hook_config, + &non_hook_bytes); + if (result != UDS_SUCCESS) + return result; + + *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes; + return UDS_SUCCESS; +} + +int uds_compute_volume_index_save_blocks(const struct uds_configuration *config, + size_t block_size, u64 *block_count) +{ + size_t bytes; + int result; + + result = compute_volume_index_save_bytes(config, &bytes); + if (result != UDS_SUCCESS) + return result; + + bytes += sizeof(struct delta_list_save_info); + *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES; + return UDS_SUCCESS; +} + +/* Flush invalid entries while walking the delta list. */ +static inline int flush_invalid_entries(struct volume_index_record *record, + struct chapter_range *flush_range, + u32 *next_chapter_to_invalidate) +{ + int result; + + result = uds_next_delta_index_entry(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + while (!record->delta_entry.at_end) { + u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); + u32 relative_chapter = ((index_chapter - flush_range->chapter_start) & + record->sub_index->chapter_mask); + + if (likely(relative_chapter >= flush_range->chapter_count)) { + if (relative_chapter < *next_chapter_to_invalidate) + *next_chapter_to_invalidate = relative_chapter; + break; + } + + result = uds_remove_delta_index_entry(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +/* Find the matching record, or the list offset where the record would go. */ +static int get_volume_index_entry(struct volume_index_record *record, u32 list_number, + u32 key, struct chapter_range *flush_range) +{ + struct volume_index_record other_record; + const struct volume_sub_index *sub_index = record->sub_index; + u32 next_chapter_to_invalidate = sub_index->chapter_mask; + int result; + + result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0, + &record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + do { + result = flush_invalid_entries(record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + } while (!record->delta_entry.at_end && (key > record->delta_entry.key)); + + result = uds_remember_delta_index_offset(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + /* Check any collision records for a more precise match. */ + other_record = *record; + if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) { + for (;;) { + u8 collision_name[UDS_RECORD_NAME_SIZE]; + + result = flush_invalid_entries(&other_record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + + if (other_record.delta_entry.at_end || + !other_record.delta_entry.is_collision) + break; + + result = uds_get_delta_entry_collision(&other_record.delta_entry, + collision_name); + if (result != UDS_SUCCESS) + return result; + + if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) { + *record = other_record; + break; + } + } + } + while (!other_record.delta_entry.at_end) { + result = flush_invalid_entries(&other_record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + } + next_chapter_to_invalidate += flush_range->chapter_start; + next_chapter_to_invalidate &= sub_index->chapter_mask; + flush_range->chapter_start = next_chapter_to_invalidate; + flush_range->chapter_count = 0; + return UDS_SUCCESS; +} + +static int get_volume_sub_index_record(struct volume_sub_index *sub_index, + const struct uds_record_name *name, + struct volume_index_record *record) +{ + int result; + const struct volume_sub_index_zone *volume_index_zone; + u32 address = extract_address(sub_index, name); + u32 delta_list_number = extract_dlist_num(sub_index, name); + u64 flush_chapter = sub_index->flush_chapters[delta_list_number]; + + record->sub_index = sub_index; + record->mutex = NULL; + record->name = name; + record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone; + volume_index_zone = get_zone_for_record(record); + + if (flush_chapter < volume_index_zone->virtual_chapter_low) { + struct chapter_range range; + u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter; + + range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter); + range.chapter_count = (flush_count > sub_index->chapter_mask ? + sub_index->chapter_mask + 1 : + flush_count); + result = get_volume_index_entry(record, delta_list_number, address, + &range); + flush_chapter = convert_index_to_virtual(record, range.chapter_start); + if (flush_chapter > volume_index_zone->virtual_chapter_high) + flush_chapter = volume_index_zone->virtual_chapter_high; + sub_index->flush_chapters[delta_list_number] = flush_chapter; + } else { + result = uds_get_delta_index_entry(&sub_index->delta_index, + delta_list_number, address, + name->name, &record->delta_entry); + } + + if (result != UDS_SUCCESS) + return result; + + record->is_found = + (!record->delta_entry.at_end && (record->delta_entry.key == address)); + if (record->is_found) { + u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); + + record->virtual_chapter = convert_index_to_virtual(record, index_chapter); + } + + record->is_collision = record->delta_entry.is_collision; + return UDS_SUCCESS; +} + +int uds_get_volume_index_record(struct volume_index *volume_index, + const struct uds_record_name *name, + struct volume_index_record *record) +{ + int result; + + if (uds_is_volume_index_sample(volume_index, name)) { + /* + * Other threads cannot be allowed to call uds_lookup_volume_index_name() while + * this thread is finding the volume index record. Due to the lazy LRU flushing of + * the volume index, uds_get_volume_index_record() is not a read-only operation. + */ + unsigned int zone = + get_volume_sub_index_zone(&volume_index->vi_hook, name); + struct mutex *mutex = &volume_index->zones[zone].hook_mutex; + + mutex_lock(mutex); + result = get_volume_sub_index_record(&volume_index->vi_hook, name, + record); + mutex_unlock(mutex); + /* Remember the mutex so that other operations on the index record can use it. */ + record->mutex = mutex; + } else { + result = get_volume_sub_index_record(&volume_index->vi_non_hook, name, + record); + } + + return result; +} + +int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter) +{ + int result; + u32 address; + const struct volume_sub_index *sub_index = record->sub_index; + + if (!is_virtual_chapter_indexed(record, virtual_chapter)) { + u64 low = get_zone_for_record(record)->virtual_chapter_low; + u64 high = get_zone_for_record(record)->virtual_chapter_high; + + return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot put record into chapter number %llu that is out of the valid range %llu to %llu", + (unsigned long long) virtual_chapter, + (unsigned long long) low, + (unsigned long long) high); + } + address = extract_address(sub_index, record->name); + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_put_delta_index_entry(&record->delta_entry, address, + convert_virtual_to_index(sub_index, + virtual_chapter), + record->is_found ? record->name->name : NULL); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + switch (result) { + case UDS_SUCCESS: + record->virtual_chapter = virtual_chapter; + record->is_collision = record->delta_entry.is_collision; + record->is_found = true; + break; + case UDS_OVERFLOW: + uds_log_ratelimit(uds_log_warning_strerror, UDS_OVERFLOW, + "Volume index entry dropped due to overflow condition"); + uds_log_delta_index_entry(&record->delta_entry); + break; + default: + break; + } + + return result; +} + +int uds_remove_volume_index_record(struct volume_index_record *record) +{ + int result; + + if (!record->is_found) + return uds_log_warning_strerror(UDS_BAD_STATE, + "illegal operation on new record"); + + /* Mark the record so that it cannot be used again */ + record->is_found = false; + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_remove_delta_index_entry(&record->delta_entry); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + return result; +} + +static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index, + unsigned int zone_number, + u64 virtual_chapter) +{ + u64 used_bits = 0; + struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; + struct delta_zone *delta_zone; + u32 i; + + zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ? + virtual_chapter - sub_index->chapter_count + 1 : + 0); + zone->virtual_chapter_high = virtual_chapter; + + /* Check to see if the new zone data is too large. */ + delta_zone = &sub_index->delta_index.delta_zones[zone_number]; + for (i = 1; i <= delta_zone->list_count; i++) + used_bits += delta_zone->delta_lists[i].size; + + if (used_bits > sub_index->max_zone_bits) { + /* Expire enough chapters to free the desired space. */ + u64 expire_count = + 1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits; + + if (expire_count == 1) { + uds_log_ratelimit(uds_log_info, + "zone %u: At chapter %llu, expiring chapter %llu early", + zone_number, + (unsigned long long) virtual_chapter, + (unsigned long long) zone->virtual_chapter_low); + zone->early_flushes++; + zone->virtual_chapter_low++; + } else { + u64 first_expired = zone->virtual_chapter_low; + + if (first_expired + expire_count < zone->virtual_chapter_high) { + zone->early_flushes += expire_count; + zone->virtual_chapter_low += expire_count; + } else { + zone->early_flushes += + zone->virtual_chapter_high - zone->virtual_chapter_low; + zone->virtual_chapter_low = zone->virtual_chapter_high; + } + uds_log_ratelimit(uds_log_info, + "zone %u: At chapter %llu, expiring chapters %llu to %llu early", + zone_number, + (unsigned long long) virtual_chapter, + (unsigned long long) first_expired, + (unsigned long long) zone->virtual_chapter_low - 1); + } + } +} + +void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, + unsigned int zone_number, + u64 virtual_chapter) +{ + struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; + + set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number, + virtual_chapter); + + /* + * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open + * chapter number is changing. + */ + if (has_sparse(volume_index)) { + mutex_lock(mutex); + set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook, + zone_number, virtual_chapter); + mutex_unlock(mutex); + } +} + +/* + * Set the newest open chapter number for the index, while also advancing the oldest valid chapter + * number. + */ +void uds_set_volume_index_open_chapter(struct volume_index *volume_index, + u64 virtual_chapter) +{ + unsigned int zone; + + for (zone = 0; zone < volume_index->zone_count; zone++) + uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter); +} + +int uds_set_volume_index_record_chapter(struct volume_index_record *record, + u64 virtual_chapter) +{ + const struct volume_sub_index *sub_index = record->sub_index; + int result; + + if (!record->is_found) + return uds_log_warning_strerror(UDS_BAD_STATE, + "illegal operation on new record"); + + if (!is_virtual_chapter_indexed(record, virtual_chapter)) { + u64 low = get_zone_for_record(record)->virtual_chapter_low; + u64 high = get_zone_for_record(record)->virtual_chapter_high; + + return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot set chapter number %llu that is out of the valid range %llu to %llu", + (unsigned long long) virtual_chapter, + (unsigned long long) low, + (unsigned long long) high); + } + + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_set_delta_entry_value(&record->delta_entry, + convert_virtual_to_index(sub_index, + virtual_chapter)); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + if (result != UDS_SUCCESS) + return result; + + record->virtual_chapter = virtual_chapter; + return UDS_SUCCESS; +} + +static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + int result; + u32 address = extract_address(sub_index, name); + u32 delta_list_number = extract_dlist_num(sub_index, name); + unsigned int zone_number = get_volume_sub_index_zone(sub_index, name); + const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; + u64 virtual_chapter; + u32 index_chapter; + u32 rolling_chapter; + struct delta_index_entry delta_entry; + + result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number, + address, name->name, &delta_entry); + if (result != UDS_SUCCESS) + return NO_CHAPTER; + + if (delta_entry.at_end || (delta_entry.key != address)) + return NO_CHAPTER; + + index_chapter = uds_get_delta_entry_value(&delta_entry); + rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask; + + virtual_chapter = zone->virtual_chapter_low + rolling_chapter; + if (virtual_chapter > zone->virtual_chapter_high) + return NO_CHAPTER; + + return virtual_chapter; +} + +/* Do a read-only lookup of the record name for sparse cache management. */ +u64 uds_lookup_volume_index_name(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + unsigned int zone_number = uds_get_volume_index_zone(volume_index, name); + struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; + u64 virtual_chapter; + + if (!uds_is_volume_index_sample(volume_index, name)) + return NO_CHAPTER; + + mutex_lock(mutex); + virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name); + mutex_unlock(mutex); + + return virtual_chapter; +} + +static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index) +{ + uds_reset_delta_index(&sub_index->delta_index); +} + +static void abort_restoring_volume_index(struct volume_index *volume_index) +{ + abort_restoring_volume_sub_index(&volume_index->vi_non_hook); + if (has_sparse(volume_index)) + abort_restoring_volume_sub_index(&volume_index->vi_hook); +} + +static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, + struct buffered_reader **readers, + unsigned int reader_count) +{ + unsigned int z; + int result; + u64 virtual_chapter_low = 0, virtual_chapter_high = 0; + unsigned int i; + + for (i = 0; i < reader_count; i++) { + struct sub_index_data header; + u8 buffer[sizeof(struct sub_index_data)]; + size_t offset = 0; + u32 j; + + result = uds_read_from_buffered_reader(readers[i], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read volume index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u64_le(buffer, &offset, &header.volume_nonce); + decode_u64_le(buffer, &offset, &header.virtual_chapter_low); + decode_u64_le(buffer, &offset, &header.virtual_chapter_high); + decode_u32_le(buffer, &offset, &header.first_list); + decode_u32_le(buffer, &offset, &header.list_count); + + result = ASSERT(offset == sizeof(buffer), + "%zu bytes decoded of %zu expected", offset, + sizeof(buffer)); + if (result != UDS_SUCCESS) + result = UDS_CORRUPT_DATA; + + if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index file had bad magic number"); + } + + if (sub_index->volume_nonce == 0) { + sub_index->volume_nonce = header.volume_nonce; + } else if (header.volume_nonce != sub_index->volume_nonce) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index volume nonce incorrect"); + } + + if (i == 0) { + virtual_chapter_low = header.virtual_chapter_low; + virtual_chapter_high = header.virtual_chapter_high; + } else if (virtual_chapter_high != header.virtual_chapter_high) { + u64 low = header.virtual_chapter_low; + u64 high = header.virtual_chapter_high; + + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]", + (unsigned long long) virtual_chapter_low, + (unsigned long long) virtual_chapter_high, + i, (unsigned long long) low, + (unsigned long long) high); + } else if (virtual_chapter_low < header.virtual_chapter_low) { + virtual_chapter_low = header.virtual_chapter_low; + } + + for (j = 0; j < header.list_count; j++) { + u8 decoded[sizeof(u64)]; + + result = uds_read_from_buffered_reader(readers[i], decoded, + sizeof(u64)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read volume index flush ranges"); + } + + sub_index->flush_chapters[header.first_list + j] = + get_unaligned_le64(decoded); + } + } + + for (z = 0; z < sub_index->zone_count; z++) { + memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone)); + sub_index->zones[z].virtual_chapter_low = virtual_chapter_low; + sub_index->zones[z].virtual_chapter_high = virtual_chapter_high; + } + + result = uds_start_restoring_delta_index(&sub_index->delta_index, readers, + reader_count); + if (result != UDS_SUCCESS) + return uds_log_warning_strerror(result, "restoring delta index failed"); + + return UDS_SUCCESS; +} + +static int start_restoring_volume_index(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + unsigned int i; + int result; + + if (!has_sparse(volume_index)) { + return start_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + } + + for (i = 0; i < reader_count; i++) { + struct volume_index_data header; + u8 buffer[sizeof(struct volume_index_data)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(buffered_readers[i], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read volume index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u32_le(buffer, &offset, &header.sparse_sample_rate); + + result = ASSERT(offset == sizeof(buffer), + "%zu bytes decoded of %zu expected", offset, + sizeof(buffer)); + if (result != UDS_SUCCESS) + result = UDS_CORRUPT_DATA; + + if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index file had bad magic number"); + + if (i == 0) { + volume_index->sparse_sample_rate = header.sparse_sample_rate; + } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) { + uds_log_warning_strerror(UDS_CORRUPT_DATA, + "Inconsistent sparse sample rate in delta index zone files: %u vs. %u", + volume_index->sparse_sample_rate, + header.sparse_sample_rate); + return UDS_CORRUPT_DATA; + } + } + + result = start_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + if (result != UDS_SUCCESS) + return result; + + return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers, + reader_count); +} + +static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + return uds_finish_restoring_delta_index(&sub_index->delta_index, + buffered_readers, reader_count); +} + +static int finish_restoring_volume_index(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + + result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + if ((result == UDS_SUCCESS) && has_sparse(volume_index)) { + result = finish_restoring_volume_sub_index(&volume_index->vi_hook, + buffered_readers, + reader_count); + } + + return result; +} + +int uds_load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, unsigned int reader_count) +{ + int result; + + /* Start by reading the header section of the stream. */ + result = start_restoring_volume_index(volume_index, readers, reader_count); + if (result != UDS_SUCCESS) + return result; + + result = finish_restoring_volume_index(volume_index, readers, reader_count); + if (result != UDS_SUCCESS) { + abort_restoring_volume_index(volume_index); + return result; + } + + /* Check the final guard lists to make sure there is no extra data. */ + result = uds_check_guard_delta_lists(readers, reader_count); + if (result != UDS_SUCCESS) + abort_restoring_volume_index(volume_index); + + return result; +} + +static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer) +{ + int result; + struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number]; + u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list; + u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count; + u8 buffer[sizeof(struct sub_index_data)]; + size_t offset = 0; + u32 i; + + memcpy(buffer, MAGIC_START_5, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u64_le(buffer, &offset, sub_index->volume_nonce); + encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low); + encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high); + encode_u32_le(buffer, &offset, first_list); + encode_u32_le(buffer, &offset, list_count); + + result = ASSERT(offset == sizeof(struct sub_index_data), + "%zu bytes of config written, of %zu expected", offset, + sizeof(struct sub_index_data)); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); + if (result != UDS_SUCCESS) + return uds_log_warning_strerror(result, + "failed to write volume index header"); + + for (i = 0; i < list_count; i++) { + u8 encoded[sizeof(u64)]; + + put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded); + result = uds_write_to_buffered_writer(buffered_writer, encoded, + sizeof(u64)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to write volume index flush ranges"); + } + } + + return uds_start_saving_delta_index(&sub_index->delta_index, zone_number, + buffered_writer); +} + +static int start_saving_volume_index(const struct volume_index *volume_index, + unsigned int zone_number, + struct buffered_writer *writer) +{ + u8 buffer[sizeof(struct volume_index_data)]; + size_t offset = 0; + int result; + + if (!has_sparse(volume_index)) { + return start_saving_volume_sub_index(&volume_index->vi_non_hook, + zone_number, writer); + } + + memcpy(buffer, MAGIC_START_6, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate); + result = ASSERT(offset == sizeof(struct volume_index_data), + "%zu bytes of header written, of %zu expected", offset, + sizeof(struct volume_index_data)); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, buffer, offset); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, "failed to write volume index header"); + return result; + } + + result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number, + writer); + if (result != UDS_SUCCESS) + return result; + + return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number, + writer); +} + +static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index, + unsigned int zone_number) +{ + return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number); +} + +static int finish_saving_volume_index(const struct volume_index *volume_index, + unsigned int zone_number) +{ + int result; + + result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number); + if ((result == UDS_SUCCESS) && has_sparse(volume_index)) + result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number); + return result; +} + +int uds_save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, unsigned int writer_count) +{ + int result = UDS_SUCCESS; + unsigned int zone; + + for (zone = 0; zone < writer_count; zone++) { + result = start_saving_volume_index(volume_index, zone, writers[zone]); + if (result != UDS_SUCCESS) + break; + + result = finish_saving_volume_index(volume_index, zone); + if (result != UDS_SUCCESS) + break; + + result = uds_write_guard_delta_list(writers[zone]); + if (result != UDS_SUCCESS) + break; + + result = uds_flush_buffered_writer(writers[zone]); + if (result != UDS_SUCCESS) + break; + } + + return result; +} + +static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index, + struct volume_index_stats *stats) +{ + struct delta_index_stats dis; + unsigned int z; + + uds_get_delta_index_stats(&sub_index->delta_index, &dis); + stats->rebalance_time = dis.rebalance_time; + stats->rebalance_count = dis.rebalance_count; + stats->record_count = dis.record_count; + stats->collision_count = dis.collision_count; + stats->discard_count = dis.discard_count; + stats->overflow_count = dis.overflow_count; + stats->delta_lists = dis.list_count; + stats->early_flushes = 0; + for (z = 0; z < sub_index->zone_count; z++) + stats->early_flushes += sub_index->zones[z].early_flushes; +} + +void uds_get_volume_index_stats(const struct volume_index *volume_index, + struct volume_index_stats *stats) +{ + struct volume_index_stats sparse_stats; + + get_volume_sub_index_stats(&volume_index->vi_non_hook, stats); + if (!has_sparse(volume_index)) + return; + + get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats); + stats->rebalance_time += sparse_stats.rebalance_time; + stats->rebalance_count += sparse_stats.rebalance_count; + stats->record_count += sparse_stats.record_count; + stats->collision_count += sparse_stats.collision_count; + stats->discard_count += sparse_stats.discard_count; + stats->overflow_count += sparse_stats.overflow_count; + stats->delta_lists += sparse_stats.delta_lists; + stats->early_flushes += sparse_stats.early_flushes; +} + +static int initialize_volume_sub_index(const struct uds_configuration *config, + u64 volume_nonce, u8 tag, + struct volume_sub_index *sub_index) +{ + struct sub_index_parameters params = { .address_bits = 0 }; + unsigned int zone_count = config->zone_count; + u64 available_bytes = 0; + unsigned int z; + int result; + + result = compute_volume_sub_index_parameters(config, ¶ms); + if (result != UDS_SUCCESS) + return result; + + sub_index->address_bits = params.address_bits; + sub_index->address_mask = (1u << params.address_bits) - 1; + sub_index->chapter_bits = params.chapter_bits; + sub_index->chapter_mask = (1u << params.chapter_bits) - 1; + sub_index->chapter_count = params.chapter_count; + sub_index->list_count = params.list_count; + sub_index->zone_count = zone_count; + sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count; + sub_index->volume_nonce = volume_nonce; + + result = uds_initialize_delta_index(&sub_index->delta_index, zone_count, + params.list_count, params.mean_delta, + params.chapter_bits, params.memory_size, + tag); + if (result != UDS_SUCCESS) + return result; + + for (z = 0; z < sub_index->delta_index.zone_count; z++) + available_bytes += sub_index->delta_index.delta_zones[z].size; + available_bytes -= params.target_free_bytes; + sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count; + sub_index->memory_size = (sub_index->delta_index.memory_size + + sizeof(struct volume_sub_index) + + (params.list_count * sizeof(u64)) + + (zone_count * sizeof(struct volume_sub_index_zone))); + + /* The following arrays are initialized to all zeros. */ + result = uds_allocate(params.list_count, u64, "first chapter to flush", + &sub_index->flush_chapters); + if (result != UDS_SUCCESS) + return result; + + return uds_allocate(zone_count, struct volume_sub_index_zone, + "volume index zones", &sub_index->zones); +} + +int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, + struct volume_index **volume_index_ptr) +{ + struct split_config split; + unsigned int zone; + struct volume_index *volume_index; + int result; + + result = uds_allocate(1, struct volume_index, "volume index", &volume_index); + if (result != UDS_SUCCESS) + return result; + + volume_index->zone_count = config->zone_count; + + if (!uds_is_sparse_index_geometry(config->geometry)) { + result = initialize_volume_sub_index(config, volume_nonce, 'm', + &volume_index->vi_non_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return result; + } + + volume_index->memory_size = volume_index->vi_non_hook.memory_size; + *volume_index_ptr = volume_index; + return UDS_SUCCESS; + } + + volume_index->sparse_sample_rate = config->sparse_sample_rate; + + result = uds_allocate(config->zone_count, struct volume_index_zone, + "volume index zones", &volume_index->zones); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return result; + } + + for (zone = 0; zone < config->zone_count; zone++) + mutex_init(&volume_index->zones[zone].hook_mutex); + + split_configuration(config, &split); + result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd', + &volume_index->vi_non_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return uds_log_error_strerror(result, + "Error creating non hook volume index"); + } + + result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's', + &volume_index->vi_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return uds_log_error_strerror(result, + "Error creating hook volume index"); + } + + volume_index->memory_size = + volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size; + *volume_index_ptr = volume_index; + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h new file mode 100644 index 0000000000000..583998c547b7b --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume-index.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_VOLUME_INDEX_H +#define UDS_VOLUME_INDEX_H + +#include + +#include "thread-utils.h" + +#include "config.h" +#include "delta-index.h" +#include "indexer.h" + +/* + * The volume index is the primary top-level index for UDS. It contains records which map a record + * name to the chapter where a record with that name is stored. This mapping can definitively say + * when no record exists. However, because we only use a subset of the name for this index, it + * cannot definitively say that a record for the entry does exist. It can only say that if a record + * exists, it will be in a particular chapter. The request can then be dispatched to that chapter + * for further processing. + * + * If the volume_index_record does not actually match the record name, the index can store a more + * specific collision record to disambiguate the new entry from the existing one. Index entries are + * managed with volume_index_record structures. + */ + +#define NO_CHAPTER U64_MAX + +struct volume_index_stats { + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOWs detected */ + u64 overflow_count; + /* The number of delta lists */ + u32 delta_lists; + /* Number of early flushes */ + u64 early_flushes; +}; + +struct volume_sub_index_zone { + u64 virtual_chapter_low; + u64 virtual_chapter_high; + u64 early_flushes; +} __aligned(L1_CACHE_BYTES); + +struct volume_sub_index { + /* The delta index */ + struct delta_index delta_index; + /* The first chapter to be flushed in each zone */ + u64 *flush_chapters; + /* The zones */ + struct volume_sub_index_zone *zones; + /* The volume nonce */ + u64 volume_nonce; + /* Expected size of a chapter (per zone) */ + u64 chapter_zone_bits; + /* Maximum size of the index (per zone) */ + u64 max_zone_bits; + /* The number of bits in address mask */ + u8 address_bits; + /* Mask to get address within delta list */ + u32 address_mask; + /* The number of bits in chapter number */ + u8 chapter_bits; + /* The largest storable chapter number */ + u32 chapter_mask; + /* The number of chapters used */ + u32 chapter_count; + /* The number of delta lists */ + u32 list_count; + /* The number of zones */ + unsigned int zone_count; + /* The amount of memory allocated */ + u64 memory_size; +}; + +struct volume_index_zone { + /* Protects the sampled index in this zone */ + struct mutex hook_mutex; +} __aligned(L1_CACHE_BYTES); + +struct volume_index { + u32 sparse_sample_rate; + unsigned int zone_count; + u64 memory_size; + struct volume_sub_index vi_non_hook; + struct volume_sub_index vi_hook; + struct volume_index_zone *zones; +}; + +/* + * The volume_index_record structure is used to facilitate processing of a record name. A client + * first calls uds_get_volume_index_record() to find the volume index record for a record name. The + * fields of the record can then be examined to determine the state of the record. + * + * If is_found is false, then the index did not find an entry for the record name. Calling + * uds_put_volume_index_record() will insert a new entry for that name at the proper place. + * + * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and + * is_collision fields reflect the entry found. Subsequently, a call to + * uds_remove_volume_index_record() will remove the entry, a call to + * uds_set_volume_index_record_chapter() will update the existing entry, and a call to + * uds_put_volume_index_record() will insert a new collision record after the existing entry. + */ +struct volume_index_record { + /* Public fields */ + + /* Chapter where the record info is found */ + u64 virtual_chapter; + /* This record is a collision */ + bool is_collision; + /* This record is the requested record */ + bool is_found; + + /* Private fields */ + + /* Zone that contains this name */ + unsigned int zone_number; + /* The volume index */ + struct volume_sub_index *sub_index; + /* Mutex for accessing this delta index entry in the hook index */ + struct mutex *mutex; + /* The record name to which this record refers */ + const struct uds_record_name *name; + /* The delta index entry for this record */ + struct delta_index_entry delta_entry; +}; + +int __must_check uds_make_volume_index(const struct uds_configuration *config, + u64 volume_nonce, + struct volume_index **volume_index); + +void uds_free_volume_index(struct volume_index *volume_index); + +int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config, + size_t block_size, + u64 *block_count); + +unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index, + const struct uds_record_name *name); + +bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index, + const struct uds_record_name *name); + +/* + * This function is only used to manage sparse cache membership. Most requests should use + * uds_get_volume_index_record() to look up index records instead. + */ +u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index, + const struct uds_record_name *name); + +int __must_check uds_get_volume_index_record(struct volume_index *volume_index, + const struct uds_record_name *name, + struct volume_index_record *record); + +int __must_check uds_put_volume_index_record(struct volume_index_record *record, + u64 virtual_chapter); + +int __must_check uds_remove_volume_index_record(struct volume_index_record *record); + +int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record, + u64 virtual_chapter); + +void uds_set_volume_index_open_chapter(struct volume_index *volume_index, + u64 virtual_chapter); + +void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, + unsigned int zone_number, + u64 virtual_chapter); + +int __must_check uds_load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, + unsigned int reader_count); + +int __must_check uds_save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, + unsigned int writer_count); + +void uds_get_volume_index_stats(const struct volume_index *volume_index, + struct volume_index_stats *stats); + +#endif /* UDS_VOLUME_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c new file mode 100644 index 0000000000000..eca83b6cab356 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -0,0 +1,1695 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "volume.h" + +#include +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" +#include "thread-utils.h" + +#include "chapter-index.h" +#include "config.h" +#include "geometry.h" +#include "hash-utils.h" +#include "index.h" +#include "sparse-cache.h" + +/* + * The first block of the volume layout is reserved for the volume header, which is no longer used. + * The remainder of the volume is divided into chapters consisting of several pages of records, and + * several pages of static index to use to find those records. The index pages are recorded first, + * followed by the record pages. The chapters are written in order as they are filled, so the + * volume storage acts as a circular log of the most recent chapters, with each new chapter + * overwriting the oldest saved one. + * + * When a new chapter is filled and closed, the records from that chapter are sorted and + * interleaved in approximate temporal order, and assigned to record pages. Then a static delta + * index is generated to store which record page contains each record. The in-memory index page map + * is also updated to indicate which delta lists fall on each chapter index page. This means that + * when a record is read, the volume only has to load a single index page and a single record page, + * rather than search the entire chapter. These index and record pages are written to storage, and + * the index pages are transferred to the page cache under the theory that the most recently + * written chapter is likely to be accessed again soon. + * + * When reading a record, the volume index will indicate which chapter should contain it. The + * volume uses the index page map to determine which chapter index page needs to be loaded, and + * then reads the relevant record page number from the chapter index. Both index and record pages + * are stored in a page cache when read for the common case that subsequent records need the same + * pages. The page cache evicts the least recently accessed entries when caching new pages. In + * addition, the volume uses dm-bufio to manage access to the storage, which may allow for + * additional caching depending on available system resources. + * + * Record requests are handled from cached pages when possible. If a page needs to be read, it is + * placed on a queue along with the request that wants to read it. Any requests for the same page + * that arrive while the read is pending are added to the queue entry. A separate reader thread + * handles the queued reads, adding the page to the cache and updating any requests queued with it + * so they can continue processing. This allows the index zone threads to continue processing new + * requests rather than wait for the storage reads. + * + * When an index rebuild is necessary, the volume reads each stored chapter to determine which + * range of chapters contain valid records, so that those records can be used to reconstruct the + * in-memory volume index. + */ + +enum { + /* The maximum allowable number of contiguous bad chapters */ + MAX_BAD_CHAPTERS = 100, + VOLUME_CACHE_MAX_ENTRIES = (U16_MAX >> 1), + VOLUME_CACHE_QUEUED_FLAG = (1 << 15), + VOLUME_CACHE_MAX_QUEUED_READS = 4096, +}; + +static const u64 BAD_CHAPTER = U64_MAX; + +/* + * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits + * are the physical page number of the cached page being read. The high order 32 bits are a + * sequence number. This value is written when the zone that owns it begins or completes a cache + * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting + * to update the cache contents. + */ +union invalidate_counter { + u64 value; + struct { + u32 page; + u32 counter; + }; +}; + +static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter; +} + +static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter; +} + +static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page) +{ + return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter; +} + +static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page) +{ + /* Page zero is the header page, so the first chapter index page is page one. */ + return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page; +} + +static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache, + unsigned int zone_number) +{ + return (union invalidate_counter) { + .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value), + }; +} + +static inline void set_invalidate_counter(struct page_cache *cache, + unsigned int zone_number, + union invalidate_counter invalidate_counter) +{ + WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value, + invalidate_counter.value); +} + +static inline bool search_pending(union invalidate_counter invalidate_counter) +{ + return (invalidate_counter.counter & 1) != 0; +} + +/* Lock the cache for a zone in order to search for a page. */ +static void begin_pending_search(struct page_cache *cache, u32 physical_page, + unsigned int zone_number) +{ + union invalidate_counter invalidate_counter = + get_invalidate_counter(cache, zone_number); + + invalidate_counter.page = physical_page; + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); + ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", zone_number); + /* + * This memory barrier ensures that the write to the invalidate counter is seen by other + * threads before this thread accesses the cached page. The corresponding read memory + * barrier is in wait_for_pending_searches(). + */ + smp_mb(); +} + +/* Unlock the cache for a zone by clearing its invalidate counter. */ +static void end_pending_search(struct page_cache *cache, unsigned int zone_number) +{ + union invalidate_counter invalidate_counter; + + /* + * This memory barrier ensures that this thread completes reads of the + * cached page before other threads see the write to the invalidate + * counter. + */ + smp_mb(); + + invalidate_counter = get_invalidate_counter(cache, zone_number); + ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", zone_number); + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); +} + +static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page) +{ + union invalidate_counter initial_counters[MAX_ZONES]; + unsigned int i; + + /* + * We hold the read_threads_mutex. We are waiting for threads that do not hold the + * read_threads_mutex. Those threads have "locked" their targeted page by setting the + * search_pending_counter. The corresponding write memory barrier is in + * begin_pending_search(). + */ + smp_mb(); + + for (i = 0; i < cache->zone_count; i++) + initial_counters[i] = get_invalidate_counter(cache, i); + for (i = 0; i < cache->zone_count; i++) { + if (search_pending(initial_counters[i]) && + (initial_counters[i].page == physical_page)) { + /* + * There is an active search using the physical page. We need to wait for + * the search to finish. + * + * FIXME: Investigate using wait_event() to wait for the search to finish. + */ + while (initial_counters[i].value == + get_invalidate_counter(cache, i).value) + cond_resched(); + } + } +} + +static void release_page_buffer(struct cached_page *page) +{ + if (page->buffer != NULL) + dm_bufio_release(uds_forget(page->buffer)); +} + +static void clear_cache_page(struct page_cache *cache, struct cached_page *page) +{ + /* Do not clear read_pending because the read queue relies on it. */ + release_page_buffer(page); + page->physical_page = cache->indexable_pages; + WRITE_ONCE(page->last_used, 0); +} + +static void make_page_most_recent(struct page_cache *cache, struct cached_page *page) +{ + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + */ + if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used)) + WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock)); +} + +/* Select a page to remove from the cache to make space for a new entry. */ +static struct cached_page *select_victim_in_cache(struct page_cache *cache) +{ + struct cached_page *page; + int oldest_index = 0; + s64 oldest_time = S64_MAX; + s64 last_used; + u16 i; + + /* Find the oldest unclaimed page. We hold the read_threads_mutex. */ + for (i = 0; i < cache->cache_slots; i++) { + /* A page with a pending read must not be replaced. */ + if (cache->cache[i].read_pending) + continue; + + last_used = READ_ONCE(cache->cache[i].last_used); + if (last_used <= oldest_time) { + oldest_time = last_used; + oldest_index = i; + } + } + + page = &cache->cache[oldest_index]; + if (page->physical_page != cache->indexable_pages) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + } + + page->read_pending = true; + clear_cache_page(cache, page); + return page; +} + +/* Make a newly filled cache entry available to other threads. */ +static int put_page_in_cache(struct page_cache *cache, u32 physical_page, + struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = ASSERT((page->read_pending), "page to install has a pending read"); + if (result != UDS_SUCCESS) + return result; + + page->physical_page = physical_page; + make_page_most_recent(cache, page); + page->read_pending = false; + + /* + * We hold the read_threads_mutex, but we must have a write memory barrier before making + * the cached_page available to the readers that do not hold the mutex. The corresponding + * read memory barrier is in get_page_and_index(). + */ + smp_wmb(); + + /* This assignment also clears the queued flag. */ + WRITE_ONCE(cache->index[physical_page], page - cache->cache); + return UDS_SUCCESS; +} + +static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page, + struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = ASSERT((page->read_pending), "page to install has a pending read"); + if (result != UDS_SUCCESS) + return; + + clear_cache_page(cache, page); + page->read_pending = false; + + /* Clear the mapping and the queued flag for the new page. */ + WRITE_ONCE(cache->index[physical_page], cache->cache_slots); +} + +static inline u16 next_queue_position(u16 position) +{ + return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS; +} + +static inline void advance_queue_position(u16 *position) +{ + *position = next_queue_position(*position); +} + +static inline bool read_queue_is_full(struct page_cache *cache) +{ + return cache->read_queue_first == next_queue_position(cache->read_queue_last); +} + +static bool enqueue_read(struct page_cache *cache, struct uds_request *request, + u32 physical_page) +{ + struct queued_read *queue_entry; + u16 last = cache->read_queue_last; + u16 read_queue_index; + + /* We hold the read_threads_mutex. */ + if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) { + /* This page has no existing entry in the queue. */ + if (read_queue_is_full(cache)) + return false; + + /* Fill in the read queue entry. */ + cache->read_queue[last].physical_page = physical_page; + cache->read_queue[last].invalid = false; + cache->read_queue[last].first_request = NULL; + cache->read_queue[last].last_request = NULL; + + /* Point the cache index to the read queue entry. */ + read_queue_index = last; + WRITE_ONCE(cache->index[physical_page], + read_queue_index | VOLUME_CACHE_QUEUED_FLAG); + + advance_queue_position(&cache->read_queue_last); + } else { + /* It's already queued, so add this request to the existing entry. */ + read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG; + } + + request->next_request = NULL; + queue_entry = &cache->read_queue[read_queue_index]; + if (queue_entry->first_request == NULL) + queue_entry->first_request = request; + else + queue_entry->last_request->next_request = request; + queue_entry->last_request = request; + + return true; +} + +static void enqueue_page_read(struct volume *volume, struct uds_request *request, + u32 physical_page) +{ + /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */ + while (!enqueue_read(&volume->page_cache, request, physical_page)) { + uds_log_debug("Read queue full, waiting for reads to finish"); + uds_wait_cond(&volume->read_threads_read_done_cond, + &volume->read_threads_mutex); + } + + uds_signal_cond(&volume->read_threads_cond); +} + +/* + * Reserve the next read queue entry for processing, but do not actually remove it from the queue. + * Must be followed by release_queued_requests(). + */ +static struct queued_read *reserve_read_queue_entry(struct page_cache *cache) +{ + /* We hold the read_threads_mutex. */ + struct queued_read *entry; + u16 index_value; + bool queued; + + /* No items to dequeue */ + if (cache->read_queue_next_read == cache->read_queue_last) + return NULL; + + entry = &cache->read_queue[cache->read_queue_next_read]; + index_value = cache->index[entry->physical_page]; + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + /* Check to see if it's still queued before resetting. */ + if (entry->invalid && queued) + WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots); + + /* + * If a synchronous read has taken this page, set invalid to true so it doesn't get + * overwritten. Requests will just be requeued. + */ + if (!queued) + entry->invalid = true; + + entry->reserved = true; + advance_queue_position(&cache->read_queue_next_read); + return entry; +} + +static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume) +{ + struct queued_read *queue_entry = NULL; + + while (!volume->read_threads_exiting) { + queue_entry = reserve_read_queue_entry(&volume->page_cache); + if (queue_entry != NULL) + break; + + uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex); + } + + return queue_entry; +} + +static int init_chapter_index_page(const struct volume *volume, u8 *index_page, + u32 chapter, u32 index_page_number, + struct delta_index_page *chapter_index_page) +{ + u64 ci_virtual; + u32 ci_chapter; + u32 lowest_list; + u32 highest_list; + struct index_geometry *geometry = volume->geometry; + int result; + + result = uds_initialize_chapter_index_page(chapter_index_page, geometry, + index_page, volume->nonce); + if (volume->lookup_mode == LOOKUP_FOR_REBUILD) + return result; + + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "Reading chapter index page for chapter %u page %u", + chapter, index_page_number); + } + + uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number, + &lowest_list, &highest_list); + ci_virtual = chapter_index_page->virtual_chapter_number; + ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual); + if ((chapter == ci_chapter) && + (lowest_list == chapter_index_page->lowest_list_number) && + (highest_list == chapter_index_page->highest_list_number)) + return UDS_SUCCESS; + + uds_log_warning("Index page map updated to %llu", + (unsigned long long) volume->index_page_map->last_update); + uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u", + chapter, index_page_number, lowest_list, highest_list, + (unsigned long long) ci_virtual, + chapter_index_page->lowest_list_number, + chapter_index_page->highest_list_number); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "index page map mismatch with chapter index"); +} + +static int initialize_index_page(const struct volume *volume, u32 physical_page, + struct cached_page *page) +{ + u32 chapter = map_to_chapter_number(volume->geometry, physical_page); + u32 index_page_number = map_to_page_number(volume->geometry, physical_page); + + return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer), + chapter, index_page_number, &page->index_page); +} + +static bool search_record_page(const u8 record_page[], + const struct uds_record_name *name, + const struct index_geometry *geometry, + struct uds_record_data *metadata) +{ + /* + * The array of records is sorted by name and stored as a binary tree in heap order, so the + * root of the tree is the first array element. + */ + u32 node = 0; + const struct uds_volume_record *records = (const struct uds_volume_record *) record_page; + + while (node < geometry->records_per_page) { + int result; + const struct uds_volume_record *record = &records[node]; + + result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE); + if (result == 0) { + if (metadata != NULL) + *metadata = record->data; + return true; + } + + /* The children of node N are at indexes 2N+1 and 2N+2. */ + node = ((2 * node) + ((result < 0) ? 1 : 2)); + } + + return false; +} + +/* + * If we've read in a record page, we're going to do an immediate search, to speed up processing by + * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If + * we've read in an index page, we save the record page number so we don't have to resolve the + * index page again. We use the location, virtual_chapter, and old_metadata fields in the request + * to allow the index code to know where to begin processing the request again. + */ +static int search_page(struct cached_page *page, const struct volume *volume, + struct uds_request *request, u32 physical_page) +{ + int result; + enum uds_index_region location; + u16 record_page_number; + + if (is_record_page(volume->geometry, physical_page)) { + if (search_record_page(dm_bufio_get_block_data(page->buffer), + &request->record_name, volume->geometry, + &request->old_metadata)) + location = UDS_LOCATION_RECORD_PAGE_LOOKUP; + else + location = UDS_LOCATION_UNAVAILABLE; + } else { + result = uds_search_chapter_index_page(&page->index_page, + volume->geometry, + &request->record_name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { + location = UDS_LOCATION_UNAVAILABLE; + } else { + location = UDS_LOCATION_INDEX_PAGE_LOOKUP; + *((u16 *) &request->old_metadata) = record_page_number; + } + } + + request->location = location; + request->found = false; + return UDS_SUCCESS; +} + +static int process_entry(struct volume *volume, struct queued_read *entry) +{ + u32 page_number = entry->physical_page; + struct uds_request *request; + struct cached_page *page = NULL; + u8 *page_data; + int result; + + if (entry->invalid) { + uds_log_debug("Requeuing requests for invalid page"); + return UDS_SUCCESS; + } + + page = select_victim_in_cache(&volume->page_cache); + + mutex_unlock(&volume->read_threads_mutex); + page_data = dm_bufio_read(volume->client, page_number, &page->buffer); + mutex_lock(&volume->read_threads_mutex); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + uds_log_warning_strerror(result, + "error reading physical page %u from volume", + page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + + if (entry->invalid) { + uds_log_warning("Page %u invalidated after read", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return UDS_SUCCESS; + } + + if (!is_record_page(volume->geometry, page_number)) { + result = initialize_index_page(volume, page_number, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error initializing chapter index page"); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, page_number, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + + request = entry->first_request; + while ((request != NULL) && (result == UDS_SUCCESS)) { + result = search_page(page, volume, request, page_number); + request = request->next_request; + } + + return result; +} + +static void release_queued_requests(struct volume *volume, struct queued_read *entry, + int result) +{ + struct page_cache *cache = &volume->page_cache; + u16 next_read = cache->read_queue_next_read; + struct uds_request *request; + struct uds_request *next; + + for (request = entry->first_request; request != NULL; request = next) { + next = request->next_request; + request->status = result; + request->requeued = true; + uds_enqueue_request(request, STAGE_INDEX); + } + + entry->reserved = false; + + /* Move the read_queue_first pointer as far as we can. */ + while ((cache->read_queue_first != next_read) && + (!cache->read_queue[cache->read_queue_first].reserved)) + advance_queue_position(&cache->read_queue_first); + uds_broadcast_cond(&volume->read_threads_read_done_cond); +} + +static void read_thread_function(void *arg) +{ + struct volume *volume = arg; + + uds_log_debug("reader starting"); + mutex_lock(&volume->read_threads_mutex); + while (true) { + struct queued_read *queue_entry; + int result; + + queue_entry = wait_to_reserve_read_queue_entry(volume); + if (volume->read_threads_exiting) + break; + + result = process_entry(volume, queue_entry); + release_queued_requests(volume, queue_entry, result); + } + mutex_unlock(&volume->read_threads_mutex); + uds_log_debug("reader done"); +} + +static void get_page_and_index(struct page_cache *cache, u32 physical_page, + int *queue_index, struct cached_page **page_ptr) +{ + u16 index_value; + u16 index; + bool queued; + + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + * + * Holding only a search_pending_counter is the most frequent case. + */ + /* + * It would be unlikely for the compiler to turn the usage of index_value into two reads of + * cache->index, but it would be possible and very bad if those reads did not return the + * same bits. + */ + index_value = READ_ONCE(cache->index[physical_page]); + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + index = index_value & ~VOLUME_CACHE_QUEUED_FLAG; + + if (!queued && (index < cache->cache_slots)) { + *page_ptr = &cache->cache[index]; + /* + * We have acquired access to the cached page, but unless we hold the + * read_threads_mutex, we need a read memory barrier now. The corresponding write + * memory barrier is in put_page_in_cache(). + */ + smp_rmb(); + } else { + *page_ptr = NULL; + } + + *queue_index = queued ? index : -1; +} + +static void get_page_from_cache(struct page_cache *cache, u32 physical_page, + struct cached_page **page) +{ + /* + * ASSERTION: We are in a zone thread. + * ASSERTION: We holding a search_pending_counter or the read_threads_mutex. + */ + int queue_index = -1; + + get_page_and_index(cache, physical_page, &queue_index, page); +} + +static int read_page_locked(struct volume *volume, u32 physical_page, + struct cached_page **page_ptr) +{ + int result = UDS_SUCCESS; + struct cached_page *page = NULL; + u8 *page_data; + + page = select_victim_in_cache(&volume->page_cache); + page_data = dm_bufio_read(volume->client, physical_page, &page->buffer); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + uds_log_warning_strerror(result, + "error reading physical page %u from volume", + physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + if (!is_record_page(volume->geometry, physical_page)) { + result = initialize_index_page(volume, physical_page, page); + if (result != UDS_SUCCESS) { + if (volume->lookup_mode != LOOKUP_FOR_REBUILD) + uds_log_warning("Corrupt index page %u", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding the read threads mutex. */ +static int get_volume_page_locked(struct volume *volume, u32 physical_page, + struct cached_page **page_ptr) +{ + int result; + struct cached_page *page = NULL; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + result = read_page_locked(volume, physical_page, &page); + if (result != UDS_SUCCESS) + return result; + } else { + make_page_most_recent(&volume->page_cache, page); + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding a search_pending lock. */ +static int get_volume_page_protected(struct volume *volume, struct uds_request *request, + u32 physical_page, struct cached_page **page_ptr) +{ + struct cached_page *page; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page != NULL) { + if (request->zone_number == 0) { + /* Only one zone is allowed to update the LRU. */ + make_page_most_recent(&volume->page_cache, page); + } + + *page_ptr = page; + return UDS_SUCCESS; + } + + /* Prepare to enqueue a read for the page. */ + end_pending_search(&volume->page_cache, request->zone_number); + mutex_lock(&volume->read_threads_mutex); + + /* + * Do the lookup again while holding the read mutex (no longer the fast case so this should + * be fine to repeat). We need to do this because a page may have been added to the cache + * by a reader thread between the time we searched above and the time we went to actually + * try to enqueue it below. This could result in us enqueuing another read for a page which + * is already in the cache, which would mean we end up with two entries in the cache for + * the same page. + */ + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + enqueue_page_read(volume, request, physical_page); + /* + * The performance gain from unlocking first, while "search pending" mode is off, + * turns out to be significant in some cases. The page is not available yet so + * the order does not matter for correctness as it does below. + */ + mutex_unlock(&volume->read_threads_mutex); + begin_pending_search(&volume->page_cache, physical_page, + request->zone_number); + return UDS_QUEUED; + } + + /* + * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and + * "search pending" state in careful order so no other thread can mess with the data before + * the caller gets to look at it. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + mutex_unlock(&volume->read_threads_mutex); + *page_ptr = page; + return UDS_SUCCESS; +} + +static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number, + struct cached_page **page_ptr) +{ + int result; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + mutex_lock(&volume->read_threads_mutex); + result = get_volume_page_locked(volume, physical_page, page_ptr); + mutex_unlock(&volume->read_threads_mutex); + return result; +} + +int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number, + u8 **data_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *data_ptr = dm_bufio_get_block_data(page->buffer); + return result; +} + +int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number, + struct delta_index_page **index_page_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *index_page_ptr = &page->index_page; + return result; +} + +/* + * Find the record page associated with a name in a given index page. This will return UDS_QUEUED + * if the page in question must be read from storage. + */ +static int search_cached_index_page(struct volume *volume, struct uds_request *request, + u32 chapter, u32 index_page_number, + u16 *record_page_number) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, + index_page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + result = uds_search_chapter_index_page(&page->index_page, volume->geometry, + &request->record_name, + record_page_number); + end_pending_search(&volume->page_cache, request->zone_number); + return result; +} + +/* + * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if + * the page in question must be read from storage. + */ +int uds_search_cached_record_page(struct volume *volume, struct uds_request *request, + u32 chapter, u16 record_page_number, bool *found) +{ + struct cached_page *record_page; + struct index_geometry *geometry = volume->geometry; + int result; + u32 physical_page, page_number; + + *found = false; + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + result = ASSERT(record_page_number < geometry->record_pages_per_chapter, + "0 <= %d < %u", record_page_number, + geometry->record_pages_per_chapter); + if (result != UDS_SUCCESS) + return result; + + page_number = geometry->index_pages_per_chapter + record_page_number; + + physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &record_page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + if (search_record_page(dm_bufio_get_block_data(record_page->buffer), + &request->record_name, geometry, &request->old_metadata)) + *found = true; + + end_pending_search(&volume->page_cache, request->zone_number); + return UDS_SUCCESS; +} + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter) +{ + const struct index_geometry *geometry = volume->geometry; + u32 physical_page = map_to_physical_page(geometry, chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter); +} + +int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]) +{ + int result; + u32 i; + const struct index_geometry *geometry = volume->geometry; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter); + for (i = 0; i < geometry->index_pages_per_chapter; i++) { + u8 *index_page; + + index_page = dm_bufio_read(volume->client, physical_page + i, + &volume_buffers[i]); + if (IS_ERR(index_page)) { + result = -PTR_ERR(index_page); + uds_log_warning_strerror(result, + "error reading physical page %u", + physical_page); + return result; + } + + result = init_chapter_index_page(volume, index_page, physical_chapter, i, + &index_pages[i]); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request, + bool *found) +{ + int result; + u32 physical_chapter = + uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter); + u32 index_page_number; + u16 record_page_number; + + index_page_number = uds_find_index_page_number(volume->index_page_map, + &request->record_name, + physical_chapter); + + if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) { + record_page_number = *((u16 *) &request->old_metadata); + } else { + result = search_cached_index_page(volume, request, physical_chapter, + index_page_number, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + } + + return uds_search_cached_record_page(volume, request, physical_chapter, + record_page_number, found); +} + +int uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, bool *found) +{ + int result; + struct index_geometry *geometry = volume->geometry; + struct cached_page *page; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 index_page_number; + u16 record_page_number; + u32 page_number; + + *found = false; + index_page_number = + uds_find_index_page_number(volume->index_page_map, name, + physical_chapter); + result = get_volume_page(volume, physical_chapter, index_page_number, &page); + if (result != UDS_SUCCESS) + return result; + + result = uds_search_chapter_index_page(&page->index_page, geometry, name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + page_number = geometry->index_pages_per_chapter + record_page_number; + result = get_volume_page(volume, physical_chapter, page_number, &page); + if (result != UDS_SUCCESS) + return result; + + *found = search_record_page(dm_bufio_get_block_data(page->buffer), name, + geometry, NULL); + return UDS_SUCCESS; +} + +static void invalidate_page(struct page_cache *cache, u32 physical_page) +{ + struct cached_page *page; + int queue_index = -1; + + /* We hold the read_threads_mutex. */ + get_page_and_index(cache, physical_page, &queue_index, &page); + if (page != NULL) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + clear_cache_page(cache, page); + } else if (queue_index > -1) { + uds_log_debug("setting pending read to invalid"); + cache->read_queue[queue_index].invalid = true; + } +} + +void uds_forget_chapter(struct volume *volume, u64 virtual_chapter) +{ + u32 physical_chapter = + uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0); + u32 i; + + uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter); + mutex_lock(&volume->read_threads_mutex); + for (i = 0; i < volume->geometry->pages_per_chapter; i++) + invalidate_page(&volume->page_cache, first_page + i); + mutex_unlock(&volume->read_threads_mutex); +} + +/* + * Donate an index pages from a newly written chapter to the page cache since it is likely to be + * used again soon. The caller must already hold the reader thread mutex. + */ +static int donate_index_page_locked(struct volume *volume, u32 physical_chapter, + u32 index_page_number, struct dm_buffer *page_buffer) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = + map_to_physical_page(volume->geometry, physical_chapter, + index_page_number); + + page = select_victim_in_cache(&volume->page_cache); + page->buffer = page_buffer; + result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer), + physical_chapter, index_page_number, + &page->index_page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error initialize chapter index page"); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + return UDS_SUCCESS; +} + +static int write_index_pages(struct volume *volume, u32 physical_chapter_number, + struct open_chapter_index *chapter_index) +{ + struct index_geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0); + u32 delta_list_number = 0; + u32 index_page_number; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + u8 *page_data; + u32 physical_page = first_index_page + index_page_number; + u32 lists_packed; + bool last_page; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) { + return uds_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare index page"); + } + + last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter); + result = uds_pack_open_chapter_index_page(chapter_index, page_data, + delta_list_number, last_page, + &lists_packed); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return uds_log_warning_strerror(result, + "failed to pack index page"); + } + + dm_bufio_mark_buffer_dirty(page_buffer); + + if (lists_packed == 0) { + uds_log_debug("no delta lists packed on chapter %u page %u", + physical_chapter_number, index_page_number); + } else { + delta_list_number += lists_packed; + } + + uds_update_index_page_map(volume->index_page_map, + chapter_index->virtual_chapter_number, + physical_chapter_number, index_page_number, + delta_list_number - 1); + + mutex_lock(&volume->read_threads_mutex); + result = donate_index_page_locked(volume, physical_chapter_number, + index_page_number, page_buffer); + mutex_unlock(&volume->read_threads_mutex); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return result; + } + } + + return UDS_SUCCESS; +} + +static u32 encode_tree(u8 record_page[], + const struct uds_volume_record *sorted_pointers[], + u32 next_record, u32 node, u32 node_count) +{ + if (node < node_count) { + u32 child = (2 * node) + 1; + + next_record = encode_tree(record_page, sorted_pointers, next_record, + child, node_count); + + /* + * In-order traversal: copy the contents of the next record into the page at the + * node offset. + */ + memcpy(&record_page[node * BYTES_PER_RECORD], + sorted_pointers[next_record++], BYTES_PER_RECORD); + + next_record = encode_tree(record_page, sorted_pointers, next_record, + child + 1, node_count); + } + + return next_record; +} + +static int encode_record_page(const struct volume *volume, + const struct uds_volume_record records[], u8 record_page[]) +{ + int result; + u32 i; + u32 records_per_page = volume->geometry->records_per_page; + const struct uds_volume_record **record_pointers = volume->record_pointers; + + for (i = 0; i < records_per_page; i++) + record_pointers[i] = &records[i]; + + /* + * Sort the record pointers by using just the names in the records, which is less work than + * sorting the entire record values. + */ + BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0); + result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers, + records_per_page, UDS_RECORD_NAME_SIZE); + if (result != UDS_SUCCESS) + return result; + + encode_tree(record_page, record_pointers, 0, 0, records_per_page); + return UDS_SUCCESS; +} + +static int write_record_pages(struct volume *volume, u32 physical_chapter_number, + const struct uds_volume_record *records) +{ + u32 record_page_number; + struct index_geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + const struct uds_volume_record *next_record = records; + u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number, + geometry->index_pages_per_chapter); + + for (record_page_number = 0; + record_page_number < geometry->record_pages_per_chapter; + record_page_number++) { + u8 *page_data; + u32 physical_page = first_record_page + record_page_number; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) { + return uds_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare record page"); + } + + result = encode_record_page(volume, next_record, page_data); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return uds_log_warning_strerror(result, + "failed to encode record page %u", + record_page_number); + } + + next_record += geometry->records_per_page; + dm_bufio_mark_buffer_dirty(page_buffer); + dm_bufio_release(page_buffer); + } + + return UDS_SUCCESS; +} + +int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index, + const struct uds_volume_record *records) +{ + int result; + u32 physical_chapter_number = + uds_map_to_physical_chapter(volume->geometry, + chapter_index->virtual_chapter_number); + + result = write_index_pages(volume, physical_chapter_number, chapter_index); + if (result != UDS_SUCCESS) + return result; + + result = write_record_pages(volume, physical_chapter_number, records); + if (result != UDS_SUCCESS) + return result; + + result = -dm_bufio_write_dirty_buffers(volume->client); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "cannot sync chapter to volume"); + + return result; +} + +static void probe_chapter(struct volume *volume, u32 chapter_number, + u64 *virtual_chapter_number) +{ + const struct index_geometry *geometry = volume->geometry; + u32 expected_list_number = 0; + u32 i; + u64 vcn = BAD_CHAPTER; + + *virtual_chapter_number = BAD_CHAPTER; + dm_bufio_prefetch(volume->client, + map_to_physical_page(geometry, chapter_number, 0), + geometry->index_pages_per_chapter); + + for (i = 0; i < geometry->index_pages_per_chapter; i++) { + struct delta_index_page *page; + int result; + + result = uds_get_volume_index_page(volume, chapter_number, i, &page); + if (result != UDS_SUCCESS) + return; + + if (page->virtual_chapter_number == BAD_CHAPTER) { + uds_log_error("corrupt index page in chapter %u", + chapter_number); + return; + } + + if (vcn == BAD_CHAPTER) { + vcn = page->virtual_chapter_number; + } else if (page->virtual_chapter_number != vcn) { + uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu", + chapter_number, i, (unsigned long long) vcn, + (unsigned long long) page->virtual_chapter_number); + return; + } + + if (expected_list_number != page->lowest_list_number) { + uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u", + chapter_number, i, expected_list_number, + page->lowest_list_number); + return; + } + expected_list_number = page->highest_list_number + 1; + + result = uds_validate_chapter_index_page(page, geometry); + if (result != UDS_SUCCESS) + return; + } + + if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) { + uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number, + (unsigned long long) vcn, geometry->chapters_per_volume); + return; + } + + *virtual_chapter_number = vcn; +} + +/* Find the last valid physical chapter in the volume. */ +static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr) +{ + u32 span = 1; + u32 tries = 0; + + while (limit > 0) { + u32 chapter = (span > limit) ? 0 : limit - span; + u64 vcn = 0; + + probe_chapter(volume, chapter, &vcn); + if (vcn == BAD_CHAPTER) { + limit = chapter; + if (++tries > 1) + span *= 2; + } else { + if (span == 1) + break; + span /= 2; + tries = 0; + } + } + + *limit_ptr = limit; +} + +static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn, + u64 *highest_vcn) +{ + struct index_geometry *geometry = volume->geometry; + u64 zero_vcn; + u64 lowest = BAD_CHAPTER; + u64 highest = BAD_CHAPTER; + u64 moved_chapter = BAD_CHAPTER; + u32 left_chapter = 0; + u32 right_chapter = 0; + u32 bad_chapters = 0; + + /* + * This method assumes there is at most one run of contiguous bad chapters caused by + * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the + * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the + * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately + * precedes the lowest one. + */ + + /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */ + probe_chapter(volume, 0, &zero_vcn); + + /* + * Binary search for end of the discontinuity in the monotonically increasing virtual + * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're + * searching for the index of the smallest value less than zero_vcn. In the case we go off + * the end it means that chapter 0 has the lowest vcn. + * + * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always + * skip over the moved chapter when searching, adding it to the range at the end if + * necessary. + */ + if (geometry->remapped_physical > 0) { + u64 remapped_vcn; + + probe_chapter(volume, geometry->remapped_physical, &remapped_vcn); + if (remapped_vcn == geometry->remapped_virtual) + moved_chapter = geometry->remapped_physical; + } + + left_chapter = 0; + right_chapter = chapter_limit; + + while (left_chapter < right_chapter) { + u64 probe_vcn; + u32 chapter = (left_chapter + right_chapter) / 2; + + if (chapter == moved_chapter) + chapter--; + + probe_chapter(volume, chapter, &probe_vcn); + if (zero_vcn <= probe_vcn) { + left_chapter = chapter + 1; + if (left_chapter == moved_chapter) + left_chapter++; + } else { + right_chapter = chapter; + } + } + + /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/ + if (left_chapter >= chapter_limit) + left_chapter = 0; + + /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */ + probe_chapter(volume, left_chapter, &lowest); + + /* The moved chapter might be the lowest in the range. */ + if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1)) + lowest = geometry->remapped_virtual; + + /* + * Circularly scan backwards, moving over any bad chapters until encountering a good one, + * which is the chapter with the highest vcn. + */ + while (highest == BAD_CHAPTER) { + right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit; + if (right_chapter == moved_chapter) + continue; + + probe_chapter(volume, right_chapter, &highest); + if (bad_chapters++ >= MAX_BAD_CHAPTERS) { + uds_log_error("too many bad chapters in volume: %u", + bad_chapters); + return UDS_CORRUPT_DATA; + } + } + + *lowest_vcn = lowest; + *highest_vcn = highest; + return UDS_SUCCESS; +} + +/* + * Find the highest and lowest contiguous chapters present in the volume and determine their + * virtual chapter numbers. This is used by rebuild. + */ +int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn, + u64 *highest_vcn, bool *is_empty) +{ + u32 chapter_limit = volume->geometry->chapters_per_volume; + + find_real_end_of_volume(volume, chapter_limit, &chapter_limit); + if (chapter_limit == 0) { + *lowest_vcn = 0; + *highest_vcn = 0; + *is_empty = true; + return UDS_SUCCESS; + } + + *is_empty = false; + return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn); +} + +int __must_check uds_replace_volume_storage(struct volume *volume, + struct index_layout *layout, + struct block_device *bdev) +{ + int result; + u32 i; + + result = uds_replace_index_layout_storage(layout, bdev); + if (result != UDS_SUCCESS) + return result; + + /* Release all outstanding dm_bufio objects */ + for (i = 0; i < volume->page_cache.indexable_pages; i++) + volume->page_cache.index[i] = volume->page_cache.cache_slots; + for (i = 0; i < volume->page_cache.cache_slots; i++) + clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]); + if (volume->sparse_cache != NULL) + uds_invalidate_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(uds_forget(volume->client)); + + return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page, + volume->reserved_buffers, &volume->client); +} + +static int __must_check initialize_page_cache(struct page_cache *cache, + const struct index_geometry *geometry, + u32 chapters_in_cache, + unsigned int zone_count) +{ + int result; + u32 i; + + cache->indexable_pages = geometry->pages_per_volume + 1; + cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter; + cache->zone_count = zone_count; + atomic64_set(&cache->clock, 1); + + result = ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES), + "requested cache size, %u, within limit %u", + cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, + "volume read queue", &cache->read_queue); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(cache->zone_count, struct search_pending_counter, + "Volume Cache Zones", &cache->search_pending_counters); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(cache->indexable_pages, u16, "page cache index", + &cache->index); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate(cache->cache_slots, struct cached_page, "page cache cache", + &cache->cache); + if (result != UDS_SUCCESS) + return result; + + /* Initialize index values to invalid values. */ + for (i = 0; i < cache->indexable_pages; i++) + cache->index[i] = cache->cache_slots; + + for (i = 0; i < cache->cache_slots; i++) + clear_cache_page(cache, &cache->cache[i]); + + return UDS_SUCCESS; +} + +int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout, + struct volume **new_volume) +{ + unsigned int i; + struct volume *volume = NULL; + struct index_geometry *geometry; + unsigned int reserved_buffers; + int result; + + result = uds_allocate(1, struct volume, "volume", &volume); + if (result != UDS_SUCCESS) + return result; + + volume->nonce = uds_get_volume_nonce(layout); + + result = uds_copy_index_geometry(config->geometry, &volume->geometry); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return uds_log_warning_strerror(result, + "failed to allocate geometry: error"); + } + geometry = volume->geometry; + + /* + * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one + * for each entry in the sparse cache. + */ + reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter; + reserved_buffers += 1; + if (uds_is_sparse_index_geometry(geometry)) + reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter); + volume->reserved_buffers = reserved_buffers; + result = uds_open_volume_bufio(layout, geometry->bytes_per_page, + volume->reserved_buffers, &volume->client); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_make_radix_sorter(geometry->records_per_page, + &volume->radix_sorter); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_allocate(geometry->records_per_page, + const struct uds_volume_record *, "record pointers", + &volume->record_pointers); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + if (uds_is_sparse_index_geometry(geometry)) { + size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page; + + result = uds_make_sparse_cache(geometry, config->cache_chapters, + config->zone_count, + &volume->sparse_cache); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size = + page_size * geometry->index_pages_per_chapter * config->cache_chapters; + } + + result = initialize_page_cache(&volume->page_cache, geometry, + config->cache_chapters, config->zone_count); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page); + result = uds_make_index_page_map(geometry, &volume->index_page_map); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + mutex_init(&volume->read_threads_mutex); + uds_init_cond(&volume->read_threads_read_done_cond); + uds_init_cond(&volume->read_threads_cond); + + result = uds_allocate(config->read_threads, struct thread *, "reader threads", + &volume->reader_threads); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + for (i = 0; i < config->read_threads; i++) { + result = vdo_create_thread(read_thread_function, (void *) volume, + "reader", &volume->reader_threads[i]); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->read_thread_count = i + 1; + } + + *new_volume = volume; + return UDS_SUCCESS; +} + +static void uninitialize_page_cache(struct page_cache *cache) +{ + u16 i; + + if (cache->cache != NULL) { + for (i = 0; i < cache->cache_slots; i++) + release_page_buffer(&cache->cache[i]); + } + uds_free(cache->index); + uds_free(cache->cache); + uds_free(cache->search_pending_counters); + uds_free(cache->read_queue); +} + +void uds_free_volume(struct volume *volume) +{ + if (volume == NULL) + return; + + if (volume->reader_threads != NULL) { + unsigned int i; + + /* This works even if some threads weren't started. */ + mutex_lock(&volume->read_threads_mutex); + volume->read_threads_exiting = true; + uds_broadcast_cond(&volume->read_threads_cond); + mutex_unlock(&volume->read_threads_mutex); + for (i = 0; i < volume->read_thread_count; i++) + vdo_join_threads(volume->reader_threads[i]); + uds_free(volume->reader_threads); + volume->reader_threads = NULL; + } + + /* Must destroy the client AFTER freeing the cached pages. */ + uninitialize_page_cache(&volume->page_cache); + uds_free_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(uds_forget(volume->client)); + + uds_free_index_page_map(volume->index_page_map); + uds_free_radix_sorter(volume->radix_sorter); + uds_free(volume->geometry); + uds_free(volume->record_pointers); + uds_free(volume); +} diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h new file mode 100644 index 0000000000000..8679a5e55347a --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_VOLUME_H +#define UDS_VOLUME_H + +#include +#include +#include +#include + +#include "permassert.h" +#include "thread-utils.h" + +#include "chapter-index.h" +#include "config.h" +#include "geometry.h" +#include "indexer.h" +#include "index-layout.h" +#include "index-page-map.h" +#include "radix-sort.h" +#include "sparse-cache.h" + +/* + * The volume manages deduplication records on permanent storage. The term "volume" can also refer + * to the region of permanent storage where the records (and the chapters containing them) are + * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages + * as necessary. + */ + +enum index_lookup_mode { + /* Always do lookups in all chapters normally */ + LOOKUP_NORMAL, + /* Only do a subset of lookups needed when rebuilding an index */ + LOOKUP_FOR_REBUILD, +}; + +struct queued_read { + bool invalid; + bool reserved; + u32 physical_page; + struct uds_request *first_request; + struct uds_request *last_request; +}; + +struct __aligned(L1_CACHE_BYTES) search_pending_counter { + u64 atomic_value; +}; + +struct cached_page { + /* Whether this page is currently being read asynchronously */ + bool read_pending; + /* The physical page stored in this cache entry */ + u32 physical_page; + /* The value of the volume clock when this page was last used */ + s64 last_used; + /* The cached page buffer */ + struct dm_buffer *buffer; + /* The chapter index page, meaningless for record pages */ + struct delta_index_page index_page; +}; + +struct page_cache { + /* The number of zones */ + unsigned int zone_count; + /* The number of volume pages that can be cached */ + u32 indexable_pages; + /* The maximum number of simultaneously cached pages */ + u16 cache_slots; + /* An index for each physical page noting where it is in the cache */ + u16 *index; + /* The array of cached pages */ + struct cached_page *cache; + /* A counter for each zone tracking if a search is occurring there */ + struct search_pending_counter *search_pending_counters; + /* The read queue entries as a circular array */ + struct queued_read *read_queue; + + /* All entries above this point are constant after initialization. */ + + /* + * These values are all indexes into the array of read queue entries. New entries in the + * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the + * lock and then claims the entry pointed to by read_queue_next_read and increments that + * value. After the read is completed, the reader thread calls release_read_queue_entry(), + * which increments read_queue_first until it points to a pending read, or is equal to + * read_queue_next_read. This means that if multiple reads are outstanding, + * read_queue_first might not advance until the last of the reads finishes. + */ + u16 read_queue_first; + u16 read_queue_next_read; + u16 read_queue_last; + + atomic64_t clock; +}; + +struct volume { + struct index_geometry *geometry; + struct dm_bufio_client *client; + u64 nonce; + size_t cache_size; + + /* A single page worth of records, for sorting */ + const struct uds_volume_record **record_pointers; + /* Sorter for sorting records within each page */ + struct radix_sorter *radix_sorter; + + struct sparse_cache *sparse_cache; + struct page_cache page_cache; + struct index_page_map *index_page_map; + + struct mutex read_threads_mutex; + struct cond_var read_threads_cond; + struct cond_var read_threads_read_done_cond; + struct thread **reader_threads; + unsigned int read_thread_count; + bool read_threads_exiting; + + enum index_lookup_mode lookup_mode; + unsigned int reserved_buffers; +}; + +int __must_check uds_make_volume(const struct uds_configuration *config, + struct index_layout *layout, + struct volume **new_volume); + +void uds_free_volume(struct volume *volume); + +int __must_check uds_replace_volume_storage(struct volume *volume, + struct index_layout *layout, + struct block_device *bdev); + +int __must_check uds_find_volume_chapter_boundaries(struct volume *volume, + u64 *lowest_vcn, u64 *highest_vcn, + bool *is_empty); + +int __must_check uds_search_volume_page_cache(struct volume *volume, + struct uds_request *request, + bool *found); + +int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, + bool *found); + +int __must_check uds_search_cached_record_page(struct volume *volume, + struct uds_request *request, u32 chapter, + u16 record_page_number, bool *found); + +void uds_forget_chapter(struct volume *volume, u64 chapter); + +int __must_check uds_write_chapter(struct volume *volume, + struct open_chapter_index *chapter_index, + const struct uds_volume_record records[]); + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter); + +int __must_check uds_read_chapter_index_from_volume(const struct volume *volume, + u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]); + +int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter, + u32 page_number, u8 **data_ptr); + +int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter, + u32 page_number, + struct delta_index_page **page_ptr); + +#endif /* UDS_VOLUME_H */ diff --git a/drivers/md/dm-vdo/io-factory.c b/drivers/md/dm-vdo/io-factory.c deleted file mode 100644 index 02242df94e373..0000000000000 --- a/drivers/md/dm-vdo/io-factory.c +++ /dev/null @@ -1,415 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "io-factory.h" - -#include -#include -#include -#include - -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" - -/* - * The I/O factory object manages access to index storage, which is a contiguous range of blocks on - * a block device. - * - * The factory holds the open device and is responsible for closing it. The factory has methods to - * make helper structures that can be used to access sections of the index. - */ -struct io_factory { - struct block_device *bdev; - atomic_t ref_count; -}; - -/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */ -struct buffered_reader { - struct io_factory *factory; - struct dm_bufio_client *client; - struct dm_buffer *buffer; - sector_t limit; - sector_t block_number; - u8 *start; - u8 *end; -}; - -enum { MAX_READ_AHEAD_BLOCKS = 4 }; - -/* - * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments - * to storage. - */ -struct buffered_writer { - struct io_factory *factory; - struct dm_bufio_client *client; - struct dm_buffer *buffer; - sector_t limit; - sector_t block_number; - u8 *start; - u8 *end; - int error; -}; - -static void uds_get_io_factory(struct io_factory *factory) -{ - atomic_inc(&factory->ref_count); -} - -int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr) -{ - int result; - struct io_factory *factory; - - result = uds_allocate(1, struct io_factory, __func__, &factory); - if (result != UDS_SUCCESS) - return result; - - factory->bdev = bdev; - atomic_set_release(&factory->ref_count, 1); - - *factory_ptr = factory; - return UDS_SUCCESS; -} - -int uds_replace_storage(struct io_factory *factory, struct block_device *bdev) -{ - factory->bdev = bdev; - return UDS_SUCCESS; -} - -/* Free an I/O factory once all references have been released. */ -void uds_put_io_factory(struct io_factory *factory) -{ - if (atomic_add_return(-1, &factory->ref_count) <= 0) - uds_free(factory); -} - -size_t uds_get_writable_size(struct io_factory *factory) -{ - return i_size_read(factory->bdev->bd_inode); -} - -/* Create a struct dm_bufio_client for an index region starting at offset. */ -int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size, - unsigned int reserved_buffers, struct dm_bufio_client **client_ptr) -{ - struct dm_bufio_client *client; - - client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0, - NULL, NULL, 0); - if (IS_ERR(client)) - return -PTR_ERR(client); - - dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK); - *client_ptr = client; - return UDS_SUCCESS; -} - -static void read_ahead(struct buffered_reader *reader, sector_t block_number) -{ - if (block_number < reader->limit) { - sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS, - reader->limit - block_number); - - dm_bufio_prefetch(reader->client, block_number, read_ahead); - } -} - -void uds_free_buffered_reader(struct buffered_reader *reader) -{ - if (reader == NULL) - return; - - if (reader->buffer != NULL) - dm_bufio_release(reader->buffer); - - dm_bufio_client_destroy(reader->client); - uds_put_io_factory(reader->factory); - uds_free(reader); -} - -/* Create a buffered reader for an index region starting at offset. */ -int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count, - struct buffered_reader **reader_ptr) -{ - int result; - struct dm_bufio_client *client = NULL; - struct buffered_reader *reader = NULL; - - result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(1, struct buffered_reader, "buffered reader", &reader); - if (result != UDS_SUCCESS) { - dm_bufio_client_destroy(client); - return result; - } - - *reader = (struct buffered_reader) { - .factory = factory, - .client = client, - .buffer = NULL, - .limit = block_count, - .block_number = 0, - .start = NULL, - .end = NULL, - }; - - read_ahead(reader, 0); - uds_get_io_factory(factory); - *reader_ptr = reader; - return UDS_SUCCESS; -} - -static int position_reader(struct buffered_reader *reader, sector_t block_number, - off_t offset) -{ - struct dm_buffer *buffer = NULL; - void *data; - - if ((reader->end == NULL) || (block_number != reader->block_number)) { - if (block_number >= reader->limit) - return UDS_OUT_OF_RANGE; - - if (reader->buffer != NULL) - dm_bufio_release(uds_forget(reader->buffer)); - - data = dm_bufio_read(reader->client, block_number, &buffer); - if (IS_ERR(data)) - return -PTR_ERR(data); - - reader->buffer = buffer; - reader->start = data; - if (block_number == reader->block_number + 1) - read_ahead(reader, block_number + 1); - } - - reader->block_number = block_number; - reader->end = reader->start + offset; - return UDS_SUCCESS; -} - -static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader) -{ - return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end; -} - -static int reset_reader(struct buffered_reader *reader) -{ - sector_t block_number; - - if (bytes_remaining_in_read_buffer(reader) > 0) - return UDS_SUCCESS; - - block_number = reader->block_number; - if (reader->end != NULL) - block_number++; - - return position_reader(reader, block_number, 0); -} - -int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, - size_t length) -{ - int result = UDS_SUCCESS; - size_t chunk_size; - - while (length > 0) { - result = reset_reader(reader); - if (result != UDS_SUCCESS) - return result; - - chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); - memcpy(data, reader->end, chunk_size); - length -= chunk_size; - data += chunk_size; - reader->end += chunk_size; - } - - return UDS_SUCCESS; -} - -/* - * Verify that the next data on the reader matches the required value. If the value matches, the - * matching contents are consumed. If the value does not match, the reader state is unchanged. - */ -int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, - size_t length) -{ - int result = UDS_SUCCESS; - size_t chunk_size; - sector_t start_block_number = reader->block_number; - int start_offset = reader->end - reader->start; - - while (length > 0) { - result = reset_reader(reader); - if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_DATA; - break; - } - - chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); - if (memcmp(value, reader->end, chunk_size) != 0) { - result = UDS_CORRUPT_DATA; - break; - } - - length -= chunk_size; - value += chunk_size; - reader->end += chunk_size; - } - - if (result != UDS_SUCCESS) - position_reader(reader, start_block_number, start_offset); - - return result; -} - -/* Create a buffered writer for an index region starting at offset. */ -int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count, - struct buffered_writer **writer_ptr) -{ - int result; - struct dm_bufio_client *client = NULL; - struct buffered_writer *writer; - - result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(1, struct buffered_writer, "buffered writer", &writer); - if (result != UDS_SUCCESS) { - dm_bufio_client_destroy(client); - return result; - } - - *writer = (struct buffered_writer) { - .factory = factory, - .client = client, - .buffer = NULL, - .limit = block_count, - .start = NULL, - .end = NULL, - .block_number = 0, - .error = UDS_SUCCESS, - }; - - uds_get_io_factory(factory); - *writer_ptr = writer; - return UDS_SUCCESS; -} - -static size_t get_remaining_write_space(struct buffered_writer *writer) -{ - return writer->start + UDS_BLOCK_SIZE - writer->end; -} - -static int __must_check prepare_next_buffer(struct buffered_writer *writer) -{ - struct dm_buffer *buffer = NULL; - void *data; - - if (writer->block_number >= writer->limit) { - writer->error = UDS_OUT_OF_RANGE; - return UDS_OUT_OF_RANGE; - } - - data = dm_bufio_new(writer->client, writer->block_number, &buffer); - if (IS_ERR(data)) { - writer->error = -PTR_ERR(data); - return writer->error; - } - - writer->buffer = buffer; - writer->start = data; - writer->end = data; - return UDS_SUCCESS; -} - -static int flush_previous_buffer(struct buffered_writer *writer) -{ - size_t available; - - if (writer->buffer == NULL) - return writer->error; - - if (writer->error == UDS_SUCCESS) { - available = get_remaining_write_space(writer); - - if (available > 0) - memset(writer->end, 0, available); - - dm_bufio_mark_buffer_dirty(writer->buffer); - } - - dm_bufio_release(writer->buffer); - writer->buffer = NULL; - writer->start = NULL; - writer->end = NULL; - writer->block_number++; - return writer->error; -} - -void uds_free_buffered_writer(struct buffered_writer *writer) -{ - int result; - - if (writer == NULL) - return; - - flush_previous_buffer(writer); - result = -dm_bufio_write_dirty_buffers(writer->client); - if (result != UDS_SUCCESS) - uds_log_warning_strerror(result, "%s: failed to sync storage", __func__); - - dm_bufio_client_destroy(writer->client); - uds_put_io_factory(writer->factory); - uds_free(writer); -} - -/* - * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead. - * If a write error occurs, it is recorded and returned on every subsequent write attempt. - */ -int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data, - size_t length) -{ - int result = writer->error; - size_t chunk_size; - - while ((length > 0) && (result == UDS_SUCCESS)) { - if (writer->buffer == NULL) { - result = prepare_next_buffer(writer); - continue; - } - - chunk_size = min(length, get_remaining_write_space(writer)); - if (data == NULL) { - memset(writer->end, 0, chunk_size); - } else { - memcpy(writer->end, data, chunk_size); - data += chunk_size; - } - - length -= chunk_size; - writer->end += chunk_size; - - if (get_remaining_write_space(writer) == 0) - result = uds_flush_buffered_writer(writer); - } - - return result; -} - -int uds_flush_buffered_writer(struct buffered_writer *writer) -{ - if (writer->error != UDS_SUCCESS) - return writer->error; - - return flush_previous_buffer(writer); -} diff --git a/drivers/md/dm-vdo/io-factory.h b/drivers/md/dm-vdo/io-factory.h deleted file mode 100644 index 7fb5a0616a791..0000000000000 --- a/drivers/md/dm-vdo/io-factory.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_IO_FACTORY_H -#define UDS_IO_FACTORY_H - -#include - -/* - * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main - * clients are the index layout and the volume. The buffered reader and buffered writer interfaces - * are helpers for accessing data in a contiguous range of storage blocks. - */ - -struct buffered_reader; -struct buffered_writer; - -struct io_factory; - -enum { - UDS_BLOCK_SIZE = 4096, - SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT, -}; - -int __must_check uds_make_io_factory(struct block_device *bdev, - struct io_factory **factory_ptr); - -int __must_check uds_replace_storage(struct io_factory *factory, - struct block_device *bdev); - -void uds_put_io_factory(struct io_factory *factory); - -size_t __must_check uds_get_writable_size(struct io_factory *factory); - -int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset, - size_t block_size, unsigned int reserved_buffers, - struct dm_bufio_client **client_ptr); - -int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset, - u64 block_count, - struct buffered_reader **reader_ptr); - -void uds_free_buffered_reader(struct buffered_reader *reader); - -int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, - size_t length); - -int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, - size_t length); - -int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset, - u64 block_count, - struct buffered_writer **writer_ptr); - -void uds_free_buffered_writer(struct buffered_writer *buffer); - -int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer, - const u8 *data, size_t length); - -int __must_check uds_flush_buffered_writer(struct buffered_writer *writer); - -#endif /* UDS_IO_FACTORY_H */ diff --git a/drivers/md/dm-vdo/open-chapter.c b/drivers/md/dm-vdo/open-chapter.c deleted file mode 100644 index d9d6e5d45bfbd..0000000000000 --- a/drivers/md/dm-vdo/open-chapter.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "open-chapter.h" - -#include - -#include "config.h" -#include "hash-utils.h" -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" -#include "permassert.h" - -/* - * Each index zone has a dedicated open chapter zone structure which gets an equal share of the - * open chapter space. Records are assigned to zones based on their record name. Within each zone, - * records are stored in an array in the order they arrive. Additionally, a reference to each - * record is stored in a hash table to help determine if a new record duplicates an existing one. - * If new metadata for an existing name arrives, the record is altered in place. The array of - * records is 1-based so that record number 0 can be used to indicate an unused hash slot. - * - * Deleted records are marked with a flag rather than actually removed to simplify hash table - * management. The array of deleted flags overlays the array of hash slots, but the flags are - * indexed by record number instead of by record name. The number of hash slots will always be a - * power of two that is greater than the number of records to be indexed, guaranteeing that hash - * insertion cannot fail, and that there are sufficient flags for all records. - * - * Once any open chapter zone fills its available space, the chapter is closed. The records from - * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages. - * Empty or deleted records are replaced by copies of a valid record so that the record pages only - * contain valid records. The chapter then constructs a delta index which maps each record name to - * the record page on which that record can be found, which is split into index pages. These - * structures are then passed to the volume to be recorded on storage. - * - * When the index is saved, the open chapter records are saved in a single array, once again - * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a - * different number of zones than previously, so the records must be parcelled out to their new - * zones. In addition, depending on the distribution of record names, a new zone may have more - * records than it has space. In this case, the latest records for that zone will be discarded. - */ - -static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC"; -static const u8 OPEN_CHAPTER_VERSION[] = "02.00"; - -enum { - OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, - OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1, - LOAD_RATIO = 2, -}; - -static inline size_t records_size(const struct open_chapter_zone *open_chapter) -{ - return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity); -} - -static inline size_t slots_size(size_t slot_count) -{ - return sizeof(struct open_chapter_zone_slot) * slot_count; -} - -int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count, - struct open_chapter_zone **open_chapter_ptr) -{ - int result; - struct open_chapter_zone *open_chapter; - size_t capacity = geometry->records_per_chapter / zone_count; - size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO)); - - result = uds_allocate_extended(struct open_chapter_zone, slot_count, - struct open_chapter_zone_slot, "open chapter", - &open_chapter); - if (result != UDS_SUCCESS) - return result; - - open_chapter->slot_count = slot_count; - open_chapter->capacity = capacity; - result = uds_allocate_cache_aligned(records_size(open_chapter), "record pages", - &open_chapter->records); - if (result != UDS_SUCCESS) { - uds_free_open_chapter(open_chapter); - return result; - } - - *open_chapter_ptr = open_chapter; - return UDS_SUCCESS; -} - -void uds_reset_open_chapter(struct open_chapter_zone *open_chapter) -{ - open_chapter->size = 0; - open_chapter->deletions = 0; - - memset(open_chapter->records, 0, records_size(open_chapter)); - memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count)); -} - -static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name) -{ - struct uds_volume_record *record; - unsigned int slot_count = open_chapter->slot_count; - unsigned int slot = uds_name_to_hash_slot(name, slot_count); - unsigned int record_number; - unsigned int attempts = 1; - - while (true) { - record_number = open_chapter->slots[slot].record_number; - - /* - * If the hash slot is empty, we've reached the end of a chain without finding the - * record and should terminate the search. - */ - if (record_number == 0) - return slot; - - /* - * If the name of the record referenced by the slot matches and has not been - * deleted, then we've found the requested name. - */ - record = &open_chapter->records[record_number]; - if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) && - !open_chapter->slots[record_number].deleted) - return slot; - - /* - * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This - * performs better than linear probing and works best for 2^N slots. - */ - slot = (slot + attempts++) % slot_count; - } -} - -void uds_search_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name, - struct uds_record_data *metadata, bool *found) -{ - unsigned int slot; - unsigned int record_number; - - slot = probe_chapter_slots(open_chapter, name); - record_number = open_chapter->slots[slot].record_number; - if (record_number == 0) { - *found = false; - } else { - *found = true; - *metadata = open_chapter->records[record_number].data; - } -} - -/* Add a record to the open chapter zone and return the remaining space. */ -int uds_put_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name, - const struct uds_record_data *metadata) -{ - unsigned int slot; - unsigned int record_number; - struct uds_volume_record *record; - - if (open_chapter->size >= open_chapter->capacity) - return 0; - - slot = probe_chapter_slots(open_chapter, name); - record_number = open_chapter->slots[slot].record_number; - - if (record_number == 0) { - record_number = ++open_chapter->size; - open_chapter->slots[slot].record_number = record_number; - } - - record = &open_chapter->records[record_number]; - record->name = *name; - record->data = *metadata; - - return open_chapter->capacity - open_chapter->size; -} - -void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name) -{ - unsigned int slot; - unsigned int record_number; - - slot = probe_chapter_slots(open_chapter, name); - record_number = open_chapter->slots[slot].record_number; - - if (record_number > 0) { - open_chapter->slots[record_number].deleted = true; - open_chapter->deletions += 1; - } -} - -void uds_free_open_chapter(struct open_chapter_zone *open_chapter) -{ - if (open_chapter != NULL) { - uds_free(open_chapter->records); - uds_free(open_chapter); - } -} - -/* Map each record name to its record page number in the delta chapter index. */ -static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, - struct open_chapter_index *index, - struct uds_volume_record *collated_records) -{ - int result; - unsigned int records_per_chapter; - unsigned int records_per_page; - unsigned int record_index; - unsigned int records = 0; - u32 page_number; - unsigned int z; - int overflow_count = 0; - struct uds_volume_record *fill_record = NULL; - - /* - * The record pages should not have any empty space, so find a record with which to fill - * the chapter zone if it was closed early, and also to replace any deleted records. The - * last record in any filled zone is guaranteed to not have been deleted, so use one of - * those. - */ - for (z = 0; z < zone_count; z++) { - struct open_chapter_zone *zone = chapter_zones[z]; - - if (zone->size == zone->capacity) { - fill_record = &zone->records[zone->size]; - break; - } - } - - records_per_chapter = index->geometry->records_per_chapter; - records_per_page = index->geometry->records_per_page; - - for (records = 0; records < records_per_chapter; records++) { - struct uds_volume_record *record = &collated_records[records]; - struct open_chapter_zone *open_chapter; - - /* The record arrays in the zones are 1-based. */ - record_index = 1 + (records / zone_count); - page_number = records / records_per_page; - open_chapter = chapter_zones[records % zone_count]; - - /* Use the fill record in place of an unused record. */ - if (record_index > open_chapter->size || - open_chapter->slots[record_index].deleted) { - *record = *fill_record; - continue; - } - - *record = open_chapter->records[record_index]; - result = uds_put_open_chapter_index_record(index, &record->name, - page_number); - switch (result) { - case UDS_SUCCESS: - break; - case UDS_OVERFLOW: - overflow_count++; - break; - default: - uds_log_error_strerror(result, - "failed to build open chapter index"); - return result; - } - } - - if (overflow_count > 0) - uds_log_warning("Failed to add %d entries to chapter index", - overflow_count); - - return UDS_SUCCESS; -} - -int uds_close_open_chapter(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, struct volume *volume, - struct open_chapter_index *chapter_index, - struct uds_volume_record *collated_records, - u64 virtual_chapter_number) -{ - int result; - - uds_empty_open_chapter_index(chapter_index, virtual_chapter_number); - result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index, - collated_records); - if (result != UDS_SUCCESS) - return result; - - return uds_write_chapter(volume, chapter_index, collated_records); -} - -int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer) -{ - int result; - struct open_chapter_zone *open_chapter; - struct uds_volume_record *record; - u8 record_count_data[sizeof(u32)]; - u32 record_count = 0; - unsigned int record_index; - unsigned int z; - - result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC, - OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) - return result; - - result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION, - OPEN_CHAPTER_VERSION_LENGTH); - if (result != UDS_SUCCESS) - return result; - - for (z = 0; z < index->zone_count; z++) { - open_chapter = index->zones[z]->open_chapter; - record_count += open_chapter->size - open_chapter->deletions; - } - - put_unaligned_le32(record_count, record_count_data); - result = uds_write_to_buffered_writer(writer, record_count_data, - sizeof(record_count_data)); - if (result != UDS_SUCCESS) - return result; - - record_index = 1; - while (record_count > 0) { - for (z = 0; z < index->zone_count; z++) { - open_chapter = index->zones[z]->open_chapter; - if (record_index > open_chapter->size) - continue; - - if (open_chapter->slots[record_index].deleted) - continue; - - record = &open_chapter->records[record_index]; - result = uds_write_to_buffered_writer(writer, (u8 *) record, - sizeof(*record)); - if (result != UDS_SUCCESS) - return result; - - record_count--; - } - - record_index++; - } - - return uds_flush_buffered_writer(writer); -} - -u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry) -{ - unsigned int records_per_chapter = geometry->records_per_chapter; - - return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) + - records_per_chapter * sizeof(struct uds_volume_record); -} - -static int load_version20(struct uds_index *index, struct buffered_reader *reader) -{ - int result; - u32 record_count; - u8 record_count_data[sizeof(u32)]; - struct uds_volume_record record; - - /* - * Track which zones cannot accept any more records. If the open chapter had a different - * number of zones previously, some new zones may have more records than they have space - * for. These overflow records will be discarded. - */ - bool full_flags[MAX_ZONES] = { - false, - }; - - result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data, - sizeof(record_count_data)); - if (result != UDS_SUCCESS) - return result; - - record_count = get_unaligned_le32(record_count_data); - while (record_count-- > 0) { - unsigned int zone = 0; - - result = uds_read_from_buffered_reader(reader, (u8 *) &record, - sizeof(record)); - if (result != UDS_SUCCESS) - return result; - - if (index->zone_count > 1) - zone = uds_get_volume_index_zone(index->volume_index, - &record.name); - - if (!full_flags[zone]) { - struct open_chapter_zone *open_chapter; - unsigned int remaining; - - open_chapter = index->zones[zone]->open_chapter; - remaining = uds_put_open_chapter(open_chapter, &record.name, - &record.data); - /* Do not allow any zone to fill completely. */ - full_flags[zone] = (remaining <= 1); - } - } - - return UDS_SUCCESS; -} - -int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader) -{ - u8 version[OPEN_CHAPTER_VERSION_LENGTH]; - int result; - - result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC, - OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) - return result; - - result = uds_read_from_buffered_reader(reader, version, sizeof(version)); - if (result != UDS_SUCCESS) - return result; - - if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "Invalid open chapter version: %.*s", - (int) sizeof(version), version); - } - - return load_version20(index, reader); -} diff --git a/drivers/md/dm-vdo/open-chapter.h b/drivers/md/dm-vdo/open-chapter.h deleted file mode 100644 index a4250bb19525e..0000000000000 --- a/drivers/md/dm-vdo/open-chapter.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_OPEN_CHAPTER_H -#define UDS_OPEN_CHAPTER_H - -#include "chapter-index.h" -#include "geometry.h" -#include "index.h" -#include "volume.h" - -/* - * The open chapter tracks the newest records in memory. Like the index as a whole, each open - * chapter is divided into a number of independent zones which are interleaved when the chapter is - * committed to the volume. - */ - -enum { - OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, -}; - -struct open_chapter_zone_slot { - /* If non-zero, the record number addressed by this hash slot */ - unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS; - /* If true, the record at the index of this hash slot was deleted */ - bool deleted : 1; -} __packed; - -struct open_chapter_zone { - /* The maximum number of records that can be stored */ - unsigned int capacity; - /* The number of records stored */ - unsigned int size; - /* The number of deleted records */ - unsigned int deletions; - /* Array of chunk records, 1-based */ - struct uds_volume_record *records; - /* The number of slots in the hash table */ - unsigned int slot_count; - /* The hash table slots, referencing virtual record numbers */ - struct open_chapter_zone_slot slots[]; -}; - -int __must_check uds_make_open_chapter(const struct index_geometry *geometry, - unsigned int zone_count, - struct open_chapter_zone **open_chapter_ptr); - -void uds_reset_open_chapter(struct open_chapter_zone *open_chapter); - -void uds_search_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name, - struct uds_record_data *metadata, bool *found); - -int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name, - const struct uds_record_data *metadata); - -void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_record_name *name); - -void uds_free_open_chapter(struct open_chapter_zone *open_chapter); - -int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, struct volume *volume, - struct open_chapter_index *chapter_index, - struct uds_volume_record *collated_records, - u64 virtual_chapter_number); - -int __must_check uds_save_open_chapter(struct uds_index *index, - struct buffered_writer *writer); - -int __must_check uds_load_open_chapter(struct uds_index *index, - struct buffered_reader *reader); - -u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry); - -#endif /* UDS_OPEN_CHAPTER_H */ diff --git a/drivers/md/dm-vdo/radix-sort.c b/drivers/md/dm-vdo/radix-sort.c deleted file mode 100644 index 1f17c708a6526..0000000000000 --- a/drivers/md/dm-vdo/radix-sort.c +++ /dev/null @@ -1,332 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "radix-sort.h" - -#include -#include - -#include "memory-alloc.h" -#include "string-utils.h" - -/* - * This implementation allocates one large object to do the sorting, which can be reused as many - * times as desired. The amount of memory required is logarithmically proportional to the number of - * keys to be sorted. - */ - -enum { - /* Piles smaller than this are handled with a simple insertion sort. */ - INSERTION_SORT_THRESHOLD = 12, -}; - -/* Sort keys are pointers to immutable fixed-length arrays of bytes. */ -typedef const u8 *sort_key_t; - -/* - * The keys are separated into piles based on the byte in each keys at the current offset, so the - * number of keys with each byte must be counted. - */ -struct histogram { - /* The number of non-empty bins */ - u16 used; - /* The index (key byte) of the first non-empty bin */ - u16 first; - /* The index (key byte) of the last non-empty bin */ - u16 last; - /* The number of occurrences of each specific byte */ - u32 size[256]; -}; - -/* - * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound - * on the stack space needed. - */ -struct task { - /* Pointer to the first key to sort. */ - sort_key_t *first_key; - /* Pointer to the last key to sort. */ - sort_key_t *last_key; - /* The offset into the key at which to continue sorting. */ - u16 offset; - /* The number of bytes remaining in the sort keys. */ - u16 length; -}; - -struct radix_sorter { - unsigned int count; - struct histogram bins; - sort_key_t *pile[256]; - struct task *end_of_stack; - struct task insertion_list[256]; - struct task stack[]; -}; - -/* Compare a segment of two fixed-length keys starting at an offset. */ -static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length) -{ - return memcmp(&key1[offset], &key2[offset], length); -} - -/* Insert the next unsorted key into an array of sorted keys. */ -static inline void insert_key(const struct task task, sort_key_t *next) -{ - /* Pull the unsorted key out, freeing up the array slot. */ - sort_key_t unsorted = *next; - - /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */ - while ((--next >= task.first_key) && - (compare(unsorted, next[0], task.offset, task.length) < 0)) - next[1] = next[0]; - - /* Insert the key into the last slot that was cleared, sorting it. */ - next[1] = unsorted; -} - -/* - * Sort a range of key segments using an insertion sort. This simple sort is faster than the - * 256-way radix sort when the number of keys to sort is small. - */ -static inline void insertion_sort(const struct task task) -{ - sort_key_t *next; - - for (next = task.first_key + 1; next <= task.last_key; next++) - insert_key(task, next); -} - -/* Push a sorting task onto a task stack. */ -static inline void push_task(struct task **stack_pointer, sort_key_t *first_key, - u32 count, u16 offset, u16 length) -{ - struct task *task = (*stack_pointer)++; - - task->first_key = first_key; - task->last_key = &first_key[count - 1]; - task->offset = offset; - task->length = length; -} - -static inline void swap_keys(sort_key_t *a, sort_key_t *b) -{ - sort_key_t c = *a; - *a = *b; - *b = c; -} - -/* - * Count the number of times each byte value appears in the arrays of keys to sort at the current - * offset, keeping track of the number of non-empty bins, and the index of the first and last - * non-empty bin. - */ -static inline void measure_bins(const struct task task, struct histogram *bins) -{ - sort_key_t *key_ptr; - - /* - * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears - * it all out as it goes. Even though this structure is re-used, we don't need to pay to - * zero it before starting a new tally. - */ - bins->first = U8_MAX; - bins->last = 0; - - for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) { - /* Increment the count for the byte in the key at the current offset. */ - u8 bin = (*key_ptr)[task.offset]; - u32 size = ++bins->size[bin]; - - /* Track non-empty bins. */ - if (size == 1) { - bins->used += 1; - if (bin < bins->first) - bins->first = bin; - - if (bin > bins->last) - bins->last = bin; - } - } -} - -/* - * Convert the bin sizes to pointers to where each pile goes. - * - * pile[0] = first_key + bin->size[0], - * pile[1] = pile[0] + bin->size[1], etc. - * - * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the - * next radix position. A new task is put on the stack for each pile containing lots of keys, or a - * new task is put on the list for each pile containing few keys. - * - * @stack: pointer the top of the stack - * @end_of_stack: the end of the stack - * @list: pointer the head of the list - * @pile: array for pointers to the end of each pile - * @bins: the histogram of the sizes of each pile - * @first_key: the first key of the stack - * @offset: the next radix position to sort by - * @length: the number of bytes remaining in the sort keys - * - * Return: UDS_SUCCESS or an error code - */ -static inline int push_bins(struct task **stack, struct task *end_of_stack, - struct task **list, sort_key_t *pile[], - struct histogram *bins, sort_key_t *first_key, - u16 offset, u16 length) -{ - sort_key_t *pile_start = first_key; - int bin; - - for (bin = bins->first; ; bin++) { - u32 size = bins->size[bin]; - - /* Skip empty piles. */ - if (size == 0) - continue; - - /* There's no need to sort empty keys. */ - if (length > 0) { - if (size > INSERTION_SORT_THRESHOLD) { - if (*stack >= end_of_stack) - return UDS_BAD_STATE; - - push_task(stack, pile_start, size, offset, length); - } else if (size > 1) { - push_task(list, pile_start, size, offset, length); - } - } - - pile_start += size; - pile[bin] = pile_start; - if (--bins->used == 0) - break; - } - - return UDS_SUCCESS; -} - -int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) -{ - int result; - unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; - struct radix_sorter *radix_sorter; - - result = uds_allocate_extended(struct radix_sorter, stack_size, struct task, - __func__, &radix_sorter); - if (result != UDS_SUCCESS) - return result; - - radix_sorter->count = count; - radix_sorter->end_of_stack = radix_sorter->stack + stack_size; - *sorter = radix_sorter; - return UDS_SUCCESS; -} - -void uds_free_radix_sorter(struct radix_sorter *sorter) -{ - uds_free(sorter); -} - -/* - * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation - * is unstable, so the relative ordering of equal keys is not preserved. - */ -int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], - unsigned int count, unsigned short length) -{ - struct task start; - struct histogram *bins = &sorter->bins; - sort_key_t **pile = sorter->pile; - struct task *task_stack = sorter->stack; - - /* All zero-length keys are identical and therefore already sorted. */ - if ((count == 0) || (length == 0)) - return UDS_SUCCESS; - - /* The initial task is to sort the entire length of all the keys. */ - start = (struct task) { - .first_key = keys, - .last_key = &keys[count - 1], - .offset = 0, - .length = length, - }; - - if (count <= INSERTION_SORT_THRESHOLD) { - insertion_sort(start); - return UDS_SUCCESS; - } - - if (count > sorter->count) - return UDS_INVALID_ARGUMENT; - - /* - * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks - * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been - * processed, the stack will be empty and all the keys in the starting task will be fully - * sorted. - */ - for (*task_stack = start; task_stack >= sorter->stack; task_stack--) { - const struct task task = *task_stack; - struct task *insertion_task_list; - int result; - sort_key_t *fence; - sort_key_t *end; - - measure_bins(task, bins); - - /* - * Now that we know how large each bin is, generate pointers for each of the piles - * and push a new task to sort each pile by the next radix byte. - */ - insertion_task_list = sorter->insertion_list; - result = push_bins(&task_stack, sorter->end_of_stack, - &insertion_task_list, pile, bins, task.first_key, - task.offset + 1, task.length - 1); - if (result != UDS_SUCCESS) { - memset(bins, 0, sizeof(*bins)); - return result; - } - - /* Now bins->used is zero again. */ - - /* - * Don't bother processing the last pile: when piles 0..N-1 are all in place, then - * pile N must also be in place. - */ - end = task.last_key - bins->size[bins->last]; - bins->size[bins->last] = 0; - - for (fence = task.first_key; fence <= end; ) { - u8 bin; - sort_key_t key = *fence; - - /* - * The radix byte of the key tells us which pile it belongs in. Swap it for - * an unprocessed item just below that pile, and repeat. - */ - while (--pile[bin = key[task.offset]] > fence) - swap_keys(pile[bin], &key); - - /* - * The pile reached the fence. Put the key at the bottom of that pile, - * completing it, and advance the fence to the next pile. - */ - *fence = key; - fence += bins->size[bin]; - bins->size[bin] = 0; - } - - /* Now bins->size[] is all zero again. */ - - /* - * When the number of keys in a task gets small enough, it is faster to use an - * insertion sort than to keep subdividing into tiny piles. - */ - while (--insertion_task_list >= sorter->insertion_list) - insertion_sort(*insertion_task_list); - } - - return UDS_SUCCESS; -} diff --git a/drivers/md/dm-vdo/radix-sort.h b/drivers/md/dm-vdo/radix-sort.h deleted file mode 100644 index 812949bc2cee9..0000000000000 --- a/drivers/md/dm-vdo/radix-sort.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_RADIX_SORT_H -#define UDS_RADIX_SORT_H - -/* - * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix - * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith - * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort". - * - * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf - */ - -struct radix_sorter; - -int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter); - -void uds_free_radix_sorter(struct radix_sorter *sorter); - -int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], - unsigned int count, unsigned short length); - -#endif /* UDS_RADIX_SORT_H */ diff --git a/drivers/md/dm-vdo/sparse-cache.c b/drivers/md/dm-vdo/sparse-cache.c deleted file mode 100644 index b43a626a42dec..0000000000000 --- a/drivers/md/dm-vdo/sparse-cache.c +++ /dev/null @@ -1,625 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "sparse-cache.h" - -#include -#include -#include - -#include "chapter-index.h" -#include "config.h" -#include "index.h" -#include "logger.h" -#include "memory-alloc.h" -#include "permassert.h" - -/* - * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a - * specific virtual chapter is implemented as a linear search. The cache replacement policy is - * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be - * maintained by shifting entries in an array list. - * - * Changing the contents of the cache requires the coordinated participation of all zone threads - * via the careful use of barrier messages sent to all the index zones by the triage queue worker - * thread. The critical invariant for coordination is that the cache membership must not change - * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all - * receive the same results for every virtual chapter number. To ensure that critical invariant, - * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that - * chapter because it has had too many cache misses" are represented separately from the cache - * membership information (the virtual chapter number). - * - * As a result of this invariant, we have the guarantee that every zone thread will call - * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache, - * and the serialization of the barrier requests from the triage queue ensures they will all - * request the same chapter number. This means the only synchronization we need can be provided by - * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical - * section where a single zone thread can drive the cache update while all the other zone threads - * are known to be blocked, waiting in the second barrier. Outside that critical section, all the - * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an - * exclusive lock. No other threads may access or modify the cache entries. - * - * Chapter statistics must only be modified by a single thread, which is also the zone zero thread. - * All fields that might be frequently updated by that thread are kept in separate cache-aligned - * structures so they will not cause cache contention via "false sharing" with the fields that are - * frequently accessed by all of the zone threads. - * - * The LRU order is managed independently by each zone thread, and each zone uses its own list for - * searching and cache membership queries. The zone zero list is used to decide which chapter to - * evict when the cache is updated, and its search list is copied to the other threads at that - * time. - * - * The virtual chapter number field of the cache entry is the single field indicating whether a - * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or - * undefined chapter number. When present in the virtual chapter number field of a - * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of - * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any - * cache entry that is not marked as dead is fully defined and a member of the cache, and - * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears - * in any of the cache entries. - * - * A chapter index that is a member of the cache may be excluded from searches between calls to - * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the - * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since - * that chapter is no longer part of the volume, there's no point in continuing to search that - * chapter index. Once invalidated, that virtual chapter will still be considered a member of the - * cache, but it will no longer be searched for matching names. - * - * The second mechanism is a heuristic based on keeping track of the number of consecutive search - * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will - * be set to true, causing the chapter to be skipped when searching the entire cache, but still - * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will - * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry. - * Again, regardless of the state of the skip_search flag, the virtual chapter must still - * considered to be a member of the cache for uds_sparse_cache_contains(). - */ - -enum { - SKIP_SEARCH_THRESHOLD = 20000, - ZONE_ZERO = 0, -}; - -/* - * These counters are essentially fields of the struct cached_chapter_index, but are segregated - * into this structure because they are frequently modified. They are grouped and aligned to keep - * them on different cache lines from the chapter fields that are accessed far more often than they - * are updated. - */ -struct __aligned(L1_CACHE_BYTES) cached_index_counters { - u64 consecutive_misses; -}; - -struct __aligned(L1_CACHE_BYTES) cached_chapter_index { - /* - * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache - * entry is unused. This field must only be modified in the critical section in - * uds_update_sparse_cache(). - */ - u64 virtual_chapter; - - u32 index_pages_count; - - /* - * These pointers are immutable during the life of the cache. The contents of the arrays - * change when the cache entry is replaced. - */ - struct delta_index_page *index_pages; - struct dm_buffer **page_buffers; - - /* - * If set, skip the chapter when searching the entire cache. This flag is just a - * performance optimization. This flag is mutable between cache updates, but it rarely - * changes and is frequently accessed, so it groups with the immutable fields. - */ - bool skip_search; - - /* - * The cache-aligned counters change often and are placed at the end of the structure to - * prevent false sharing with the more stable fields above. - */ - struct cached_index_counters counters; -}; - -/* - * A search_list represents an ordering of the sparse chapter index cache entry array, from most - * recently accessed to least recently accessed, which is the order in which the indexes should be - * searched and the reverse order in which they should be evicted from the cache. - * - * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even - * iterate over them to search, and ensuring that dead entries are replaced before any live entries - * are evicted. - * - * The search list is instantiated for each zone thread, avoiding any need for synchronization. The - * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between - * zone threads. - */ -struct search_list { - u8 capacity; - u8 first_dead_entry; - struct cached_chapter_index *entries[]; -}; - -struct threads_barrier { - /* Lock for this barrier object */ - struct semaphore lock; - /* Semaphore for threads waiting at this barrier */ - struct semaphore wait; - /* Number of threads which have arrived */ - int arrived; - /* Total number of threads using this barrier */ - int thread_count; -}; - -struct sparse_cache { - const struct index_geometry *geometry; - unsigned int capacity; - unsigned int zone_count; - - unsigned int skip_threshold; - struct search_list *search_lists[MAX_ZONES]; - struct cached_chapter_index **scratch_entries; - - struct threads_barrier begin_update_barrier; - struct threads_barrier end_update_barrier; - - struct cached_chapter_index chapters[]; -}; - -static void initialize_threads_barrier(struct threads_barrier *barrier, - unsigned int thread_count) -{ - sema_init(&barrier->lock, 1); - barrier->arrived = 0; - barrier->thread_count = thread_count; - sema_init(&barrier->wait, 0); -} - -static inline void __down(struct semaphore *semaphore) -{ - /* - * Do not use down(semaphore). Instead use down_interruptible so that - * we do not get 120 second stall messages in kern.log. - */ - while (down_interruptible(semaphore) != 0) { - /* - * If we're called from a user-mode process (e.g., "dmsetup - * remove") while waiting for an operation that may take a - * while (e.g., UDS index save), and a signal is sent (SIGINT, - * SIGUSR2), then down_interruptible will not block. If that - * happens, sleep briefly to avoid keeping the CPU locked up in - * this loop. We could just call cond_resched, but then we'd - * still keep consuming CPU time slices and swamp other threads - * trying to do computational work. - */ - fsleep(1000); - } -} - -static void enter_threads_barrier(struct threads_barrier *barrier) -{ - __down(&barrier->lock); - if (++barrier->arrived == barrier->thread_count) { - /* last thread */ - int i; - - for (i = 1; i < barrier->thread_count; i++) - up(&barrier->wait); - - barrier->arrived = 0; - up(&barrier->lock); - } else { - up(&barrier->lock); - __down(&barrier->wait); - } -} - -static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter, - const struct index_geometry *geometry) -{ - int result; - - chapter->virtual_chapter = NO_CHAPTER; - chapter->index_pages_count = geometry->index_pages_per_chapter; - - result = uds_allocate(chapter->index_pages_count, struct delta_index_page, - __func__, &chapter->index_pages); - if (result != UDS_SUCCESS) - return result; - - return uds_allocate(chapter->index_pages_count, struct dm_buffer *, - "sparse index volume pages", &chapter->page_buffers); -} - -static int __must_check make_search_list(struct sparse_cache *cache, - struct search_list **list_ptr) -{ - struct search_list *list; - unsigned int bytes; - u8 i; - int result; - - bytes = (sizeof(struct search_list) + - (cache->capacity * sizeof(struct cached_chapter_index *))); - result = uds_allocate_cache_aligned(bytes, "search list", &list); - if (result != UDS_SUCCESS) - return result; - - list->capacity = cache->capacity; - list->first_dead_entry = 0; - - for (i = 0; i < list->capacity; i++) - list->entries[i] = &cache->chapters[i]; - - *list_ptr = list; - return UDS_SUCCESS; -} - -int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity, - unsigned int zone_count, struct sparse_cache **cache_ptr) -{ - int result; - unsigned int i; - struct sparse_cache *cache; - unsigned int bytes; - - bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index))); - result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache); - if (result != UDS_SUCCESS) - return result; - - cache->geometry = geometry; - cache->capacity = capacity; - cache->zone_count = zone_count; - - /* - * Scale down the skip threshold since the cache only counts cache misses in zone zero, but - * requests are being handled in all zones. - */ - cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); - - initialize_threads_barrier(&cache->begin_update_barrier, zone_count); - initialize_threads_barrier(&cache->end_update_barrier, zone_count); - - for (i = 0; i < capacity; i++) { - result = initialize_cached_chapter_index(&cache->chapters[i], geometry); - if (result != UDS_SUCCESS) - goto out; - } - - for (i = 0; i < zone_count; i++) { - result = make_search_list(cache, &cache->search_lists[i]); - if (result != UDS_SUCCESS) - goto out; - } - - /* purge_search_list() needs some temporary lists for sorting. */ - result = uds_allocate(capacity * 2, struct cached_chapter_index *, - "scratch entries", &cache->scratch_entries); - if (result != UDS_SUCCESS) - goto out; - - *cache_ptr = cache; - return UDS_SUCCESS; -out: - uds_free_sparse_cache(cache); - return result; -} - -static inline void set_skip_search(struct cached_chapter_index *chapter, - bool skip_search) -{ - /* Check before setting to reduce cache line contention. */ - if (READ_ONCE(chapter->skip_search) != skip_search) - WRITE_ONCE(chapter->skip_search, skip_search); -} - -static void score_search_hit(struct cached_chapter_index *chapter) -{ - chapter->counters.consecutive_misses = 0; - set_skip_search(chapter, false); -} - -static void score_search_miss(struct sparse_cache *cache, - struct cached_chapter_index *chapter) -{ - chapter->counters.consecutive_misses++; - if (chapter->counters.consecutive_misses > cache->skip_threshold) - set_skip_search(chapter, true); -} - -static void release_cached_chapter_index(struct cached_chapter_index *chapter) -{ - unsigned int i; - - chapter->virtual_chapter = NO_CHAPTER; - if (chapter->page_buffers == NULL) - return; - - for (i = 0; i < chapter->index_pages_count; i++) { - if (chapter->page_buffers[i] != NULL) - dm_bufio_release(uds_forget(chapter->page_buffers[i])); - } -} - -void uds_free_sparse_cache(struct sparse_cache *cache) -{ - unsigned int i; - - if (cache == NULL) - return; - - uds_free(cache->scratch_entries); - - for (i = 0; i < cache->zone_count; i++) - uds_free(cache->search_lists[i]); - - for (i = 0; i < cache->capacity; i++) { - release_cached_chapter_index(&cache->chapters[i]); - uds_free(cache->chapters[i].index_pages); - uds_free(cache->chapters[i].page_buffers); - } - - uds_free(cache); -} - -/* - * Take the indicated element of the search list and move it to the start, pushing the pointers - * previously before it back down the list. - */ -static inline void set_newest_entry(struct search_list *search_list, u8 index) -{ - struct cached_chapter_index *newest; - - if (index > 0) { - newest = search_list->entries[index]; - memmove(&search_list->entries[1], &search_list->entries[0], - index * sizeof(struct cached_chapter_index *)); - search_list->entries[0] = newest; - } - - /* - * This function may have moved a dead chapter to the front of the list for reuse, in which - * case the set of dead chapters becomes smaller. - */ - if (search_list->first_dead_entry <= index) - search_list->first_dead_entry++; -} - -bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, - unsigned int zone_number) -{ - struct search_list *search_list; - struct cached_chapter_index *chapter; - u8 i; - - /* - * The correctness of the barriers depends on the invariant that between calls to - * uds_update_sparse_cache(), the answers this function returns must never vary: the result - * for a given chapter must be identical across zones. That invariant must be maintained - * even if the chapter falls off the end of the volume, or if searching it is disabled - * because of too many search misses. - */ - search_list = cache->search_lists[zone_number]; - for (i = 0; i < search_list->first_dead_entry; i++) { - chapter = search_list->entries[i]; - - if (virtual_chapter == chapter->virtual_chapter) { - if (zone_number == ZONE_ZERO) - score_search_hit(chapter); - - set_newest_entry(search_list, i); - return true; - } - } - - return false; -} - -/* - * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU - * ordering that already existed. This operation must only be called during the critical section in - * uds_update_sparse_cache(). - */ -static void purge_search_list(struct search_list *search_list, - struct sparse_cache *cache, u64 oldest_virtual_chapter) -{ - struct cached_chapter_index **entries; - struct cached_chapter_index **skipped; - struct cached_chapter_index **dead; - struct cached_chapter_index *chapter; - unsigned int next_alive = 0; - unsigned int next_skipped = 0; - unsigned int next_dead = 0; - unsigned int i; - - entries = &search_list->entries[0]; - skipped = &cache->scratch_entries[0]; - dead = &cache->scratch_entries[search_list->capacity]; - - for (i = 0; i < search_list->first_dead_entry; i++) { - chapter = search_list->entries[i]; - if ((chapter->virtual_chapter < oldest_virtual_chapter) || - (chapter->virtual_chapter == NO_CHAPTER)) - dead[next_dead++] = chapter; - else if (chapter->skip_search) - skipped[next_skipped++] = chapter; - else - entries[next_alive++] = chapter; - } - - memcpy(&entries[next_alive], skipped, - next_skipped * sizeof(struct cached_chapter_index *)); - memcpy(&entries[next_alive + next_skipped], dead, - next_dead * sizeof(struct cached_chapter_index *)); - search_list->first_dead_entry = next_alive + next_skipped; -} - -static int __must_check cache_chapter_index(struct cached_chapter_index *chapter, - u64 virtual_chapter, - const struct volume *volume) -{ - int result; - - release_cached_chapter_index(chapter); - - result = uds_read_chapter_index_from_volume(volume, virtual_chapter, - chapter->page_buffers, - chapter->index_pages); - if (result != UDS_SUCCESS) - return result; - - chapter->counters.consecutive_misses = 0; - chapter->virtual_chapter = virtual_chapter; - chapter->skip_search = false; - - return UDS_SUCCESS; -} - -static inline void copy_search_list(const struct search_list *source, - struct search_list *target) -{ - *target = *source; - memcpy(target->entries, source->entries, - source->capacity * sizeof(struct cached_chapter_index *)); -} - -/* - * Update the sparse cache to contain a chapter index. This function must be called by all the zone - * threads with the same chapter number to correctly enter the thread barriers used to synchronize - * the cache updates. - */ -int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter) -{ - int result = UDS_SUCCESS; - const struct uds_index *index = zone->index; - struct sparse_cache *cache = index->volume->sparse_cache; - - if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id)) - return UDS_SUCCESS; - - /* - * Wait for every zone thread to reach its corresponding barrier request and invoke this - * function before starting to modify the cache. - */ - enter_threads_barrier(&cache->begin_update_barrier); - - /* - * This is the start of the critical section: the zone zero thread is captain, effectively - * holding an exclusive lock on the sparse cache. All the other zone threads must do - * nothing between the two barriers. They will wait at the end_update_barrier again for the - * captain to finish the update. - */ - - if (zone->id == ZONE_ZERO) { - unsigned int z; - struct search_list *list = cache->search_lists[ZONE_ZERO]; - - purge_search_list(list, cache, zone->oldest_virtual_chapter); - - if (virtual_chapter >= index->oldest_virtual_chapter) { - set_newest_entry(list, list->capacity - 1); - result = cache_chapter_index(list->entries[0], virtual_chapter, - index->volume); - } - - for (z = 1; z < cache->zone_count; z++) - copy_search_list(list, cache->search_lists[z]); - } - - /* - * This is the end of the critical section. All cache invariants must have been restored. - */ - enter_threads_barrier(&cache->end_update_barrier); - return result; -} - -void uds_invalidate_sparse_cache(struct sparse_cache *cache) -{ - unsigned int i; - - for (i = 0; i < cache->capacity; i++) - release_cached_chapter_index(&cache->chapters[i]); -} - -static inline bool should_skip_chapter(struct cached_chapter_index *chapter, - u64 oldest_chapter, u64 requested_chapter) -{ - if ((chapter->virtual_chapter == NO_CHAPTER) || - (chapter->virtual_chapter < oldest_chapter)) - return true; - - if (requested_chapter != NO_CHAPTER) - return requested_chapter != chapter->virtual_chapter; - else - return READ_ONCE(chapter->skip_search); -} - -static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter, - const struct index_geometry *geometry, - const struct index_page_map *index_page_map, - const struct uds_record_name *name, - u16 *record_page_ptr) -{ - u32 physical_chapter = - uds_map_to_physical_chapter(geometry, chapter->virtual_chapter); - u32 index_page_number = - uds_find_index_page_number(index_page_map, name, physical_chapter); - struct delta_index_page *index_page = - &chapter->index_pages[index_page_number]; - - return uds_search_chapter_index_page(index_page, geometry, name, - record_page_ptr); -} - -int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name, - u64 *virtual_chapter_ptr, u16 *record_page_ptr) -{ - int result; - struct volume *volume = zone->index->volume; - struct sparse_cache *cache = volume->sparse_cache; - struct cached_chapter_index *chapter; - struct search_list *search_list; - u8 i; - /* Search the entire cache unless a specific chapter was requested. */ - bool search_one = (*virtual_chapter_ptr != NO_CHAPTER); - - *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; - search_list = cache->search_lists[zone->id]; - for (i = 0; i < search_list->first_dead_entry; i++) { - chapter = search_list->entries[i]; - - if (should_skip_chapter(chapter, zone->oldest_virtual_chapter, - *virtual_chapter_ptr)) - continue; - - result = search_cached_chapter_index(chapter, cache->geometry, - volume->index_page_map, name, - record_page_ptr); - if (result != UDS_SUCCESS) - return result; - - if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { - /* - * In theory, this might be a false match while a true match exists in - * another chapter, but that's a very rare case and not worth the extra - * search complexity. - */ - set_newest_entry(search_list, i); - if (zone->id == ZONE_ZERO) - score_search_hit(chapter); - - *virtual_chapter_ptr = chapter->virtual_chapter; - return UDS_SUCCESS; - } - - if (zone->id == ZONE_ZERO) - score_search_miss(cache, chapter); - - if (search_one) - break; - } - - return UDS_SUCCESS; -} diff --git a/drivers/md/dm-vdo/sparse-cache.h b/drivers/md/dm-vdo/sparse-cache.h deleted file mode 100644 index 45e2dcf165b51..0000000000000 --- a/drivers/md/dm-vdo/sparse-cache.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_SPARSE_CACHE_H -#define UDS_SPARSE_CACHE_H - -#include "geometry.h" -#include "indexer.h" - -/* - * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching - * for names after all other search paths have failed. It contains only complete chapter indexes; - * record pages from sparse chapters and single index pages used for resolving hooks are kept in - * the regular page cache in the volume. - * - * The most important property of this cache is the absence of synchronization for read operations. - * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and - * the barrier requests it issues to the zone queues. The set of cached chapters does not and must - * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone - * threads. Outside of updates, every zone will get the same result when calling - * uds_sparse_cache_contains() as every other zone. - */ - -struct index_zone; -struct sparse_cache; - -int __must_check uds_make_sparse_cache(const struct index_geometry *geometry, - unsigned int capacity, unsigned int zone_count, - struct sparse_cache **cache_ptr); - -void uds_free_sparse_cache(struct sparse_cache *cache); - -bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, - unsigned int zone_number); - -int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter); - -void uds_invalidate_sparse_cache(struct sparse_cache *cache); - -int __must_check uds_search_sparse_cache(struct index_zone *zone, - const struct uds_record_name *name, - u64 *virtual_chapter_ptr, u16 *record_page_ptr); - -#endif /* UDS_SPARSE_CACHE_H */ diff --git a/drivers/md/dm-vdo/uds-sysfs.c b/drivers/md/dm-vdo/uds-sysfs.c index 1548092e7de1d..2c4fb277ba388 100644 --- a/drivers/md/dm-vdo/uds-sysfs.c +++ b/drivers/md/dm-vdo/uds-sysfs.c @@ -9,11 +9,12 @@ #include #include -#include "indexer.h" #include "logger.h" #include "memory-alloc.h" #include "string-utils.h" +#include "indexer.h" + #define UDS_SYSFS_NAME "uds" static struct { diff --git a/drivers/md/dm-vdo/volume-index.c b/drivers/md/dm-vdo/volume-index.c deleted file mode 100644 index 36e3c2e3d799a..0000000000000 --- a/drivers/md/dm-vdo/volume-index.c +++ /dev/null @@ -1,1280 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ -#include "volume-index.h" - -#include -#include -#include -#include -#include - -#include "config.h" -#include "errors.h" -#include "geometry.h" -#include "hash-utils.h" -#include "indexer.h" -#include "logger.h" -#include "memory-alloc.h" -#include "numeric.h" -#include "permassert.h" -#include "thread-utils.h" - -/* - * The volume index is a combination of two separate subindexes, one containing sparse hook entries - * (retained for all chapters), and one containing the remaining entries (retained only for the - * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it - * will contain all records for all chapters. - * - * The volume index is also divided into zones, with one thread operating on each zone. Each - * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex. - * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to - * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta - * lists must be the square of the maximum zone count for both subindexes. - * - * Each subindex zone is a delta index where the payload is a chapter number. The volume index can - * compute the delta list number, address, and zone number from the record name in order to - * dispatch record handling to the correct structures. - * - * Most operations that use all the zones take place either before request processing is allowed, - * or after all requests have been flushed in order to shut down. The only multi-threaded operation - * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine - * whether a new chapter should be loaded into the sparse index cache. This operation only uses the - * sparse hook subindex, and the zone mutexes are used to make this operation safe. - * - * There are three ways of expressing chapter numbers in the volume index: virtual, index, and - * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long. - * Internally the subindex stores only the minimal number of bits necessary by masking away the - * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when - * flushing entries from older chapters, it rolls the index chapter number around so that the - * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries() - * for an example of this technique. - * - * For efficiency, when older chapter numbers become invalid, the index does not immediately remove - * the invalidated entries. Instead it lazily removes them from a given delta list the next time it - * walks that list during normal operation. Because of this, the index size must be increased - * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard - * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in - * the index. - */ - -struct sub_index_parameters { - /* The number of bits in address mask */ - u8 address_bits; - /* The number of bits in chapter number */ - u8 chapter_bits; - /* The mean delta */ - u32 mean_delta; - /* The number of delta lists */ - u64 list_count; - /* The number of chapters used */ - u32 chapter_count; - /* The number of bits per chapter */ - size_t chapter_size_in_bits; - /* The number of bytes of delta list memory */ - size_t memory_size; - /* The number of bytes the index should keep free at all times */ - size_t target_free_bytes; -}; - -struct split_config { - /* The hook subindex configuration */ - struct uds_configuration hook_config; - struct index_geometry hook_geometry; - - /* The non-hook subindex configuration */ - struct uds_configuration non_hook_config; - struct index_geometry non_hook_geometry; -}; - -struct chapter_range { - u32 chapter_start; - u32 chapter_count; -}; - -enum { MAGIC_SIZE = 8 }; -static const char MAGIC_START_5[] = "MI5-0005"; - -struct sub_index_data { - char magic[MAGIC_SIZE]; /* MAGIC_START_5 */ - u64 volume_nonce; - u64 virtual_chapter_low; - u64 virtual_chapter_high; - u32 first_list; - u32 list_count; -}; - -static const char MAGIC_START_6[] = "MI6-0001"; - -struct volume_index_data { - char magic[MAGIC_SIZE]; /* MAGIC_START_6 */ - u32 sparse_sample_rate; -}; - -static inline u32 extract_address(const struct volume_sub_index *sub_index, - const struct uds_record_name *name) -{ - return uds_extract_volume_index_bytes(name) & sub_index->address_mask; -} - -static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index, - const struct uds_record_name *name) -{ - u64 bits = uds_extract_volume_index_bytes(name); - - return (bits >> sub_index->address_bits) % sub_index->list_count; -} - -static inline const struct volume_sub_index_zone * -get_zone_for_record(const struct volume_index_record *record) -{ - return &record->sub_index->zones[record->zone_number]; -} - -static inline u64 convert_index_to_virtual(const struct volume_index_record *record, - u32 index_chapter) -{ - const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); - u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) & - record->sub_index->chapter_mask); - - return volume_index_zone->virtual_chapter_low + rolling_chapter; -} - -static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index, - u64 virtual_chapter) -{ - return virtual_chapter & sub_index->chapter_mask; -} - -static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record, - u64 virtual_chapter) -{ - const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); - - return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) && - (virtual_chapter <= volume_index_zone->virtual_chapter_high)); -} - -static inline bool has_sparse(const struct volume_index *volume_index) -{ - return volume_index->sparse_sample_rate > 0; -} - -bool uds_is_volume_index_sample(const struct volume_index *volume_index, - const struct uds_record_name *name) -{ - if (!has_sparse(volume_index)) - return false; - - return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0; -} - -static inline const struct volume_sub_index * -get_volume_sub_index(const struct volume_index *volume_index, - const struct uds_record_name *name) -{ - return (uds_is_volume_index_sample(volume_index, name) ? - &volume_index->vi_hook : - &volume_index->vi_non_hook); -} - -static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index, - const struct uds_record_name *name) -{ - return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone; -} - -unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index, - const struct uds_record_name *name) -{ - return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name); -} - -static int compute_volume_sub_index_parameters(const struct uds_configuration *config, - struct sub_index_parameters *params) -{ - enum { DELTA_LIST_SIZE = 256 }; - u64 entries_in_volume_index, address_span; - u32 chapters_in_volume_index, invalid_chapters; - u32 rounded_chapters; - u64 delta_list_records; - u32 address_count; - u64 index_size_in_bits; - size_t expected_index_size; - u64 min_delta_lists = MAX_ZONES * MAX_ZONES; - struct index_geometry *geometry = config->geometry; - u64 records_per_chapter = geometry->records_per_chapter; - - params->chapter_count = geometry->chapters_per_volume; - /* - * Make sure that the number of delta list records in the volume index does not change when - * the volume is reduced by one chapter. This preserves the mapping from name to volume - * index delta list. - */ - rounded_chapters = params->chapter_count; - if (uds_is_reduced_index_geometry(geometry)) - rounded_chapters += 1; - delta_list_records = records_per_chapter * rounded_chapters; - address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE; - params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists); - params->address_bits = bits_per(address_count - 1); - params->chapter_bits = bits_per(rounded_chapters - 1); - if ((u32) params->list_count != params->list_count) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "cannot initialize volume index with %llu delta lists", - (unsigned long long) params->list_count); - } - - if (params->address_bits > 31) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "cannot initialize volume index with %u address bits", - params->address_bits); - } - - /* - * The probability that a given delta list is not touched during the writing of an entire - * chapter is: - * - * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count, - * records_per_chapter); - * - * For the standard index sizes, about 78% of the delta lists are not touched, and - * therefore contain old index entries that have not been eliminated by the lazy LRU - * processing. Then the number of old index entries that accumulate over the entire index, - * in terms of full chapters worth of entries, is: - * - * double invalid_chapters = p_not_touched / (1.0 - p_not_touched); - * - * For the standard index sizes, the index needs about 3.5 chapters of space for the old - * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in - * the index. - */ - invalid_chapters = max(rounded_chapters / 256, 2U); - chapters_in_volume_index = rounded_chapters + invalid_chapters; - entries_in_volume_index = records_per_chapter * chapters_in_volume_index; - - address_span = params->list_count << params->address_bits; - params->mean_delta = address_span / entries_in_volume_index; - - /* - * Compute the expected size of a full index, then set the total memory to be 6% larger - * than that expected size. This number should be large enough that there are not many - * rebalances when the index is full. - */ - params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter, - params->mean_delta, - params->chapter_bits); - index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index; - expected_index_size = index_size_in_bits / BITS_PER_BYTE; - params->memory_size = expected_index_size * 106 / 100; - - params->target_free_bytes = expected_index_size / 20; - return UDS_SUCCESS; -} - -static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index) -{ - uds_free(uds_forget(sub_index->flush_chapters)); - uds_free(uds_forget(sub_index->zones)); - uds_uninitialize_delta_index(&sub_index->delta_index); -} - -void uds_free_volume_index(struct volume_index *volume_index) -{ - if (volume_index == NULL) - return; - - if (volume_index->zones != NULL) - uds_free(uds_forget(volume_index->zones)); - - uninitialize_volume_sub_index(&volume_index->vi_non_hook); - uninitialize_volume_sub_index(&volume_index->vi_hook); - uds_free(volume_index); -} - - -static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config, - size_t *bytes) -{ - struct sub_index_parameters params = { .address_bits = 0 }; - int result; - - result = compute_volume_sub_index_parameters(config, ¶ms); - if (result != UDS_SUCCESS) - return result; - - *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) + - uds_compute_delta_index_save_bytes(params.list_count, - params.memory_size)); - return UDS_SUCCESS; -} - -/* This function is only useful if the configuration includes sparse chapters. */ -static void split_configuration(const struct uds_configuration *config, - struct split_config *split) -{ - u64 sample_rate, sample_records; - u64 dense_chapters, sparse_chapters; - - /* Start with copies of the base configuration. */ - split->hook_config = *config; - split->hook_geometry = *config->geometry; - split->hook_config.geometry = &split->hook_geometry; - split->non_hook_config = *config; - split->non_hook_geometry = *config->geometry; - split->non_hook_config.geometry = &split->non_hook_geometry; - - sample_rate = config->sparse_sample_rate; - sparse_chapters = config->geometry->sparse_chapters_per_volume; - dense_chapters = config->geometry->chapters_per_volume - sparse_chapters; - sample_records = config->geometry->records_per_chapter / sample_rate; - - /* Adjust the number of records indexed for each chapter. */ - split->hook_geometry.records_per_chapter = sample_records; - split->non_hook_geometry.records_per_chapter -= sample_records; - - /* Adjust the number of chapters indexed. */ - split->hook_geometry.sparse_chapters_per_volume = 0; - split->non_hook_geometry.sparse_chapters_per_volume = 0; - split->non_hook_geometry.chapters_per_volume = dense_chapters; -} - -static int compute_volume_index_save_bytes(const struct uds_configuration *config, - size_t *bytes) -{ - size_t hook_bytes, non_hook_bytes; - struct split_config split; - int result; - - if (!uds_is_sparse_index_geometry(config->geometry)) - return compute_volume_sub_index_save_bytes(config, bytes); - - split_configuration(config, &split); - result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes); - if (result != UDS_SUCCESS) - return result; - - result = compute_volume_sub_index_save_bytes(&split.non_hook_config, - &non_hook_bytes); - if (result != UDS_SUCCESS) - return result; - - *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes; - return UDS_SUCCESS; -} - -int uds_compute_volume_index_save_blocks(const struct uds_configuration *config, - size_t block_size, u64 *block_count) -{ - size_t bytes; - int result; - - result = compute_volume_index_save_bytes(config, &bytes); - if (result != UDS_SUCCESS) - return result; - - bytes += sizeof(struct delta_list_save_info); - *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES; - return UDS_SUCCESS; -} - -/* Flush invalid entries while walking the delta list. */ -static inline int flush_invalid_entries(struct volume_index_record *record, - struct chapter_range *flush_range, - u32 *next_chapter_to_invalidate) -{ - int result; - - result = uds_next_delta_index_entry(&record->delta_entry); - if (result != UDS_SUCCESS) - return result; - - while (!record->delta_entry.at_end) { - u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); - u32 relative_chapter = ((index_chapter - flush_range->chapter_start) & - record->sub_index->chapter_mask); - - if (likely(relative_chapter >= flush_range->chapter_count)) { - if (relative_chapter < *next_chapter_to_invalidate) - *next_chapter_to_invalidate = relative_chapter; - break; - } - - result = uds_remove_delta_index_entry(&record->delta_entry); - if (result != UDS_SUCCESS) - return result; - } - - return UDS_SUCCESS; -} - -/* Find the matching record, or the list offset where the record would go. */ -static int get_volume_index_entry(struct volume_index_record *record, u32 list_number, - u32 key, struct chapter_range *flush_range) -{ - struct volume_index_record other_record; - const struct volume_sub_index *sub_index = record->sub_index; - u32 next_chapter_to_invalidate = sub_index->chapter_mask; - int result; - - result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0, - &record->delta_entry); - if (result != UDS_SUCCESS) - return result; - - do { - result = flush_invalid_entries(record, flush_range, - &next_chapter_to_invalidate); - if (result != UDS_SUCCESS) - return result; - } while (!record->delta_entry.at_end && (key > record->delta_entry.key)); - - result = uds_remember_delta_index_offset(&record->delta_entry); - if (result != UDS_SUCCESS) - return result; - - /* Check any collision records for a more precise match. */ - other_record = *record; - if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) { - for (;;) { - u8 collision_name[UDS_RECORD_NAME_SIZE]; - - result = flush_invalid_entries(&other_record, flush_range, - &next_chapter_to_invalidate); - if (result != UDS_SUCCESS) - return result; - - if (other_record.delta_entry.at_end || - !other_record.delta_entry.is_collision) - break; - - result = uds_get_delta_entry_collision(&other_record.delta_entry, - collision_name); - if (result != UDS_SUCCESS) - return result; - - if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) { - *record = other_record; - break; - } - } - } - while (!other_record.delta_entry.at_end) { - result = flush_invalid_entries(&other_record, flush_range, - &next_chapter_to_invalidate); - if (result != UDS_SUCCESS) - return result; - } - next_chapter_to_invalidate += flush_range->chapter_start; - next_chapter_to_invalidate &= sub_index->chapter_mask; - flush_range->chapter_start = next_chapter_to_invalidate; - flush_range->chapter_count = 0; - return UDS_SUCCESS; -} - -static int get_volume_sub_index_record(struct volume_sub_index *sub_index, - const struct uds_record_name *name, - struct volume_index_record *record) -{ - int result; - const struct volume_sub_index_zone *volume_index_zone; - u32 address = extract_address(sub_index, name); - u32 delta_list_number = extract_dlist_num(sub_index, name); - u64 flush_chapter = sub_index->flush_chapters[delta_list_number]; - - record->sub_index = sub_index; - record->mutex = NULL; - record->name = name; - record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone; - volume_index_zone = get_zone_for_record(record); - - if (flush_chapter < volume_index_zone->virtual_chapter_low) { - struct chapter_range range; - u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter; - - range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter); - range.chapter_count = (flush_count > sub_index->chapter_mask ? - sub_index->chapter_mask + 1 : - flush_count); - result = get_volume_index_entry(record, delta_list_number, address, - &range); - flush_chapter = convert_index_to_virtual(record, range.chapter_start); - if (flush_chapter > volume_index_zone->virtual_chapter_high) - flush_chapter = volume_index_zone->virtual_chapter_high; - sub_index->flush_chapters[delta_list_number] = flush_chapter; - } else { - result = uds_get_delta_index_entry(&sub_index->delta_index, - delta_list_number, address, - name->name, &record->delta_entry); - } - - if (result != UDS_SUCCESS) - return result; - - record->is_found = - (!record->delta_entry.at_end && (record->delta_entry.key == address)); - if (record->is_found) { - u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); - - record->virtual_chapter = convert_index_to_virtual(record, index_chapter); - } - - record->is_collision = record->delta_entry.is_collision; - return UDS_SUCCESS; -} - -int uds_get_volume_index_record(struct volume_index *volume_index, - const struct uds_record_name *name, - struct volume_index_record *record) -{ - int result; - - if (uds_is_volume_index_sample(volume_index, name)) { - /* - * Other threads cannot be allowed to call uds_lookup_volume_index_name() while - * this thread is finding the volume index record. Due to the lazy LRU flushing of - * the volume index, uds_get_volume_index_record() is not a read-only operation. - */ - unsigned int zone = - get_volume_sub_index_zone(&volume_index->vi_hook, name); - struct mutex *mutex = &volume_index->zones[zone].hook_mutex; - - mutex_lock(mutex); - result = get_volume_sub_index_record(&volume_index->vi_hook, name, - record); - mutex_unlock(mutex); - /* Remember the mutex so that other operations on the index record can use it. */ - record->mutex = mutex; - } else { - result = get_volume_sub_index_record(&volume_index->vi_non_hook, name, - record); - } - - return result; -} - -int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter) -{ - int result; - u32 address; - const struct volume_sub_index *sub_index = record->sub_index; - - if (!is_virtual_chapter_indexed(record, virtual_chapter)) { - u64 low = get_zone_for_record(record)->virtual_chapter_low; - u64 high = get_zone_for_record(record)->virtual_chapter_high; - - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "cannot put record into chapter number %llu that is out of the valid range %llu to %llu", - (unsigned long long) virtual_chapter, - (unsigned long long) low, - (unsigned long long) high); - } - address = extract_address(sub_index, record->name); - if (unlikely(record->mutex != NULL)) - mutex_lock(record->mutex); - result = uds_put_delta_index_entry(&record->delta_entry, address, - convert_virtual_to_index(sub_index, - virtual_chapter), - record->is_found ? record->name->name : NULL); - if (unlikely(record->mutex != NULL)) - mutex_unlock(record->mutex); - switch (result) { - case UDS_SUCCESS: - record->virtual_chapter = virtual_chapter; - record->is_collision = record->delta_entry.is_collision; - record->is_found = true; - break; - case UDS_OVERFLOW: - uds_log_ratelimit(uds_log_warning_strerror, UDS_OVERFLOW, - "Volume index entry dropped due to overflow condition"); - uds_log_delta_index_entry(&record->delta_entry); - break; - default: - break; - } - - return result; -} - -int uds_remove_volume_index_record(struct volume_index_record *record) -{ - int result; - - if (!record->is_found) - return uds_log_warning_strerror(UDS_BAD_STATE, - "illegal operation on new record"); - - /* Mark the record so that it cannot be used again */ - record->is_found = false; - if (unlikely(record->mutex != NULL)) - mutex_lock(record->mutex); - result = uds_remove_delta_index_entry(&record->delta_entry); - if (unlikely(record->mutex != NULL)) - mutex_unlock(record->mutex); - return result; -} - -static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index, - unsigned int zone_number, - u64 virtual_chapter) -{ - u64 used_bits = 0; - struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; - struct delta_zone *delta_zone; - u32 i; - - zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ? - virtual_chapter - sub_index->chapter_count + 1 : - 0); - zone->virtual_chapter_high = virtual_chapter; - - /* Check to see if the new zone data is too large. */ - delta_zone = &sub_index->delta_index.delta_zones[zone_number]; - for (i = 1; i <= delta_zone->list_count; i++) - used_bits += delta_zone->delta_lists[i].size; - - if (used_bits > sub_index->max_zone_bits) { - /* Expire enough chapters to free the desired space. */ - u64 expire_count = - 1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits; - - if (expire_count == 1) { - uds_log_ratelimit(uds_log_info, - "zone %u: At chapter %llu, expiring chapter %llu early", - zone_number, - (unsigned long long) virtual_chapter, - (unsigned long long) zone->virtual_chapter_low); - zone->early_flushes++; - zone->virtual_chapter_low++; - } else { - u64 first_expired = zone->virtual_chapter_low; - - if (first_expired + expire_count < zone->virtual_chapter_high) { - zone->early_flushes += expire_count; - zone->virtual_chapter_low += expire_count; - } else { - zone->early_flushes += - zone->virtual_chapter_high - zone->virtual_chapter_low; - zone->virtual_chapter_low = zone->virtual_chapter_high; - } - uds_log_ratelimit(uds_log_info, - "zone %u: At chapter %llu, expiring chapters %llu to %llu early", - zone_number, - (unsigned long long) virtual_chapter, - (unsigned long long) first_expired, - (unsigned long long) zone->virtual_chapter_low - 1); - } - } -} - -void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, - unsigned int zone_number, - u64 virtual_chapter) -{ - struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; - - set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number, - virtual_chapter); - - /* - * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open - * chapter number is changing. - */ - if (has_sparse(volume_index)) { - mutex_lock(mutex); - set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook, - zone_number, virtual_chapter); - mutex_unlock(mutex); - } -} - -/* - * Set the newest open chapter number for the index, while also advancing the oldest valid chapter - * number. - */ -void uds_set_volume_index_open_chapter(struct volume_index *volume_index, - u64 virtual_chapter) -{ - unsigned int zone; - - for (zone = 0; zone < volume_index->zone_count; zone++) - uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter); -} - -int uds_set_volume_index_record_chapter(struct volume_index_record *record, - u64 virtual_chapter) -{ - const struct volume_sub_index *sub_index = record->sub_index; - int result; - - if (!record->is_found) - return uds_log_warning_strerror(UDS_BAD_STATE, - "illegal operation on new record"); - - if (!is_virtual_chapter_indexed(record, virtual_chapter)) { - u64 low = get_zone_for_record(record)->virtual_chapter_low; - u64 high = get_zone_for_record(record)->virtual_chapter_high; - - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "cannot set chapter number %llu that is out of the valid range %llu to %llu", - (unsigned long long) virtual_chapter, - (unsigned long long) low, - (unsigned long long) high); - } - - if (unlikely(record->mutex != NULL)) - mutex_lock(record->mutex); - result = uds_set_delta_entry_value(&record->delta_entry, - convert_virtual_to_index(sub_index, - virtual_chapter)); - if (unlikely(record->mutex != NULL)) - mutex_unlock(record->mutex); - if (result != UDS_SUCCESS) - return result; - - record->virtual_chapter = virtual_chapter; - return UDS_SUCCESS; -} - -static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index, - const struct uds_record_name *name) -{ - int result; - u32 address = extract_address(sub_index, name); - u32 delta_list_number = extract_dlist_num(sub_index, name); - unsigned int zone_number = get_volume_sub_index_zone(sub_index, name); - const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; - u64 virtual_chapter; - u32 index_chapter; - u32 rolling_chapter; - struct delta_index_entry delta_entry; - - result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number, - address, name->name, &delta_entry); - if (result != UDS_SUCCESS) - return NO_CHAPTER; - - if (delta_entry.at_end || (delta_entry.key != address)) - return NO_CHAPTER; - - index_chapter = uds_get_delta_entry_value(&delta_entry); - rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask; - - virtual_chapter = zone->virtual_chapter_low + rolling_chapter; - if (virtual_chapter > zone->virtual_chapter_high) - return NO_CHAPTER; - - return virtual_chapter; -} - -/* Do a read-only lookup of the record name for sparse cache management. */ -u64 uds_lookup_volume_index_name(const struct volume_index *volume_index, - const struct uds_record_name *name) -{ - unsigned int zone_number = uds_get_volume_index_zone(volume_index, name); - struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; - u64 virtual_chapter; - - if (!uds_is_volume_index_sample(volume_index, name)) - return NO_CHAPTER; - - mutex_lock(mutex); - virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name); - mutex_unlock(mutex); - - return virtual_chapter; -} - -static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index) -{ - uds_reset_delta_index(&sub_index->delta_index); -} - -static void abort_restoring_volume_index(struct volume_index *volume_index) -{ - abort_restoring_volume_sub_index(&volume_index->vi_non_hook); - if (has_sparse(volume_index)) - abort_restoring_volume_sub_index(&volume_index->vi_hook); -} - -static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, - struct buffered_reader **readers, - unsigned int reader_count) -{ - unsigned int z; - int result; - u64 virtual_chapter_low = 0, virtual_chapter_high = 0; - unsigned int i; - - for (i = 0; i < reader_count; i++) { - struct sub_index_data header; - u8 buffer[sizeof(struct sub_index_data)]; - size_t offset = 0; - u32 j; - - result = uds_read_from_buffered_reader(readers[i], buffer, - sizeof(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read volume index header"); - } - - memcpy(&header.magic, buffer, MAGIC_SIZE); - offset += MAGIC_SIZE; - decode_u64_le(buffer, &offset, &header.volume_nonce); - decode_u64_le(buffer, &offset, &header.virtual_chapter_low); - decode_u64_le(buffer, &offset, &header.virtual_chapter_high); - decode_u32_le(buffer, &offset, &header.first_list); - decode_u32_le(buffer, &offset, &header.list_count); - - result = ASSERT(offset == sizeof(buffer), - "%zu bytes decoded of %zu expected", offset, - sizeof(buffer)); - if (result != UDS_SUCCESS) - result = UDS_CORRUPT_DATA; - - if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "volume index file had bad magic number"); - } - - if (sub_index->volume_nonce == 0) { - sub_index->volume_nonce = header.volume_nonce; - } else if (header.volume_nonce != sub_index->volume_nonce) { - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "volume index volume nonce incorrect"); - } - - if (i == 0) { - virtual_chapter_low = header.virtual_chapter_low; - virtual_chapter_high = header.virtual_chapter_high; - } else if (virtual_chapter_high != header.virtual_chapter_high) { - u64 low = header.virtual_chapter_low; - u64 high = header.virtual_chapter_high; - - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]", - (unsigned long long) virtual_chapter_low, - (unsigned long long) virtual_chapter_high, - i, (unsigned long long) low, - (unsigned long long) high); - } else if (virtual_chapter_low < header.virtual_chapter_low) { - virtual_chapter_low = header.virtual_chapter_low; - } - - for (j = 0; j < header.list_count; j++) { - u8 decoded[sizeof(u64)]; - - result = uds_read_from_buffered_reader(readers[i], decoded, - sizeof(u64)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read volume index flush ranges"); - } - - sub_index->flush_chapters[header.first_list + j] = - get_unaligned_le64(decoded); - } - } - - for (z = 0; z < sub_index->zone_count; z++) { - memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone)); - sub_index->zones[z].virtual_chapter_low = virtual_chapter_low; - sub_index->zones[z].virtual_chapter_high = virtual_chapter_high; - } - - result = uds_start_restoring_delta_index(&sub_index->delta_index, readers, - reader_count); - if (result != UDS_SUCCESS) - return uds_log_warning_strerror(result, "restoring delta index failed"); - - return UDS_SUCCESS; -} - -static int start_restoring_volume_index(struct volume_index *volume_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - unsigned int i; - int result; - - if (!has_sparse(volume_index)) { - return start_restoring_volume_sub_index(&volume_index->vi_non_hook, - buffered_readers, reader_count); - } - - for (i = 0; i < reader_count; i++) { - struct volume_index_data header; - u8 buffer[sizeof(struct volume_index_data)]; - size_t offset = 0; - - result = uds_read_from_buffered_reader(buffered_readers[i], buffer, - sizeof(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read volume index header"); - } - - memcpy(&header.magic, buffer, MAGIC_SIZE); - offset += MAGIC_SIZE; - decode_u32_le(buffer, &offset, &header.sparse_sample_rate); - - result = ASSERT(offset == sizeof(buffer), - "%zu bytes decoded of %zu expected", offset, - sizeof(buffer)); - if (result != UDS_SUCCESS) - result = UDS_CORRUPT_DATA; - - if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) - return uds_log_warning_strerror(UDS_CORRUPT_DATA, - "volume index file had bad magic number"); - - if (i == 0) { - volume_index->sparse_sample_rate = header.sparse_sample_rate; - } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) { - uds_log_warning_strerror(UDS_CORRUPT_DATA, - "Inconsistent sparse sample rate in delta index zone files: %u vs. %u", - volume_index->sparse_sample_rate, - header.sparse_sample_rate); - return UDS_CORRUPT_DATA; - } - } - - result = start_restoring_volume_sub_index(&volume_index->vi_non_hook, - buffered_readers, reader_count); - if (result != UDS_SUCCESS) - return result; - - return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers, - reader_count); -} - -static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - return uds_finish_restoring_delta_index(&sub_index->delta_index, - buffered_readers, reader_count); -} - -static int finish_restoring_volume_index(struct volume_index *volume_index, - struct buffered_reader **buffered_readers, - unsigned int reader_count) -{ - int result; - - result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook, - buffered_readers, reader_count); - if ((result == UDS_SUCCESS) && has_sparse(volume_index)) { - result = finish_restoring_volume_sub_index(&volume_index->vi_hook, - buffered_readers, - reader_count); - } - - return result; -} - -int uds_load_volume_index(struct volume_index *volume_index, - struct buffered_reader **readers, unsigned int reader_count) -{ - int result; - - /* Start by reading the header section of the stream. */ - result = start_restoring_volume_index(volume_index, readers, reader_count); - if (result != UDS_SUCCESS) - return result; - - result = finish_restoring_volume_index(volume_index, readers, reader_count); - if (result != UDS_SUCCESS) { - abort_restoring_volume_index(volume_index); - return result; - } - - /* Check the final guard lists to make sure there is no extra data. */ - result = uds_check_guard_delta_lists(readers, reader_count); - if (result != UDS_SUCCESS) - abort_restoring_volume_index(volume_index); - - return result; -} - -static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index, - unsigned int zone_number, - struct buffered_writer *buffered_writer) -{ - int result; - struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number]; - u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list; - u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count; - u8 buffer[sizeof(struct sub_index_data)]; - size_t offset = 0; - u32 i; - - memcpy(buffer, MAGIC_START_5, MAGIC_SIZE); - offset += MAGIC_SIZE; - encode_u64_le(buffer, &offset, sub_index->volume_nonce); - encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low); - encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high); - encode_u32_le(buffer, &offset, first_list); - encode_u32_le(buffer, &offset, list_count); - - result = ASSERT(offset == sizeof(struct sub_index_data), - "%zu bytes of config written, of %zu expected", offset, - sizeof(struct sub_index_data)); - if (result != UDS_SUCCESS) - return result; - - result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); - if (result != UDS_SUCCESS) - return uds_log_warning_strerror(result, - "failed to write volume index header"); - - for (i = 0; i < list_count; i++) { - u8 encoded[sizeof(u64)]; - - put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded); - result = uds_write_to_buffered_writer(buffered_writer, encoded, - sizeof(u64)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to write volume index flush ranges"); - } - } - - return uds_start_saving_delta_index(&sub_index->delta_index, zone_number, - buffered_writer); -} - -static int start_saving_volume_index(const struct volume_index *volume_index, - unsigned int zone_number, - struct buffered_writer *writer) -{ - u8 buffer[sizeof(struct volume_index_data)]; - size_t offset = 0; - int result; - - if (!has_sparse(volume_index)) { - return start_saving_volume_sub_index(&volume_index->vi_non_hook, - zone_number, writer); - } - - memcpy(buffer, MAGIC_START_6, MAGIC_SIZE); - offset += MAGIC_SIZE; - encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate); - result = ASSERT(offset == sizeof(struct volume_index_data), - "%zu bytes of header written, of %zu expected", offset, - sizeof(struct volume_index_data)); - if (result != UDS_SUCCESS) - return result; - - result = uds_write_to_buffered_writer(writer, buffer, offset); - if (result != UDS_SUCCESS) { - uds_log_warning_strerror(result, "failed to write volume index header"); - return result; - } - - result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number, - writer); - if (result != UDS_SUCCESS) - return result; - - return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number, - writer); -} - -static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index, - unsigned int zone_number) -{ - return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number); -} - -static int finish_saving_volume_index(const struct volume_index *volume_index, - unsigned int zone_number) -{ - int result; - - result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number); - if ((result == UDS_SUCCESS) && has_sparse(volume_index)) - result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number); - return result; -} - -int uds_save_volume_index(struct volume_index *volume_index, - struct buffered_writer **writers, unsigned int writer_count) -{ - int result = UDS_SUCCESS; - unsigned int zone; - - for (zone = 0; zone < writer_count; zone++) { - result = start_saving_volume_index(volume_index, zone, writers[zone]); - if (result != UDS_SUCCESS) - break; - - result = finish_saving_volume_index(volume_index, zone); - if (result != UDS_SUCCESS) - break; - - result = uds_write_guard_delta_list(writers[zone]); - if (result != UDS_SUCCESS) - break; - - result = uds_flush_buffered_writer(writers[zone]); - if (result != UDS_SUCCESS) - break; - } - - return result; -} - -static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index, - struct volume_index_stats *stats) -{ - struct delta_index_stats dis; - unsigned int z; - - uds_get_delta_index_stats(&sub_index->delta_index, &dis); - stats->rebalance_time = dis.rebalance_time; - stats->rebalance_count = dis.rebalance_count; - stats->record_count = dis.record_count; - stats->collision_count = dis.collision_count; - stats->discard_count = dis.discard_count; - stats->overflow_count = dis.overflow_count; - stats->delta_lists = dis.list_count; - stats->early_flushes = 0; - for (z = 0; z < sub_index->zone_count; z++) - stats->early_flushes += sub_index->zones[z].early_flushes; -} - -void uds_get_volume_index_stats(const struct volume_index *volume_index, - struct volume_index_stats *stats) -{ - struct volume_index_stats sparse_stats; - - get_volume_sub_index_stats(&volume_index->vi_non_hook, stats); - if (!has_sparse(volume_index)) - return; - - get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats); - stats->rebalance_time += sparse_stats.rebalance_time; - stats->rebalance_count += sparse_stats.rebalance_count; - stats->record_count += sparse_stats.record_count; - stats->collision_count += sparse_stats.collision_count; - stats->discard_count += sparse_stats.discard_count; - stats->overflow_count += sparse_stats.overflow_count; - stats->delta_lists += sparse_stats.delta_lists; - stats->early_flushes += sparse_stats.early_flushes; -} - -static int initialize_volume_sub_index(const struct uds_configuration *config, - u64 volume_nonce, u8 tag, - struct volume_sub_index *sub_index) -{ - struct sub_index_parameters params = { .address_bits = 0 }; - unsigned int zone_count = config->zone_count; - u64 available_bytes = 0; - unsigned int z; - int result; - - result = compute_volume_sub_index_parameters(config, ¶ms); - if (result != UDS_SUCCESS) - return result; - - sub_index->address_bits = params.address_bits; - sub_index->address_mask = (1u << params.address_bits) - 1; - sub_index->chapter_bits = params.chapter_bits; - sub_index->chapter_mask = (1u << params.chapter_bits) - 1; - sub_index->chapter_count = params.chapter_count; - sub_index->list_count = params.list_count; - sub_index->zone_count = zone_count; - sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count; - sub_index->volume_nonce = volume_nonce; - - result = uds_initialize_delta_index(&sub_index->delta_index, zone_count, - params.list_count, params.mean_delta, - params.chapter_bits, params.memory_size, - tag); - if (result != UDS_SUCCESS) - return result; - - for (z = 0; z < sub_index->delta_index.zone_count; z++) - available_bytes += sub_index->delta_index.delta_zones[z].size; - available_bytes -= params.target_free_bytes; - sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count; - sub_index->memory_size = (sub_index->delta_index.memory_size + - sizeof(struct volume_sub_index) + - (params.list_count * sizeof(u64)) + - (zone_count * sizeof(struct volume_sub_index_zone))); - - /* The following arrays are initialized to all zeros. */ - result = uds_allocate(params.list_count, u64, "first chapter to flush", - &sub_index->flush_chapters); - if (result != UDS_SUCCESS) - return result; - - return uds_allocate(zone_count, struct volume_sub_index_zone, - "volume index zones", &sub_index->zones); -} - -int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, - struct volume_index **volume_index_ptr) -{ - struct split_config split; - unsigned int zone; - struct volume_index *volume_index; - int result; - - result = uds_allocate(1, struct volume_index, "volume index", &volume_index); - if (result != UDS_SUCCESS) - return result; - - volume_index->zone_count = config->zone_count; - - if (!uds_is_sparse_index_geometry(config->geometry)) { - result = initialize_volume_sub_index(config, volume_nonce, 'm', - &volume_index->vi_non_hook); - if (result != UDS_SUCCESS) { - uds_free_volume_index(volume_index); - return result; - } - - volume_index->memory_size = volume_index->vi_non_hook.memory_size; - *volume_index_ptr = volume_index; - return UDS_SUCCESS; - } - - volume_index->sparse_sample_rate = config->sparse_sample_rate; - - result = uds_allocate(config->zone_count, struct volume_index_zone, - "volume index zones", &volume_index->zones); - if (result != UDS_SUCCESS) { - uds_free_volume_index(volume_index); - return result; - } - - for (zone = 0; zone < config->zone_count; zone++) - mutex_init(&volume_index->zones[zone].hook_mutex); - - split_configuration(config, &split); - result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd', - &volume_index->vi_non_hook); - if (result != UDS_SUCCESS) { - uds_free_volume_index(volume_index); - return uds_log_error_strerror(result, - "Error creating non hook volume index"); - } - - result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's', - &volume_index->vi_hook); - if (result != UDS_SUCCESS) { - uds_free_volume_index(volume_index); - return uds_log_error_strerror(result, - "Error creating hook volume index"); - } - - volume_index->memory_size = - volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size; - *volume_index_ptr = volume_index; - return UDS_SUCCESS; -} diff --git a/drivers/md/dm-vdo/volume-index.h b/drivers/md/dm-vdo/volume-index.h deleted file mode 100644 index 66bf14fddc906..0000000000000 --- a/drivers/md/dm-vdo/volume-index.h +++ /dev/null @@ -1,192 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_VOLUME_INDEX_H -#define UDS_VOLUME_INDEX_H - -#include - -#include "config.h" -#include "delta-index.h" -#include "indexer.h" -#include "thread-utils.h" - -/* - * The volume index is the primary top-level index for UDS. It contains records which map a record - * name to the chapter where a record with that name is stored. This mapping can definitively say - * when no record exists. However, because we only use a subset of the name for this index, it - * cannot definitively say that a record for the entry does exist. It can only say that if a record - * exists, it will be in a particular chapter. The request can then be dispatched to that chapter - * for further processing. - * - * If the volume_index_record does not actually match the record name, the index can store a more - * specific collision record to disambiguate the new entry from the existing one. Index entries are - * managed with volume_index_record structures. - */ - -#define NO_CHAPTER U64_MAX - -struct volume_index_stats { - /* Nanoseconds spent rebalancing */ - ktime_t rebalance_time; - /* Number of memory rebalances */ - u32 rebalance_count; - /* The number of records in the index */ - u64 record_count; - /* The number of collision records */ - u64 collision_count; - /* The number of records removed */ - u64 discard_count; - /* The number of UDS_OVERFLOWs detected */ - u64 overflow_count; - /* The number of delta lists */ - u32 delta_lists; - /* Number of early flushes */ - u64 early_flushes; -}; - -struct volume_sub_index_zone { - u64 virtual_chapter_low; - u64 virtual_chapter_high; - u64 early_flushes; -} __aligned(L1_CACHE_BYTES); - -struct volume_sub_index { - /* The delta index */ - struct delta_index delta_index; - /* The first chapter to be flushed in each zone */ - u64 *flush_chapters; - /* The zones */ - struct volume_sub_index_zone *zones; - /* The volume nonce */ - u64 volume_nonce; - /* Expected size of a chapter (per zone) */ - u64 chapter_zone_bits; - /* Maximum size of the index (per zone) */ - u64 max_zone_bits; - /* The number of bits in address mask */ - u8 address_bits; - /* Mask to get address within delta list */ - u32 address_mask; - /* The number of bits in chapter number */ - u8 chapter_bits; - /* The largest storable chapter number */ - u32 chapter_mask; - /* The number of chapters used */ - u32 chapter_count; - /* The number of delta lists */ - u32 list_count; - /* The number of zones */ - unsigned int zone_count; - /* The amount of memory allocated */ - u64 memory_size; -}; - -struct volume_index_zone { - /* Protects the sampled index in this zone */ - struct mutex hook_mutex; -} __aligned(L1_CACHE_BYTES); - -struct volume_index { - u32 sparse_sample_rate; - unsigned int zone_count; - u64 memory_size; - struct volume_sub_index vi_non_hook; - struct volume_sub_index vi_hook; - struct volume_index_zone *zones; -}; - -/* - * The volume_index_record structure is used to facilitate processing of a record name. A client - * first calls uds_get_volume_index_record() to find the volume index record for a record name. The - * fields of the record can then be examined to determine the state of the record. - * - * If is_found is false, then the index did not find an entry for the record name. Calling - * uds_put_volume_index_record() will insert a new entry for that name at the proper place. - * - * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and - * is_collision fields reflect the entry found. Subsequently, a call to - * uds_remove_volume_index_record() will remove the entry, a call to - * uds_set_volume_index_record_chapter() will update the existing entry, and a call to - * uds_put_volume_index_record() will insert a new collision record after the existing entry. - */ -struct volume_index_record { - /* Public fields */ - - /* Chapter where the record info is found */ - u64 virtual_chapter; - /* This record is a collision */ - bool is_collision; - /* This record is the requested record */ - bool is_found; - - /* Private fields */ - - /* Zone that contains this name */ - unsigned int zone_number; - /* The volume index */ - struct volume_sub_index *sub_index; - /* Mutex for accessing this delta index entry in the hook index */ - struct mutex *mutex; - /* The record name to which this record refers */ - const struct uds_record_name *name; - /* The delta index entry for this record */ - struct delta_index_entry delta_entry; -}; - -int __must_check uds_make_volume_index(const struct uds_configuration *config, - u64 volume_nonce, - struct volume_index **volume_index); - -void uds_free_volume_index(struct volume_index *volume_index); - -int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config, - size_t block_size, - u64 *block_count); - -unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index, - const struct uds_record_name *name); - -bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index, - const struct uds_record_name *name); - -/* - * This function is only used to manage sparse cache membership. Most requests should use - * uds_get_volume_index_record() to look up index records instead. - */ -u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index, - const struct uds_record_name *name); - -int __must_check uds_get_volume_index_record(struct volume_index *volume_index, - const struct uds_record_name *name, - struct volume_index_record *record); - -int __must_check uds_put_volume_index_record(struct volume_index_record *record, - u64 virtual_chapter); - -int __must_check uds_remove_volume_index_record(struct volume_index_record *record); - -int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record, - u64 virtual_chapter); - -void uds_set_volume_index_open_chapter(struct volume_index *volume_index, - u64 virtual_chapter); - -void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, - unsigned int zone_number, - u64 virtual_chapter); - -int __must_check uds_load_volume_index(struct volume_index *volume_index, - struct buffered_reader **readers, - unsigned int reader_count); - -int __must_check uds_save_volume_index(struct volume_index *volume_index, - struct buffered_writer **writers, - unsigned int writer_count); - -void uds_get_volume_index_stats(const struct volume_index *volume_index, - struct volume_index_stats *stats); - -#endif /* UDS_VOLUME_INDEX_H */ diff --git a/drivers/md/dm-vdo/volume.c b/drivers/md/dm-vdo/volume.c deleted file mode 100644 index 60416dc8a9d70..0000000000000 --- a/drivers/md/dm-vdo/volume.c +++ /dev/null @@ -1,1694 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2023 Red Hat - */ - -#include "volume.h" - -#include -#include -#include - -#include "chapter-index.h" -#include "config.h" -#include "errors.h" -#include "geometry.h" -#include "hash-utils.h" -#include "index.h" -#include "logger.h" -#include "memory-alloc.h" -#include "permassert.h" -#include "sparse-cache.h" -#include "string-utils.h" -#include "thread-utils.h" - -/* - * The first block of the volume layout is reserved for the volume header, which is no longer used. - * The remainder of the volume is divided into chapters consisting of several pages of records, and - * several pages of static index to use to find those records. The index pages are recorded first, - * followed by the record pages. The chapters are written in order as they are filled, so the - * volume storage acts as a circular log of the most recent chapters, with each new chapter - * overwriting the oldest saved one. - * - * When a new chapter is filled and closed, the records from that chapter are sorted and - * interleaved in approximate temporal order, and assigned to record pages. Then a static delta - * index is generated to store which record page contains each record. The in-memory index page map - * is also updated to indicate which delta lists fall on each chapter index page. This means that - * when a record is read, the volume only has to load a single index page and a single record page, - * rather than search the entire chapter. These index and record pages are written to storage, and - * the index pages are transferred to the page cache under the theory that the most recently - * written chapter is likely to be accessed again soon. - * - * When reading a record, the volume index will indicate which chapter should contain it. The - * volume uses the index page map to determine which chapter index page needs to be loaded, and - * then reads the relevant record page number from the chapter index. Both index and record pages - * are stored in a page cache when read for the common case that subsequent records need the same - * pages. The page cache evicts the least recently accessed entries when caching new pages. In - * addition, the volume uses dm-bufio to manage access to the storage, which may allow for - * additional caching depending on available system resources. - * - * Record requests are handled from cached pages when possible. If a page needs to be read, it is - * placed on a queue along with the request that wants to read it. Any requests for the same page - * that arrive while the read is pending are added to the queue entry. A separate reader thread - * handles the queued reads, adding the page to the cache and updating any requests queued with it - * so they can continue processing. This allows the index zone threads to continue processing new - * requests rather than wait for the storage reads. - * - * When an index rebuild is necessary, the volume reads each stored chapter to determine which - * range of chapters contain valid records, so that those records can be used to reconstruct the - * in-memory volume index. - */ - -enum { - /* The maximum allowable number of contiguous bad chapters */ - MAX_BAD_CHAPTERS = 100, - VOLUME_CACHE_MAX_ENTRIES = (U16_MAX >> 1), - VOLUME_CACHE_QUEUED_FLAG = (1 << 15), - VOLUME_CACHE_MAX_QUEUED_READS = 4096, -}; - -static const u64 BAD_CHAPTER = U64_MAX; - -/* - * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits - * are the physical page number of the cached page being read. The high order 32 bits are a - * sequence number. This value is written when the zone that owns it begins or completes a cache - * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting - * to update the cache contents. - */ -union invalidate_counter { - u64 value; - struct { - u32 page; - u32 counter; - }; -}; - -static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page) -{ - return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter; -} - -static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page) -{ - return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter; -} - -static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page) -{ - return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter; -} - -static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page) -{ - /* Page zero is the header page, so the first chapter index page is page one. */ - return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page; -} - -static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache, - unsigned int zone_number) -{ - return (union invalidate_counter) { - .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value), - }; -} - -static inline void set_invalidate_counter(struct page_cache *cache, - unsigned int zone_number, - union invalidate_counter invalidate_counter) -{ - WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value, - invalidate_counter.value); -} - -static inline bool search_pending(union invalidate_counter invalidate_counter) -{ - return (invalidate_counter.counter & 1) != 0; -} - -/* Lock the cache for a zone in order to search for a page. */ -static void begin_pending_search(struct page_cache *cache, u32 physical_page, - unsigned int zone_number) -{ - union invalidate_counter invalidate_counter = - get_invalidate_counter(cache, zone_number); - - invalidate_counter.page = physical_page; - invalidate_counter.counter++; - set_invalidate_counter(cache, zone_number, invalidate_counter); - ASSERT_LOG_ONLY(search_pending(invalidate_counter), - "Search is pending for zone %u", zone_number); - /* - * This memory barrier ensures that the write to the invalidate counter is seen by other - * threads before this thread accesses the cached page. The corresponding read memory - * barrier is in wait_for_pending_searches(). - */ - smp_mb(); -} - -/* Unlock the cache for a zone by clearing its invalidate counter. */ -static void end_pending_search(struct page_cache *cache, unsigned int zone_number) -{ - union invalidate_counter invalidate_counter; - - /* - * This memory barrier ensures that this thread completes reads of the - * cached page before other threads see the write to the invalidate - * counter. - */ - smp_mb(); - - invalidate_counter = get_invalidate_counter(cache, zone_number); - ASSERT_LOG_ONLY(search_pending(invalidate_counter), - "Search is pending for zone %u", zone_number); - invalidate_counter.counter++; - set_invalidate_counter(cache, zone_number, invalidate_counter); -} - -static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page) -{ - union invalidate_counter initial_counters[MAX_ZONES]; - unsigned int i; - - /* - * We hold the read_threads_mutex. We are waiting for threads that do not hold the - * read_threads_mutex. Those threads have "locked" their targeted page by setting the - * search_pending_counter. The corresponding write memory barrier is in - * begin_pending_search(). - */ - smp_mb(); - - for (i = 0; i < cache->zone_count; i++) - initial_counters[i] = get_invalidate_counter(cache, i); - for (i = 0; i < cache->zone_count; i++) { - if (search_pending(initial_counters[i]) && - (initial_counters[i].page == physical_page)) { - /* - * There is an active search using the physical page. We need to wait for - * the search to finish. - * - * FIXME: Investigate using wait_event() to wait for the search to finish. - */ - while (initial_counters[i].value == - get_invalidate_counter(cache, i).value) - cond_resched(); - } - } -} - -static void release_page_buffer(struct cached_page *page) -{ - if (page->buffer != NULL) - dm_bufio_release(uds_forget(page->buffer)); -} - -static void clear_cache_page(struct page_cache *cache, struct cached_page *page) -{ - /* Do not clear read_pending because the read queue relies on it. */ - release_page_buffer(page); - page->physical_page = cache->indexable_pages; - WRITE_ONCE(page->last_used, 0); -} - -static void make_page_most_recent(struct page_cache *cache, struct cached_page *page) -{ - /* - * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any - * thread holding the read_threads_mutex. - */ - if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used)) - WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock)); -} - -/* Select a page to remove from the cache to make space for a new entry. */ -static struct cached_page *select_victim_in_cache(struct page_cache *cache) -{ - struct cached_page *page; - int oldest_index = 0; - s64 oldest_time = S64_MAX; - s64 last_used; - u16 i; - - /* Find the oldest unclaimed page. We hold the read_threads_mutex. */ - for (i = 0; i < cache->cache_slots; i++) { - /* A page with a pending read must not be replaced. */ - if (cache->cache[i].read_pending) - continue; - - last_used = READ_ONCE(cache->cache[i].last_used); - if (last_used <= oldest_time) { - oldest_time = last_used; - oldest_index = i; - } - } - - page = &cache->cache[oldest_index]; - if (page->physical_page != cache->indexable_pages) { - WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); - wait_for_pending_searches(cache, page->physical_page); - } - - page->read_pending = true; - clear_cache_page(cache, page); - return page; -} - -/* Make a newly filled cache entry available to other threads. */ -static int put_page_in_cache(struct page_cache *cache, u32 physical_page, - struct cached_page *page) -{ - int result; - - /* We hold the read_threads_mutex. */ - result = ASSERT((page->read_pending), "page to install has a pending read"); - if (result != UDS_SUCCESS) - return result; - - page->physical_page = physical_page; - make_page_most_recent(cache, page); - page->read_pending = false; - - /* - * We hold the read_threads_mutex, but we must have a write memory barrier before making - * the cached_page available to the readers that do not hold the mutex. The corresponding - * read memory barrier is in get_page_and_index(). - */ - smp_wmb(); - - /* This assignment also clears the queued flag. */ - WRITE_ONCE(cache->index[physical_page], page - cache->cache); - return UDS_SUCCESS; -} - -static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page, - struct cached_page *page) -{ - int result; - - /* We hold the read_threads_mutex. */ - result = ASSERT((page->read_pending), "page to install has a pending read"); - if (result != UDS_SUCCESS) - return; - - clear_cache_page(cache, page); - page->read_pending = false; - - /* Clear the mapping and the queued flag for the new page. */ - WRITE_ONCE(cache->index[physical_page], cache->cache_slots); -} - -static inline u16 next_queue_position(u16 position) -{ - return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS; -} - -static inline void advance_queue_position(u16 *position) -{ - *position = next_queue_position(*position); -} - -static inline bool read_queue_is_full(struct page_cache *cache) -{ - return cache->read_queue_first == next_queue_position(cache->read_queue_last); -} - -static bool enqueue_read(struct page_cache *cache, struct uds_request *request, - u32 physical_page) -{ - struct queued_read *queue_entry; - u16 last = cache->read_queue_last; - u16 read_queue_index; - - /* We hold the read_threads_mutex. */ - if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) { - /* This page has no existing entry in the queue. */ - if (read_queue_is_full(cache)) - return false; - - /* Fill in the read queue entry. */ - cache->read_queue[last].physical_page = physical_page; - cache->read_queue[last].invalid = false; - cache->read_queue[last].first_request = NULL; - cache->read_queue[last].last_request = NULL; - - /* Point the cache index to the read queue entry. */ - read_queue_index = last; - WRITE_ONCE(cache->index[physical_page], - read_queue_index | VOLUME_CACHE_QUEUED_FLAG); - - advance_queue_position(&cache->read_queue_last); - } else { - /* It's already queued, so add this request to the existing entry. */ - read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG; - } - - request->next_request = NULL; - queue_entry = &cache->read_queue[read_queue_index]; - if (queue_entry->first_request == NULL) - queue_entry->first_request = request; - else - queue_entry->last_request->next_request = request; - queue_entry->last_request = request; - - return true; -} - -static void enqueue_page_read(struct volume *volume, struct uds_request *request, - u32 physical_page) -{ - /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */ - while (!enqueue_read(&volume->page_cache, request, physical_page)) { - uds_log_debug("Read queue full, waiting for reads to finish"); - uds_wait_cond(&volume->read_threads_read_done_cond, - &volume->read_threads_mutex); - } - - uds_signal_cond(&volume->read_threads_cond); -} - -/* - * Reserve the next read queue entry for processing, but do not actually remove it from the queue. - * Must be followed by release_queued_requests(). - */ -static struct queued_read *reserve_read_queue_entry(struct page_cache *cache) -{ - /* We hold the read_threads_mutex. */ - struct queued_read *entry; - u16 index_value; - bool queued; - - /* No items to dequeue */ - if (cache->read_queue_next_read == cache->read_queue_last) - return NULL; - - entry = &cache->read_queue[cache->read_queue_next_read]; - index_value = cache->index[entry->physical_page]; - queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; - /* Check to see if it's still queued before resetting. */ - if (entry->invalid && queued) - WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots); - - /* - * If a synchronous read has taken this page, set invalid to true so it doesn't get - * overwritten. Requests will just be requeued. - */ - if (!queued) - entry->invalid = true; - - entry->reserved = true; - advance_queue_position(&cache->read_queue_next_read); - return entry; -} - -static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume) -{ - struct queued_read *queue_entry = NULL; - - while (!volume->read_threads_exiting) { - queue_entry = reserve_read_queue_entry(&volume->page_cache); - if (queue_entry != NULL) - break; - - uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex); - } - - return queue_entry; -} - -static int init_chapter_index_page(const struct volume *volume, u8 *index_page, - u32 chapter, u32 index_page_number, - struct delta_index_page *chapter_index_page) -{ - u64 ci_virtual; - u32 ci_chapter; - u32 lowest_list; - u32 highest_list; - struct index_geometry *geometry = volume->geometry; - int result; - - result = uds_initialize_chapter_index_page(chapter_index_page, geometry, - index_page, volume->nonce); - if (volume->lookup_mode == LOOKUP_FOR_REBUILD) - return result; - - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "Reading chapter index page for chapter %u page %u", - chapter, index_page_number); - } - - uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number, - &lowest_list, &highest_list); - ci_virtual = chapter_index_page->virtual_chapter_number; - ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual); - if ((chapter == ci_chapter) && - (lowest_list == chapter_index_page->lowest_list_number) && - (highest_list == chapter_index_page->highest_list_number)) - return UDS_SUCCESS; - - uds_log_warning("Index page map updated to %llu", - (unsigned long long) volume->index_page_map->last_update); - uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u", - chapter, index_page_number, lowest_list, highest_list, - (unsigned long long) ci_virtual, - chapter_index_page->lowest_list_number, - chapter_index_page->highest_list_number); - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "index page map mismatch with chapter index"); -} - -static int initialize_index_page(const struct volume *volume, u32 physical_page, - struct cached_page *page) -{ - u32 chapter = map_to_chapter_number(volume->geometry, physical_page); - u32 index_page_number = map_to_page_number(volume->geometry, physical_page); - - return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer), - chapter, index_page_number, &page->index_page); -} - -static bool search_record_page(const u8 record_page[], - const struct uds_record_name *name, - const struct index_geometry *geometry, - struct uds_record_data *metadata) -{ - /* - * The array of records is sorted by name and stored as a binary tree in heap order, so the - * root of the tree is the first array element. - */ - u32 node = 0; - const struct uds_volume_record *records = (const struct uds_volume_record *) record_page; - - while (node < geometry->records_per_page) { - int result; - const struct uds_volume_record *record = &records[node]; - - result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE); - if (result == 0) { - if (metadata != NULL) - *metadata = record->data; - return true; - } - - /* The children of node N are at indexes 2N+1 and 2N+2. */ - node = ((2 * node) + ((result < 0) ? 1 : 2)); - } - - return false; -} - -/* - * If we've read in a record page, we're going to do an immediate search, to speed up processing by - * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If - * we've read in an index page, we save the record page number so we don't have to resolve the - * index page again. We use the location, virtual_chapter, and old_metadata fields in the request - * to allow the index code to know where to begin processing the request again. - */ -static int search_page(struct cached_page *page, const struct volume *volume, - struct uds_request *request, u32 physical_page) -{ - int result; - enum uds_index_region location; - u16 record_page_number; - - if (is_record_page(volume->geometry, physical_page)) { - if (search_record_page(dm_bufio_get_block_data(page->buffer), - &request->record_name, volume->geometry, - &request->old_metadata)) - location = UDS_LOCATION_RECORD_PAGE_LOOKUP; - else - location = UDS_LOCATION_UNAVAILABLE; - } else { - result = uds_search_chapter_index_page(&page->index_page, - volume->geometry, - &request->record_name, - &record_page_number); - if (result != UDS_SUCCESS) - return result; - - if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { - location = UDS_LOCATION_UNAVAILABLE; - } else { - location = UDS_LOCATION_INDEX_PAGE_LOOKUP; - *((u16 *) &request->old_metadata) = record_page_number; - } - } - - request->location = location; - request->found = false; - return UDS_SUCCESS; -} - -static int process_entry(struct volume *volume, struct queued_read *entry) -{ - u32 page_number = entry->physical_page; - struct uds_request *request; - struct cached_page *page = NULL; - u8 *page_data; - int result; - - if (entry->invalid) { - uds_log_debug("Requeuing requests for invalid page"); - return UDS_SUCCESS; - } - - page = select_victim_in_cache(&volume->page_cache); - - mutex_unlock(&volume->read_threads_mutex); - page_data = dm_bufio_read(volume->client, page_number, &page->buffer); - mutex_lock(&volume->read_threads_mutex); - if (IS_ERR(page_data)) { - result = -PTR_ERR(page_data); - uds_log_warning_strerror(result, - "error reading physical page %u from volume", - page_number); - cancel_page_in_cache(&volume->page_cache, page_number, page); - return result; - } - - if (entry->invalid) { - uds_log_warning("Page %u invalidated after read", page_number); - cancel_page_in_cache(&volume->page_cache, page_number, page); - return UDS_SUCCESS; - } - - if (!is_record_page(volume->geometry, page_number)) { - result = initialize_index_page(volume, page_number, page); - if (result != UDS_SUCCESS) { - uds_log_warning("Error initializing chapter index page"); - cancel_page_in_cache(&volume->page_cache, page_number, page); - return result; - } - } - - result = put_page_in_cache(&volume->page_cache, page_number, page); - if (result != UDS_SUCCESS) { - uds_log_warning("Error putting page %u in cache", page_number); - cancel_page_in_cache(&volume->page_cache, page_number, page); - return result; - } - - request = entry->first_request; - while ((request != NULL) && (result == UDS_SUCCESS)) { - result = search_page(page, volume, request, page_number); - request = request->next_request; - } - - return result; -} - -static void release_queued_requests(struct volume *volume, struct queued_read *entry, - int result) -{ - struct page_cache *cache = &volume->page_cache; - u16 next_read = cache->read_queue_next_read; - struct uds_request *request; - struct uds_request *next; - - for (request = entry->first_request; request != NULL; request = next) { - next = request->next_request; - request->status = result; - request->requeued = true; - uds_enqueue_request(request, STAGE_INDEX); - } - - entry->reserved = false; - - /* Move the read_queue_first pointer as far as we can. */ - while ((cache->read_queue_first != next_read) && - (!cache->read_queue[cache->read_queue_first].reserved)) - advance_queue_position(&cache->read_queue_first); - uds_broadcast_cond(&volume->read_threads_read_done_cond); -} - -static void read_thread_function(void *arg) -{ - struct volume *volume = arg; - - uds_log_debug("reader starting"); - mutex_lock(&volume->read_threads_mutex); - while (true) { - struct queued_read *queue_entry; - int result; - - queue_entry = wait_to_reserve_read_queue_entry(volume); - if (volume->read_threads_exiting) - break; - - result = process_entry(volume, queue_entry); - release_queued_requests(volume, queue_entry, result); - } - mutex_unlock(&volume->read_threads_mutex); - uds_log_debug("reader done"); -} - -static void get_page_and_index(struct page_cache *cache, u32 physical_page, - int *queue_index, struct cached_page **page_ptr) -{ - u16 index_value; - u16 index; - bool queued; - - /* - * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any - * thread holding the read_threads_mutex. - * - * Holding only a search_pending_counter is the most frequent case. - */ - /* - * It would be unlikely for the compiler to turn the usage of index_value into two reads of - * cache->index, but it would be possible and very bad if those reads did not return the - * same bits. - */ - index_value = READ_ONCE(cache->index[physical_page]); - queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; - index = index_value & ~VOLUME_CACHE_QUEUED_FLAG; - - if (!queued && (index < cache->cache_slots)) { - *page_ptr = &cache->cache[index]; - /* - * We have acquired access to the cached page, but unless we hold the - * read_threads_mutex, we need a read memory barrier now. The corresponding write - * memory barrier is in put_page_in_cache(). - */ - smp_rmb(); - } else { - *page_ptr = NULL; - } - - *queue_index = queued ? index : -1; -} - -static void get_page_from_cache(struct page_cache *cache, u32 physical_page, - struct cached_page **page) -{ - /* - * ASSERTION: We are in a zone thread. - * ASSERTION: We holding a search_pending_counter or the read_threads_mutex. - */ - int queue_index = -1; - - get_page_and_index(cache, physical_page, &queue_index, page); -} - -static int read_page_locked(struct volume *volume, u32 physical_page, - struct cached_page **page_ptr) -{ - int result = UDS_SUCCESS; - struct cached_page *page = NULL; - u8 *page_data; - - page = select_victim_in_cache(&volume->page_cache); - page_data = dm_bufio_read(volume->client, physical_page, &page->buffer); - if (IS_ERR(page_data)) { - result = -PTR_ERR(page_data); - uds_log_warning_strerror(result, - "error reading physical page %u from volume", - physical_page); - cancel_page_in_cache(&volume->page_cache, physical_page, page); - return result; - } - - if (!is_record_page(volume->geometry, physical_page)) { - result = initialize_index_page(volume, physical_page, page); - if (result != UDS_SUCCESS) { - if (volume->lookup_mode != LOOKUP_FOR_REBUILD) - uds_log_warning("Corrupt index page %u", physical_page); - cancel_page_in_cache(&volume->page_cache, physical_page, page); - return result; - } - } - - result = put_page_in_cache(&volume->page_cache, physical_page, page); - if (result != UDS_SUCCESS) { - uds_log_warning("Error putting page %u in cache", physical_page); - cancel_page_in_cache(&volume->page_cache, physical_page, page); - return result; - } - - *page_ptr = page; - return UDS_SUCCESS; -} - -/* Retrieve a page from the cache while holding the read threads mutex. */ -static int get_volume_page_locked(struct volume *volume, u32 physical_page, - struct cached_page **page_ptr) -{ - int result; - struct cached_page *page = NULL; - - get_page_from_cache(&volume->page_cache, physical_page, &page); - if (page == NULL) { - result = read_page_locked(volume, physical_page, &page); - if (result != UDS_SUCCESS) - return result; - } else { - make_page_most_recent(&volume->page_cache, page); - } - - *page_ptr = page; - return UDS_SUCCESS; -} - -/* Retrieve a page from the cache while holding a search_pending lock. */ -static int get_volume_page_protected(struct volume *volume, struct uds_request *request, - u32 physical_page, struct cached_page **page_ptr) -{ - struct cached_page *page; - - get_page_from_cache(&volume->page_cache, physical_page, &page); - if (page != NULL) { - if (request->zone_number == 0) { - /* Only one zone is allowed to update the LRU. */ - make_page_most_recent(&volume->page_cache, page); - } - - *page_ptr = page; - return UDS_SUCCESS; - } - - /* Prepare to enqueue a read for the page. */ - end_pending_search(&volume->page_cache, request->zone_number); - mutex_lock(&volume->read_threads_mutex); - - /* - * Do the lookup again while holding the read mutex (no longer the fast case so this should - * be fine to repeat). We need to do this because a page may have been added to the cache - * by a reader thread between the time we searched above and the time we went to actually - * try to enqueue it below. This could result in us enqueuing another read for a page which - * is already in the cache, which would mean we end up with two entries in the cache for - * the same page. - */ - get_page_from_cache(&volume->page_cache, physical_page, &page); - if (page == NULL) { - enqueue_page_read(volume, request, physical_page); - /* - * The performance gain from unlocking first, while "search pending" mode is off, - * turns out to be significant in some cases. The page is not available yet so - * the order does not matter for correctness as it does below. - */ - mutex_unlock(&volume->read_threads_mutex); - begin_pending_search(&volume->page_cache, physical_page, - request->zone_number); - return UDS_QUEUED; - } - - /* - * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and - * "search pending" state in careful order so no other thread can mess with the data before - * the caller gets to look at it. - */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); - mutex_unlock(&volume->read_threads_mutex); - *page_ptr = page; - return UDS_SUCCESS; -} - -static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number, - struct cached_page **page_ptr) -{ - int result; - u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number); - - mutex_lock(&volume->read_threads_mutex); - result = get_volume_page_locked(volume, physical_page, page_ptr); - mutex_unlock(&volume->read_threads_mutex); - return result; -} - -int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number, - u8 **data_ptr) -{ - int result; - struct cached_page *page = NULL; - - result = get_volume_page(volume, chapter, page_number, &page); - if (result == UDS_SUCCESS) - *data_ptr = dm_bufio_get_block_data(page->buffer); - return result; -} - -int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number, - struct delta_index_page **index_page_ptr) -{ - int result; - struct cached_page *page = NULL; - - result = get_volume_page(volume, chapter, page_number, &page); - if (result == UDS_SUCCESS) - *index_page_ptr = &page->index_page; - return result; -} - -/* - * Find the record page associated with a name in a given index page. This will return UDS_QUEUED - * if the page in question must be read from storage. - */ -static int search_cached_index_page(struct volume *volume, struct uds_request *request, - u32 chapter, u32 index_page_number, - u16 *record_page_number) -{ - int result; - struct cached_page *page = NULL; - u32 physical_page = map_to_physical_page(volume->geometry, chapter, - index_page_number); - - /* - * Make sure the invalidate counter is updated before we try and read the mapping. This - * prevents this thread from reading a page in the cache which has already been marked for - * invalidation by the reader thread, before the reader thread has noticed that the - * invalidate_counter has been incremented. - */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); - - result = get_volume_page_protected(volume, request, physical_page, &page); - if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); - return result; - } - - result = uds_search_chapter_index_page(&page->index_page, volume->geometry, - &request->record_name, - record_page_number); - end_pending_search(&volume->page_cache, request->zone_number); - return result; -} - -/* - * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if - * the page in question must be read from storage. - */ -int uds_search_cached_record_page(struct volume *volume, struct uds_request *request, - u32 chapter, u16 record_page_number, bool *found) -{ - struct cached_page *record_page; - struct index_geometry *geometry = volume->geometry; - int result; - u32 physical_page, page_number; - - *found = false; - if (record_page_number == NO_CHAPTER_INDEX_ENTRY) - return UDS_SUCCESS; - - result = ASSERT(record_page_number < geometry->record_pages_per_chapter, - "0 <= %d < %u", record_page_number, - geometry->record_pages_per_chapter); - if (result != UDS_SUCCESS) - return result; - - page_number = geometry->index_pages_per_chapter + record_page_number; - - physical_page = map_to_physical_page(volume->geometry, chapter, page_number); - - /* - * Make sure the invalidate counter is updated before we try and read the mapping. This - * prevents this thread from reading a page in the cache which has already been marked for - * invalidation by the reader thread, before the reader thread has noticed that the - * invalidate_counter has been incremented. - */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); - - result = get_volume_page_protected(volume, request, physical_page, &record_page); - if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); - return result; - } - - if (search_record_page(dm_bufio_get_block_data(record_page->buffer), - &request->record_name, geometry, &request->old_metadata)) - *found = true; - - end_pending_search(&volume->page_cache, request->zone_number); - return UDS_SUCCESS; -} - -void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter) -{ - const struct index_geometry *geometry = volume->geometry; - u32 physical_page = map_to_physical_page(geometry, chapter, 0); - - dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter); -} - -int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter, - struct dm_buffer *volume_buffers[], - struct delta_index_page index_pages[]) -{ - int result; - u32 i; - const struct index_geometry *geometry = volume->geometry; - u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); - u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0); - - dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter); - for (i = 0; i < geometry->index_pages_per_chapter; i++) { - u8 *index_page; - - index_page = dm_bufio_read(volume->client, physical_page + i, - &volume_buffers[i]); - if (IS_ERR(index_page)) { - result = -PTR_ERR(index_page); - uds_log_warning_strerror(result, - "error reading physical page %u", - physical_page); - return result; - } - - result = init_chapter_index_page(volume, index_page, physical_chapter, i, - &index_pages[i]); - if (result != UDS_SUCCESS) - return result; - } - - return UDS_SUCCESS; -} - -int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request, - bool *found) -{ - int result; - u32 physical_chapter = - uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter); - u32 index_page_number; - u16 record_page_number; - - index_page_number = uds_find_index_page_number(volume->index_page_map, - &request->record_name, - physical_chapter); - - if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) { - record_page_number = *((u16 *) &request->old_metadata); - } else { - result = search_cached_index_page(volume, request, physical_chapter, - index_page_number, - &record_page_number); - if (result != UDS_SUCCESS) - return result; - } - - return uds_search_cached_record_page(volume, request, physical_chapter, - record_page_number, found); -} - -int uds_search_volume_page_cache_for_rebuild(struct volume *volume, - const struct uds_record_name *name, - u64 virtual_chapter, bool *found) -{ - int result; - struct index_geometry *geometry = volume->geometry; - struct cached_page *page; - u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); - u32 index_page_number; - u16 record_page_number; - u32 page_number; - - *found = false; - index_page_number = - uds_find_index_page_number(volume->index_page_map, name, - physical_chapter); - result = get_volume_page(volume, physical_chapter, index_page_number, &page); - if (result != UDS_SUCCESS) - return result; - - result = uds_search_chapter_index_page(&page->index_page, geometry, name, - &record_page_number); - if (result != UDS_SUCCESS) - return result; - - if (record_page_number == NO_CHAPTER_INDEX_ENTRY) - return UDS_SUCCESS; - - page_number = geometry->index_pages_per_chapter + record_page_number; - result = get_volume_page(volume, physical_chapter, page_number, &page); - if (result != UDS_SUCCESS) - return result; - - *found = search_record_page(dm_bufio_get_block_data(page->buffer), name, - geometry, NULL); - return UDS_SUCCESS; -} - -static void invalidate_page(struct page_cache *cache, u32 physical_page) -{ - struct cached_page *page; - int queue_index = -1; - - /* We hold the read_threads_mutex. */ - get_page_and_index(cache, physical_page, &queue_index, &page); - if (page != NULL) { - WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); - wait_for_pending_searches(cache, page->physical_page); - clear_cache_page(cache, page); - } else if (queue_index > -1) { - uds_log_debug("setting pending read to invalid"); - cache->read_queue[queue_index].invalid = true; - } -} - -void uds_forget_chapter(struct volume *volume, u64 virtual_chapter) -{ - u32 physical_chapter = - uds_map_to_physical_chapter(volume->geometry, virtual_chapter); - u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0); - u32 i; - - uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter); - mutex_lock(&volume->read_threads_mutex); - for (i = 0; i < volume->geometry->pages_per_chapter; i++) - invalidate_page(&volume->page_cache, first_page + i); - mutex_unlock(&volume->read_threads_mutex); -} - -/* - * Donate an index pages from a newly written chapter to the page cache since it is likely to be - * used again soon. The caller must already hold the reader thread mutex. - */ -static int donate_index_page_locked(struct volume *volume, u32 physical_chapter, - u32 index_page_number, struct dm_buffer *page_buffer) -{ - int result; - struct cached_page *page = NULL; - u32 physical_page = - map_to_physical_page(volume->geometry, physical_chapter, - index_page_number); - - page = select_victim_in_cache(&volume->page_cache); - page->buffer = page_buffer; - result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer), - physical_chapter, index_page_number, - &page->index_page); - if (result != UDS_SUCCESS) { - uds_log_warning("Error initialize chapter index page"); - cancel_page_in_cache(&volume->page_cache, physical_page, page); - return result; - } - - result = put_page_in_cache(&volume->page_cache, physical_page, page); - if (result != UDS_SUCCESS) { - uds_log_warning("Error putting page %u in cache", physical_page); - cancel_page_in_cache(&volume->page_cache, physical_page, page); - return result; - } - - return UDS_SUCCESS; -} - -static int write_index_pages(struct volume *volume, u32 physical_chapter_number, - struct open_chapter_index *chapter_index) -{ - struct index_geometry *geometry = volume->geometry; - struct dm_buffer *page_buffer; - u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0); - u32 delta_list_number = 0; - u32 index_page_number; - - for (index_page_number = 0; - index_page_number < geometry->index_pages_per_chapter; - index_page_number++) { - u8 *page_data; - u32 physical_page = first_index_page + index_page_number; - u32 lists_packed; - bool last_page; - int result; - - page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); - if (IS_ERR(page_data)) { - return uds_log_warning_strerror(-PTR_ERR(page_data), - "failed to prepare index page"); - } - - last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter); - result = uds_pack_open_chapter_index_page(chapter_index, page_data, - delta_list_number, last_page, - &lists_packed); - if (result != UDS_SUCCESS) { - dm_bufio_release(page_buffer); - return uds_log_warning_strerror(result, - "failed to pack index page"); - } - - dm_bufio_mark_buffer_dirty(page_buffer); - - if (lists_packed == 0) { - uds_log_debug("no delta lists packed on chapter %u page %u", - physical_chapter_number, index_page_number); - } else { - delta_list_number += lists_packed; - } - - uds_update_index_page_map(volume->index_page_map, - chapter_index->virtual_chapter_number, - physical_chapter_number, index_page_number, - delta_list_number - 1); - - mutex_lock(&volume->read_threads_mutex); - result = donate_index_page_locked(volume, physical_chapter_number, - index_page_number, page_buffer); - mutex_unlock(&volume->read_threads_mutex); - if (result != UDS_SUCCESS) { - dm_bufio_release(page_buffer); - return result; - } - } - - return UDS_SUCCESS; -} - -static u32 encode_tree(u8 record_page[], - const struct uds_volume_record *sorted_pointers[], - u32 next_record, u32 node, u32 node_count) -{ - if (node < node_count) { - u32 child = (2 * node) + 1; - - next_record = encode_tree(record_page, sorted_pointers, next_record, - child, node_count); - - /* - * In-order traversal: copy the contents of the next record into the page at the - * node offset. - */ - memcpy(&record_page[node * BYTES_PER_RECORD], - sorted_pointers[next_record++], BYTES_PER_RECORD); - - next_record = encode_tree(record_page, sorted_pointers, next_record, - child + 1, node_count); - } - - return next_record; -} - -static int encode_record_page(const struct volume *volume, - const struct uds_volume_record records[], u8 record_page[]) -{ - int result; - u32 i; - u32 records_per_page = volume->geometry->records_per_page; - const struct uds_volume_record **record_pointers = volume->record_pointers; - - for (i = 0; i < records_per_page; i++) - record_pointers[i] = &records[i]; - - /* - * Sort the record pointers by using just the names in the records, which is less work than - * sorting the entire record values. - */ - BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0); - result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers, - records_per_page, UDS_RECORD_NAME_SIZE); - if (result != UDS_SUCCESS) - return result; - - encode_tree(record_page, record_pointers, 0, 0, records_per_page); - return UDS_SUCCESS; -} - -static int write_record_pages(struct volume *volume, u32 physical_chapter_number, - const struct uds_volume_record *records) -{ - u32 record_page_number; - struct index_geometry *geometry = volume->geometry; - struct dm_buffer *page_buffer; - const struct uds_volume_record *next_record = records; - u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number, - geometry->index_pages_per_chapter); - - for (record_page_number = 0; - record_page_number < geometry->record_pages_per_chapter; - record_page_number++) { - u8 *page_data; - u32 physical_page = first_record_page + record_page_number; - int result; - - page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); - if (IS_ERR(page_data)) { - return uds_log_warning_strerror(-PTR_ERR(page_data), - "failed to prepare record page"); - } - - result = encode_record_page(volume, next_record, page_data); - if (result != UDS_SUCCESS) { - dm_bufio_release(page_buffer); - return uds_log_warning_strerror(result, - "failed to encode record page %u", - record_page_number); - } - - next_record += geometry->records_per_page; - dm_bufio_mark_buffer_dirty(page_buffer); - dm_bufio_release(page_buffer); - } - - return UDS_SUCCESS; -} - -int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index, - const struct uds_volume_record *records) -{ - int result; - u32 physical_chapter_number = - uds_map_to_physical_chapter(volume->geometry, - chapter_index->virtual_chapter_number); - - result = write_index_pages(volume, physical_chapter_number, chapter_index); - if (result != UDS_SUCCESS) - return result; - - result = write_record_pages(volume, physical_chapter_number, records); - if (result != UDS_SUCCESS) - return result; - - result = -dm_bufio_write_dirty_buffers(volume->client); - if (result != UDS_SUCCESS) - uds_log_error_strerror(result, "cannot sync chapter to volume"); - - return result; -} - -static void probe_chapter(struct volume *volume, u32 chapter_number, - u64 *virtual_chapter_number) -{ - const struct index_geometry *geometry = volume->geometry; - u32 expected_list_number = 0; - u32 i; - u64 vcn = BAD_CHAPTER; - - *virtual_chapter_number = BAD_CHAPTER; - dm_bufio_prefetch(volume->client, - map_to_physical_page(geometry, chapter_number, 0), - geometry->index_pages_per_chapter); - - for (i = 0; i < geometry->index_pages_per_chapter; i++) { - struct delta_index_page *page; - int result; - - result = uds_get_volume_index_page(volume, chapter_number, i, &page); - if (result != UDS_SUCCESS) - return; - - if (page->virtual_chapter_number == BAD_CHAPTER) { - uds_log_error("corrupt index page in chapter %u", - chapter_number); - return; - } - - if (vcn == BAD_CHAPTER) { - vcn = page->virtual_chapter_number; - } else if (page->virtual_chapter_number != vcn) { - uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu", - chapter_number, i, (unsigned long long) vcn, - (unsigned long long) page->virtual_chapter_number); - return; - } - - if (expected_list_number != page->lowest_list_number) { - uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u", - chapter_number, i, expected_list_number, - page->lowest_list_number); - return; - } - expected_list_number = page->highest_list_number + 1; - - result = uds_validate_chapter_index_page(page, geometry); - if (result != UDS_SUCCESS) - return; - } - - if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) { - uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number, - (unsigned long long) vcn, geometry->chapters_per_volume); - return; - } - - *virtual_chapter_number = vcn; -} - -/* Find the last valid physical chapter in the volume. */ -static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr) -{ - u32 span = 1; - u32 tries = 0; - - while (limit > 0) { - u32 chapter = (span > limit) ? 0 : limit - span; - u64 vcn = 0; - - probe_chapter(volume, chapter, &vcn); - if (vcn == BAD_CHAPTER) { - limit = chapter; - if (++tries > 1) - span *= 2; - } else { - if (span == 1) - break; - span /= 2; - tries = 0; - } - } - - *limit_ptr = limit; -} - -static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn, - u64 *highest_vcn) -{ - struct index_geometry *geometry = volume->geometry; - u64 zero_vcn; - u64 lowest = BAD_CHAPTER; - u64 highest = BAD_CHAPTER; - u64 moved_chapter = BAD_CHAPTER; - u32 left_chapter = 0; - u32 right_chapter = 0; - u32 bad_chapters = 0; - - /* - * This method assumes there is at most one run of contiguous bad chapters caused by - * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the - * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the - * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately - * precedes the lowest one. - */ - - /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */ - probe_chapter(volume, 0, &zero_vcn); - - /* - * Binary search for end of the discontinuity in the monotonically increasing virtual - * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're - * searching for the index of the smallest value less than zero_vcn. In the case we go off - * the end it means that chapter 0 has the lowest vcn. - * - * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always - * skip over the moved chapter when searching, adding it to the range at the end if - * necessary. - */ - if (geometry->remapped_physical > 0) { - u64 remapped_vcn; - - probe_chapter(volume, geometry->remapped_physical, &remapped_vcn); - if (remapped_vcn == geometry->remapped_virtual) - moved_chapter = geometry->remapped_physical; - } - - left_chapter = 0; - right_chapter = chapter_limit; - - while (left_chapter < right_chapter) { - u64 probe_vcn; - u32 chapter = (left_chapter + right_chapter) / 2; - - if (chapter == moved_chapter) - chapter--; - - probe_chapter(volume, chapter, &probe_vcn); - if (zero_vcn <= probe_vcn) { - left_chapter = chapter + 1; - if (left_chapter == moved_chapter) - left_chapter++; - } else { - right_chapter = chapter; - } - } - - /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/ - if (left_chapter >= chapter_limit) - left_chapter = 0; - - /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */ - probe_chapter(volume, left_chapter, &lowest); - - /* The moved chapter might be the lowest in the range. */ - if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1)) - lowest = geometry->remapped_virtual; - - /* - * Circularly scan backwards, moving over any bad chapters until encountering a good one, - * which is the chapter with the highest vcn. - */ - while (highest == BAD_CHAPTER) { - right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit; - if (right_chapter == moved_chapter) - continue; - - probe_chapter(volume, right_chapter, &highest); - if (bad_chapters++ >= MAX_BAD_CHAPTERS) { - uds_log_error("too many bad chapters in volume: %u", - bad_chapters); - return UDS_CORRUPT_DATA; - } - } - - *lowest_vcn = lowest; - *highest_vcn = highest; - return UDS_SUCCESS; -} - -/* - * Find the highest and lowest contiguous chapters present in the volume and determine their - * virtual chapter numbers. This is used by rebuild. - */ -int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn, - u64 *highest_vcn, bool *is_empty) -{ - u32 chapter_limit = volume->geometry->chapters_per_volume; - - find_real_end_of_volume(volume, chapter_limit, &chapter_limit); - if (chapter_limit == 0) { - *lowest_vcn = 0; - *highest_vcn = 0; - *is_empty = true; - return UDS_SUCCESS; - } - - *is_empty = false; - return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn); -} - -int __must_check uds_replace_volume_storage(struct volume *volume, - struct index_layout *layout, - struct block_device *bdev) -{ - int result; - u32 i; - - result = uds_replace_index_layout_storage(layout, bdev); - if (result != UDS_SUCCESS) - return result; - - /* Release all outstanding dm_bufio objects */ - for (i = 0; i < volume->page_cache.indexable_pages; i++) - volume->page_cache.index[i] = volume->page_cache.cache_slots; - for (i = 0; i < volume->page_cache.cache_slots; i++) - clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]); - if (volume->sparse_cache != NULL) - uds_invalidate_sparse_cache(volume->sparse_cache); - if (volume->client != NULL) - dm_bufio_client_destroy(uds_forget(volume->client)); - - return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page, - volume->reserved_buffers, &volume->client); -} - -static int __must_check initialize_page_cache(struct page_cache *cache, - const struct index_geometry *geometry, - u32 chapters_in_cache, - unsigned int zone_count) -{ - int result; - u32 i; - - cache->indexable_pages = geometry->pages_per_volume + 1; - cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter; - cache->zone_count = zone_count; - atomic64_set(&cache->clock, 1); - - result = ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES), - "requested cache size, %u, within limit %u", - cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, - "volume read queue", &cache->read_queue); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(cache->zone_count, struct search_pending_counter, - "Volume Cache Zones", &cache->search_pending_counters); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(cache->indexable_pages, u16, "page cache index", - &cache->index); - if (result != UDS_SUCCESS) - return result; - - result = uds_allocate(cache->cache_slots, struct cached_page, "page cache cache", - &cache->cache); - if (result != UDS_SUCCESS) - return result; - - /* Initialize index values to invalid values. */ - for (i = 0; i < cache->indexable_pages; i++) - cache->index[i] = cache->cache_slots; - - for (i = 0; i < cache->cache_slots; i++) - clear_cache_page(cache, &cache->cache[i]); - - return UDS_SUCCESS; -} - -int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout, - struct volume **new_volume) -{ - unsigned int i; - struct volume *volume = NULL; - struct index_geometry *geometry; - unsigned int reserved_buffers; - int result; - - result = uds_allocate(1, struct volume, "volume", &volume); - if (result != UDS_SUCCESS) - return result; - - volume->nonce = uds_get_volume_nonce(layout); - - result = uds_copy_index_geometry(config->geometry, &volume->geometry); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return uds_log_warning_strerror(result, - "failed to allocate geometry: error"); - } - geometry = volume->geometry; - - /* - * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one - * for each entry in the sparse cache. - */ - reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter; - reserved_buffers += 1; - if (uds_is_sparse_index_geometry(geometry)) - reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter); - volume->reserved_buffers = reserved_buffers; - result = uds_open_volume_bufio(layout, geometry->bytes_per_page, - volume->reserved_buffers, &volume->client); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - result = uds_make_radix_sorter(geometry->records_per_page, - &volume->radix_sorter); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - result = uds_allocate(geometry->records_per_page, - const struct uds_volume_record *, "record pointers", - &volume->record_pointers); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - if (uds_is_sparse_index_geometry(geometry)) { - size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page; - - result = uds_make_sparse_cache(geometry, config->cache_chapters, - config->zone_count, - &volume->sparse_cache); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - volume->cache_size = - page_size * geometry->index_pages_per_chapter * config->cache_chapters; - } - - result = initialize_page_cache(&volume->page_cache, geometry, - config->cache_chapters, config->zone_count); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page); - result = uds_make_index_page_map(geometry, &volume->index_page_map); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - mutex_init(&volume->read_threads_mutex); - uds_init_cond(&volume->read_threads_read_done_cond); - uds_init_cond(&volume->read_threads_cond); - - result = uds_allocate(config->read_threads, struct thread *, "reader threads", - &volume->reader_threads); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - for (i = 0; i < config->read_threads; i++) { - result = vdo_create_thread(read_thread_function, (void *) volume, - "reader", &volume->reader_threads[i]); - if (result != UDS_SUCCESS) { - uds_free_volume(volume); - return result; - } - - volume->read_thread_count = i + 1; - } - - *new_volume = volume; - return UDS_SUCCESS; -} - -static void uninitialize_page_cache(struct page_cache *cache) -{ - u16 i; - - if (cache->cache != NULL) { - for (i = 0; i < cache->cache_slots; i++) - release_page_buffer(&cache->cache[i]); - } - uds_free(cache->index); - uds_free(cache->cache); - uds_free(cache->search_pending_counters); - uds_free(cache->read_queue); -} - -void uds_free_volume(struct volume *volume) -{ - if (volume == NULL) - return; - - if (volume->reader_threads != NULL) { - unsigned int i; - - /* This works even if some threads weren't started. */ - mutex_lock(&volume->read_threads_mutex); - volume->read_threads_exiting = true; - uds_broadcast_cond(&volume->read_threads_cond); - mutex_unlock(&volume->read_threads_mutex); - for (i = 0; i < volume->read_thread_count; i++) - vdo_join_threads(volume->reader_threads[i]); - uds_free(volume->reader_threads); - volume->reader_threads = NULL; - } - - /* Must destroy the client AFTER freeing the cached pages. */ - uninitialize_page_cache(&volume->page_cache); - uds_free_sparse_cache(volume->sparse_cache); - if (volume->client != NULL) - dm_bufio_client_destroy(uds_forget(volume->client)); - - uds_free_index_page_map(volume->index_page_map); - uds_free_radix_sorter(volume->radix_sorter); - uds_free(volume->geometry); - uds_free(volume->record_pointers); - uds_free(volume); -} diff --git a/drivers/md/dm-vdo/volume.h b/drivers/md/dm-vdo/volume.h deleted file mode 100644 index 290de5cbf9ec8..0000000000000 --- a/drivers/md/dm-vdo/volume.h +++ /dev/null @@ -1,171 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2023 Red Hat - */ - -#ifndef UDS_VOLUME_H -#define UDS_VOLUME_H - -#include -#include -#include -#include - -#include "chapter-index.h" -#include "config.h" -#include "geometry.h" -#include "indexer.h" -#include "index-layout.h" -#include "index-page-map.h" -#include "permassert.h" -#include "radix-sort.h" -#include "sparse-cache.h" -#include "thread-utils.h" - -/* - * The volume manages deduplication records on permanent storage. The term "volume" can also refer - * to the region of permanent storage where the records (and the chapters containing them) are - * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages - * as necessary. - */ - -enum index_lookup_mode { - /* Always do lookups in all chapters normally */ - LOOKUP_NORMAL, - /* Only do a subset of lookups needed when rebuilding an index */ - LOOKUP_FOR_REBUILD, -}; - -struct queued_read { - bool invalid; - bool reserved; - u32 physical_page; - struct uds_request *first_request; - struct uds_request *last_request; -}; - -struct __aligned(L1_CACHE_BYTES) search_pending_counter { - u64 atomic_value; -}; - -struct cached_page { - /* Whether this page is currently being read asynchronously */ - bool read_pending; - /* The physical page stored in this cache entry */ - u32 physical_page; - /* The value of the volume clock when this page was last used */ - s64 last_used; - /* The cached page buffer */ - struct dm_buffer *buffer; - /* The chapter index page, meaningless for record pages */ - struct delta_index_page index_page; -}; - -struct page_cache { - /* The number of zones */ - unsigned int zone_count; - /* The number of volume pages that can be cached */ - u32 indexable_pages; - /* The maximum number of simultaneously cached pages */ - u16 cache_slots; - /* An index for each physical page noting where it is in the cache */ - u16 *index; - /* The array of cached pages */ - struct cached_page *cache; - /* A counter for each zone tracking if a search is occurring there */ - struct search_pending_counter *search_pending_counters; - /* The read queue entries as a circular array */ - struct queued_read *read_queue; - - /* All entries above this point are constant after initialization. */ - - /* - * These values are all indexes into the array of read queue entries. New entries in the - * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the - * lock and then claims the entry pointed to by read_queue_next_read and increments that - * value. After the read is completed, the reader thread calls release_read_queue_entry(), - * which increments read_queue_first until it points to a pending read, or is equal to - * read_queue_next_read. This means that if multiple reads are outstanding, - * read_queue_first might not advance until the last of the reads finishes. - */ - u16 read_queue_first; - u16 read_queue_next_read; - u16 read_queue_last; - - atomic64_t clock; -}; - -struct volume { - struct index_geometry *geometry; - struct dm_bufio_client *client; - u64 nonce; - size_t cache_size; - - /* A single page worth of records, for sorting */ - const struct uds_volume_record **record_pointers; - /* Sorter for sorting records within each page */ - struct radix_sorter *radix_sorter; - - struct sparse_cache *sparse_cache; - struct page_cache page_cache; - struct index_page_map *index_page_map; - - struct mutex read_threads_mutex; - struct cond_var read_threads_cond; - struct cond_var read_threads_read_done_cond; - struct thread **reader_threads; - unsigned int read_thread_count; - bool read_threads_exiting; - - enum index_lookup_mode lookup_mode; - unsigned int reserved_buffers; -}; - -int __must_check uds_make_volume(const struct uds_configuration *config, - struct index_layout *layout, - struct volume **new_volume); - -void uds_free_volume(struct volume *volume); - -int __must_check uds_replace_volume_storage(struct volume *volume, - struct index_layout *layout, - struct block_device *bdev); - -int __must_check uds_find_volume_chapter_boundaries(struct volume *volume, - u64 *lowest_vcn, u64 *highest_vcn, - bool *is_empty); - -int __must_check uds_search_volume_page_cache(struct volume *volume, - struct uds_request *request, - bool *found); - -int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume, - const struct uds_record_name *name, - u64 virtual_chapter, - bool *found); - -int __must_check uds_search_cached_record_page(struct volume *volume, - struct uds_request *request, u32 chapter, - u16 record_page_number, bool *found); - -void uds_forget_chapter(struct volume *volume, u64 chapter); - -int __must_check uds_write_chapter(struct volume *volume, - struct open_chapter_index *chapter_index, - const struct uds_volume_record records[]); - -void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter); - -int __must_check uds_read_chapter_index_from_volume(const struct volume *volume, - u64 virtual_chapter, - struct dm_buffer *volume_buffers[], - struct delta_index_page index_pages[]); - -int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter, - u32 page_number, u8 **data_ptr); - -int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter, - u32 page_number, - struct delta_index_page **page_ptr); - -#endif /* UDS_VOLUME_H */