diff --git a/TODO b/TODO index c7313d95..4a2e9aec 100644 --- a/TODO +++ b/TODO @@ -37,7 +37,6 @@ LATER: * send progress information via sd_notify(), so that people can wrap casync nicely in UIs * maybe turn "recursive" mode into a numeric value specifying how far to descend? * make "casync stat" work on a directory with a subpath -* tweak chunker: shift cut to last "marker". * define sane errors we can show user messages about * introduce a --best-effort mode when replaying, which means we'll ignore what we can't apply * when building the cache, also build a seed @@ -49,6 +48,5 @@ LATER: * make sure "casync list /etc/fstab" does something useful * rework CaSeed logic to use CaCache as backend, and then add a new command "casync cache" or so, to explicitly generate a cache/seed * support blake2 as hashes -* parallelize image generation: when storing chunks in the store do so in a thread * in "casync stat" output show which flags enable what * save/restore xfs/ext4 projid diff --git a/doc/casync.rst b/doc/casync.rst index 6191f716..f5a432d6 100644 --- a/doc/casync.rst +++ b/doc/casync.rst @@ -157,6 +157,8 @@ General options: --store=PATH The primary chunk store to use --extra-store= Additional chunk store to look for chunks in --chunk-size=<[MIN:]AVG[:MAX]> The minimal/average/maximum number of bytes in a chunk +--cutmark=CUTMARK Specify a cutmark +--cutmark-delta-bytes=BYTES Maximum bytes to shift cut due to cutmark --digest= Pick digest algorithm (sha512-256 or sha256) --compression= Pick compression algorithm (zstd, xz or gzip) --seed= Additional file or directory to use as seed @@ -291,3 +293,79 @@ excluded: unconditionally take precedence over lines not marked like this. Moreover, lines prefixed with ``!`` also cancel the effect of patterns in ``.caexclude`` files placed in directories further up the tree. + +Cutmarks +-------- + +``casync`` cuts the stream to serialize into chunks of an average size (as +specified with ``--chunk-size=``), determining cut points using the ``buzhash`` +rolling hash function and a modulo test. Frequently, cut points determined that +way are at slightly inconvenient locations: in the midle of objects serialized +in the stream rather then before or after them, thus needlessly exploding +changes to individual objects into more than one chunk. To optimize this +**cutmarks** may be configured. These are byte sequences ``casync`` (up to 8 +bytes in length) automatically detects in the data stream and that should be +considered particularly good cutpoints. When cutmarks are defined the chunking +algorithm will slightly move the cut point between two chunks to match a +cutmark if one has recently been seen in the serialization stream. + +Cutmarks may be specified with the ``--cutmark=`` option. It takes a cutmark +specification in the format ``VALUE:MASK+OFFSET`` or ``VALUE:MASK-OFFSET``. The +first part, the value indicates the byte sequence to detect in hexadecimal +digits, up to 8 bytes (thus 16 characters) in length. Following the colon a +bitmask (also in hexadecimal) may be specified of the same size. Every 8 byte +sequence at every 1 byte granularity stream position is tested against the +value. If all bits indicated in the mask match a cutmark is found. The third +part of the specification indicates where to place the cutmark specifically +relative to the the end of the 8 byte sequence. Specify ``-8`` to cut +immediately before the cutmark sequence, and ``+0`` right after. The offset +(along with its ``+`` or ``-`` character) may be omitted, in which case the +offset is assumed to be zero, i.e. the cut is done right after the +sequence. The mask (along with its ``:`` character) may also be omitted, in +which case it is assumed to be ``FFFFFFFFFFFFFFFF``, i.e. all +bits on, matching the full specified byte sequence. In order to match shorter +byte sequence (for example to adapt the tool to some specific file format using +shorter object or section markers) simply specificy a shorter mask value and +correct the offset value. + +Examples: + + --cutmark=123456789ABCDEF0 + + +This defines a cutmark to be the 8 byte sequence 0x12, 0x34, 0x56, 0x78, 0x9A, +0xBC, 0xDE, 0xF0, and the cut is placed right after the last byte, i.e. after the +0xF0. + + + --cutmark=C0FFEE:FFFFFF-5 + + +This defines a cutmark to be the 3 byte sequence 0xC0, 0xFF, 0xEE and the cut is +placed right after the last byte, i.e. after the 0xEE. + + --cutmark=C0DECAFE:FFFFFFFF-8 + + +This defines a cutmark to be the 4 byte sequence 0xC0, 0xDE, 0xCA, 0xFE and the +cut is placed right before the first byte, i.e. before the 0xC0. + +When operating on the file system layer (i.e. when creating `.caidx` files), +the implicit cutmark of ``--cutmark=51bb5beabcfa9613+8`` is used, to increase +the chance that cutmarks are placed right before each serialized file. + +Multiple cutmarks may be defined on the same operation, simply specify +``--cutmark=`` multiple times. The parameter also takes the specifical values +``yes`` and ``no``. If the latter any implicit cutmarks are turned off, in +particular the implicit cutmark used when generating ``.caidx`` files above. + +``casync`` will honour cutmarks only within the immediate vicinity of the cut +point the modulo test suggested. By default this a 16K window before the +calculated cut point. This value may be altered using the +``--cutmark-delta-max=`` setting. + +Any configured cutmark (and the selected ``--cutmark-delta-max=`` value) is +also stored in the ``.caidx`` or ``.caibx`` file to ensure that such an index +file contains sufficient data for an extracting client to properly use an +existing file system tree (or block device) as seed while applying the same +chunking logic as the original image. diff --git a/meson.build b/meson.build index 49dcaf0e..78967a07 100644 --- a/meson.build +++ b/meson.build @@ -88,6 +88,7 @@ foreach ident : [ ['copy_file_range', '''#define _GNU_SOURCE #include #include '''], + ['reallocarray', '''#include '''], ] have = cc.has_function(ident[0], args : '-D_GNU_SOURCE', prefix : ident[1]) conf.set10('HAVE_' + ident[0].to_upper(), have) @@ -312,6 +313,14 @@ test_cache = find_program(test_cache_sh) test('test-cache.sh', test_cache, timeout : 30 * 60) +test_seed_sh = configure_file( + output : 'test-seed.sh', + input : 'test/test-seed.sh.in', + configuration : substs) +test_seed = find_program(test_seed_sh) +test('test-seed.sh', test_seed, + timeout : 30 * 60) + udev_rule = configure_file( output : '75-casync.rules', input : 'src/75-casync.rules.in', @@ -325,6 +334,7 @@ test_sources = ''' test-cachunk test-cachunker test-cachunker-histogram + test-cacutmark test-cadigest test-caencoder test-calocation diff --git a/src/affinity-count.c b/src/affinity-count.c new file mode 100644 index 00000000..29f749ea --- /dev/null +++ b/src/affinity-count.c @@ -0,0 +1,39 @@ +#include "affinity-count.h" + +#include +#include +#include + +int cpus_in_affinity_mask(void) { + size_t n = 16; + int r; + + for (;;) { + cpu_set_t *c; + + c = CPU_ALLOC(n); + if (!c) + return -ENOMEM; + + if (sched_getaffinity(0, CPU_ALLOC_SIZE(n), c) >= 0) { + int k; + + k = CPU_COUNT_S(CPU_ALLOC_SIZE(n), c); + CPU_FREE(c); + + if (k <= 0) + return -EINVAL; + + return k; + } + + r = -errno; + CPU_FREE(c); + + if (r != -EINVAL) + return r; + if (n*2 < n) + return -ENOMEM; + n *= 2; + } +} diff --git a/src/affinity-count.h b/src/affinity-count.h new file mode 100644 index 00000000..0a07d098 --- /dev/null +++ b/src/affinity-count.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef fooaffinitycounthfoo +#define fooaffinitycounthfoo + +int cpus_in_affinity_mask(void); + +#endif diff --git a/src/cachunker.c b/src/cachunker.c index 35fe8ca2..853343bc 100644 --- a/src/cachunker.c +++ b/src/cachunker.c @@ -197,35 +197,102 @@ static bool shall_break(CaChunker *c, uint32_t v) { return (v % c->discriminator) == (c->discriminator - 1); } -size_t ca_chunker_scan(CaChunker *c, const void* p, size_t n) { +static bool CA_CHUNKER_IS_FIXED_SIZE(CaChunker *c) { + return c->chunk_size_min == c->chunk_size_avg && + c->chunk_size_max == c->chunk_size_avg; +} + +static bool is_cutmark( + uint64_t qword, + const CaCutmark *cutmarks, + size_t n_cutmarks, + int64_t *ret_delta) { + + size_t i; + + for (i = 0; i < n_cutmarks; i++) { + const CaCutmark *m = cutmarks + i; + + if (((qword ^ m->value) & m->mask) == 0) { + *ret_delta = m->delta; + return true; + } + } + + *ret_delta = 0; + return false; +} + +static void find_cutmarks(CaChunker *c, const void *p, size_t n) { const uint8_t *q = p; + + /* Find "cutmarks", i.e. special magic values that may appear in the + * stream that make particularly good places for cutting up things into + * chunks. When our chunker finds a position to cut we'll check if a + * cutmark was recently seen, and if so the cut will be moved to that + * place. */ + + if (c->n_cutmarks == 0) /* Shortcut: no cutmark magic cutmark values defined */ + return; + + for (; n > 0; q++, n--) { + /* Let's put together a qword overlapping every byte of the file, in BE ordering. */ + c->qword_be = (c->qword_be << 8) | *q; + + if (c->last_cutmark >= (ssize_t) sizeof(c->qword_be)) { + int64_t delta; + + if (is_cutmark(be64toh(c->qword_be), c->cutmarks, c->n_cutmarks, &delta)) { + c->last_cutmark = -delta; + continue; + } + } + + c->last_cutmark++; + } +} + +static void chunker_cut(CaChunker *c) { + assert(c); + + c->h = 0; + c->window_size = 0; + c->chunk_size = 0; +} + +size_t ca_chunker_scan(CaChunker *c, bool test_break, const void* p, size_t n) { + const uint8_t *q = p; + size_t k, idx; uint32_t v; - size_t k = 0, idx; assert(c); assert(p); - /* fixed size chunker */ + /* Scans the specified bytes for chunk borders. Returns (size_t) -1 if + * no border was discovered, otherwise the chunk size. */ - if (c->chunk_size_min == c->chunk_size_avg && c->chunk_size_max == c->chunk_size_avg) { - size_t m; - size_t fixed_size = c->chunk_size_avg; + if (CA_CHUNKER_IS_FIXED_SIZE(c)) { + /* Special case: fixed size chunker */ + size_t m, fixed_size = c->chunk_size_avg; /* Append to window to make it full */ + assert(c->chunk_size < fixed_size); m = MIN(fixed_size - c->chunk_size, n); - c->chunk_size += m; + /* Note that we don't search for cutmarks if we are in fixed + * size mode, since there's no point, we'll not move the cuts + * anyway, since the chunks shall be fixed size. */ + if (c->chunk_size < fixed_size) return (size_t) -1; - k = m; - - goto now; + chunker_cut(c); + return m; } - /* Scans the specified bytes for chunk borders. Returns (size_t) -1 if no border was discovered, otherwise the - * chunk size. */ + if (c->window_size == 0) /* Reset cutmark location after each cut */ + c->last_cutmark = c->chunk_size; if (c->window_size < CA_CHUNKER_WINDOW_SIZE) { size_t m; @@ -240,16 +307,20 @@ size_t ca_chunker_scan(CaChunker *c, const void* p, size_t n) { c->chunk_size += m; /* If the window isn't full still, return early */ - if (c->window_size < CA_CHUNKER_WINDOW_SIZE) + if (c->window_size < CA_CHUNKER_WINDOW_SIZE) { + find_cutmarks(c, p, m); return (size_t) -1; + } /* Window is full, we are now ready to go. */ v = ca_chunker_start(c, c->window, c->window_size); k = m; - if (shall_break(c, v)) + if (test_break && + shall_break(c, v)) goto now; - } + } else + k = 0; idx = c->chunk_size % CA_CHUNKER_WINDOW_SIZE; @@ -258,7 +329,8 @@ size_t ca_chunker_scan(CaChunker *c, const void* p, size_t n) { c->chunk_size++; k++; - if (shall_break(c, v)) + if (test_break && + shall_break(c, v)) goto now; c->window[idx++] = *q; @@ -268,12 +340,179 @@ size_t ca_chunker_scan(CaChunker *c, const void* p, size_t n) { q++, n--; } + find_cutmarks(c, p, k); return (size_t) -1; now: - c->h = 0; - c->chunk_size = 0; - c->window_size = 0; - + find_cutmarks(c, p, k); + chunker_cut(c); return k; } + +uint64_t ca_chunker_cutmark_delta_max(CaChunker *c) { + assert(c); + + if (c->cutmark_delta_max != UINT64_MAX) + return c->cutmark_delta_max; + + /* If not specified otherwise, use a quarter of the average chunk size. (Unless reconfigured this + * happens to match the default for chunk_size_min btw). */ + return c->chunk_size_avg / 4; +} + +int ca_chunker_extract_chunk( + CaChunker *c, + ReallocBuffer *buffer, + const void **pp, + size_t *ll, + const void **ret_chunk, + size_t *ret_chunk_size) { + + const void *chunk = NULL, *p; + size_t chunk_size = 0, k; + bool indirect = false; + size_t l; + + /* Takes some input data at *p of size *l and chunks it. If this supplied data is too short for a + * cut, adds the data to the specified buffer instead and returns CA_CHUNKER_NOT_YET. If a chunk is + * generated returns CA_CHUNKER_DIRECT or CA_CHUNKER_INDIRECT and returns a ptr to the new chunk in + * *ret_chunk, and its size in *ret_chunk_size. The value CA_CHUNKER_DIRECT is returned if these + * pointers point into the memory area passed in through *p and *l. The value CA_CHUNKER_INDIRECT is + * returned if the pointers point to a memory area in the specified 'buffer' object. In the latter + * case the caller should drop the chunk from the buffer after use with realloc_buffer_advance(). */ + + assert(c); + assert(buffer); + assert(pp); + assert(ll); + + p = *pp; + l = *ll; + + if (c->cut_pending != (size_t) -1) { + + /* Is there a cut pending, if so process that first. */ + + if (c->cut_pending > l) { + /* Need to read more. Let's add what we have now to the buffer hence, and continue */ + if (!realloc_buffer_append(buffer, p, l)) + return -ENOMEM; + + c->cut_pending -= l; + goto not_yet; + } + + k = c->cut_pending; + c->cut_pending = (size_t) -1; + + } else { + k = ca_chunker_scan(c, true, p, l); + if (k == (size_t) -1) { + /* No cut yet, add the stuff to the buffer, and return */ + if (!realloc_buffer_append(buffer, p, l)) + return -ENOMEM; + + goto not_yet; + } + + /* Nice, we got told by the chunker to generate a new chunk. Let's see if we have any + * suitable cutmarks we can use, so that we slightly shift the actual cut. */ + + if (c->n_cutmarks > 0) { + + /* Is there a left-hand cutmark defined that is within the area we are looking? */ + if (c->last_cutmark > 0 && + (size_t) c->last_cutmark <= ca_chunker_cutmark_delta_max(c)) { + + size_t cs; + + /* Yay, found a cutmark, to the left of the calculated cut. */ + + cs = realloc_buffer_size(buffer) + k; + + /* Does this cutmark violate of the minimal chunk size? */ + if ((size_t) c->last_cutmark < cs && + cs - (size_t) c->last_cutmark >= c->chunk_size_min) { + + /* Yay, we found a cutmark we can apply. Let's add everything we have + * to the full buffer, and the take out just what we need from it.*/ + + if (!realloc_buffer_append(buffer, p, k)) + return -ENOMEM; + + chunk = realloc_buffer_data(buffer); + + chunk_size = realloc_buffer_size(buffer); + assert_se(chunk_size >= (size_t) c->last_cutmark); + chunk_size -= (size_t) c->last_cutmark; + + indirect = true; + + c->cutmark_delta_sum += c->last_cutmark; + c->n_cutmarks_applied ++; + + /* Make sure the rolling hash function processes the data that remains in the buffer */ + assert_se(ca_chunker_scan(c, false, (uint8_t*) chunk + chunk_size, c->last_cutmark) == (size_t) -1); + } + + } else if (c->last_cutmark < 0 && + (size_t) -c->last_cutmark <= ca_chunker_cutmark_delta_max(c)) { + + size_t cs; + + /* Yay, found a cutmark, to the right of the calculated cut */ + + cs = realloc_buffer_size(buffer) + k; + + if (cs + (size_t) -c->last_cutmark <= c->chunk_size_max) { + + /* Remember how many more bytes to process before the cut we + * determine shall take place. Note that we don't advance anything + * here, we'll call ourselves and then do that for. */ + c->cut_pending = k + (size_t) -c->last_cutmark; + + c->cutmark_delta_sum -= c->last_cutmark; + c->n_cutmarks_applied ++; + + return ca_chunker_extract_chunk(c, buffer, pp, ll, ret_chunk, ret_chunk_size); + } + } + } + } + + if (!chunk) { + if (realloc_buffer_size(buffer) == 0) { + chunk = p; + chunk_size = k; + } else { + if (!realloc_buffer_append(buffer, p, k)) + return -ENOMEM; + + chunk = realloc_buffer_data(buffer); + chunk_size = realloc_buffer_size(buffer); + + indirect = true; + } + } + + *pp = (uint8_t*) p + k; + *ll = l - k; + + if (ret_chunk) + *ret_chunk = chunk; + if (ret_chunk_size) + *ret_chunk_size = chunk_size; + + return indirect ? CA_CHUNKER_INDIRECT : CA_CHUNKER_DIRECT; + +not_yet: + *pp = (uint8_t*) p + l; + *ll = 0; + + if (ret_chunk) + *ret_chunk = NULL; + if (ret_chunk_size) + *ret_chunk_size = 0; + + return CA_CHUNKER_NOT_YET; +} diff --git a/src/cachunker.h b/src/cachunker.h index 8329b9b4..d3e5312d 100644 --- a/src/cachunker.h +++ b/src/cachunker.h @@ -7,6 +7,9 @@ #include #include +#include "cacutmark.h" +#include "realloc-buffer.h" + /* The default average chunk size */ #define CA_CHUNK_SIZE_AVG_DEFAULT ((size_t) (64U*1024U)) @@ -32,6 +35,22 @@ typedef struct CaChunker { size_t discriminator; uint8_t window[CA_CHUNKER_WINDOW_SIZE]; + + const CaCutmark *cutmarks; /* List of defined cutmarks to look for */ + size_t n_cutmarks; + + ssize_t last_cutmark; /* The byte offset we have seen the last cutmark at, relative to the current byte index */ + uint64_t qword_be; /* The last 8 byte we read, always shifted through and hence in BE format. */ + + /* How many bytes to go back to search for cutmarks at most */ + uint64_t cutmark_delta_max; + + /* A cutmark was previously found, pointing to a cut in the future. This specifies how many more + * bytes to process before the cut we already determined shall take place. */ + size_t cut_pending; + + uint64_t n_cutmarks_applied; + int64_t cutmark_delta_sum; } CaChunker; /* The default initializer for the chunker. We pick an average chunk size equivalent to 64K */ @@ -41,6 +60,8 @@ typedef struct CaChunker { .chunk_size_avg = CA_CHUNK_SIZE_AVG_DEFAULT, \ .chunk_size_max = CA_CHUNK_SIZE_AVG_DEFAULT*4, \ .discriminator = CA_CHUNKER_DISCRIMINATOR_FROM_AVG(CA_CHUNK_SIZE_AVG_DEFAULT), \ + .cutmark_delta_max = UINT64_MAX, \ + .cut_pending = (size_t) -1, \ } /* Set the min/avg/max chunk size. Each parameter may be 0, in which case a default is used. */ @@ -48,10 +69,20 @@ int ca_chunker_set_size(CaChunker *c, size_t min_size, size_t avg_size, size_t m /* Scans the specified data for a chunk border. Returns (size_t) -1 if none was found (and the function should be * called with more data later on), or another value indicating the position of a border. */ -size_t ca_chunker_scan(CaChunker *c, const void* p, size_t n); +size_t ca_chunker_scan(CaChunker *c, bool test_break, const void* p, size_t n); /* Low-level buzhash functions. Only exported for testing purposes. */ uint32_t ca_chunker_start(CaChunker *c, const void *p, size_t n); uint32_t ca_chunker_roll(CaChunker *c, uint8_t pop_byte, uint8_t push_byte); +uint64_t ca_chunker_cutmark_delta_max(CaChunker *c); + +enum { + CA_CHUNKER_NOT_YET, /* Not enough data for chunk */ + CA_CHUNKER_DIRECT, /* Found chunk, directly in the specified *pp and *ll buffer */ + CA_CHUNKER_INDIRECT, /* Found chunk, but inside of *buffer, need to advance it afterwards */ +}; + +int ca_chunker_extract_chunk(CaChunker *c, ReallocBuffer *buffer, const void **pp, size_t *ll, const void **ret_chunk, size_t *ret_chunk_size); + #endif diff --git a/src/cacutmark.c b/src/cacutmark.c new file mode 100644 index 00000000..e06e5fb9 --- /dev/null +++ b/src/cacutmark.c @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include + +#include "cacutmark.h" +#include "util.h" + +int ca_cutmark_parse(CaCutmark *c, const char *p) { + enum { + VALUE, + MASK, + DELTA_PLUS, + DELTA_MINUS, + } state = VALUE; + + uint64_t value = 0, mask = 0, udelta = 0; + int64_t delta = 0; + size_t digits = 0; + const char *q; + + /* Parsers a cutmark specification. Expects a value (in hex), + * optionally followed by a slash and a mask (in hex), optionally + * followed by +/- and a delta offset (in dec). */ + + for (q = p;; q++) { + + switch (state) { + + case VALUE: + if (*q == 0) { + if (digits == 0) + return -EINVAL; + + goto done; + + } else if (*q == '/') { + if (digits == 0) + return -EINVAL; + + state = MASK; + mask = 0; + digits = 0; + + } else if (*q == '+') { + if (digits == 0) + return -EINVAL; + + state = DELTA_PLUS; + digits = 0; + + } else if (*q == '-') { + if (digits == 0) + return -EINVAL; + + state = DELTA_MINUS; + digits = 0; + + } else { + int k; + + if (digits >= 16) + return -EOVERFLOW; + + k = unhexchar(*q); + if (k < 0) + return k; + + value = (value << 4) | k; + mask = (mask << 4) | 0xFU; + digits++; + } + + break; + + case MASK: + if (*q == 0) { + if (digits == 0 || mask == 0) + return -EINVAL; + + goto done; + + } else if (*q == '+') { + if (digits == 0 || mask == 0) + return -EINVAL; + + state = DELTA_PLUS; + digits = 0; + } else if (*q == '-') { + if (digits == 0 || mask == 0) + return -EINVAL; + + state = DELTA_MINUS; + digits = 0; + } else { + int k; + + if (digits >= 16) + return -EOVERFLOW; + + k = unhexchar(*q); + if (k < 0) + return k; + + mask = (mask << 4) | k; + digits++; + } + + break; + + case DELTA_PLUS: + case DELTA_MINUS: + + if (*q == 0) { + if (digits == 0) + return -EINVAL; + + if (state == DELTA_MINUS) { + if (udelta > - (uint64_t) INT64_MIN) + return -EOVERFLOW; + + if (udelta == -(uint64_t) INT64_MIN) + delta = INT64_MIN; + else + delta = -(int64_t) udelta; + } else { + if (udelta > INT64_MAX) + return -EOVERFLOW; + + delta = (int64_t) udelta; + } + + goto done; + } else { + uint64_t d; + int k; + + k = undecchar(*q); + if (k < 0) + return k; + + d = udelta*10; + if (d < udelta) + return -EOVERFLOW; + d += k; + if (d < udelta*10) + return -EOVERFLOW; + + udelta = d; + digits ++; + } + + break; + } + } + +done: + *c = (CaCutmark) { + .value = value, + .mask = mask, + .delta = delta, + }; + + return 0; +} + +int ca_cutmark_cmp(const CaCutmark *a, const CaCutmark *b) { + int r; + + if (a == b) + return 0; + if (!a) + return -1; + if (!b) + return 1; + + r = CMP(a->value, b->value); + if (r != 0) + return r; + + r = CMP(a->mask, b->mask); + if (r != 0) + return r; + + return CMP(a->delta, b->delta); +} + +void ca_cutmark_sort(CaCutmark *c, size_t n) { + + if (n <= 1) + return; + + assert(c); + qsort(c, n, sizeof(CaCutmark), (__compar_fn_t) ca_cutmark_cmp); +} diff --git a/src/cacutmark.h b/src/cacutmark.h new file mode 100644 index 00000000..984cd073 --- /dev/null +++ b/src/cacutmark.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef foocutmarkhfoo +#define foocutmarkhfoo + +#include +#include +#include + +typedef struct CaCutmark { + uint64_t value; /* Numeric value of the cutmark */ + uint64_t mask; /* Mask to apply when matching the cutmark */ + int64_t delta; /* Where to cut, as an offset (possibly negative) relative to the position right after the 64bit value. */ +} CaCutmark; + +int ca_cutmark_parse(CaCutmark *c, const char *p); + +void ca_cutmark_sort(CaCutmark *c, size_t n); + +int ca_cutmark_cmp(const CaCutmark *a, const CaCutmark *b); + +#endif diff --git a/src/caencoder.c b/src/caencoder.c index 0262a36b..38eacd4e 100644 --- a/src/caencoder.c +++ b/src/caencoder.c @@ -3527,9 +3527,9 @@ int ca_encoder_current_location(CaEncoder *e, uint64_t add, CaLocation **ret) { name_table_node = node; - /* Here's a tweak: in CA_ENCODER_FILENAME state we actually encode the child's data, as we our current - * node might be the directory, but we need to serialize at which directory entry within it we - * currently are. */ + /* Here's a tweak: in CA_ENCODER_FILENAME state we actually encode the child's data, as our + * current node might be the directory, but we need to serialize at which directory entry + * within it we currently are. */ node = ca_encoder_current_child_node(e); if (!node) return -EUNATCH; diff --git a/src/caformat.h b/src/caformat.h index 660f167c..acd5c474 100644 --- a/src/caformat.h +++ b/src/caformat.h @@ -72,6 +72,8 @@ enum { /* The index file format */ CA_FORMAT_INDEX = UINT64_C(0x96824d9c7b129ff9), + CA_FORMAT_CUTMARK_DELTA_MAX = UINT64_C(0x1bd67079a4939e36), + CA_FORMAT_CUTMARK = UINT64_C(0x5a218a2418b94da3), CA_FORMAT_TABLE = UINT64_C(0xe75b9e112f17417d), /* The end marker used in the TABLE object */ @@ -427,6 +429,18 @@ typedef struct CaFormatIndex { le64_t chunk_size_max; } CaFormatIndex; +typedef struct CaFormatCutmarkDeltaMax { + CaFormatHeader header; + le64_t cutmark_delta_max; +} CaFormatCutmarkDeltaMax; + +typedef struct CaFormatCutmark { + CaFormatHeader header; + le64_t value; + le64_t mask; + le64_t delta; +} CaFormatCutmark; + typedef struct CaFormatTableItem { le64_t offset; uint8_t chunk[CA_CHUNK_ID_SIZE]; diff --git a/src/caindex.c b/src/caindex.c index 1ee74b73..84e91ecc 100644 --- a/src/caindex.c +++ b/src/caindex.c @@ -18,9 +18,9 @@ /* #define EINVAL __LINE__ */ typedef enum CaIndexMode { - CA_INDEX_WRITE, /* only cooked writing */ + CA_INDEX_WRITE, /* only cooked (i.e. individual parsed fields) writing */ CA_INDEX_READ, /* only cooked reading */ - CA_INDEX_INCREMENTAL_WRITE, /* cooked writing + incremental raw reading back */ + CA_INDEX_INCREMENTAL_WRITE, /* cooked writing + incremental raw (i.e. raw byte-wise) reading back */ CA_INDEX_INCREMENTAL_READ, /* incremental raw writing + cooked reading back */ } CaIndexMode; @@ -48,6 +48,11 @@ struct CaIndex { uint64_t file_size; /* The size of the index file */ uint64_t blob_size; /* The size of the blob this index file describes */ + + CaCutmark *cutmarks; + size_t n_cutmarks; + + uint64_t cutmark_delta_max; }; static inline uint64_t CA_INDEX_METADATA_SIZE(CaIndex *i) { @@ -142,6 +147,8 @@ CaIndex *ca_index_unref(CaIndex *i) { if (i->fd >= 2) safe_close(i->fd); + free(i->cutmarks); + return mfree(i); } @@ -242,15 +249,8 @@ static int ca_index_open_fd(CaIndex *i) { static int ca_index_write_head(CaIndex *i) { - struct { - CaFormatIndex index; - CaFormatHeader table; - } head = { - .index.header.size = htole64(sizeof(CaFormatIndex)), - .index.header.type = htole64(CA_FORMAT_INDEX), - .table.size = htole64(UINT64_MAX), - .table.type = htole64(CA_FORMAT_TABLE), - }; + CaFormatIndex index; + CaFormatHeader table; int r; assert(i); @@ -272,19 +272,65 @@ static int ca_index_write_head(CaIndex *i) { i->chunk_size_avg <= i->chunk_size_max)) return -EINVAL; - head.index.feature_flags = htole64(i->feature_flags); + assert(i->cooked_offset == 0); - head.index.chunk_size_min = htole64(i->chunk_size_min); - head.index.chunk_size_avg = htole64(i->chunk_size_avg); - head.index.chunk_size_max = htole64(i->chunk_size_max); + index = (CaFormatIndex) { + .header.size = htole64(sizeof(CaFormatIndex)), + .header.type = htole64(CA_FORMAT_INDEX), + .feature_flags = htole64(i->feature_flags), + .chunk_size_min = htole64(i->chunk_size_min), + .chunk_size_avg = htole64(i->chunk_size_avg), + .chunk_size_max = htole64(i->chunk_size_max), + }; - assert(i->cooked_offset == 0); + r = loop_write(i->fd, &index, sizeof(index)); + if (r < 0) + return r; - r = loop_write(i->fd, &head, sizeof(head)); + i->cooked_offset = sizeof(index); + + if (i->n_cutmarks > 0) { + CaFormatCutmarkDeltaMax dm = { + .header.size = htole64(sizeof(CaFormatCutmarkDeltaMax)), + .header.type = htole64(CA_FORMAT_CUTMARK_DELTA_MAX), + .cutmark_delta_max = htole64(i->cutmark_delta_max), + }; + size_t j; + + r = loop_write(i->fd, &dm, sizeof(dm)); + if (r < 0) + return r; + + i->cooked_offset += sizeof(dm); + + for (j = 0; j < i->n_cutmarks; j++) { + CaFormatCutmark cm = { + .header.size = htole64(sizeof(CaFormatCutmark)), + .header.type = htole64(CA_FORMAT_CUTMARK), + .value = htole64(i->cutmarks[j].value), + .mask = htole64(i->cutmarks[j].mask), + .delta = htole64((uint64_t) i->cutmarks[j].delta), + }; + + r = loop_write(i->fd, &cm, sizeof(cm)); + if (r < 0) + return r; + + i->cooked_offset += sizeof(cm); + } + } + + table = (CaFormatHeader) { + .size = htole64(UINT64_MAX), + .type = htole64(CA_FORMAT_TABLE), + }; + + r = loop_write(i->fd, &table, sizeof(table)); if (r < 0) return r; - i->start_offset = i->cooked_offset = sizeof(head); + i->cooked_offset += sizeof(table); + i->start_offset = i->cooked_offset; return 0; } @@ -311,12 +357,29 @@ static int ca_index_enough_data(CaIndex *i, size_t n) { return 1; } +static int ca_index_progressive_read(CaIndex *i, void *p, size_t n) { + ssize_t m; + int r; + + assert(i); + assert(p || n == 0); + + r = ca_index_enough_data(i, n); + if (r < 0) + return r; + if (r == 0) + return -EAGAIN; + + m = loop_read(i->fd, p, n); + if (m < 0) + return (int) m; + if ((size_t) m != n) + return -EPIPE; + + return 0; +} + static int ca_index_read_head(CaIndex *i) { - struct { - CaFormatIndex index; - CaFormatHeader table; - } head; - ssize_t n; int r; assert(i); @@ -326,59 +389,152 @@ static int ca_index_read_head(CaIndex *i) { if (i->start_offset != 0) /* already past the head */ return 0; - assert(i->cooked_offset == 0); + if (i->cooked_offset == 0) { + CaFormatIndex index; - r = ca_index_enough_data(i, sizeof(head)); - if (r < 0) - return r; - if (r == 0) - return -EAGAIN; + r = ca_index_progressive_read(i, &index, sizeof(index)); + if (r < 0) + return r; - n = loop_read(i->fd, &head, sizeof(head)); - if (n < 0) - return (int) n; - if (n != sizeof(head)) - return -EPIPE; + if (le64toh(index.header.size) != sizeof(CaFormatIndex) || + le64toh(index.header.type) != CA_FORMAT_INDEX) + return -EBADMSG; - if (le64toh(head.index.header.size) != sizeof(CaFormatIndex) || - le64toh(head.index.header.type) != CA_FORMAT_INDEX) - return -EBADMSG; + r = ca_feature_flags_are_normalized(le64toh(index.feature_flags)); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; - r = ca_feature_flags_are_normalized(le64toh(head.index.feature_flags)); - if (r < 0) - return r; - if (r == 0) - return -EINVAL; + if (le64toh(index.chunk_size_min) < CA_CHUNK_SIZE_LIMIT_MIN || + le64toh(index.chunk_size_min) > CA_CHUNK_SIZE_LIMIT_MAX) + return -EBADMSG; - if (le64toh(head.index.chunk_size_min) < CA_CHUNK_SIZE_LIMIT_MIN || - le64toh(head.index.chunk_size_min) > CA_CHUNK_SIZE_LIMIT_MAX) - return -EBADMSG; + if (le64toh(index.chunk_size_avg) < CA_CHUNK_SIZE_LIMIT_MIN || + le64toh(index.chunk_size_avg) > CA_CHUNK_SIZE_LIMIT_MAX) + return -EBADMSG; - if (le64toh(head.index.chunk_size_avg) < CA_CHUNK_SIZE_LIMIT_MIN || - le64toh(head.index.chunk_size_avg) > CA_CHUNK_SIZE_LIMIT_MAX) - return -EBADMSG; + if (le64toh(index.chunk_size_max) < CA_CHUNK_SIZE_LIMIT_MIN || + le64toh(index.chunk_size_max) > CA_CHUNK_SIZE_LIMIT_MAX) + return -EBADMSG; - if (le64toh(head.index.chunk_size_max) < CA_CHUNK_SIZE_LIMIT_MIN || - le64toh(head.index.chunk_size_max) > CA_CHUNK_SIZE_LIMIT_MAX) - return -EBADMSG; + if (!(le64toh(index.chunk_size_min) <= le64toh(index.chunk_size_avg) && + le64toh(index.chunk_size_avg) <= le64toh(index.chunk_size_max))) + return -EBADMSG; - if (!(le64toh(head.index.chunk_size_min) <= le64toh(head.index.chunk_size_avg) && - le64toh(head.index.chunk_size_avg) <= le64toh(head.index.chunk_size_max))) - return -EBADMSG; + i->feature_flags = le64toh(index.feature_flags); - if (le64toh(head.table.size) != UINT64_MAX || - le64toh(head.table.type) != CA_FORMAT_TABLE) - return -EBADMSG; + i->chunk_size_min = le64toh(index.chunk_size_min); + i->chunk_size_avg = le64toh(index.chunk_size_avg); + i->chunk_size_max = le64toh(index.chunk_size_max); + + i->cooked_offset += sizeof(index); + } + + if (i->cooked_offset == sizeof(CaFormatIndex)) { + CaFormatHeader header; - i->start_offset = i->cooked_offset = sizeof(head); + r = ca_index_progressive_read(i, &header, sizeof(header)); + if (r < 0) + return r; - i->feature_flags = le64toh(head.index.feature_flags); + if (le64toh(header.type) == CA_FORMAT_TABLE) { + /* No cutmarks defined, the table follows immediately */ - i->chunk_size_min = le64toh(head.index.chunk_size_min); - i->chunk_size_avg = le64toh(head.index.chunk_size_avg); - i->chunk_size_max = le64toh(head.index.chunk_size_max); + if (le64toh(header.size) != UINT64_MAX) + return -EBADMSG; - return 0; + i->cooked_offset += sizeof(header); + i->start_offset = i->cooked_offset; + + return 0; + } + + /* Otherwise the cutmark delata max object has to follow immediately */ + if (le64toh(header.type) != CA_FORMAT_CUTMARK_DELTA_MAX || + le64toh(header.size) != sizeof(CaFormatCutmarkDeltaMax)) + return -EBADMSG; + + i->cooked_offset += sizeof(header); + } + + if (i->cooked_offset == sizeof(CaFormatIndex) + sizeof(CaFormatHeader)) { + CaFormatCutmarkDeltaMax dm = {}; + size_t need = sizeof(CaFormatCutmarkDeltaMax) - sizeof(CaFormatHeader); + + r = ca_index_progressive_read(i, (uint8_t*) &dm + sizeof(CaFormatHeader), need); + if (r < 0) + return r; + + i->cutmark_delta_max = le64toh(dm.cutmark_delta_max); + i->cooked_offset += need; + } + + if (i->cooked_offset >= sizeof(CaFormatIndex) + sizeof(CaFormatCutmarkDeltaMax)) { + + for (;;) { + CaFormatCutmark cm = {}; + uint64_t p; + + p = i->cooked_offset - sizeof(CaFormatIndex) - sizeof(CaFormatCutmarkDeltaMax); + if (p % sizeof(CaFormatCutmark) == 0) { + + r = ca_index_progressive_read(i, &cm.header, sizeof(cm.header)); + if (r < 0) + return r; + + if (le64toh(cm.header.type) == CA_FORMAT_TABLE) { + /* No further cutmarks defined, the table follows now */ + + if (le64toh(cm.header.size) != UINT64_MAX) + return -EBADMSG; + + i->cooked_offset += sizeof(cm.header); + i->start_offset = i->cooked_offset; + return 0; + } + + if (le64toh(cm.header.type) != CA_FORMAT_CUTMARK || + le64toh(cm.header.size) != sizeof(CaFormatCutmark)) + return -EBADMSG; + + i->cooked_offset += sizeof(cm.header); + } else { + size_t need = sizeof(CaFormatCutmark) - sizeof(CaFormatHeader); + CaCutmark e, *a; + + assert(p % sizeof(CaFormatCutmark) == sizeof(CaFormatHeader)); + + r = ca_index_progressive_read(i, (uint8_t*) &cm + sizeof(CaFormatHeader), need); + if (r < 0) + return r; + + if (le64toh(cm.mask == 0)) + return -EBADMSG; + + e = (CaCutmark) { + .value = le64toh(cm.value), + .mask = le64toh(cm.mask), + .delta = le64toh(cm.delta), + }; + + if (i->n_cutmarks > 0 && + ca_cutmark_cmp(i->cutmarks + i->n_cutmarks - 1, &e) >= 0) + return -EBADMSG; + + a = reallocarray(i->cutmarks, i->n_cutmarks+1, sizeof(CaCutmark)); + if (!a) + return -ENOMEM; + + i->cutmarks = a; + i->cutmarks[i->n_cutmarks++] = e; + + i->cooked_offset += need; + } + } + } + + assert_not_reached("Huh, read the wrong number of bytes so far, this can't be."); } int ca_index_open(CaIndex *i) { @@ -468,6 +624,13 @@ int ca_index_write_chunk(CaIndex *i, const CaChunkID *id, uint64_t size) { return 0; } +static uint64_t index_offset(CaIndex *i) { + assert(i); + assert(i->start_offset >= sizeof(CaFormatIndex) + offsetof(CaFormatTable, items)); + + return i->start_offset - offsetof(CaFormatTable, items); +} + int ca_index_write_eof(CaIndex *i) { CaFormatTableTail tail = {}; int r; @@ -485,7 +648,7 @@ int ca_index_write_eof(CaIndex *i) { if (r < 0) return r; - tail.index_offset = htole64(sizeof(CaFormatIndex)); + tail.index_offset = htole64(index_offset(i)); tail.size = htole64(offsetof(CaFormatTable, items) + (i->item_position * sizeof(CaFormatTableItem)) + sizeof(tail)); @@ -543,7 +706,7 @@ int ca_index_read_chunk(CaIndex *i, CaChunkID *ret_id, uint64_t *ret_offset_end, if (buffer.tail.marker == htole64(CA_FORMAT_TABLE_TAIL_MARKER) && buffer.tail._zero_fill1 == 0 && buffer.tail._zero_fill2 == 0 && - buffer.tail.index_offset == htole64(sizeof(CaFormatIndex)) && + buffer.tail.index_offset == htole64(index_offset(i)) && le64toh(buffer.tail.size) == (i->cooked_offset - i->start_offset + offsetof(CaFormatTable, items) + sizeof(CaFormatTableTail))) { uint8_t final_byte; @@ -872,6 +1035,75 @@ int ca_index_get_chunk_size_max(CaIndex *i, size_t *ret) { return 0; } +int ca_index_set_cutmarks( + CaIndex *i, + const CaCutmark *cutmarks, + size_t n_cutmarks) { + + CaCutmark *m; + + if (!i) + return -EINVAL; + if (n_cutmarks > 0 && !cutmarks) + return -EINVAL; + if (!IN_SET(i->mode, CA_INDEX_WRITE, CA_INDEX_INCREMENTAL_WRITE)) + return -EROFS; + + /* We don't validate the cutmark array here, the assumption is that it is already valid */ + + m = newdup(CaCutmark, cutmarks, n_cutmarks); + if (!m) + return -ENOMEM; + + free_and_replace(i->cutmarks, m); + i->n_cutmarks = n_cutmarks; + + return 0; +} + +int ca_index_set_cutmark_delta_max( + CaIndex *i, + uint64_t cutmark_delta_max) { + + if (!i) + return -EINVAL; + if (!IN_SET(i->mode, CA_INDEX_WRITE, CA_INDEX_INCREMENTAL_WRITE)) + return -EROFS; + + i->cutmark_delta_max = cutmark_delta_max; + return 0; +} + +int ca_index_get_cutmarks(CaIndex *i, const CaCutmark **ret_cutmarks, size_t *ret_n_cutmarks) { + if (!i) + return -EINVAL; + if (!ret_cutmarks && !ret_n_cutmarks) + return -EINVAL; + + if (IN_SET(i->mode, CA_INDEX_READ, CA_INDEX_INCREMENTAL_READ) && i->start_offset == 0) + return -ENODATA; + + if (ret_cutmarks) + *ret_cutmarks = i->cutmarks; + if (ret_n_cutmarks) + *ret_n_cutmarks = i->n_cutmarks; + + return 0; +} + +int ca_index_get_cutmark_delta_max(CaIndex *i, uint64_t *ret) { + if (!i) + return -EINVAL; + if (!ret) + return -EINVAL; + + if (IN_SET(i->mode, CA_INDEX_READ, CA_INDEX_INCREMENTAL_READ) && i->start_offset == 0) + return -ENODATA; + + *ret = i->cutmark_delta_max; + return 0; +} + int ca_index_get_index_size(CaIndex *i, uint64_t *ret) { uint64_t size, metadata_size; int r; @@ -976,9 +1208,9 @@ static int ca_index_read_tail(CaIndex *i) { if (le64toh(buffer.tail.marker) != CA_FORMAT_TABLE_TAIL_MARKER) return -EBADMSG; - if (le64toh(buffer.tail.index_offset) != sizeof(CaFormatIndex)) + if (le64toh(buffer.tail.index_offset) != index_offset(i)) return -EBADMSG; - if (le64toh(buffer.tail.size) + sizeof(CaFormatIndex) != size) + if (index_offset(i) + le64toh(buffer.tail.size) != size) return -EBADMSG; i->blob_size = le64toh(buffer.last_item.offset); diff --git a/src/caindex.h b/src/caindex.h index a50e776c..67e833b4 100644 --- a/src/caindex.h +++ b/src/caindex.h @@ -4,6 +4,7 @@ #define foocaindexhfoo #include "cachunkid.h" +#include "cacutmark.h" #include "realloc-buffer.h" typedef struct CaIndex CaIndex; @@ -51,6 +52,12 @@ int ca_index_get_chunk_size_min(CaIndex *i, size_t *ret); int ca_index_get_chunk_size_avg(CaIndex *i, size_t *ret); int ca_index_get_chunk_size_max(CaIndex *i, size_t *ret); +int ca_index_set_cutmarks(CaIndex *i, const CaCutmark *cutmarks, size_t n_cutmarks); +int ca_index_set_cutmark_delta_max(CaIndex *i, uint64_t cutmark_delta_max); + +int ca_index_get_cutmarks(CaIndex *i, const CaCutmark **ret_cutmarks, size_t *ret_n_cutmarks); +int ca_index_get_cutmark_delta_max(CaIndex *i, uint64_t *ret); + int ca_index_get_blob_size(CaIndex *i, uint64_t *ret); int ca_index_get_index_size(CaIndex *i, uint64_t *ret); int ca_index_get_total_chunks(CaIndex *i, uint64_t *ret); diff --git a/src/caseed.c b/src/caseed.c index ce88555e..b127b67a 100644 --- a/src/caseed.c +++ b/src/caseed.c @@ -41,7 +41,7 @@ struct CaSeed { bool cache_chunks:1; ReallocBuffer buffer; - CaLocation *buffer_location; + CaOrigin *buffer_origin; CaFileRoot *root; @@ -52,23 +52,25 @@ struct CaSeed { uint64_t first_step_nsec; uint64_t last_step_nsec; + + CaCutmark *cutmarks; + size_t n_cutmarks; }; CaSeed *ca_seed_new(void) { CaSeed *s; - s = new0(CaSeed, 1); + s = new(CaSeed, 1); if (!s) return NULL; - s->cache_fd = -1; - s->base_fd = -1; - - s->cache_chunks = true; - - s->chunker = (CaChunker) CA_CHUNKER_INIT; - - s->feature_flags = CA_FORMAT_DEFAULT & SUPPORTED_FEATURE_MASK; + *s = (CaSeed) { + .cache_fd = -1, + .base_fd = -1, + .cache_chunks = true, + .chunker = CA_CHUNKER_INIT, + .feature_flags = CA_FORMAT_DEFAULT & SUPPORTED_FEATURE_MASK, + }; return s; } @@ -106,10 +108,12 @@ CaSeed *ca_seed_unref(CaSeed *s) { ca_digest_free(s->chunk_digest); realloc_buffer_free(&s->buffer); - ca_location_unref(s->buffer_location); + ca_origin_unref(s->buffer_origin); ca_file_root_unref(s->root); + free(s->cutmarks); + return mfree(s); } @@ -247,7 +251,8 @@ static int ca_seed_make_chunk_id(CaSeed *s, const void *p, size_t l, CaChunkID * return ca_chunk_id_make(s->chunk_digest, p, l, ret); } -static int ca_seed_write_cache_entry(CaSeed *s, CaLocation *location, const void *data, size_t l) { +static int ca_seed_write_cache_entry(CaSeed *s, CaLocation *loc, const void *data, size_t l) { + _cleanup_(ca_location_unrefp) CaLocation *location = ca_location_ref(loc); char ids[CA_CHUNK_ID_FORMAT_MAX]; const char *t, *four, *combined; CaChunkID id; @@ -258,6 +263,7 @@ static int ca_seed_write_cache_entry(CaSeed *s, CaLocation *location, const void assert(data); assert(l > 0); + /* We took our own copy above, to make sure we don't write around in the object the caller passed to us */ r = ca_location_patch_size(&location, l); if (r < 0) return r; @@ -317,7 +323,7 @@ static int ca_seed_write_cache_entry(CaSeed *s, CaLocation *location, const void } static int ca_seed_cache_chunks(CaSeed *s) { - uint64_t offset = 0; + _cleanup_(ca_location_unrefp) CaLocation *location = NULL; const void *p; size_t l; int r; @@ -333,46 +339,48 @@ static int ca_seed_cache_chunks(CaSeed *s) { if (!s->cache_chunks) return 0; - while (l > 0) { - const void *chunk; - size_t chunk_size, k; + r = ca_encoder_current_location(s->encoder, 0, &location); + if (r < 0) + return r; - if (!s->buffer_location) { - r = ca_encoder_current_location(s->encoder, offset, &s->buffer_location); - if (r < 0) - return r; - } + r = ca_location_patch_size(&location, l); + if (r < 0) + return r; - k = ca_chunker_scan(&s->chunker, p, l); - if (k == (size_t) -1) { - if (!realloc_buffer_append(&s->buffer, p, l)) - return -ENOMEM; + if (!s->buffer_origin) { + r = ca_origin_new(&s->buffer_origin); + if (r < 0) + return r; + } - return 0; - } + r = ca_origin_put(s->buffer_origin, location); + if (r < 0) + return r; - if (realloc_buffer_size(&s->buffer) == 0) { - chunk = p; - chunk_size = k; - } else { - if (!realloc_buffer_append(&s->buffer, p, k)) - return -ENOMEM; + while (l > 0) { + const void *chunk; + size_t chunk_size; + int verdict; - chunk = realloc_buffer_data(&s->buffer); - chunk_size = realloc_buffer_size(&s->buffer); - } + verdict = ca_chunker_extract_chunk(&s->chunker, &s->buffer, &p, &l, &chunk, &chunk_size); + if (verdict < 0) + return verdict; + if (verdict == CA_CHUNKER_NOT_YET) + return 0; - r = ca_seed_write_cache_entry(s, s->buffer_location, chunk, chunk_size); + r = ca_seed_write_cache_entry(s, ca_origin_get(s->buffer_origin, 0), chunk, chunk_size); if (r < 0) return r; - realloc_buffer_empty(&s->buffer); - s->buffer_location = ca_location_unref(s->buffer_location); - - p = (const uint8_t*) p + k; - l -= k; + if (verdict == CA_CHUNKER_INDIRECT) { + r = realloc_buffer_advance(&s->buffer, chunk_size); + if (r < 0) + return r; + } - offset += k; + r = ca_origin_advance_bytes(s->buffer_origin, chunk_size); + if (r < 0) + return r; } return 0; @@ -389,15 +397,12 @@ static int ca_seed_cache_final_chunk(CaSeed *s) { if (realloc_buffer_size(&s->buffer) == 0) return 0; - if (!s->buffer_location) - return 0; - - r = ca_seed_write_cache_entry(s, s->buffer_location, realloc_buffer_data(&s->buffer), realloc_buffer_size(&s->buffer)); + r = ca_seed_write_cache_entry(s, ca_origin_get(s->buffer_origin, 0), realloc_buffer_data(&s->buffer), realloc_buffer_size(&s->buffer)); if (r < 0) return 0; realloc_buffer_empty(&s->buffer); - s->buffer_location = ca_location_unref(s->buffer_location); + s->buffer_origin = ca_origin_unref(s->buffer_origin); return 0; } @@ -852,6 +857,32 @@ int ca_seed_set_chunk_size(CaSeed *s, size_t cmin, size_t cavg, size_t cmax) { return 0; } +int ca_seed_set_cutmarks(CaSeed *s, const CaCutmark *cutmarks, size_t n_cutmarks) { + CaCutmark *copy; + + if (!s) + return -EINVAL; + if (!cutmarks && n_cutmarks > 0) + return -EINVAL; + + copy = newdup(CaCutmark, cutmarks, n_cutmarks); + if (!copy) + return -ENOMEM; + + s->chunker.cutmarks = s->cutmarks = copy; + s->chunker.n_cutmarks = s->n_cutmarks = n_cutmarks; + + return 0; +} + +int ca_seed_set_cutmark_delta_max(CaSeed *s, int64_t delta) { + if (!s) + return -EINVAL; + + s->chunker.cutmark_delta_max = delta; + return 0; +} + int ca_seed_get_file_root(CaSeed *s, CaFileRoot **ret) { int r; diff --git a/src/caseed.h b/src/caseed.h index bdaeb25e..903509b4 100644 --- a/src/caseed.h +++ b/src/caseed.h @@ -39,6 +39,8 @@ int ca_seed_current_mode(CaSeed *seed, mode_t *ret); int ca_seed_set_feature_flags(CaSeed *s, uint64_t flags); int ca_seed_set_chunk_size(CaSeed *s, size_t cmin, size_t cavg, size_t cmax); +int ca_seed_set_cutmarks(CaSeed *s, const CaCutmark *cutmarks, size_t n_cutmarks); +int ca_seed_set_cutmark_delta_max(CaSeed *s, int64_t delta); int ca_seed_set_hardlink(CaSeed *s, bool b); int ca_seed_set_chunks(CaSeed *s, bool b); diff --git a/src/castore.c b/src/castore.c index c290dfd9..612b17eb 100644 --- a/src/castore.c +++ b/src/castore.c @@ -3,9 +3,12 @@ #include #include #include +#include +#include #include #include +#include "affinity-count.h" #include "castore.h" #include "def.h" #include "dirent-util.h" @@ -19,6 +22,8 @@ /* #undef EBADMSG */ /* #define EBADMSG __LINE__ */ +#define WORKER_THREADS_MAX 64U + struct CaStore { char *root; bool is_cache:1; @@ -34,6 +39,10 @@ struct CaStore { uint64_t n_requests; uint64_t n_request_bytes; + + pthread_t worker_threads[WORKER_THREADS_MAX]; + size_t n_worker_threads, n_worker_threads_max; + int worker_thread_socket[2]; }; struct CaStoreIterator { @@ -47,14 +56,17 @@ struct CaStoreIterator { CaStore* ca_store_new(void) { CaStore *store; - store = new0(CaStore, 1); + store = new(CaStore, 1); if (!store) return NULL; - store->digest_type = _CA_DIGEST_TYPE_INVALID; - - store->compression = CA_CHUNK_COMPRESSED; - store->compression_type = CA_COMPRESSION_DEFAULT; + *store = (CaStore) { + .digest_type = _CA_DIGEST_TYPE_INVALID, + .compression = CA_CHUNK_COMPRESSED, + .compression_type = CA_COMPRESSION_DEFAULT, + .worker_thread_socket = { -1, -1}, + .n_worker_threads_max = (size_t) -1, + }; return store; } @@ -62,13 +74,18 @@ CaStore* ca_store_new(void) { CaStore *ca_store_new_cache(void) { CaStore *s; - s = new0(CaStore, 1); + s = new(CaStore, 1); if (!s) return NULL; - s->is_cache = true; - s->compression = CA_CHUNK_AS_IS; - s->compression_type = CA_COMPRESSION_DEFAULT; + *s = (CaStore) { + .is_cache = true, + .compression = CA_CHUNK_AS_IS, + .compression_type = CA_COMPRESSION_DEFAULT, + + .worker_thread_socket = { -1, -1 }, + .n_worker_threads_max = (size_t) -1, + }; return s; } @@ -77,6 +94,8 @@ CaStore* ca_store_unref(CaStore *store) { if (!store) return NULL; + (void) ca_store_finalize(store); + if (store->is_cache && store->root) (void) rm_rf(store->root, REMOVE_ROOT|REMOVE_PHYSICAL); @@ -240,6 +259,203 @@ int ca_store_has(CaStore *store, const CaChunkID *chunk_id) { return ca_chunk_file_test(AT_FDCWD, store->root, chunk_id); } +struct queue_entry { + CaChunkID chunk_id; + CaChunkCompression effective_compression; + void *data; + size_t size; +}; + +static void* worker_thread(void *p) { + CaStore *store = p; + int ret = 0, r; + + assert(store); + assert(store->worker_thread_socket[1] >= 0); + + (void) pthread_setname_np(pthread_self(), "worker-thread"); + + for (;;) { + struct queue_entry e; + ssize_t n; + + n = recv(store->worker_thread_socket[0], &e, sizeof(e), 0); + if (n < 0) { + if (errno == EINTR) + continue; + + log_debug_errno(errno, "Failed to read from thread pool socket: %m"); + return INT_TO_PTR(errno); + } + if (n == 0) /* Either EOF or zero-sized datagram (Linux doesn't really allow us to + * distinguish that), we take both as an indication to exit the worker thread. */ + break; + + assert(n == sizeof(e)); + + r = ca_chunk_file_save( + AT_FDCWD, store->root, + &e.chunk_id, + e.effective_compression, store->compression, + store->compression_type, + e.data, e.size); + free(e.data); + + if (r < 0) { + log_debug_errno(r, "Failed to store chunk in store: %m"); + + if (r != -EEXIST) + ret = r; + } + } + + return INT_TO_PTR(ret); +} + +static int determine_worker_threads_max(CaStore *store) { + const char *e; + int r; + + assert(store); + + if (store->n_worker_threads_max != (size_t) -1) + return 0; + + e = getenv("CASYNC_WORKER_THREADS"); + if (e) { + unsigned u; + + r = safe_atou(e, &u); + if (r < 0) + log_debug_errno(r, "Failed to parse $CASYNC_WORKER_THREADS, ignoring: %s", e); + else if (u > WORKER_THREADS_MAX) { + log_debug("$CASYNC_WORKER_THREADS out of range, clamping to %zu: %s", (size_t) WORKER_THREADS_MAX, e); + store->n_worker_threads_max = WORKER_THREADS_MAX; + } else { + store->n_worker_threads_max = u; + return 0; + } + } + + r = cpus_in_affinity_mask(); + if (r < 0) + return log_debug_errno(r, "Failed to determine CPUs in affinity mask: %m"); + + store->n_worker_threads_max = MIN((size_t) r, WORKER_THREADS_MAX); + return 0; +} + +static int start_worker_thread(CaStore *store) { + int r; + + assert(store); + + r = determine_worker_threads_max(store); + if (r < 0) + return r; + + if (store->n_worker_threads >= (size_t) store->n_worker_threads_max) + return 0; + + if (store->worker_thread_socket[0] < 0) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, store->worker_thread_socket) < 0) + return -errno; + + r = pthread_create(store->worker_threads + store->n_worker_threads, NULL, worker_thread, store); + if (r != 0) + return -r; + + store->n_worker_threads++; + + log_debug("Started store worker thread %zu.", store->n_worker_threads); + return 0; +} + +static int submit_to_worker_thread( + CaStore *store, + const CaChunkID *chunkid, + CaChunkCompression effective_compression, + const void *p, + uint64_t l) { + + struct queue_entry e; + void *copy = NULL; + ssize_t n; + int r; + + assert(store); + + /* If there's no need to compress/decompress, then let's do things client side, since the operation + * is likely IO bound, not CPU bound */ + if (store->compression == CA_CHUNK_AS_IS || + store->compression == effective_compression) + return -ENOANO; + + /* Before we submit the chunk for compression, let's see if it exists already. If so, let's return + * -EEXIST right away, so that the caller can count reused chunks. Note that this is a bit racy + * currently, as submitted but not yet processed chunks are not considered. */ + r = ca_store_has(store, chunkid); + if (r < 0) + return r; + if (r > 0) + return -EEXIST; + + /* Let's start a new worker thread each time we have a new job to process, until we reached all + * worker threads we need */ + (void) start_worker_thread(store); + + /* If there are no worker threads, do things client side */ + if (store->n_worker_threads <= 0 || + store->worker_thread_socket[1] < 0) + return -ENETDOWN; + + copy = memdup(p, l); + if (!copy) + return -ENOMEM; + + e = (struct queue_entry) { + .chunk_id = *chunkid, + .effective_compression = effective_compression, + .data = copy, + .size = l, + }; + + n = send(store->worker_thread_socket[1], &e, sizeof(e), 0); + if (n < 0) { + free(copy); + return -errno; + } + + assert(n == sizeof(e)); + return 0; +} + +int ca_store_finalize(CaStore *store) { + int ret = 0, r; + size_t i; + + assert(store); + + /* Trigger EOF in all worker threads */ + store->worker_thread_socket[1] = safe_close(store->worker_thread_socket[1]); + + for (i = 0; i < store->n_worker_threads; i++) { + void *p; + r = pthread_join(store->worker_threads[i], &p); + if (r != 0) + ret = -r; + if (p != NULL) + ret = -PTR_TO_INT(p); + } + + store->n_worker_threads = 0; + store->worker_thread_socket[0] = safe_close(store->worker_thread_socket[0]); + + /* Propagate errors we ran into while processing store requests. This is useful for callers to + * determine whether the worker threads ran into any problems. */ + return ret; +} + int ca_store_put( CaStore *store, const CaChunkID *chunk_id, @@ -273,6 +489,14 @@ int ca_store_put( store->mkdir_done = true; } + r = submit_to_worker_thread( + store, + chunk_id, + effective_compression, + data, size); + if (r >= 0) + return 0; + return ca_chunk_file_save( AT_FDCWD, store->root, chunk_id, diff --git a/src/castore.h b/src/castore.h index 003bb955..0d2ee5c3 100644 --- a/src/castore.h +++ b/src/castore.h @@ -25,6 +25,8 @@ int ca_store_get(CaStore *store, const CaChunkID *chunk_id, CaChunkCompression d int ca_store_has(CaStore *store, const CaChunkID *chunk_id); int ca_store_put(CaStore *store, const CaChunkID *chunk_id, CaChunkCompression effective_compression, const void *data, uint64_t size); +int ca_store_finalize(CaStore *store); + int ca_store_get_requests(CaStore *s, uint64_t *ret); int ca_store_get_request_bytes(CaStore *s, uint64_t *ret); diff --git a/src/casync-tool.c b/src/casync-tool.c index f9206886..cb7ff19b 100644 --- a/src/casync-tool.c +++ b/src/casync-tool.c @@ -73,6 +73,10 @@ static bool arg_uid_shift_apply = false; static bool arg_mkdir = true; static CaDigestType arg_digest = CA_DIGEST_DEFAULT; static CaCompressionType arg_compression = CA_COMPRESSION_DEFAULT; +static CaCutmark *arg_cutmarks = NULL; +static size_t arg_n_cutmarks = 0; +static bool arg_auto_cutmarks = true; +static uint64_t arg_cutmark_delta_max = 0; static void help(void) { printf("%1$s [OPTIONS...] make [ARCHIVE|ARCHIVE_INDEX|BLOB_INDEX] [PATH]\n" @@ -99,6 +103,9 @@ static void help(void) { " --chunk-size=[MIN:]AVG[:MAX]\n" " The minimal/average/maximum number of bytes in a\n" " chunk\n" + " --cutmark=CUTMARK Specify a cutmark\n" + " --cutmark-delta-max=BYTES\n" + " Maximum bytes to shift cut due to cutmark\n" " --digest=DIGEST Pick digest algorithm (sha512-256 or sha256)\n" " --compression=COMPRESSION\n" " Pick compression algorithm (zstd, xz or gzip)\n" @@ -347,6 +354,8 @@ static int parse_argv(int argc, char *argv[]) { ARG_DIGEST, ARG_COMPRESSION, ARG_VERSION, + ARG_CUTMARK, + ARG_CUTMARK_DELTA_MAX, }; static const struct option options[] = { @@ -380,6 +389,8 @@ static int parse_argv(int argc, char *argv[]) { { "mkdir", required_argument, NULL, ARG_MKDIR }, { "digest", required_argument, NULL, ARG_DIGEST }, { "compression", required_argument, NULL, ARG_COMPRESSION }, + { "cutmark", required_argument, NULL, ARG_CUTMARK }, + { "cutmark-delta-max", required_argument, NULL, ARG_CUTMARK_DELTA_MAX }, {} }; @@ -684,6 +695,47 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_CUTMARK: + r = parse_boolean(optarg); + if (r < 0) { + CaCutmark *n; + + n = reallocarray(arg_cutmarks, sizeof(CaCutmark), arg_n_cutmarks + 1); + if (!n) + return log_oom(); + + arg_cutmarks = n; + + r = ca_cutmark_parse(arg_cutmarks + arg_n_cutmarks, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse cutmark specification: %m"); + + arg_n_cutmarks++; + arg_auto_cutmarks = false; + } else { + arg_auto_cutmarks = r; + + arg_cutmarks = mfree(arg_cutmarks); + arg_n_cutmarks = 0; + } + + break; + + case ARG_CUTMARK_DELTA_MAX: { + uint64_t u; + + r = parse_size(optarg, &u); + if (r < 0) + return log_error_errno(r, "Failed to parse cutmark delta: %s", optarg); + if (u == 0) { + log_error("Cutmark delta cannot be zero."); + return -EINVAL; + } + + arg_cutmark_delta_max = u; + break; + } + case '?': return -EINVAL; @@ -810,8 +862,10 @@ static int load_feature_flags(CaSync *s, uint64_t default_with_flags) { return 0; } -static int load_chunk_size(CaSync *s) { +static int load_chunk_size(CaSync *s, bool is_catar) { uint64_t cavg, cmin, cmax; + const CaCutmark *cutmarks; + size_t n_cutmarks; int r; if (arg_chunk_size_avg != 0) { @@ -832,6 +886,22 @@ static int load_chunk_size(CaSync *s) { return log_error_errno(r, "Failed to set maximum chunk size to %zu: %m", arg_chunk_size_max); } + if (arg_auto_cutmarks && is_catar) { + r = ca_sync_set_cutmarks_catar(s); + if (r < 0) + return log_error_errno(r, "Failed to set automatic cutmarks: %m"); + } else { + r = ca_sync_set_cutmarks(s, arg_cutmarks, arg_n_cutmarks); + if (r < 0) + return log_error_errno(r, "Failed to set manual cutmarks: %m"); + } + + if (arg_cutmark_delta_max != 0) { + r = ca_sync_set_cutmark_delta_max(s, arg_cutmark_delta_max); + if (r < 0) + return log_error_errno(r, "Failed to set cutmark delta: %m"); + } + if (!arg_verbose) return 1; @@ -848,6 +918,31 @@ static int load_chunk_size(CaSync *s) { return log_error_errno(r, "Failed to read maximum chunk size: %m"); log_info("Selected chunk sizes: min=%" PRIu64 "..avg=%" PRIu64 "..max=%" PRIu64, cmin, cavg, cmax); + + r = ca_sync_get_cutmarks(s, &cutmarks, &n_cutmarks); + if (r < 0) + return log_error_errno(r, "Failed to acquire cutmarks: %m"); + + if (n_cutmarks == 0) + log_info("No cutmarks defined."); + else { + uint64_t delta; + size_t i; + + for (i = 0; i < n_cutmarks; i++) + log_info("Cutmark: %016" PRIx64 "/%016" PRIx64 "%c%"PRIu64, + cutmarks[i].value, + cutmarks[i].mask, + cutmarks[i].delta < 0 ? '-' : '+', + (uint64_t) (cutmarks[i].delta < 0 ? -cutmarks[i].delta : cutmarks[i].delta)); + + r = ca_sync_get_cutmark_delta_max(s, &delta); + if (r < 0) + return log_error_errno(r, "Failed to determine cutmark delta: %m"); + + log_info("Maximum cutmark delta: %" PRIu64, delta); + } + return 1; } @@ -904,7 +999,9 @@ static int verbose_print_path(CaSync *s, const char *verb) { static int verbose_print_done_make(CaSync *s) { uint64_t n_chunks = UINT64_MAX, size = UINT64_MAX, n_reused = UINT64_MAX, covering, - n_cache_hits = UINT64_MAX, n_cache_misses = UINT64_MAX, n_cache_invalidated = UINT64_MAX, n_cache_added = UINT64_MAX; + n_cache_hits = UINT64_MAX, n_cache_misses = UINT64_MAX, n_cache_invalidated = UINT64_MAX, n_cache_added = UINT64_MAX, + n_cutmarks_applied = UINT64_MAX; + int64_t cutmark_delta_sum = INT64_MAX; char buffer[FORMAT_BYTES_MAX]; int r; @@ -989,6 +1086,19 @@ static int verbose_print_done_make(CaSync *s) { if (n_cache_hits != UINT64_MAX && n_cache_misses != UINT64_MAX && n_cache_invalidated != UINT64_MAX && n_cache_added != UINT64_MAX) log_info("Cache hits: %" PRIu64 ", misses: %" PRIu64 ", invalidated: %" PRIu64 ", added: %" PRIu64, n_cache_hits, n_cache_misses, n_cache_invalidated, n_cache_added); + r = ca_sync_current_cutmarks_applied(s, &n_cutmarks_applied); + if (r < 0 && r != -ENODATA) + return log_error_errno(r, "Failed to read number of cutmarks: %m"); + if (n_cutmarks_applied != UINT64_MAX) + log_info("Cutmarks applied: %"PRIu64, n_cutmarks_applied); + + r = ca_sync_current_cutmark_delta_sum(s, &cutmark_delta_sum); + if (r < 0 && r != -ENODATA) + return log_error_errno(r, "Failed to read cutmark delta: %m"); + if (cutmark_delta_sum != INT64_MAX) + log_info("Average cutmark delta for all chunks: %0.1f", + n_chunks > 0 ? (double) cutmark_delta_sum / n_chunks : 0.0); + return 1; } @@ -1314,7 +1424,7 @@ static int verb_make(int argc, char *argv[]) { if (!s) return log_oom(); - r = load_chunk_size(s); + r = load_chunk_size(s, IN_SET(operation, MAKE_ARCHIVE, MAKE_ARCHIVE_INDEX)); if (r < 0) return r; @@ -2242,7 +2352,7 @@ static int verb_list(int argc, char *argv[]) { if (!s) return log_oom(); - r = load_chunk_size(s); + r = load_chunk_size(s, true); if (r < 0) return r; @@ -2549,7 +2659,7 @@ static int verb_digest(int argc, char *argv[]) { if (!s) return log_oom(); - r = load_chunk_size(s); + r = load_chunk_size(s, IN_SET(operation, DIGEST_ARCHIVE, DIGEST_ARCHIVE_INDEX, DIGEST_DIRECTORY)); if (r < 0) return r; @@ -4013,6 +4123,7 @@ int main(int argc, char *argv[]) { free(arg_cache); strv_free(arg_extra_stores); strv_free(arg_seeds); + free(arg_cutmarks); /* fprintf(stderr, PID_FMT ": exiting with error code: %m", getpid()); */ diff --git a/src/casync.c b/src/casync.c index 9fee77f7..a3daef30 100644 --- a/src/casync.c +++ b/src/casync.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ +#include #include #include #include @@ -152,6 +153,10 @@ struct CaSync { uint64_t first_chunk_request_nsec; uint64_t last_chunk_request_nsec; + + /* List of defined cutmarks */ + CaCutmark *cutmarks; + size_t n_cutmarks; }; #define CA_SYNC_IS_STARTED(s) ((s)->start_nsec != 0) @@ -508,6 +513,8 @@ CaSync *ca_sync_unref(CaSync *s) { ca_digest_free(s->chunk_digest); + free(s->cutmarks); + return mfree(s); } @@ -1079,7 +1086,7 @@ int ca_sync_add_store_path(CaSync *s, const char *path) { return r; } - array = realloc_multiply(s->rstores, sizeof(CaStore*), s->n_rstores+1); + array = reallocarray(s->rstores, sizeof(CaStore*), s->n_rstores+1); if (!array) { ca_store_unref(store); return -ENOMEM; @@ -1110,7 +1117,7 @@ int ca_sync_add_store_remote(CaSync *s, const char *url) { return r; } - array = realloc_multiply(s->remote_rstores, sizeof(CaRemote*), s->n_remote_rstores+1); + array = reallocarray(s->remote_rstores, sizeof(CaRemote*), s->n_remote_rstores+1); if (!array) { ca_remote_unref(remote); return -ENOMEM; @@ -1145,7 +1152,7 @@ static int ca_sync_extend_seeds_array(CaSync *s) { assert(s); - new_seeds = realloc_multiply(s->seeds, sizeof(CaSeed*), s->n_seeds+1); + new_seeds = reallocarray(s->seeds, sizeof(CaSeed*), s->n_seeds+1); if (!new_seeds) return -ENOMEM; @@ -1549,6 +1556,14 @@ static int ca_sync_start(CaSync *s) { if (r < 0) return r; + r = ca_index_set_cutmarks(s->index, s->chunker.cutmarks, s->chunker.n_cutmarks); + if (r < 0) + return r; + + r = ca_index_set_cutmark_delta_max(s->index, ca_chunker_cutmark_delta_max(&s->chunker)); + if (r < 0) + return r; + if (s->make_mode != (mode_t) -1) { r = ca_index_set_make_mode(s->index, s->make_mode); if (r < 0 && r != -ENOTTY) @@ -1697,50 +1712,39 @@ static int ca_sync_write_chunks(CaSync *s, const void *p, size_t l, CaLocation * while (l > 0) { _cleanup_(ca_origin_unrefp) CaOrigin *chunk_origin = NULL; + size_t chunk_size; const void *chunk; - size_t chunk_size, k; + int verdict; - k = ca_chunker_scan(&s->chunker, p, l); - if (k == (size_t) -1) { - if (!realloc_buffer_append(&s->buffer, p, l)) - return -ENOMEM; + verdict = ca_chunker_extract_chunk(&s->chunker, &s->buffer, &p, &l, &chunk, &chunk_size); + if (verdict < 0) + return verdict; + if (verdict == CA_CHUNKER_NOT_YET) return 0; - } - - if (realloc_buffer_size(&s->buffer) == 0) { - chunk = p; - chunk_size = k; - } else { - if (!realloc_buffer_append(&s->buffer, p, k)) - return -ENOMEM; - - chunk = realloc_buffer_data(&s->buffer); - chunk_size = realloc_buffer_size(&s->buffer); - } if (s->buffer_origin) { - if (chunk_size == ca_origin_bytes(s->buffer_origin)) { - chunk_origin = s->buffer_origin; - s->buffer_origin = NULL; - } else { - r = ca_origin_extract_bytes(s->buffer_origin, chunk_size, &chunk_origin); - if (r < 0) - return r; - - r = ca_origin_advance_bytes(s->buffer_origin, chunk_size); - if (r < 0) - return r; - } + r = ca_origin_extract_bytes(s->buffer_origin, chunk_size, &chunk_origin); + if (r < 0) + return r; } r = ca_sync_write_one_chunk(s, chunk, chunk_size, chunk_origin); if (r < 0) return r; - realloc_buffer_empty(&s->buffer); + /* If the verdict was "indirect", then chunk/chunk_size don't point into p/l but into the + * temporary buffer "buffer". In that case, we need to advance it after using it. */ + if (verdict == CA_CHUNKER_INDIRECT) { + r = realloc_buffer_advance(&s->buffer, chunk_size); + if (r < 0) + return r; + } - p = (const uint8_t*) p + k; - l -= k; + if (s->buffer_origin) { + r = ca_origin_advance_bytes(s->buffer_origin, chunk_size); + if (r < 0) + return r; + } } return 0; @@ -2058,6 +2062,14 @@ static int ca_sync_step_encode(CaSync *s) { if (r < 0) return r; + if (s->wstore) { + /* Make sure the store ends all worker threads and pick up any pending errors from + * it */ + r = ca_store_finalize(s->wstore); + if (r < 0) + return r; + } + r = ca_sync_install_archive(s); if (r < 0) return r; @@ -3058,7 +3070,16 @@ static int ca_sync_propagate_flags_to_stores(CaSync *s, uint64_t flags) { return 0; } -static int ca_sync_propagate_flags_to_seeds(CaSync *s, uint64_t flags, size_t cmin, size_t cavg, size_t cmax) { +static int ca_sync_propagate_flags_to_seeds( + CaSync *s, + uint64_t flags, + size_t cmin, + size_t cavg, + size_t cmax, + const CaCutmark *cutmarks, + size_t n_cutmarks, + uint64_t cutmark_delta_max) { + size_t i; int r; @@ -3073,6 +3094,16 @@ static int ca_sync_propagate_flags_to_seeds(CaSync *s, uint64_t flags, size_t cm r = ca_seed_set_chunk_size(s->seeds[i], cmin, cavg, cmax); if (r < 0) return r; + + if (n_cutmarks > 0) { + r = ca_seed_set_cutmarks(s->seeds[i], cutmarks, n_cutmarks); + if (r < 0) + return r; + + r = ca_seed_set_cutmark_delta_max(s->seeds[i], cutmark_delta_max); + if (r < 0) + return r; + } } return 0; @@ -3124,8 +3155,9 @@ static int ca_sync_propagate_flags_to_decoder(CaSync *s, uint64_t flags) { } static int ca_sync_propagate_index_flags(CaSync *s) { - size_t cmin, cavg, cmax; - uint64_t flags; + size_t cmin, cavg, cmax, n_cutmarks; + uint64_t flags, cutmark_delta_max; + const CaCutmark *cutmarks; int r; assert(s); @@ -3155,11 +3187,19 @@ static int ca_sync_propagate_index_flags(CaSync *s) { if (r < 0) return r; + r = ca_index_get_cutmarks(s->index, &cutmarks, &n_cutmarks); + if (r < 0) + return r; + + r = ca_index_get_cutmark_delta_max(s->index, &cutmark_delta_max); + if (r < 0) + return r; + r = ca_sync_propagate_flags_to_stores(s, flags); if (r < 0) return r; - r = ca_sync_propagate_flags_to_seeds(s, flags, cmin, cavg, cmax); + r = ca_sync_propagate_flags_to_seeds(s, flags, cmin, cavg, cmax, cutmarks, n_cutmarks, cutmark_delta_max); if (r < 0) return r; @@ -4514,3 +4554,145 @@ int ca_sync_current_cache_added(CaSync *s, uint64_t *ret) { *ret = s->n_cache_added; return 0; } + +int ca_sync_set_cutmarks(CaSync *s, const CaCutmark *c, size_t n) { + if (!s) + return -EINVAL; + + if (n == 0) { + s->cutmarks = mfree(s->cutmarks); + s->n_cutmarks = 0; + } else { + _cleanup_free_ CaCutmark *copy = NULL; + size_t i; + + if (!c) + return -EINVAL; + + copy = newdup(CaCutmark, c, n); + if (!copy) + return -ENOMEM; + + /* Bring into a defined order */ + ca_cutmark_sort(copy, n); + + if (copy[0].mask == 0) + return -EINVAL; + + /* Refuse duplicate and bad entries */ + for (i = 1; i < n; i++) { + if (ca_cutmark_cmp(copy + i - 1, copy + i) == 0) + return -ENOTUNIQ; + + if (copy[i].mask == 0) + return -EINVAL; + } + + free_and_replace(s->cutmarks, copy); + s->n_cutmarks = n; + } + + /* Propagate to chunker */ + s->chunker.cutmarks = s->cutmarks; + s->chunker.n_cutmarks = s->n_cutmarks; + + return 0; +} + +int ca_sync_set_cutmarks_catar(CaSync *s) { + + /* Set the cutmarks in a way suitable for catar streams */ + + static const CaCutmark catar_cutmarks[] = { + { + /* Cut before each new directory entry */ +#if __BYTE_ORDER == __LITTLE_ENDIAN + .value = CA_FORMAT_ENTRY, +#else + .value = __builtin_bswap64(CA_FORMAT_ENTRY), +#endif + .mask = UINT64_C(0xFFFFFFFFFFFFFFFF), + .delta = -8, + }, + }; + + if (!s) + return -EINVAL; + + s->cutmarks = mfree(s->cutmarks); + s->n_cutmarks = 0; + + s->chunker.cutmarks = catar_cutmarks; + s->chunker.n_cutmarks = ELEMENTSOF(catar_cutmarks); + + return 0; +} + +int ca_sync_set_cutmark_delta_max(CaSync *s, uint64_t m) { + if (!s) + return -EINVAL; + if (m == 0) + return -EINVAL; + + s->chunker.cutmark_delta_max = m; + return 0; +} + +int ca_sync_get_cutmarks(CaSync *s, const CaCutmark **ret_cutmarks, size_t *ret_n) { + if (!s) + return -EINVAL; + if (!ret_cutmarks) + return -EINVAL; + if (!ret_n) + return -EINVAL; + + /* We return the cutmarks configured in the chunker instead of the ones configured in the CaSync + * object, since the latter are not set if we use the 'catar' default cutmarks. */ + + *ret_cutmarks = s->chunker.cutmarks; + *ret_n = s->chunker.n_cutmarks; + return 0; +} + +int ca_sync_get_cutmark_delta_max(CaSync *s, uint64_t *ret) { + if (!s) + return -EINVAL; + if (!ret) + return -EINVAL; + + *ret = ca_chunker_cutmark_delta_max(&s->chunker); + return 0; +} + +int ca_sync_current_cutmark_delta_sum(CaSync *s, int64_t *ret) { + if (!s) + return -EINVAL; + if (!ret) + return -EINVAL; + + if (s->direction != CA_SYNC_ENCODE) + return -ENODATA; + if (!s->wstore && !s->cache_store && !s->index) + return -ENODATA; + + *ret = s->chunker.cutmark_delta_sum; + return 0; +} + +int ca_sync_current_cutmarks_applied(CaSync *s, uint64_t *ret) { + if (!s) + return -EINVAL; + if (!ret) + return -EINVAL; + + if (s->direction != CA_SYNC_ENCODE) + return -ENODATA; + if (!s->wstore && !s->cache_store && !s->index) + return -ENODATA; + + if (s->chunker.n_cutmarks == 0) + return -ENODATA; + + *ret = s->chunker.n_cutmarks_applied; + return 0; +} diff --git a/src/casync.h b/src/casync.h index 982f67e5..01392ad9 100644 --- a/src/casync.h +++ b/src/casync.h @@ -9,6 +9,7 @@ #include "cachunk.h" #include "cachunkid.h" #include "cacommon.h" +#include "cacutmark.h" #include "caorigin.h" typedef struct CaSync CaSync; @@ -123,6 +124,16 @@ int ca_sync_set_chunk_size_min(CaSync *s, uint64_t v); int ca_sync_set_chunk_size_avg(CaSync *s, uint64_t v); int ca_sync_set_chunk_size_max(CaSync *s, uint64_t v); +int ca_sync_set_cutmarks(CaSync *s, const CaCutmark *c, size_t n); +int ca_sync_set_cutmarks_catar(CaSync *s); +int ca_sync_set_cutmark_delta_max(CaSync *s, uint64_t m); + +int ca_sync_get_cutmarks(CaSync *s, const CaCutmark **ret_cutmarks, size_t *ret_n); +int ca_sync_get_cutmark_delta_max(CaSync *s, uint64_t *ret); + +int ca_sync_current_cutmark_delta_sum(CaSync *s, int64_t *ret); +int ca_sync_current_cutmarks_applied(CaSync *s, uint64_t *ret); + int ca_sync_get_chunk_size_avg(CaSync *s, uint64_t *ret); int ca_sync_get_chunk_size_min(CaSync *s, uint64_t *ret); int ca_sync_get_chunk_size_max(CaSync *s, uint64_t *ret); diff --git a/src/meson.build b/src/meson.build index 5b1ce782..2672c27d 100644 --- a/src/meson.build +++ b/src/meson.build @@ -10,6 +10,8 @@ util_sources = files(''' '''.split()) libshared_sources = files(''' + affinity-count.c + affinity-count.h cacache.c cacache.h cachunk.c @@ -21,6 +23,8 @@ libshared_sources = files(''' cacommon.h cacompression.c cacompression.h + cacutmark.c + cacutmark.h cadecoder.c cadecoder.h cadigest.c diff --git a/src/util.c b/src/util.c index 299e4bba..64922341 100644 --- a/src/util.c +++ b/src/util.c @@ -429,6 +429,14 @@ int unhexchar(char c) { return -EINVAL; } +int undecchar(char c) { + + if (c >= '0' && c <= '9') + return c - '0'; + + return -EINVAL; +} + char *hexmem(const void *p, size_t l) { const uint8_t *x; char *r, *z; @@ -919,7 +927,7 @@ int strv_push(char ***l, char *value) { if (m < n) return -ENOMEM; - c = realloc_multiply(*l, sizeof(char*), m); + c = reallocarray(*l, sizeof(char*), m); if (!c) return -ENOMEM; diff --git a/src/util.h b/src/util.h index dc9d03e4..f4b62914 100644 --- a/src/util.h +++ b/src/util.h @@ -24,10 +24,32 @@ #include "gcc-macro.h" #include "log.h" -#define new(t, n) ((t*) malloc((n) * sizeof(t))) -#define new0(t, n) ((t*) calloc((n), sizeof(t))) +/* If for some reason more than 4M are allocated on the stack, let's abort immediately. It's better than + * proceeding and smashing the stack limits. Note that by default RLIMIT_STACK is 8M on Linux. */ +#define ALLOCA_MAX (4U*1024U*1024U) -#define newa(t, n) ((t*) alloca((n) * sizeof(t))) +#define new(t, n) ((t*) malloc_multiply(sizeof(t), (n))) +#define new0(t, n) ((t*) calloc((n) ?: 1, sizeof(t))) + +#define newa(t, n) \ + ({ \ + size_t _n_ = n; \ + assert(!size_multiply_overflow(sizeof(t), _n_)); \ + assert(sizeof(t)*_n_ <= ALLOCA_MAX); \ + (t*) alloca(sizeof(t)*_n_); \ + }) + +#define newa0(t, n) \ + ({ \ + size_t _n_ = n; \ + assert(!size_multiply_overflow(sizeof(t), _n_)); \ + assert(sizeof(t)*_n_ <= ALLOCA_MAX); \ + (t*) alloca0(sizeof(t)*_n_); \ + }) + +#define newdup(t, p, n) ((t*) memdup_multiply(p, sizeof(t), (n))) + +#define malloc0(n) (calloc(1, (n))) #define XCONCATENATE(x, y) x ## y #define CONCATENATE(x, y) XCONCATENATE(x, y) @@ -63,8 +85,6 @@ ((_A) > (_B)) ? (_A) : (_B), \ (void)0)) - - int loop_write(int fd, const void *p, size_t l); int loop_write_block(int fd, const void *p, size_t l); ssize_t loop_read(int fd, void *p, size_t l); @@ -226,6 +246,7 @@ static inline uint64_t random_u64(void) { char hexchar(int x); int unhexchar(char c); +int undecchar(char c); char octchar(int x); char *hexmem(const void *p, size_t l); @@ -330,23 +351,40 @@ char *strv_find(char **l, const char *name) _pure_; #define strv_contains(l, s) (!!strv_find((l), (s))) static inline bool size_multiply_overflow(size_t size, size_t need) { - return need != 0 && size > (SIZE_MAX / need); + return _unlikely_(need != 0 && size > (SIZE_MAX / need)); } _malloc_ _alloc_(1, 2) static inline void *malloc_multiply(size_t size, size_t need) { - if (_unlikely_(size_multiply_overflow(size, need))) + if (size_multiply_overflow(size, need)) + return NULL; + + return malloc(size * need ?: 1); +} + +#if !HAVE_REALLOCARRAY +_alloc_(2, 3) static inline void *reallocarray(void *p, size_t need, size_t size) { + if (size_multiply_overflow(size, need)) return NULL; - return malloc(size * need); + return realloc(p, size * need ?: 1); } +#endif -_alloc_(2, 3) static inline void *realloc_multiply(void *p, size_t size, size_t need) { - if (_unlikely_(size_multiply_overflow(size, need))) +_alloc_(2, 3) static inline void *memdup_multiply(const void *p, size_t size, size_t need) { + if (size_multiply_overflow(size, need)) return NULL; - return realloc(p, size * need); + return memdup(p, size * need); } +#define free_and_replace(a, b) \ + ({ \ + free(a); \ + (a) = (b); \ + (b) = NULL; \ + 0; \ + }) + #define STRV_FOREACH(s, l) \ for ((s) = (l); (s) && *(s); (s)++) @@ -780,4 +818,13 @@ int read_line(FILE *f, size_t limit, char **ret); char *delete_trailing_chars(char *s, const char *bad); char *strstrip(char *s); +#define CMP(a, b) __CMP(UNIQ, (a), UNIQ, (b)) +#define __CMP(aq, a, bq, b) \ + ({ \ + const typeof(a) UNIQ_T(A, aq) = (a); \ + const typeof(b) UNIQ_T(B, bq) = (b); \ + UNIQ_T(A, aq) < UNIQ_T(B, bq) ? -1 : \ + UNIQ_T(A, aq) > UNIQ_T(B, bq) ? 1 : 0; \ + }) + #endif diff --git a/test/test-cachunker-histogram.c b/test/test-cachunker-histogram.c index 5f82232a..2b393fb0 100644 --- a/test/test-cachunker-histogram.c +++ b/test/test-cachunker-histogram.c @@ -41,7 +41,7 @@ static void* process(void *q) { for (;;) { size_t n; - n = ca_chunker_scan(&t->chunker, p, l); + n = ca_chunker_scan(&t->chunker, true, p, l); if (n == (size_t) -1) { previous += l; break; diff --git a/test/test-cachunker.c b/test/test-cachunker.c index abacf37b..0ae75fed 100644 --- a/test/test-cachunker.c +++ b/test/test-cachunker.c @@ -1,8 +1,9 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ #include -#include +#include #include +#include #include "cachunker.h" #include "util.h" @@ -62,7 +63,7 @@ static void test_chunk(void) { for (;;) { size_t k; - k = ca_chunker_scan(&x, p, n); + k = ca_chunker_scan(&x, true, p, n); if (k == (size_t) -1) { acc += n; break; @@ -142,11 +143,100 @@ static int test_set_size(void) { return 0; } +static const ssize_t CUTMARK_BUFFER_SIZE = (1024*1024); + +static void test_cutmarks(void) { + struct CaChunker x = CA_CHUNKER_INIT; + struct CaCutmark marks[] = { + { + .value = htole64(UINT64_C(0xEBCDABCDABCDABCF)), + .mask = htole64(UINT64_C(0xFFFFFFFFFFFFFFFF)), + .delta = -8, + }, + { + .value = htole64(UINT64_C(0x00EFFEFFEE000000)), + .mask = htole64(UINT64_C(0x00FFFFFFFF000000)), + .delta = -2, + }, + { + .value = htole64(UINT64_C(0x1122113311441155)), + .mask = htole64(UINT64_C(0xFFFFFFFFFFFFFFFF)), + .delta = 3, + }, + + }; + + _cleanup_free_ uint8_t *buffer = NULL; + uint8_t *p; + size_t n, offset = 0, i; + unsigned found = 0; + uint32_t z; + + x.cutmarks = marks; + x.n_cutmarks = ELEMENTSOF(marks); + + buffer = new0(uint8_t, CUTMARK_BUFFER_SIZE); + assert_se(buffer); + + /* Fill an array with (constant) rubbish */ + srand(0); + for (i = 0; i < (size_t) CUTMARK_BUFFER_SIZE; i++) + buffer[i] = (uint8_t) rand(); + + /* Insert the cutmarks at five places */ + z = le64toh(marks[1].value) >> 24; + memcpy(buffer + 444, &marks[0].value, 8); + memcpy(buffer + CUTMARK_BUFFER_SIZE/3-9, &z, 4); + memcpy(buffer + CUTMARK_BUFFER_SIZE/2+7, &marks[0].value, 8); + memcpy(buffer + CUTMARK_BUFFER_SIZE/3*2+5, &marks[2].value, 8); + memcpy(buffer + CUTMARK_BUFFER_SIZE - 333, &z, 4); + + p = buffer; + n = CUTMARK_BUFFER_SIZE; + + for (;;) { + size_t k; + ssize_t cm; + + k = ca_chunker_scan(&x, true, p, n); + + if (k == (size_t) -1) { + offset += n; + p += n; + n = 0; + } else { + offset += k; + p += k; + n -= k; + } + + cm = offset - x.last_cutmark; + + if (cm == 444) + found |= 1; + else if (cm == CUTMARK_BUFFER_SIZE/3-9+3) + found |= 2; + else if (cm == CUTMARK_BUFFER_SIZE/2+7) + found |= 4; + else if (cm == CUTMARK_BUFFER_SIZE/3*2+5+8+3) + found |= 8; + else if (cm == CUTMARK_BUFFER_SIZE - 333+3) + found |= 16; + + if (n == 0) + break; + } + + /* Make sure we found all three cutmarks */ + assert_se(found == 31); +} + int main(int argc, char *argv[]) { test_rolling(); test_chunk(); test_set_size(); + test_cutmarks(); return 0; } diff --git a/test/test-cacutmark.c b/test/test-cacutmark.c new file mode 100644 index 00000000..973b6ee1 --- /dev/null +++ b/test/test-cacutmark.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "cacutmark.h" +#include "util.h" + +static void test_cutmark_parse_one(const char *s, int ret, uint64_t value, uint64_t mask, int64_t delta) { + CaCutmark c = {}; + + assert_se(ca_cutmark_parse(&c, s) == ret); + + assert_se(c.value == value); + assert_se(c.mask == mask); + assert_se(c.delta == delta); +} + +static void test_cutmark_parse(void) { + test_cutmark_parse_one("aaaaa", 0, 0xaaaaaU, 0xfffffU, 0); + test_cutmark_parse_one("0/1", 0, 0, 1, 0); + test_cutmark_parse_one("ff/ff+99", 0, 0xffU, 0xffU, 99); + test_cutmark_parse_one("ff/ff-99", 0, 0xffU, 0xffU, -99); + test_cutmark_parse_one("abc+99", 0, 0xabcU, 0xfffU, 99); + test_cutmark_parse_one("abc-99", 0, 0xabcU, 0xfffU, -99); + test_cutmark_parse_one("abc/eee", 0, 0xabcU, 0xeeeU, 0); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0+2147483647", 0, 0xabcdef0123456789U, 0x123456789abcdef0U, 2147483647); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0-2147483648", 0, 0xabcdef0123456789U, 0x123456789abcdef0U, -2147483648); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0+2147483647", 0, 0xabcdef0123456789U, 0x123456789abcdef0U, 2147483647); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0+9223372036854775807", 0, 0xabcdef0123456789U, 0x123456789abcdef0U, INT64_MAX); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0-9223372036854775808", 0, 0xabcdef0123456789U, 0x123456789abcdef0U, INT64_MIN); + test_cutmark_parse_one("1000000000000000/1000000000000000+0", 0, 0x1000000000000000U, 0x1000000000000000U, 0); + test_cutmark_parse_one("1000000000000000/1000000000000000-0", 0, 0x1000000000000000U, 0x1000000000000000U, 0); + + test_cutmark_parse_one("", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("fg", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("/", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("/f", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("+", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("+1", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("-", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("-1", -EINVAL, 0, 0, 0); + test_cutmark_parse_one("0/0", -EINVAL, 0, 0, 0); + + test_cutmark_parse_one("10000000000000000", -EOVERFLOW, 0, 0, 0); + test_cutmark_parse_one("0/10000000000000000", -EOVERFLOW, 0, 0, 0); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0+9223372036854775808", -EOVERFLOW, 0, 0, 0); + test_cutmark_parse_one("abcdef0123456789/123456789abcdef0-9223372036854775809", -EOVERFLOW, 0, 0, 0); +} + +int main(int argc, char *argv[]) { + + test_cutmark_parse(); + + return 0; +} diff --git a/test/test-seed.sh.in b/test/test-seed.sh.in new file mode 100755 index 00000000..a059371f --- /dev/null +++ b/test/test-seed.sh.in @@ -0,0 +1,26 @@ +#!/bin/bash + +set -ex + +SCRATCH_DIR=${TMPDIR:-/var/tmp}/test-casync.$RANDOM +mkdir -p $SCRATCH_DIR/src + +mkdir $SCRATCH_DIR/src/casync +if [ $UID = 0 ]; then + cp -a @top_srcdir@/{test-files,src} $SCRATCH_DIR/src/casync/ +else + # If we lack privileges we use rsync rather than cp to copy, as it will just skip over device nodes + rsync -a --exclude=.cacac @top_srcdir@/{test-files,src} $SCRATCH_DIR/src/casync/ +fi + +cd $SCRATCH_DIR/src + +@top_builddir@/casync make -v $SCRATCH_DIR/test.caidx +@top_builddir@/casync extract -v $SCRATCH_DIR/test.caidx $SCRATCH_DIR/first +@top_builddir@/casync extract -v --seed=$SCRATCH_DIR/src $SCRATCH_DIR/test.caidx $SCRATCH_DIR/second + +# Now, let's flush out the chunk store. If everything works correctly, the seed +# should be sufficient as a source for chunks +rm -rf $SCRATCH_DIR/default.castr/* + +@top_builddir@/casync extract -v --seed=$SCRATCH_DIR/src $SCRATCH_DIR/test.caidx $SCRATCH_DIR/third