From c08692df2c43c4c4e71691b83f43e4012aa6315e Mon Sep 17 00:00:00 2001 From: edef Date: Thu, 28 Apr 2022 20:36:48 +0000 Subject: ripple/fossil: deduplicate using content-defined chunking This implements casync-style content-defined chunking and deduplication. Change-Id: I42a9b98e1140bed462a5ae1e0aba508bebc9fa0e --- ripple/fossil/src/chunker/buz.rs | 102 +++++++++ ripple/fossil/src/chunker/mod.rs | 432 +++++++++++++++++++++++++++++++++++++++ ripple/fossil/src/lib.rs | 112 +++++++--- ripple/fossil/src/store.proto | 8 +- 4 files changed, 627 insertions(+), 27 deletions(-) create mode 100644 ripple/fossil/src/chunker/buz.rs create mode 100644 ripple/fossil/src/chunker/mod.rs (limited to 'ripple/fossil') diff --git a/ripple/fossil/src/chunker/buz.rs b/ripple/fossil/src/chunker/buz.rs new file mode 100644 index 0000000..7e10b9f --- /dev/null +++ b/ripple/fossil/src/chunker/buz.rs @@ -0,0 +1,102 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use std::mem; + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Hash(pub u32); + +impl Hash { + pub fn from(data: &[u8]) -> Hash { + let mut x: u32 = 0; + for &b in data { + x = x.rotate_left(1) ^ expand(b); + } + Hash(x) + } +} + +#[derive(Clone)] +pub struct Rolling { + i: usize, + x: u32, + /// `window[i]` is the least recently written byte + /// `window[i-1]` is the most recently written byte + window: [u8; N], +} + +impl Rolling { + pub fn from_slice(input: &[u8]) -> Rolling { + Self::try_from_slice(input).expect("need at least Rolling::WINDOW_SIZE bytes") + } + + pub fn try_from_slice(input: &[u8]) -> Option> { + let last_chunk = input.rchunks_exact(N).next()?; + + let mut window = [0; N]; + window.copy_from_slice(last_chunk); + + Some(Self::new(&window)) + } + + pub fn new(window: &[u8; N]) -> Rolling { + Rolling { + i: 0, + x: Hash::from(window).0, + window: *window, + } + } + + pub fn push(&mut self, next: u8) { + unsafe { + let ptr = self.window.get_unchecked_mut(self.i as usize); + let prev = mem::replace(ptr, next); + + self.i = (self.i + 1) % N; + self.x = self.x.rotate_left(1) ^ expand(prev).rotate_left(N as u32) ^ expand(next); + } + } + + pub fn sum(&self) -> Hash { + Hash(self.x) + } +} + +fn expand(byte: u8) -> u32 { + TABLE[byte as usize] +} + +const TABLE: [u32; 256] = [ + 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, + 0x7ebf5191, 0x841135c7, 0x65cc53b3, 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, + 0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa, 0x882bf287, 0x3116737c, 0x05569956, 0xe8cc1f68, + 0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7, 0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93, + 0x9bfd7c64, 0x0b3e7276, 0xf2688e77, 0x8fad8abc, 0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00, + 0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2, 0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2, + 0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e, 0x7b7c222f, 0x2955ed16, 0x9f10ca59, 0xe840c4c9, + 0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c, 0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5, + 0xb19165cd, 0x9891c393, 0x325384ac, 0x0308459d, 0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed, + 0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae, 0x977eb18c, 0xd8770976, 0x9833466a, 0xc674df7f, + 0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874, 0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4, + 0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493, 0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85, + 0xbef8f0e1, 0x21d73653, 0x4e3d977a, 0x1e7b3929, 0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6, + 0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617, 0xf5f7be70, 0xe795248a, 0x375a2fe9, 0x425570b6, + 0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f, 0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f, + 0x1bc0dfb5, 0xfb273589, 0x0ea47f7b, 0x3c1c2b50, 0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2, + 0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c, 0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b, + 0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729, 0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, 0xe0d8f8ae, + 0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328, 0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f, + 0x686a5b83, 0x50e072e5, 0xd9d3bb2a, 0x8befc475, 0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18, + 0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09, 0xc0d0a81c, 0x7fa3429b, 0xe9158a1b, 0x225ea19a, + 0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293, 0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140, + 0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661, 0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9, + 0x19727a23, 0x15a7e374, 0xc43a18d5, 0x3fb1aa73, 0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9, + 0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2, 0x5388e5ee, 0xcd8a7510, 0xf901b4fd, 0xdbc13dbc, + 0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49, 0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b, + 0x32baf4a9, 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a, 0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c, + 0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1, 0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319, + 0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1, 0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, 0xc6eb57bb, + 0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c, 0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b, + 0x329e5388, 0x91dd236b, 0x2ecb0d93, 0xf4d82a3d, 0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964, + 0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131, 0x7bf7cabc, 0xf9c18d66, 0x593ade65, 0xd95ddf11, +]; diff --git a/ripple/fossil/src/chunker/mod.rs b/ripple/fossil/src/chunker/mod.rs new file mode 100644 index 0000000..0045646 --- /dev/null +++ b/ripple/fossil/src/chunker/mod.rs @@ -0,0 +1,432 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use std::mem; + +mod buz; + +pub const MIN_CHUNK_SIZE: usize = AVG_CHUNK_SIZE / 4; +pub const AVG_CHUNK_SIZE: usize = 64 * 1024; +pub const MAX_CHUNK_SIZE: usize = AVG_CHUNK_SIZE * 4; + +fn discriminator_from_average(avg: u64) -> u32 { + let avg = avg as f64; + (avg / (-1.42888852e-7 * avg + 1.33237515)) as u32 +} + +#[derive(Clone)] +pub struct Chunker<'a> { + buffer: &'a [u8], +} + +impl<'a> Chunker<'a> { + pub fn from(buffer: &'a [u8]) -> Chunker<'a> { + Chunker { buffer } + } +} + +impl<'a> Iterator for Chunker<'a> { + type Item = &'a [u8]; + fn next(&mut self) -> Option<&'a [u8]> { + if self.buffer.is_empty() { + return None; + } + + if self.buffer.len() <= MIN_CHUNK_SIZE { + return Some(mem::take(&mut self.buffer)); + } + + let bytes = self + .buffer + .iter() + .cloned() + .enumerate() + .take(MAX_CHUNK_SIZE) + .skip(MIN_CHUNK_SIZE); + + let d = discriminator_from_average(AVG_CHUNK_SIZE as u64); + let mut hasher = buz::Rolling::<48>::from_slice(&self.buffer[..MIN_CHUNK_SIZE]); + let chunk; + for (idx, byte) in bytes { + hasher.push(byte); + let buz::Hash(x) = hasher.sum(); + if x % d == d.wrapping_sub(1) { + // split point + (chunk, self.buffer) = self.buffer.split_at(idx + 1); + return Some(chunk); + } + } + + (chunk, self.buffer) = self.buffer.split_at(MAX_CHUNK_SIZE.min(self.buffer.len())); + Some(chunk) + } + + fn size_hint(&self) -> (usize, Option) { + if self.buffer.is_empty() { + return (0, Some(0)); + } + let min = (self.buffer.len() + MAX_CHUNK_SIZE - 1) / MAX_CHUNK_SIZE; + let max = (self.buffer.len() + MIN_CHUNK_SIZE - 1) / MIN_CHUNK_SIZE; + (min, Some(max)) + } +} + +#[cfg(test)] +mod test { + use std::io::Read; + + fn generate(length: usize) -> Vec { + let mut h = blake3::Hasher::new(); + h.update(b"test vector"); + let mut buf = vec![0; length]; + h.finalize_xof().read_exact(&mut buf).unwrap(); + buf + } + + const GOLDEN: &[[u8; 32]] = &[ + [ + 0x66, 0x77, 0xd1, 0xf7, 0x78, 0xbf, 0x7b, 0x42, 0x91, 0x15, 0x16, 0xda, 0xde, 0xae, + 0x69, 0x17, 0x59, 0xe9, 0x73, 0xe8, 0x5b, 0xa2, 0xa0, 0x92, 0x6f, 0x98, 0x76, 0x8c, + 0xbb, 0xb4, 0x74, 0xc7, + ], + [ + 0xad, 0xd6, 0xc0, 0x6e, 0x0e, 0xef, 0xb5, 0x3b, 0x1f, 0x8b, 0xeb, 0xac, 0x21, 0xa8, + 0xf7, 0x0c, 0xc9, 0xeb, 0xc8, 0x20, 0x22, 0xf0, 0xfe, 0x0a, 0x72, 0x15, 0x15, 0x21, + 0x32, 0x7d, 0xe5, 0x0a, + ], + [ + 0xbc, 0xd0, 0x68, 0x57, 0x0a, 0x66, 0x70, 0xf3, 0x3e, 0x81, 0xc9, 0xa8, 0x7d, 0xac, + 0xd9, 0xde, 0x80, 0xb9, 0x8a, 0x19, 0x8d, 0x54, 0xcd, 0xd7, 0x5e, 0x76, 0xca, 0x0f, + 0xb0, 0x0d, 0x53, 0x11, + ], + [ + 0x91, 0x16, 0x6f, 0x72, 0x92, 0x3d, 0x00, 0xbb, 0x46, 0x82, 0x7a, 0xc5, 0x35, 0xda, + 0x0c, 0x6e, 0x20, 0xbc, 0x1a, 0xb0, 0xb8, 0x40, 0x58, 0x0b, 0xf4, 0xc4, 0xe3, 0xbe, + 0xb7, 0x81, 0xeb, 0xd8, + ], + [ + 0x51, 0x59, 0xea, 0xdf, 0xd1, 0x41, 0x38, 0x11, 0xa9, 0x91, 0xb3, 0xf4, 0x91, 0x8c, + 0x7b, 0x61, 0x38, 0x4a, 0x04, 0x59, 0x1f, 0xe9, 0xc1, 0xa6, 0x01, 0x19, 0x32, 0x5b, + 0xfa, 0x3b, 0x24, 0x92, + ], + [ + 0x7c, 0x64, 0x0c, 0xa5, 0xe8, 0x3a, 0x25, 0xeb, 0xaf, 0xe3, 0x1e, 0x5b, 0x58, 0xd4, + 0x8b, 0xb8, 0x0a, 0xa5, 0x50, 0x5d, 0x18, 0x8a, 0x79, 0x38, 0xe3, 0x71, 0x61, 0x6d, + 0x24, 0x4b, 0x0d, 0x07, + ], + [ + 0x6c, 0x4b, 0xc6, 0x17, 0xf2, 0xaf, 0x9b, 0xa6, 0x81, 0xd4, 0x1b, 0xf1, 0x77, 0xce, + 0xcf, 0x0b, 0xb7, 0xbf, 0x3a, 0x54, 0xc7, 0x26, 0x5f, 0x8c, 0x43, 0x5a, 0x40, 0x61, + 0xf5, 0x5c, 0x47, 0x75, + ], + [ + 0x98, 0x74, 0xd1, 0x45, 0x30, 0xe9, 0x47, 0x0c, 0xcf, 0x76, 0x52, 0x65, 0x85, 0x96, + 0xf6, 0x90, 0x6f, 0x1d, 0x84, 0x24, 0xe1, 0x4f, 0x08, 0x55, 0x03, 0xf8, 0xe8, 0x8f, + 0x10, 0xb4, 0xec, 0xc8, + ], + [ + 0xd2, 0x87, 0x27, 0x3d, 0xa5, 0x7d, 0x31, 0x75, 0x56, 0x82, 0xb7, 0x13, 0x4c, 0xea, + 0x68, 0xbf, 0xc4, 0x1c, 0x0b, 0xb9, 0xce, 0x64, 0xb1, 0x90, 0xfb, 0xde, 0x2a, 0xd8, + 0xce, 0xc0, 0x1b, 0x2c, + ], + [ + 0x68, 0xbd, 0xfb, 0x6c, 0x99, 0xf5, 0x3e, 0x82, 0xf6, 0x9e, 0x6a, 0xa6, 0x01, 0x56, + 0x00, 0x3f, 0xab, 0x8e, 0x9c, 0xf7, 0x98, 0x93, 0x1a, 0x70, 0xef, 0x2b, 0x19, 0x5b, + 0x1c, 0xf0, 0x1e, 0x38, + ], + [ + 0xae, 0x9f, 0xa2, 0x52, 0x89, 0x7b, 0xda, 0xcf, 0x65, 0x09, 0x93, 0xd1, 0xe0, 0x96, + 0x73, 0xb6, 0xfb, 0xa6, 0x0f, 0x58, 0x02, 0xec, 0x79, 0x22, 0xf8, 0xa7, 0xea, 0xe8, + 0x16, 0x60, 0x6d, 0x1b, + ], + [ + 0x8b, 0x9e, 0x83, 0xc2, 0x45, 0x75, 0x9c, 0x3a, 0xf4, 0x0f, 0x4b, 0xb4, 0xbd, 0x82, + 0xa1, 0x62, 0x66, 0xde, 0xd3, 0x2f, 0xda, 0xf6, 0xfc, 0x12, 0xc9, 0x57, 0x40, 0xe2, + 0x18, 0x90, 0x89, 0x67, + ], + [ + 0x6b, 0x0d, 0x3f, 0x9b, 0xec, 0x24, 0xb2, 0xf7, 0x90, 0x8c, 0x75, 0x28, 0x57, 0xc9, + 0x6f, 0x9e, 0xa7, 0x78, 0x94, 0x31, 0xaf, 0xa4, 0xa4, 0x61, 0x9c, 0x6e, 0xa7, 0xf9, + 0x64, 0x60, 0x48, 0xcd, + ], + [ + 0xbd, 0x04, 0x1a, 0x0c, 0x3e, 0xe2, 0x3e, 0x3a, 0xe6, 0x6c, 0x5b, 0xcd, 0x8d, 0x8a, + 0x1e, 0xfe, 0x4d, 0xd5, 0xac, 0x14, 0xd5, 0xe6, 0x59, 0x40, 0x0e, 0xa0, 0xcf, 0x75, + 0x32, 0xc6, 0xb6, 0x7a, + ], + [ + 0x66, 0x73, 0x94, 0x96, 0x23, 0x81, 0x69, 0x26, 0x34, 0x94, 0x4d, 0x20, 0xf3, 0x6d, + 0x67, 0x3d, 0xb6, 0xb9, 0x9b, 0xc7, 0xa7, 0x7c, 0x22, 0xc3, 0x56, 0xd1, 0x58, 0xe5, + 0x4e, 0x55, 0x0a, 0x33, + ], + [ + 0x26, 0xdd, 0xfc, 0x31, 0xac, 0xfc, 0xd2, 0x57, 0x9d, 0x52, 0x65, 0xdc, 0xf3, 0xe0, + 0x8a, 0xde, 0x13, 0x04, 0x5f, 0xee, 0xc4, 0xf4, 0x35, 0xce, 0xb6, 0xcf, 0xd1, 0x7f, + 0xc0, 0xc5, 0x4b, 0x7f, + ], + [ + 0xe2, 0xd5, 0x6a, 0xb7, 0xdb, 0x71, 0x1d, 0x4d, 0xb2, 0xb4, 0xdb, 0xee, 0x94, 0x4c, + 0xac, 0x92, 0x6e, 0xdc, 0xf3, 0xfd, 0x92, 0x44, 0xf1, 0x79, 0x66, 0x2a, 0x1e, 0x70, + 0xd6, 0x2e, 0xb4, 0x6b, + ], + [ + 0x86, 0x5d, 0x5f, 0x60, 0x75, 0x3d, 0xff, 0x7b, 0xac, 0x50, 0xa3, 0x11, 0x82, 0x7a, + 0xb8, 0xfe, 0xde, 0xbc, 0xb1, 0x0a, 0xfd, 0xea, 0xa8, 0x08, 0xaa, 0x28, 0xfc, 0x95, + 0x72, 0x04, 0x72, 0x13, + ], + [ + 0xa6, 0x2d, 0xcc, 0x3a, 0x2e, 0x8e, 0xe9, 0xf2, 0x87, 0xc7, 0xcc, 0xa8, 0xc8, 0xa3, + 0x18, 0x33, 0xc9, 0x43, 0x31, 0x4a, 0xab, 0xfe, 0xaf, 0xc1, 0x1d, 0xc6, 0x22, 0x3d, + 0xf3, 0x54, 0x45, 0xe2, + ], + [ + 0x75, 0x1d, 0x1f, 0x3d, 0x94, 0x8e, 0x7d, 0x27, 0x22, 0x7f, 0x1a, 0x0e, 0xcc, 0x61, + 0xef, 0xbd, 0x41, 0x16, 0x53, 0xb9, 0x5b, 0x78, 0x05, 0x06, 0x80, 0x60, 0xa3, 0x17, + 0x4f, 0x01, 0x10, 0x51, + ], + [ + 0x4e, 0x0f, 0x4d, 0x71, 0x65, 0x10, 0xa5, 0x81, 0xff, 0xfe, 0x62, 0x6b, 0x15, 0x94, + 0x17, 0x4c, 0x6d, 0x12, 0x85, 0xbe, 0x74, 0xcb, 0x56, 0x04, 0xc7, 0x38, 0x8f, 0xb7, + 0x6e, 0xaa, 0xfd, 0x12, + ], + [ + 0xe4, 0xb8, 0x64, 0x03, 0x92, 0xe5, 0x71, 0x89, 0x31, 0xfc, 0xc7, 0xe9, 0xee, 0xcd, + 0x92, 0xfb, 0x92, 0xb0, 0x1d, 0x32, 0x34, 0xa1, 0x91, 0xd4, 0x7f, 0x5b, 0x8b, 0x37, + 0xb8, 0x60, 0x2d, 0x6a, + ], + [ + 0xc6, 0x3c, 0x17, 0xc0, 0x94, 0x0d, 0x16, 0xec, 0xc6, 0x67, 0x1f, 0xd1, 0x7a, 0xec, + 0x12, 0x10, 0xf3, 0xf6, 0x1e, 0x59, 0x4b, 0x29, 0xab, 0x1c, 0xd4, 0xf6, 0xd1, 0x2e, + 0x8a, 0xd7, 0x42, 0xc1, + ], + [ + 0x8d, 0x59, 0x07, 0x06, 0x8c, 0x84, 0x68, 0x2a, 0x02, 0xb0, 0xe3, 0x2a, 0x88, 0x64, + 0x28, 0x1a, 0xbf, 0x58, 0x74, 0xb5, 0xf9, 0x75, 0x5f, 0xd8, 0xc8, 0x4d, 0xa6, 0x9b, + 0xd3, 0x40, 0x19, 0x05, + ], + [ + 0x45, 0xa7, 0xc6, 0x21, 0xb3, 0x14, 0x14, 0x95, 0x74, 0x2e, 0xa4, 0xdc, 0x1f, 0x11, + 0xe0, 0x77, 0xd5, 0xf3, 0x68, 0xe2, 0xc1, 0x9a, 0xb9, 0x83, 0xed, 0xd4, 0xf6, 0x77, + 0x83, 0x09, 0x8f, 0x54, + ], + [ + 0x94, 0x79, 0x72, 0xb1, 0x57, 0x2a, 0xb5, 0x2d, 0x81, 0x87, 0x9b, 0x32, 0xa8, 0xbf, + 0xa8, 0xb6, 0x69, 0xff, 0x82, 0x69, 0x37, 0x69, 0x56, 0x7f, 0x26, 0xa4, 0xf2, 0x6c, + 0x9a, 0xfd, 0x7e, 0x1a, + ], + [ + 0x6f, 0x7f, 0x6d, 0xf1, 0x1f, 0xbb, 0x57, 0x2b, 0x2f, 0x29, 0x9f, 0x3d, 0x4e, 0x00, + 0xd3, 0x75, 0x9b, 0xf2, 0x9f, 0x5d, 0xd4, 0xe5, 0x07, 0x9e, 0x46, 0xe0, 0x73, 0x3e, + 0x68, 0x52, 0xa4, 0x99, + ], + [ + 0xbb, 0x38, 0x7e, 0xc3, 0x7b, 0x3f, 0xe4, 0x9d, 0x05, 0xcc, 0x38, 0xa0, 0xe2, 0x3e, + 0xb0, 0x95, 0x23, 0x43, 0x92, 0x2b, 0x83, 0x77, 0x10, 0xe4, 0x33, 0x7d, 0xe9, 0x75, + 0xac, 0xdd, 0xe4, 0xcf, + ], + [ + 0x32, 0x36, 0x32, 0x5b, 0x9f, 0xa9, 0x47, 0xd3, 0xfb, 0xca, 0xc0, 0x4e, 0x3d, 0x4a, + 0xa6, 0xb8, 0x42, 0x16, 0x91, 0x43, 0xf2, 0x20, 0x04, 0x8d, 0x12, 0xb9, 0xb2, 0xa4, + 0xc8, 0x97, 0xff, 0x99, + ], + [ + 0x91, 0x11, 0xf3, 0xae, 0x36, 0x81, 0x03, 0xa5, 0x11, 0x74, 0xcd, 0x39, 0x06, 0x38, + 0x1b, 0xa6, 0x87, 0x06, 0x28, 0x8a, 0x8a, 0x12, 0xbd, 0x35, 0xf6, 0x3c, 0xfe, 0xa9, + 0x4a, 0x5b, 0x4e, 0x99, + ], + [ + 0x68, 0x58, 0x3d, 0x1b, 0x52, 0xcb, 0x10, 0x28, 0x13, 0x1c, 0x99, 0x3b, 0xba, 0x09, + 0x0b, 0x61, 0xc1, 0x51, 0x16, 0x03, 0xd9, 0x53, 0xba, 0x92, 0x6b, 0x45, 0x52, 0x8b, + 0x90, 0x3c, 0xe1, 0x64, + ], + [ + 0x5c, 0x26, 0xc4, 0xc9, 0x46, 0x08, 0x4d, 0x5d, 0xd2, 0xd7, 0x1e, 0xc7, 0xab, 0x95, + 0xe8, 0xa6, 0xc1, 0x03, 0xff, 0x1f, 0x48, 0x1f, 0x2a, 0x8f, 0xe6, 0xa7, 0x05, 0x3c, + 0xbb, 0xde, 0x9e, 0xbc, + ], + [ + 0xf2, 0x25, 0xf0, 0xee, 0xd0, 0xa8, 0x2c, 0x50, 0x1b, 0x02, 0xaf, 0xd4, 0x56, 0x7b, + 0x45, 0xea, 0x10, 0xf6, 0x8f, 0xb7, 0x2c, 0xe2, 0xe7, 0x88, 0xc6, 0x52, 0x91, 0x5d, + 0xfe, 0x18, 0x4a, 0x0e, + ], + [ + 0xb0, 0x38, 0xfe, 0x27, 0x98, 0xc4, 0x4d, 0xad, 0xfa, 0x07, 0x09, 0xa4, 0x40, 0xc8, + 0x5a, 0x85, 0x69, 0xb5, 0xbd, 0xf3, 0xac, 0xb2, 0x82, 0x2e, 0x55, 0x22, 0xb4, 0x19, + 0xf6, 0x30, 0xde, 0x82, + ], + [ + 0x5f, 0x98, 0x00, 0xfd, 0x1d, 0x88, 0xf0, 0x00, 0x39, 0x20, 0x70, 0x5c, 0x2b, 0x81, + 0xb3, 0x2d, 0x31, 0x7c, 0x1c, 0xa8, 0xcc, 0x21, 0x09, 0xb5, 0xfc, 0xd0, 0xae, 0x59, + 0xdb, 0x45, 0xbf, 0x03, + ], + [ + 0xf5, 0x92, 0xf4, 0x5f, 0xb9, 0x29, 0x1c, 0xc1, 0xee, 0xf1, 0x12, 0xf8, 0x0d, 0xc4, + 0x92, 0x3a, 0xc8, 0x45, 0x5b, 0x1e, 0x2b, 0xae, 0xfc, 0xb4, 0x35, 0x21, 0x15, 0xcf, + 0xc8, 0xc8, 0x3b, 0x19, + ], + [ + 0xc2, 0x2e, 0x11, 0x46, 0x01, 0x5f, 0x0c, 0xa2, 0x6a, 0xe1, 0x13, 0x6d, 0x30, 0xd4, + 0x86, 0xdf, 0xdd, 0xe0, 0x30, 0x8f, 0x3e, 0xaa, 0x1c, 0xd3, 0xf3, 0xda, 0x4c, 0x72, + 0xdb, 0xe3, 0xe3, 0x23, + ], + [ + 0xf9, 0x98, 0x5c, 0xcd, 0x6e, 0x19, 0xbf, 0xfa, 0xeb, 0x28, 0xf5, 0xf1, 0xab, 0xea, + 0x25, 0x8d, 0xd4, 0xa7, 0xb7, 0x11, 0x95, 0x4f, 0xed, 0x9b, 0x0d, 0xba, 0x08, 0x83, + 0xc4, 0x97, 0xad, 0x65, + ], + [ + 0x76, 0xb1, 0x15, 0x9a, 0x9a, 0x76, 0xac, 0x13, 0x7e, 0xfa, 0x0f, 0x3c, 0xaa, 0x70, + 0xb5, 0x9a, 0x60, 0x4d, 0xb9, 0x5b, 0x48, 0xd0, 0x38, 0x85, 0x1d, 0x4a, 0xcb, 0xa9, + 0xe0, 0x4d, 0x41, 0x10, + ], + [ + 0xbe, 0xf5, 0xbb, 0x3b, 0x80, 0x96, 0x52, 0x07, 0x8a, 0x39, 0xdc, 0xf4, 0xc4, 0x36, + 0x34, 0xd5, 0x2f, 0xb3, 0xf4, 0x81, 0xde, 0x92, 0x7f, 0x7b, 0xcc, 0x0c, 0xc6, 0x82, + 0x10, 0xd0, 0xdb, 0xa4, + ], + [ + 0x91, 0x8b, 0x9a, 0x12, 0x12, 0x04, 0x84, 0x7a, 0x08, 0x03, 0x88, 0x0d, 0x59, 0x0a, + 0xe1, 0x79, 0xd8, 0x21, 0x28, 0x96, 0xa4, 0x93, 0x3e, 0x66, 0xc3, 0xf0, 0x0d, 0x7f, + 0x63, 0xc7, 0xc0, 0x0b, + ], + [ + 0xa8, 0x0b, 0x46, 0xe0, 0x44, 0x86, 0xe1, 0xa4, 0xe8, 0x39, 0x61, 0x73, 0x69, 0x59, + 0xed, 0xf4, 0x63, 0xfd, 0x1a, 0x55, 0x9b, 0x12, 0xf3, 0xa1, 0xab, 0x4c, 0x89, 0x3b, + 0xb9, 0x31, 0x8b, 0xbf, + ], + [ + 0x9f, 0xc1, 0x32, 0x3f, 0xcf, 0x6b, 0x96, 0xd8, 0x26, 0xbe, 0x3e, 0x73, 0x30, 0xed, + 0x53, 0x10, 0xe7, 0x9d, 0x35, 0x72, 0x1e, 0x67, 0xe6, 0x77, 0x4f, 0x6b, 0xa8, 0xd3, + 0xdf, 0xd7, 0x13, 0x66, + ], + [ + 0x6a, 0x6b, 0x9c, 0xfa, 0xcd, 0x49, 0xb5, 0xd4, 0x17, 0x9f, 0xaa, 0xe0, 0x25, 0x9a, + 0xf4, 0xa4, 0x46, 0x39, 0x56, 0xff, 0x6f, 0xdf, 0x5f, 0x6f, 0xa5, 0xa2, 0x52, 0x98, + 0xe4, 0x1b, 0x64, 0x05, + ], + [ + 0x15, 0xd2, 0x00, 0x5b, 0x94, 0xa4, 0xa0, 0xa7, 0x93, 0xce, 0x66, 0x89, 0xc3, 0xa6, + 0xa6, 0x5e, 0x5c, 0xa1, 0x51, 0x4e, 0x3b, 0xf2, 0xbd, 0x9d, 0xc5, 0x27, 0x3c, 0x34, + 0xaa, 0xc6, 0x7b, 0xfc, + ], + [ + 0x85, 0xeb, 0xa0, 0x53, 0xce, 0xdd, 0xfc, 0x1e, 0x2d, 0x16, 0x09, 0xb0, 0x3d, 0xd1, + 0x1f, 0x3c, 0xf0, 0x71, 0x2d, 0xa2, 0xbd, 0x1e, 0xb6, 0x1a, 0x27, 0x0c, 0xe1, 0x34, + 0x42, 0x49, 0x8c, 0x41, + ], + [ + 0xd1, 0xaf, 0xae, 0x46, 0x0e, 0xb6, 0xc3, 0x7e, 0xb7, 0x83, 0x45, 0xab, 0xc5, 0x2e, + 0xc4, 0x3b, 0x16, 0x52, 0x84, 0x0f, 0xbf, 0x4f, 0x79, 0x0c, 0x2b, 0xa7, 0x98, 0xc5, + 0x4d, 0x0b, 0x17, 0xff, + ], + [ + 0x0e, 0xb6, 0x4e, 0xf4, 0x49, 0x54, 0xa9, 0x5d, 0x51, 0xf1, 0xf4, 0x5e, 0xf7, 0x56, + 0xe1, 0x84, 0x7b, 0xaa, 0x25, 0x3b, 0xa4, 0xa0, 0xe4, 0xf6, 0x5f, 0xc5, 0x55, 0x12, + 0x57, 0x1f, 0x30, 0x27, + ], + [ + 0xc6, 0x43, 0x9d, 0x8c, 0x49, 0x8c, 0x45, 0xb9, 0x05, 0x14, 0xe3, 0x07, 0xdf, 0x1f, + 0xa9, 0x9f, 0x41, 0xf2, 0xd0, 0xe1, 0xde, 0x90, 0xa8, 0x52, 0xa0, 0x55, 0x40, 0xa1, + 0x94, 0xf7, 0xad, 0xd3, + ], + [ + 0x8f, 0x8a, 0xc3, 0x9c, 0x0d, 0xbc, 0xf7, 0xd6, 0x65, 0xda, 0x26, 0x0a, 0x82, 0x6d, + 0x88, 0xe0, 0x88, 0xd8, 0xba, 0x73, 0x81, 0xce, 0x5d, 0x1e, 0xa6, 0xdd, 0x88, 0x94, + 0x56, 0x89, 0x17, 0xa6, + ], + [ + 0x97, 0xe1, 0xbf, 0xc8, 0x8f, 0x16, 0x99, 0xce, 0x00, 0x44, 0x9c, 0x5a, 0x5f, 0x69, + 0xb0, 0xfd, 0xf6, 0xc9, 0x94, 0x10, 0x73, 0x53, 0x10, 0xf2, 0xb6, 0x22, 0x2a, 0x6f, + 0x25, 0x67, 0xbe, 0xc2, + ], + [ + 0x80, 0x02, 0x68, 0x53, 0x7b, 0x24, 0xe0, 0x19, 0x7b, 0x10, 0x9c, 0x47, 0x77, 0xab, + 0x0f, 0x4c, 0x09, 0x28, 0x5d, 0xa5, 0x54, 0xd1, 0x25, 0x65, 0x97, 0x4d, 0x45, 0xd2, + 0xa0, 0x71, 0xec, 0x2d, + ], + [ + 0x55, 0xbf, 0x83, 0x5b, 0xa6, 0xba, 0xb1, 0x43, 0x1b, 0x18, 0x23, 0xd9, 0x64, 0x93, + 0x66, 0x0a, 0x92, 0x06, 0xfe, 0xa1, 0xec, 0xb5, 0xce, 0x95, 0x9f, 0x9f, 0x7d, 0xc4, + 0x92, 0x49, 0x84, 0xfd, + ], + [ + 0xdb, 0x6a, 0xcd, 0x67, 0x7a, 0x11, 0x65, 0x38, 0x56, 0x65, 0x89, 0xfd, 0x1b, 0x5f, + 0x86, 0x19, 0x1f, 0xb9, 0x38, 0x94, 0x98, 0x5d, 0x6b, 0x30, 0x82, 0xce, 0xae, 0x3e, + 0x53, 0x4c, 0xbb, 0x95, + ], + [ + 0xd1, 0x83, 0xc3, 0xd4, 0xb0, 0x65, 0xf0, 0x77, 0xad, 0x50, 0x65, 0x24, 0xec, 0x07, + 0x48, 0x1d, 0xe4, 0x9d, 0x2c, 0x6e, 0x94, 0xc9, 0x1d, 0x58, 0x43, 0x7a, 0x93, 0xaf, + 0xff, 0x6e, 0xd2, 0xbd, + ], + [ + 0x07, 0x17, 0x08, 0x3d, 0xcb, 0xb2, 0x4e, 0x38, 0x96, 0x1e, 0x68, 0x84, 0x39, 0xba, + 0x3a, 0x40, 0xc2, 0x6a, 0x72, 0xcf, 0xf8, 0x2d, 0x56, 0x18, 0x44, 0x42, 0x9d, 0xa8, + 0x08, 0x67, 0xfc, 0x10, + ], + [ + 0xed, 0x8d, 0xdb, 0xbf, 0x6e, 0x1a, 0xd0, 0x0a, 0x34, 0x61, 0xac, 0xab, 0x32, 0x3f, + 0x2a, 0x70, 0x51, 0xf7, 0x39, 0xb5, 0x63, 0xbb, 0xac, 0x83, 0xba, 0x56, 0x73, 0xc8, + 0x4a, 0x60, 0x09, 0x24, + ], + [ + 0x46, 0x63, 0x1d, 0x73, 0x51, 0xf6, 0xe2, 0x57, 0xa5, 0x6a, 0xbb, 0xb5, 0x88, 0x76, + 0x3c, 0xeb, 0x89, 0xbe, 0xc5, 0x1b, 0x95, 0x9c, 0xc5, 0x30, 0x02, 0x21, 0xad, 0x35, + 0x2f, 0xe5, 0xae, 0x51, + ], + [ + 0x69, 0xcb, 0x42, 0x2a, 0xd8, 0x9b, 0xcc, 0xcc, 0x51, 0x3b, 0x58, 0xe9, 0x21, 0x61, + 0xee, 0x81, 0xb1, 0x0d, 0x0a, 0xc4, 0xb1, 0xc7, 0xa0, 0x4f, 0x19, 0x76, 0x57, 0x5e, + 0xd8, 0x53, 0x0a, 0xee, + ], + [ + 0x9b, 0x39, 0x2e, 0x2e, 0xd5, 0x7f, 0x0f, 0x82, 0x04, 0xbb, 0x7c, 0xf6, 0x88, 0x39, + 0xd9, 0xd1, 0x91, 0xc8, 0x06, 0x71, 0x36, 0x1d, 0x96, 0xc2, 0x9d, 0xdf, 0xc2, 0x6c, + 0x18, 0xe1, 0x1b, 0x79, + ], + [ + 0xe7, 0xe0, 0x6b, 0xfa, 0x7c, 0xcd, 0x85, 0x9e, 0x10, 0xb8, 0xac, 0x33, 0xae, 0x61, + 0x5a, 0x29, 0x98, 0x2a, 0x65, 0xc1, 0x31, 0x33, 0xe2, 0x7e, 0xd2, 0x97, 0x5e, 0x9a, + 0x67, 0xe2, 0x16, 0x52, + ], + [ + 0x5b, 0x01, 0x92, 0x69, 0x45, 0xf7, 0xd6, 0x35, 0xf7, 0xca, 0x28, 0xa6, 0xd7, 0x1c, + 0xd8, 0x8d, 0xac, 0x15, 0x6e, 0x00, 0xeb, 0xaa, 0xe4, 0x9c, 0x2f, 0xe5, 0x03, 0x95, + 0x79, 0xf1, 0x55, 0xf1, + ], + [ + 0x9d, 0x8f, 0x23, 0x51, 0xb6, 0xae, 0x39, 0xe9, 0xae, 0x13, 0x5c, 0xcb, 0x86, 0xb5, + 0xdd, 0xc7, 0x77, 0xa8, 0x92, 0x23, 0x59, 0xb8, 0x1d, 0xea, 0x58, 0x64, 0x34, 0x0c, + 0x13, 0xbe, 0x95, 0xba, + ], + [ + 0xf2, 0x2f, 0x2d, 0x26, 0xa5, 0x78, 0xbe, 0x8d, 0xd2, 0xb1, 0xf9, 0xf4, 0x19, 0x01, + 0x29, 0xd6, 0x87, 0xc6, 0xcf, 0x57, 0x46, 0xb1, 0x3c, 0x63, 0x15, 0x1c, 0x65, 0xee, + 0x83, 0x7e, 0x16, 0x56, + ], + [ + 0xd5, 0x81, 0x12, 0x6b, 0x6b, 0xdc, 0xc4, 0xc6, 0x84, 0x25, 0x04, 0xdd, 0xc9, 0x8c, + 0xb1, 0x50, 0x56, 0x19, 0x6e, 0xbf, 0x69, 0x03, 0x12, 0x38, 0x93, 0x20, 0x98, 0xb7, + 0xf2, 0x5e, 0x4c, 0xd6, + ], + [ + 0x12, 0x4b, 0xbc, 0x2e, 0x30, 0x71, 0x18, 0xa5, 0x82, 0xd5, 0xae, 0x52, 0xfa, 0xfd, + 0xbf, 0xcb, 0x75, 0x2b, 0x94, 0x8e, 0x9e, 0xc3, 0xaf, 0x56, 0xa1, 0x41, 0xdc, 0x57, + 0xaf, 0x40, 0x50, 0xdb, + ], + [ + 0xaa, 0xaa, 0x52, 0xad, 0xa4, 0xf5, 0xde, 0xd6, 0x92, 0x9e, 0xbf, 0xb9, 0x12, 0xfa, + 0x70, 0x12, 0xa8, 0x19, 0x6e, 0x0c, 0xa4, 0xdf, 0x7c, 0x99, 0x88, 0x94, 0x2c, 0x3a, + 0x98, 0x3d, 0xa6, 0x4e, + ], + ]; + + #[test] + fn golden() { + let data = generate(1024 * 64 * 64); + let chunks = super::Chunker::from(&data).map(blake3::hash); + for (actual, &expected) in chunks.zip(GOLDEN.iter()) { + assert_eq!(actual, blake3::Hash::from(expected)); + } + } +} diff --git a/ripple/fossil/src/lib.rs b/ripple/fossil/src/lib.rs index 3b2b3f8..4cea16a 100644 --- a/ripple/fossil/src/lib.rs +++ b/ripple/fossil/src/lib.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: OSL-3.0 use { + crate::chunker::Chunker, anyhow::{Context, Result}, byteorder::{BigEndian, ByteOrder}, prost::Message, @@ -20,13 +21,15 @@ pub mod store { include!(concat!(env!("OUT_DIR"), "/fossil.store.rs")); } -const CHUNK_BYTES: usize = 0x400; +mod chunker; + const DIGEST_BYTES: usize = blake3::OUT_LEN; pub struct Store { meta: sled::Tree, blobs: sled::Tree, - chunks: RefCell, + chunks: sled::Tree, + chunks_file: RefCell, chunks_tail: Cell, } @@ -37,8 +40,9 @@ impl Store { let db = sled::open(path)?; let meta = (&*db).clone(); let blobs = db.open_tree("blobs")?; + let chunks = db.open_tree("chunks")?; - let chunks = fs::OpenOptions::new() + let chunks_file = fs::OpenOptions::new() .read(true) .append(true) .create(true) @@ -49,12 +53,13 @@ impl Store { .map(|v| BigEndian::read_u64(&v)) .unwrap_or_default(); - chunks.set_len(chunks_tail)?; + chunks_file.set_len(chunks_tail)?; Ok(Store { blobs, meta, - chunks: RefCell::new(chunks), + chunks, + chunks_file: RefCell::new(chunks_file), chunks_tail: Cell::new(chunks_tail), }) } @@ -124,34 +129,66 @@ impl Store { } fn write_blob_inner(&self, ident: &Digest, outboard: Vec, data: &[u8]) { - let mut chunks_file = self.chunks.borrow_mut(); - let offset = self.chunks_tail.get(); + let mut chunks_file = self.chunks_file.borrow_mut(); + let mut offset = self.chunks_tail.get(); + let mut batch = sled::Batch::default(); - chunks_file.write_all(data).unwrap(); - let chunks_tail = offset + data.len() as u64; + let chunks = Chunker::from(data) + .map(|chunk_data| { + self.write_chunk(&mut chunks_file, &mut offset, &mut batch, chunk_data) + }) + .collect::>(); let blob_buf = store::Blob { - offset, - length: data.len() as u64, + chunks, bao_inline: outboard, } .encode_to_vec(); let chunks_tail_buf = { let mut buf = [0u8; 8]; - BigEndian::write_u64(&mut buf, chunks_tail); + BigEndian::write_u64(&mut buf, offset); buf }; // TODO(edef): figure out fsync for durability - (&self.blobs, &self.meta) - .transaction(|(blobs, meta)| { + (&self.blobs, &self.chunks, &self.meta) + .transaction(|(blobs, chunks, meta)| { + chunks.apply_batch(&batch)?; blobs.insert(&*ident.as_bytes(), &*blob_buf)?; meta.insert("chunks_tail", &chunks_tail_buf)?; Ok::<_, ConflictableTransactionError>(()) }) .unwrap(); - self.chunks_tail.set(chunks_tail); + self.chunks_tail.set(offset); + } + + fn write_chunk( + &self, + chunks_file: &mut fs::File, + offset: &mut u64, + batch: &mut sled::Batch, + data: &[u8], + ) -> store::Chunk { + let ident = blake3::hash(data); + if let Some(chunk) = self.get_chunk(&ident) { + return chunk; + } + + chunks_file.write_all(data).unwrap(); + let chunk = store::Chunk { + offset: *offset, + length: data.len() as u32, + }; + *offset += data.len() as u64; + + batch.insert(ident.as_bytes(), chunk.encode_to_vec()); + chunk + } + + fn get_chunk(&self, ident: &Digest) -> Option { + let buf = self.chunks.get(&*ident.as_bytes()).unwrap()?; + Some(store::Chunk::decode(&*buf).unwrap()) } pub fn read_blob(&self, ident: Digest) -> Vec { @@ -168,15 +205,31 @@ impl Store { .expect("blob not found"); let store::Blob { - offset, - length, + mut chunks, bao_inline, } = store::Blob::decode(&*buf).unwrap(); + let mut blob_length: u64 = 0; + let chunks = chunks + .drain(..) + .map(|chunk| { + let chunk_offset = blob_length; + blob_length += chunk.length as u64; + ( + chunk_offset, + Slice { + offset: chunk.offset, + length: chunk.length, + }, + ) + }) + .collect(); + Blob(bao::decode::Decoder::new_outboard( RawBlob { store: self, - slice: Slice { offset, length }, + chunks, + length: blob_length, position: 0, }, io::Cursor::new(bao_inline), @@ -211,29 +264,38 @@ impl io::Seek for Blob<'_> { #[derive(Debug)] struct Slice { offset: u64, - length: u64, + length: u32, } struct RawBlob<'a> { store: &'a Store, - slice: Slice, + chunks: BTreeMap, + length: u64, position: u64, } impl io::Read for RawBlob<'_> { fn read(&mut self, dst: &mut [u8]) -> io::Result { + let (&chunk_offset, chunk_slice) = + if let Some(entry) = self.chunks.range(..=self.position).next_back() { + entry + } else { + // empty blob + return Ok(0); + }; + let prev_pos = self.position; let next_pos = Ord::min( self.position.saturating_add(dst.len() as u64), - self.slice.length, + chunk_offset + chunk_slice.length as u64, ); let len = (next_pos - prev_pos) as usize; let dst = &mut dst[..len]; - let offset = self.slice.offset + prev_pos; + let offset = prev_pos - chunk_offset + chunk_slice.offset; self.store - .chunks + .chunks_file .borrow() .read_exact_at(dst, offset) .context("Couldn't read blob data") @@ -257,12 +319,12 @@ impl io::Seek for RawBlob<'_> { fn seek(&mut self, pos: io::SeekFrom) -> io::Result { let pos = match pos { io::SeekFrom::Start(n) => Some(n), - io::SeekFrom::End(n) => checked_add_signed(self.slice.length, n), + io::SeekFrom::End(n) => checked_add_signed(self.length, n), io::SeekFrom::Current(n) => checked_add_signed(self.position, n), }; match pos { - Some(n) if n <= self.slice.length => { + Some(n) if n <= self.length => { self.position = n; Ok(self.position) } diff --git a/ripple/fossil/src/store.proto b/ripple/fossil/src/store.proto index cbeb16d..e0e3e59 100644 --- a/ripple/fossil/src/store.proto +++ b/ripple/fossil/src/store.proto @@ -30,7 +30,11 @@ message LinkNode { } message Blob { - uint64 offset = 1; - uint64 length = 2; bytes bao_inline = 3; + repeated Chunk chunks = 4; +} + +message Chunk { + uint64 offset = 1; + uint32 length = 2; } -- cgit 1.4.1