// SPDX-FileCopyrightText: edef // SPDX-License-Identifier: OSL-3.0 use std::mem; mod buz; pub const MIN_CHUNK_SIZE: usize = AVG_CHUNK_SIZE / 4; pub const AVG_CHUNK_SIZE: usize = 64 * 1024; pub const MAX_CHUNK_SIZE: usize = AVG_CHUNK_SIZE * 4; const WINDOW_SIZE: usize = 48; const DISCRIMINATOR: u32 = { let avg = AVG_CHUNK_SIZE as f64; (avg / (-1.42888852e-7 * avg + 1.33237515)) as u32 }; pub struct Chunker<'a> { buffer: &'a [u8], } impl<'a> Chunker<'a> { pub fn from(buffer: &'a [u8]) -> Chunker<'a> { Chunker { buffer } } /// SAFETY: `idx` must be an in-bounds index for `self.buffer` unsafe fn cut(&mut self, idx: usize) -> &'a [u8] { let ret; (ret, self.buffer) = ( self.buffer.get_unchecked(..idx), self.buffer.get_unchecked(idx..), ); ret } } impl<'a> Iterator for Chunker<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option<&'a [u8]> { if self.buffer.is_empty() { return None; } let max_len = MAX_CHUNK_SIZE.min(self.buffer.len()); let bytes = match self.buffer.get(MIN_CHUNK_SIZE..max_len) { None | Some(&[]) => { return Some(mem::take(&mut self.buffer)); } Some(bytes) => bytes, }; // SAFETY: `self.buffer.len() > MIN_CHUNK_SIZE`, so this is in bounds let mut hasher = unsafe { buz::Rolling::::from_slice_unchecked( self.buffer.get_unchecked(..MIN_CHUNK_SIZE), ) }; for byte in bytes { let buz::Hash(x) = hasher.sum(); if x % DISCRIMINATOR == DISCRIMINATOR.wrapping_sub(1) { // SAFETY: `byte` is in bounds of `self.buffer`, so // computing `idx` is safe, and `idx` is in bounds return Some(unsafe { let origin = self.buffer.as_ptr(); let ptr = byte as *const u8; let idx = ptr.offset_from(origin) as usize; self.cut(idx) }); } hasher.push(*byte); } // SAFETY: `max_len` is clamped to `self.buffer.len()` Some(unsafe { self.cut(max_len) }) } fn size_hint(&self) -> (usize, Option) { let min = (self.buffer.len() + MAX_CHUNK_SIZE - 1) / MAX_CHUNK_SIZE; let max = (self.buffer.len() + MIN_CHUNK_SIZE - 1) / MIN_CHUNK_SIZE; (min, Some(max)) } } #[cfg(test)] mod test { use { super::{Chunker, MAX_CHUNK_SIZE, MIN_CHUNK_SIZE, WINDOW_SIZE}, hex_literal::hex, std::io::Read, }; fn generate(length: usize) -> Vec { let mut h = blake3::Hasher::new(); h.update(b"test vector"); let mut buf = vec![0; length]; h.finalize_xof().read_exact(&mut buf).unwrap(); buf } const GOLDEN: &[[u8; 32]] = &[ hex!("6677d1f778bf7b42911516dadeae691759e973e85ba2a0926f98768cbbb474c7"), hex!("add6c06e0eefb53b1f8bebac21a8f70cc9ebc82022f0fe0a72151521327de50a"), hex!("bcd068570a6670f33e81c9a87dacd9de80b98a198d54cdd75e76ca0fb00d5311"), hex!("91166f72923d00bb46827ac535da0c6e20bc1ab0b840580bf4c4e3beb781ebd8"), hex!("5159eadfd1413811a991b3f4918c7b61384a04591fe9c1a60119325bfa3b2492"), hex!("7c640ca5e83a25ebafe31e5b58d48bb80aa5505d188a7938e371616d244b0d07"), hex!("6c4bc617f2af9ba681d41bf177cecf0bb7bf3a54c7265f8c435a4061f55c4775"), hex!("9874d14530e9470ccf7652658596f6906f1d8424e14f085503f8e88f10b4ecc8"), hex!("d287273da57d31755682b7134cea68bfc41c0bb9ce64b190fbde2ad8cec01b2c"), hex!("68bdfb6c99f53e82f69e6aa60156003fab8e9cf798931a70ef2b195b1cf01e38"), hex!("ae9fa252897bdacf650993d1e09673b6fba60f5802ec7922f8a7eae816606d1b"), hex!("8b9e83c245759c3af40f4bb4bd82a16266ded32fdaf6fc12c95740e218908967"), hex!("6b0d3f9bec24b2f7908c752857c96f9ea7789431afa4a4619c6ea7f9646048cd"), hex!("bd041a0c3ee23e3ae66c5bcd8d8a1efe4dd5ac14d5e659400ea0cf7532c6b67a"), hex!("667394962381692634944d20f36d673db6b99bc7a77c22c356d158e54e550a33"), hex!("26ddfc31acfcd2579d5265dcf3e08ade13045feec4f435ceb6cfd17fc0c54b7f"), hex!("e2d56ab7db711d4db2b4dbee944cac926edcf3fd9244f179662a1e70d62eb46b"), hex!("865d5f60753dff7bac50a311827ab8fedebcb10afdeaa808aa28fc9572047213"), hex!("a62dcc3a2e8ee9f287c7cca8c8a31833c943314aabfeafc11dc6223df35445e2"), hex!("751d1f3d948e7d27227f1a0ecc61efbd411653b95b7805068060a3174f011051"), hex!("4e0f4d716510a581fffe626b1594174c6d1285be74cb5604c7388fb76eaafd12"), hex!("e4b8640392e5718931fcc7e9eecd92fb92b01d3234a191d47f5b8b37b8602d6a"), hex!("c63c17c0940d16ecc6671fd17aec1210f3f61e594b29ab1cd4f6d12e8ad742c1"), hex!("8d5907068c84682a02b0e32a8864281abf5874b5f9755fd8c84da69bd3401905"), hex!("45a7c621b3141495742ea4dc1f11e077d5f368e2c19ab983edd4f67783098f54"), hex!("947972b1572ab52d81879b32a8bfa8b669ff82693769567f26a4f26c9afd7e1a"), hex!("6f7f6df11fbb572b2f299f3d4e00d3759bf29f5dd4e5079e46e0733e6852a499"), hex!("bb387ec37b3fe49d05cc38a0e23eb0952343922b837710e4337de975acdde4cf"), hex!("3236325b9fa947d3fbcac04e3d4aa6b842169143f220048d12b9b2a4c897ff99"), hex!("9111f3ae368103a51174cd3906381ba68706288a8a12bd35f63cfea94a5b4e99"), hex!("68583d1b52cb1028131c993bba090b61c1511603d953ba926b45528b903ce164"), hex!("5c26c4c946084d5dd2d71ec7ab95e8a6c103ff1f481f2a8fe6a7053cbbde9ebc"), hex!("f225f0eed0a82c501b02afd4567b45ea10f68fb72ce2e788c652915dfe184a0e"), hex!("b038fe2798c44dadfa0709a440c85a8569b5bdf3acb2822e5522b419f630de82"), hex!("5f9800fd1d88f0003920705c2b81b32d317c1ca8cc2109b5fcd0ae59db45bf03"), hex!("f592f45fb9291cc1eef112f80dc4923ac8455b1e2baefcb4352115cfc8c83b19"), hex!("c22e1146015f0ca26ae1136d30d486dfdde0308f3eaa1cd3f3da4c72dbe3e323"), hex!("f9985ccd6e19bffaeb28f5f1abea258dd4a7b711954fed9b0dba0883c497ad65"), hex!("76b1159a9a76ac137efa0f3caa70b59a604db95b48d038851d4acba9e04d4110"), hex!("bef5bb3b809652078a39dcf4c43634d52fb3f481de927f7bcc0cc68210d0dba4"), hex!("918b9a121204847a0803880d590ae179d8212896a4933e66c3f00d7f63c7c00b"), hex!("a80b46e04486e1a4e83961736959edf463fd1a559b12f3a1ab4c893bb9318bbf"), hex!("9fc1323fcf6b96d826be3e7330ed5310e79d35721e67e6774f6ba8d3dfd71366"), hex!("6a6b9cfacd49b5d4179faae0259af4a4463956ff6fdf5f6fa5a25298e41b6405"), hex!("15d2005b94a4a0a793ce6689c3a6a65e5ca1514e3bf2bd9dc5273c34aac67bfc"), hex!("85eba053ceddfc1e2d1609b03dd11f3cf0712da2bd1eb61a270ce13442498c41"), hex!("d1afae460eb6c37eb78345abc52ec43b1652840fbf4f790c2ba798c54d0b17ff"), hex!("0eb64ef44954a95d51f1f45ef756e1847baa253ba4a0e4f65fc55512571f3027"), hex!("c6439d8c498c45b90514e307df1fa99f41f2d0e1de90a852a05540a194f7add3"), hex!("8f8ac39c0dbcf7d665da260a826d88e088d8ba7381ce5d1ea6dd8894568917a6"), hex!("97e1bfc88f1699ce00449c5a5f69b0fdf6c99410735310f2b6222a6f2567bec2"), hex!("800268537b24e0197b109c4777ab0f4c09285da554d12565974d45d2a071ec2d"), hex!("55bf835ba6bab1431b1823d96493660a9206fea1ecb5ce959f9f7dc4924984fd"), hex!("db6acd677a116538566589fd1b5f86191fb93894985d6b3082ceae3e534cbb95"), hex!("d183c3d4b065f077ad506524ec07481de49d2c6e94c91d58437a93afff6ed2bd"), hex!("0717083dcbb24e38961e688439ba3a40c26a72cff82d561844429da80867fc10"), hex!("ed8ddbbf6e1ad00a3461acab323f2a7051f739b563bbac83ba5673c84a600924"), hex!("46631d7351f6e257a56abbb588763ceb89bec51b959cc5300221ad352fe5ae51"), hex!("69cb422ad89bcccc513b58e92161ee81b10d0ac4b1c7a04f1976575ed8530aee"), hex!("9b392e2ed57f0f8204bb7cf68839d9d191c80671361d96c29ddfc26c18e11b79"), hex!("e7e06bfa7ccd859e10b8ac33ae615a29982a65c13133e27ed2975e9a67e21652"), hex!("5b01926945f7d635f7ca28a6d71cd88dac156e00ebaae49c2fe5039579f155f1"), hex!("9d8f2351b6ae39e9ae135ccb86b5ddc777a8922359b81dea5864340c13be95ba"), hex!("f22f2d26a578be8dd2b1f9f4190129d687c6cf5746b13c63151c65ee837e1656"), hex!("d581126b6bdcc4c6842504ddc98cb15056196ebf69031238932098b7f25e4cd6"), hex!("124bbc2e307118a582d5ae52fafdbfcb752b948e9ec3af56a141dc57af4050db"), hex!("aaaa52ada4f5ded6929ebfb912fa7012a8196e0ca4df7c9988942c3a983da64e"), ]; #[test] fn golden() { let data = generate(1024 * 64 * 64); let chunks = Chunker::from(&data).map(blake3::hash); for (actual, &expected) in chunks.zip(GOLDEN.iter()) { assert_eq!(actual, blake3::Hash::from(expected)); } } #[test] fn max_chunk() { // all-zero byte sequences don't contain any split points, // so the chunker is forced to limit the chunk size assert_eq!( Chunker::from(&[0u8; MAX_CHUNK_SIZE + 1]) .next() .unwrap() .len(), MAX_CHUNK_SIZE ); } #[test] fn min_chunk() { let data = generate(1024 * 32); // a "tail" is a window-sized byte sequence that terminates a chunk // we extract one from our test vectors, since every chunk smaller // than MAX_CHUNK_SIZE ends in a tail let mut tail = [0; WINDOW_SIZE]; tail.copy_from_slice({ let chunk = Chunker::from(&data).next().unwrap(); chunk.rchunks_exact(WINDOW_SIZE).next().unwrap() }); let mut data = vec![0; MIN_CHUNK_SIZE - WINDOW_SIZE]; data.extend(tail); data.push(0); assert_eq!(Chunker::from(&data).next().unwrap().len(), MIN_CHUNK_SIZE); } #[test] fn short_chunk() { assert_eq!( Chunker::from(&[0u8; MIN_CHUNK_SIZE / 2]) .next() .unwrap() .len(), MIN_CHUNK_SIZE / 2 ); } #[test] fn size_hint() { assert_eq!(Chunker::from(&[]).size_hint(), (0, Some(0))); assert_eq!(Chunker::from(&[0]).size_hint(), (1, Some(1))); assert_eq!( Chunker::from(&[0; MAX_CHUNK_SIZE]).size_hint(), (1, Some(MAX_CHUNK_SIZE / MIN_CHUNK_SIZE)) ); assert_eq!( Chunker::from(&[0; MAX_CHUNK_SIZE + 1]).size_hint(), (2, Some(MAX_CHUNK_SIZE / MIN_CHUNK_SIZE + 1)) ); } }