More efficient serialization for bitmap segments (#3492)

* More efficient serialization for bitmap segments * Rename a const * Correctly count number of chunks in a segment * Enum for BitmapBlock (de)ser mode * Add more segments in test * Fix duplicate function
2025-01-20 19:11:08 +03:00 · 2020-11-24 15:19:07 +01:00 · 2020-11-24 15:19:07 +01:00 · 055b684416
commit 055b684416
parent b3938de8b3
3 changed files with 442 additions and 1 deletions
--- a/chain/src/txhashset/bitmap_accumulator.rs
+++ b/chain/src/txhashset/bitmap_accumulator.rs
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::cmp::min;
 use std::convert::TryFrom;
 use std::time::Instant;

@ -19,9 +20,11 @@ use bit_vec::BitVec;
 use croaring::Bitmap;

 use crate::core::core::hash::{DefaultHashable, Hash};
+use crate::core::core::pmmr::segment::{Segment, SegmentIdentifier, SegmentProof};
 use crate::core::core::pmmr::{self, ReadablePMMR, ReadonlyPMMR, VecBackend, PMMR};
 use crate::core::ser::{self, PMMRable, Readable, Reader, Writeable, Writer};
 use crate::error::{Error, ErrorKind};
+use enum_primitive::FromPrimitive;

 /// The "bitmap accumulator" allows us to commit to a specific bitmap by splitting it into
 /// fragments and inserting these fragments into an MMR to produce an overall root hash.
@ -187,7 +190,7 @@ impl BitmapAccumulator {

 /// A bitmap "chunk" representing 1024 contiguous bits of the overall bitmap.
 /// The first 1024 bits belong in one chunk. The next 1024 bits in the next chunk, etc.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub struct BitmapChunk(BitVec);

 impl BitmapChunk {
@ -242,3 +245,304 @@ impl Readable for BitmapChunk {
 		Ok(BitmapChunk::new())
 	}
 }
+
+///
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct BitmapSegment {
+	identifier: SegmentIdentifier,
+	blocks: Vec<BitmapBlock>,
+	proof: SegmentProof,
+}
+
+impl Writeable for BitmapSegment {
+	fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
+		Writeable::write(&self.identifier, writer)?;
+		writer.write_u16(self.blocks.len() as u16)?;
+		for block in &self.blocks {
+			Writeable::write(block, writer)?;
+		}
+		Writeable::write(&self.proof, writer)?;
+		Ok(())
+	}
+}
+
+impl Readable for BitmapSegment {
+	fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
+		let identifier: SegmentIdentifier = Readable::read(reader)?;
+
+		let n_blocks = reader.read_u16()? as usize;
+		let mut blocks = Vec::<BitmapBlock>::with_capacity(n_blocks);
+		for _ in 0..n_blocks {
+			blocks.push(Readable::read(reader)?);
+		}
+		let proof = Readable::read(reader)?;
+
+		Ok(Self {
+			identifier,
+			blocks,
+			proof,
+		})
+	}
+}
+
+// TODO: this can be sped up with some `unsafe` code
+impl From<Segment<BitmapChunk>> for BitmapSegment {
+	fn from(segment: Segment<BitmapChunk>) -> Self {
+		let (identifier, _, _, _, leaf_data, proof) = segment.parts();
+
+		let mut chunks_left = leaf_data.len();
+		let mut blocks =
+			Vec::with_capacity((chunks_left + BitmapBlock::NCHUNKS - 1) / BitmapBlock::NCHUNKS);
+		while chunks_left > 0 {
+			let n_chunks = min(BitmapBlock::NCHUNKS, chunks_left);
+			chunks_left = chunks_left.saturating_sub(n_chunks);
+			blocks.push(BitmapBlock::new(n_chunks));
+		}
+
+		for (chunk_idx, chunk) in leaf_data.into_iter().enumerate() {
+			assert_eq!(chunk.0.len(), BitmapChunk::LEN_BITS);
+			let block = &mut blocks
+				.get_mut(chunk_idx / BitmapBlock::NCHUNKS)
+				.unwrap()
+				.inner;
+			let offset = (chunk_idx % BitmapBlock::NCHUNKS) * BitmapChunk::LEN_BITS;
+			for (i, _) in chunk.0.iter().enumerate().filter(|&(_, v)| v) {
+				block.set(offset + i, true);
+			}
+		}
+
+		Self {
+			identifier,
+			blocks,
+			proof,
+		}
+	}
+}
+
+// TODO: this can be sped up with some `unsafe` code
+impl From<BitmapSegment> for Segment<BitmapChunk> {
+	fn from(segment: BitmapSegment) -> Self {
+		let BitmapSegment {
+			identifier,
+			blocks,
+			proof,
+		} = segment;
+
+		// Count the number of chunks taking into account that the final block might be smaller
+		let n_chunks = blocks.len().saturating_sub(1) * BitmapBlock::NCHUNKS
+			+ blocks.last().map(|b| b.n_chunks()).unwrap_or(0);
+		let mut leaf_pos = Vec::with_capacity(n_chunks);
+		let mut chunks = Vec::with_capacity(n_chunks);
+		let offset = (1 << identifier.height) * identifier.idx + 1;
+		for i in 0..(n_chunks as u64) {
+			leaf_pos.push(pmmr::insertion_to_pmmr_index(offset + i));
+			chunks.push(BitmapChunk::new());
+		}
+
+		for (block_idx, block) in blocks.into_iter().enumerate() {
+			assert_eq!(block.inner.len(), BitmapBlock::NBITS as usize);
+			let offset = block_idx * BitmapBlock::NCHUNKS;
+			for (i, _) in block.inner.iter().enumerate().filter(|&(_, v)| v) {
+				chunks
+					.get_mut(offset + i / BitmapChunk::LEN_BITS)
+					.unwrap()
+					.0
+					.set(i % BitmapChunk::LEN_BITS, true);
+			}
+		}
+
+		Segment::from_parts(identifier, Vec::new(), Vec::new(), leaf_pos, chunks, proof)
+	}
+}
+
+/// A block of 2^16 bits that provides an efficient (de)serialization
+/// depending on the bitmap occupancy.
+#[derive(Clone, Debug, PartialEq, Eq)]
+struct BitmapBlock {
+	inner: BitVec,
+}
+
+impl BitmapBlock {
+	/// Maximum number of bits in a block
+	const NBITS: u32 = 1 << 16;
+	/// Maximum number of chunks in a block
+	const NCHUNKS: usize = Self::NBITS as usize / BitmapChunk::LEN_BITS;
+
+	fn new(n_chunks: usize) -> Self {
+		assert!(n_chunks <= BitmapBlock::NCHUNKS);
+		Self {
+			inner: BitVec::from_elem(n_chunks * BitmapChunk::LEN_BITS, false),
+		}
+	}
+
+	fn n_chunks(&self) -> usize {
+		let length = self.inner.len();
+		assert_eq!(length % BitmapChunk::LEN_BITS, 0);
+		let n_chunks = length / BitmapChunk::LEN_BITS;
+		assert!(n_chunks <= BitmapBlock::NCHUNKS);
+		n_chunks
+	}
+}
+
+impl Writeable for BitmapBlock {
+	fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
+		let length = self.inner.len();
+		assert!(length <= Self::NBITS as usize);
+		assert_eq!(length % BitmapChunk::LEN_BITS, 0);
+		writer.write_u8((length / BitmapChunk::LEN_BITS) as u8)?;
+
+		let count_pos = self.inner.iter().filter(|&v| v).count() as u32;
+		let count_neg = Self::NBITS - count_pos;
+		let threshold = Self::NBITS / 16;
+		if count_pos < threshold {
+			// Write positive indices
+			Writeable::write(&BitmapBlockSerialization::Positive, writer)?;
+			writer.write_u16(count_pos as u16)?;
+			for (i, _) in self.inner.iter().enumerate().filter(|&(_, v)| v) {
+				writer.write_u16(i as u16)?;
+			}
+		} else if count_neg < threshold {
+			// Write negative indices
+			Writeable::write(&BitmapBlockSerialization::Negative, writer)?;
+			writer.write_u16(count_neg as u16)?;
+			for (i, _) in self.inner.iter().enumerate().filter(|&(_, v)| !v) {
+				writer.write_u16(i as u16)?;
+			}
+		} else {
+			// Write raw bytes
+			Writeable::write(&BitmapBlockSerialization::Raw, writer)?;
+			let bytes = self.inner.to_bytes();
+			assert_eq!(bytes.len(), Self::NBITS as usize / 8);
+			writer.write_fixed_bytes(&bytes)?;
+		}
+
+		Ok(())
+	}
+}
+
+impl Readable for BitmapBlock {
+	fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
+		let n_chunks = reader.read_u8()?;
+		if n_chunks as usize > BitmapBlock::NCHUNKS {
+			return Err(ser::Error::TooLargeReadErr);
+		}
+		let n_bits = n_chunks as usize * BitmapChunk::LEN_BITS;
+
+		let mode = Readable::read(reader)?;
+		let inner = match mode {
+			BitmapBlockSerialization::Raw => {
+				// Raw bytes
+				let bytes = reader.read_fixed_bytes(n_bits / 8)?;
+				BitVec::from_bytes(&bytes)
+			}
+			BitmapBlockSerialization::Positive => {
+				// Positive indices
+				let mut inner = BitVec::from_elem(n_bits, false);
+				let n = reader.read_u16()?;
+				for _ in 0..n {
+					inner.set(reader.read_u16()? as usize, true);
+				}
+				inner
+			}
+			BitmapBlockSerialization::Negative => {
+				// Negative indices
+				let mut inner = BitVec::from_elem(n_bits, true);
+				let n = reader.read_u16()?;
+				for _ in 0..n {
+					inner.set(reader.read_u16()? as usize, false);
+				}
+				inner
+			}
+		};
+
+		Ok(BitmapBlock { inner })
+	}
+}
+
+enum_from_primitive! {
+	#[derive(Debug, Clone, Copy, PartialEq)]
+	#[repr(u8)]
+	enum BitmapBlockSerialization {
+		Raw = 0,
+		Positive = 1,
+		Negative = 2,
+	}
+}
+
+impl Writeable for BitmapBlockSerialization {
+	fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
+		writer.write_u8(*self as u8)
+	}
+}
+
+impl Readable for BitmapBlockSerialization {
+	fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
+		Self::from_u8(reader.read_u8()?).ok_or(ser::Error::CorruptedData)
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use crate::core::ser::{BinReader, BinWriter, ProtocolVersion, Readable, Writeable};
+	use byteorder::ReadBytesExt;
+	use grin_util::secp::rand::Rng;
+	use rand::thread_rng;
+	use std::io::Cursor;
+
+	fn test_roundtrip(entries: usize, inverse: bool, encoding: u8, length: usize) {
+		let mut rng = thread_rng();
+		let mut block = BitmapBlock::new(64);
+		if inverse {
+			block.inner.negate();
+		}
+
+		// Flip `entries` bits in random spots
+		let mut count = 0;
+		while count < entries {
+			let idx = rng.gen_range(0, BitmapBlock::NBITS as usize);
+			if block.inner.get(idx).unwrap() == inverse {
+				count += 1;
+				block.inner.set(idx, !inverse);
+			}
+		}
+
+		// Serialize
+		let mut cursor = Cursor::new(Vec::<u8>::new());
+		let mut writer = BinWriter::new(&mut cursor, ProtocolVersion(1));
+		Writeable::write(&block, &mut writer).unwrap();
+
+		// Check encoding type and length
+		cursor.set_position(1);
+		assert_eq!(cursor.read_u8().unwrap(), encoding);
+		let actual_length = cursor.get_ref().len();
+		assert_eq!(actual_length, length);
+		assert!(actual_length <= 2 + BitmapBlock::NBITS as usize / 8);
+
+		// Deserialize
+		cursor.set_position(0);
+		let mut reader = BinReader::new(&mut cursor, ProtocolVersion(1));
+		let block2: BitmapBlock = Readable::read(&mut reader).unwrap();
+		assert_eq!(block, block2);
+	}
+
+	#[test]
+	fn block_ser_roundtrip() {
+		let threshold = BitmapBlock::NBITS as usize / 16;
+		let entries = thread_rng().gen_range(threshold, 4 * threshold);
+		test_roundtrip(entries, false, 0, 2 + BitmapBlock::NBITS as usize / 8);
+		test_roundtrip(entries, true, 0, 2 + BitmapBlock::NBITS as usize / 8);
+	}
+
+	#[test]
+	fn sparse_block_ser_roundtrip() {
+		let entries = thread_rng().gen_range(1024, BitmapBlock::NBITS as usize / 16);
+		test_roundtrip(entries, false, 1, 4 + 2 * entries);
+	}
+
+	#[test]
+	fn abdundant_block_ser_roundtrip() {
+		let entries = thread_rng().gen_range(1024, BitmapBlock::NBITS as usize / 16);
+		test_roundtrip(entries, true, 2, 4 + 2 * entries);
+	}
+}
--- a/chain/tests/bitmap_segment.rs
+++ b/chain/tests/bitmap_segment.rs
@ -0,0 +1,79 @@
+use self::chain::txhashset::{BitmapAccumulator, BitmapSegment};
+use self::core::core::pmmr::segment::{Segment, SegmentIdentifier};
+use self::core::ser::{BinReader, BinWriter, ProtocolVersion, Readable, Writeable};
+use croaring::Bitmap;
+use grin_chain as chain;
+use grin_core as core;
+use grin_util::secp::rand::Rng;
+use rand::thread_rng;
+use std::io::Cursor;
+
+fn test_roundtrip(entries: usize) {
+	let mut rng = thread_rng();
+
+	let identifier = SegmentIdentifier {
+		height: 12,
+		idx: rng.gen_range(8, 16),
+	};
+	let block = rng.gen_range(2, 64);
+
+	let mut bitmap = Bitmap::create();
+	let block_size = 1 << 16;
+	let offset = (1 << identifier.height) * 1024 * identifier.idx + block_size * block;
+	let mut count = 0;
+	while count < entries {
+		let idx = (offset + rng.gen_range(0, block_size)) as u32;
+		if !bitmap.contains(idx) {
+			count += 1;
+			bitmap.add(idx);
+		}
+	}
+
+	// Add a bunch of segments after the one we are interested in
+	let size =
+		bitmap.maximum().unwrap() as u64 + (1 << identifier.height) * 1024 * rng.gen_range(0, 64);
+
+	// Construct the accumulator
+	let mut accumulator = BitmapAccumulator::new();
+	accumulator
+		.init(bitmap.iter().map(|v| v as u64), size)
+		.unwrap();
+
+	let mmr = accumulator.readonly_pmmr();
+	let segment = Segment::from_pmmr(identifier, &mmr, false).unwrap();
+
+	// Convert to `BitmapSegment`
+	let bms = BitmapSegment::from(segment.clone());
+
+	// Serialize `BitmapSegment`
+	let mut cursor = Cursor::new(Vec::<u8>::new());
+	let mut writer = BinWriter::new(&mut cursor, ProtocolVersion(1));
+	Writeable::write(&bms, &mut writer).unwrap();
+
+	// Read `BitmapSegment`
+	cursor.set_position(0);
+	let mut reader = BinReader::new(&mut cursor, ProtocolVersion(1));
+	let bms2: BitmapSegment = Readable::read(&mut reader).unwrap();
+	assert_eq!(bms, bms2);
+
+	// Convert back to `Segment`
+	let segment2 = Segment::from(bms2);
+	assert_eq!(segment, segment2);
+}
+
+#[test]
+fn segment_ser_roundtrip() {
+	let threshold = 4096;
+	test_roundtrip(thread_rng().gen_range(threshold, 4 * threshold));
+}
+
+#[test]
+fn sparse_segment_ser_roundtrip() {
+	test_roundtrip(thread_rng().gen_range(1024, 4096));
+}
+
+#[test]
+fn abundant_segment_ser_roundtrip() {
+	let max = 1 << 16;
+	test_roundtrip(thread_rng().gen_range(max - 4096, max - 1024));
+}
--- a/core/src/core/pmmr/segment.rs
+++ b/core/src/core/pmmr/segment.rs
@ -142,6 +142,64 @@ impl<T> Segment<T> {
 			.ok_or_else(|| SegmentError::MissingHash(pos))
 	}

+	/// Get the identifier associated with this segment
+	pub fn identifier(&self) -> SegmentIdentifier {
+		self.identifier
+	}
+
+	/// Consume the segment and return its parts
+	pub fn parts(
+		self,
+	) -> (
+		SegmentIdentifier,
+		Vec<u64>,
+		Vec<Hash>,
+		Vec<u64>,
+		Vec<T>,
+		SegmentProof,
+	) {
+		(
+			self.identifier,
+			self.hash_pos,
+			self.hashes,
+			self.leaf_pos,
+			self.leaf_data,
+			self.proof,
+		)
+	}
+
+	/// Construct a segment from its parts
+	pub fn from_parts(
+		identifier: SegmentIdentifier,
+		hash_pos: Vec<u64>,
+		hashes: Vec<Hash>,
+		leaf_pos: Vec<u64>,
+		leaf_data: Vec<T>,
+		proof: SegmentProof,
+	) -> Self {
+		assert_eq!(hash_pos.len(), hashes.len());
+		let mut last_pos = 0;
+		for &pos in &hash_pos {
+			assert!(pos > last_pos);
+			last_pos = pos;
+		}
+		assert_eq!(leaf_pos.len(), leaf_data.len());
+		last_pos = 0;
+		for &pos in &leaf_pos {
+			assert!(pos > last_pos);
+			last_pos = pos;
+		}
+
+		Self {
+			identifier,
+			hash_pos,
+			hashes,
+			leaf_pos,
+			leaf_data,
+			proof,
+		}
+	}
+
 	/// Iterator of all the leaves in the segment
 	pub fn leaf_iter(&self) -> impl Iterator<Item = (u64, &T)> + '_ {
 		self.leaf_pos.iter().map(|&p| p).zip(&self.leaf_data)