More efficient serialization for bitmap segments (#3492)

* More efficient serialization for bitmap segments

* Rename a const

* Correctly count number of chunks in a segment

* Enum for BitmapBlock (de)ser mode

* Add more segments in test

* Fix duplicate function
This commit is contained in:
jaspervdm 2020-11-24 15:19:07 +01:00 committed by GitHub
parent b3938de8b3
commit 055b684416
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 442 additions and 1 deletions

View file

@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::min;
use std::convert::TryFrom;
use std::time::Instant;
@ -19,9 +20,11 @@ use bit_vec::BitVec;
use croaring::Bitmap;
use crate::core::core::hash::{DefaultHashable, Hash};
use crate::core::core::pmmr::segment::{Segment, SegmentIdentifier, SegmentProof};
use crate::core::core::pmmr::{self, ReadablePMMR, ReadonlyPMMR, VecBackend, PMMR};
use crate::core::ser::{self, PMMRable, Readable, Reader, Writeable, Writer};
use crate::error::{Error, ErrorKind};
use enum_primitive::FromPrimitive;
/// The "bitmap accumulator" allows us to commit to a specific bitmap by splitting it into
/// fragments and inserting these fragments into an MMR to produce an overall root hash.
@ -187,7 +190,7 @@ impl BitmapAccumulator {
/// A bitmap "chunk" representing 1024 contiguous bits of the overall bitmap.
/// The first 1024 bits belong in one chunk. The next 1024 bits in the next chunk, etc.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct BitmapChunk(BitVec);
impl BitmapChunk {
@ -242,3 +245,304 @@ impl Readable for BitmapChunk {
Ok(BitmapChunk::new())
}
}
///
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct BitmapSegment {
identifier: SegmentIdentifier,
blocks: Vec<BitmapBlock>,
proof: SegmentProof,
}
impl Writeable for BitmapSegment {
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
Writeable::write(&self.identifier, writer)?;
writer.write_u16(self.blocks.len() as u16)?;
for block in &self.blocks {
Writeable::write(block, writer)?;
}
Writeable::write(&self.proof, writer)?;
Ok(())
}
}
impl Readable for BitmapSegment {
fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
let identifier: SegmentIdentifier = Readable::read(reader)?;
let n_blocks = reader.read_u16()? as usize;
let mut blocks = Vec::<BitmapBlock>::with_capacity(n_blocks);
for _ in 0..n_blocks {
blocks.push(Readable::read(reader)?);
}
let proof = Readable::read(reader)?;
Ok(Self {
identifier,
blocks,
proof,
})
}
}
// TODO: this can be sped up with some `unsafe` code
impl From<Segment<BitmapChunk>> for BitmapSegment {
fn from(segment: Segment<BitmapChunk>) -> Self {
let (identifier, _, _, _, leaf_data, proof) = segment.parts();
let mut chunks_left = leaf_data.len();
let mut blocks =
Vec::with_capacity((chunks_left + BitmapBlock::NCHUNKS - 1) / BitmapBlock::NCHUNKS);
while chunks_left > 0 {
let n_chunks = min(BitmapBlock::NCHUNKS, chunks_left);
chunks_left = chunks_left.saturating_sub(n_chunks);
blocks.push(BitmapBlock::new(n_chunks));
}
for (chunk_idx, chunk) in leaf_data.into_iter().enumerate() {
assert_eq!(chunk.0.len(), BitmapChunk::LEN_BITS);
let block = &mut blocks
.get_mut(chunk_idx / BitmapBlock::NCHUNKS)
.unwrap()
.inner;
let offset = (chunk_idx % BitmapBlock::NCHUNKS) * BitmapChunk::LEN_BITS;
for (i, _) in chunk.0.iter().enumerate().filter(|&(_, v)| v) {
block.set(offset + i, true);
}
}
Self {
identifier,
blocks,
proof,
}
}
}
// TODO: this can be sped up with some `unsafe` code
impl From<BitmapSegment> for Segment<BitmapChunk> {
fn from(segment: BitmapSegment) -> Self {
let BitmapSegment {
identifier,
blocks,
proof,
} = segment;
// Count the number of chunks taking into account that the final block might be smaller
let n_chunks = blocks.len().saturating_sub(1) * BitmapBlock::NCHUNKS
+ blocks.last().map(|b| b.n_chunks()).unwrap_or(0);
let mut leaf_pos = Vec::with_capacity(n_chunks);
let mut chunks = Vec::with_capacity(n_chunks);
let offset = (1 << identifier.height) * identifier.idx + 1;
for i in 0..(n_chunks as u64) {
leaf_pos.push(pmmr::insertion_to_pmmr_index(offset + i));
chunks.push(BitmapChunk::new());
}
for (block_idx, block) in blocks.into_iter().enumerate() {
assert_eq!(block.inner.len(), BitmapBlock::NBITS as usize);
let offset = block_idx * BitmapBlock::NCHUNKS;
for (i, _) in block.inner.iter().enumerate().filter(|&(_, v)| v) {
chunks
.get_mut(offset + i / BitmapChunk::LEN_BITS)
.unwrap()
.0
.set(i % BitmapChunk::LEN_BITS, true);
}
}
Segment::from_parts(identifier, Vec::new(), Vec::new(), leaf_pos, chunks, proof)
}
}
/// A block of 2^16 bits that provides an efficient (de)serialization
/// depending on the bitmap occupancy.
#[derive(Clone, Debug, PartialEq, Eq)]
struct BitmapBlock {
inner: BitVec,
}
impl BitmapBlock {
/// Maximum number of bits in a block
const NBITS: u32 = 1 << 16;
/// Maximum number of chunks in a block
const NCHUNKS: usize = Self::NBITS as usize / BitmapChunk::LEN_BITS;
fn new(n_chunks: usize) -> Self {
assert!(n_chunks <= BitmapBlock::NCHUNKS);
Self {
inner: BitVec::from_elem(n_chunks * BitmapChunk::LEN_BITS, false),
}
}
fn n_chunks(&self) -> usize {
let length = self.inner.len();
assert_eq!(length % BitmapChunk::LEN_BITS, 0);
let n_chunks = length / BitmapChunk::LEN_BITS;
assert!(n_chunks <= BitmapBlock::NCHUNKS);
n_chunks
}
}
impl Writeable for BitmapBlock {
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
let length = self.inner.len();
assert!(length <= Self::NBITS as usize);
assert_eq!(length % BitmapChunk::LEN_BITS, 0);
writer.write_u8((length / BitmapChunk::LEN_BITS) as u8)?;
let count_pos = self.inner.iter().filter(|&v| v).count() as u32;
let count_neg = Self::NBITS - count_pos;
let threshold = Self::NBITS / 16;
if count_pos < threshold {
// Write positive indices
Writeable::write(&BitmapBlockSerialization::Positive, writer)?;
writer.write_u16(count_pos as u16)?;
for (i, _) in self.inner.iter().enumerate().filter(|&(_, v)| v) {
writer.write_u16(i as u16)?;
}
} else if count_neg < threshold {
// Write negative indices
Writeable::write(&BitmapBlockSerialization::Negative, writer)?;
writer.write_u16(count_neg as u16)?;
for (i, _) in self.inner.iter().enumerate().filter(|&(_, v)| !v) {
writer.write_u16(i as u16)?;
}
} else {
// Write raw bytes
Writeable::write(&BitmapBlockSerialization::Raw, writer)?;
let bytes = self.inner.to_bytes();
assert_eq!(bytes.len(), Self::NBITS as usize / 8);
writer.write_fixed_bytes(&bytes)?;
}
Ok(())
}
}
impl Readable for BitmapBlock {
fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
let n_chunks = reader.read_u8()?;
if n_chunks as usize > BitmapBlock::NCHUNKS {
return Err(ser::Error::TooLargeReadErr);
}
let n_bits = n_chunks as usize * BitmapChunk::LEN_BITS;
let mode = Readable::read(reader)?;
let inner = match mode {
BitmapBlockSerialization::Raw => {
// Raw bytes
let bytes = reader.read_fixed_bytes(n_bits / 8)?;
BitVec::from_bytes(&bytes)
}
BitmapBlockSerialization::Positive => {
// Positive indices
let mut inner = BitVec::from_elem(n_bits, false);
let n = reader.read_u16()?;
for _ in 0..n {
inner.set(reader.read_u16()? as usize, true);
}
inner
}
BitmapBlockSerialization::Negative => {
// Negative indices
let mut inner = BitVec::from_elem(n_bits, true);
let n = reader.read_u16()?;
for _ in 0..n {
inner.set(reader.read_u16()? as usize, false);
}
inner
}
};
Ok(BitmapBlock { inner })
}
}
enum_from_primitive! {
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(u8)]
enum BitmapBlockSerialization {
Raw = 0,
Positive = 1,
Negative = 2,
}
}
impl Writeable for BitmapBlockSerialization {
fn write<W: Writer>(&self, writer: &mut W) -> Result<(), ser::Error> {
writer.write_u8(*self as u8)
}
}
impl Readable for BitmapBlockSerialization {
fn read<R: Reader>(reader: &mut R) -> Result<Self, ser::Error> {
Self::from_u8(reader.read_u8()?).ok_or(ser::Error::CorruptedData)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::ser::{BinReader, BinWriter, ProtocolVersion, Readable, Writeable};
use byteorder::ReadBytesExt;
use grin_util::secp::rand::Rng;
use rand::thread_rng;
use std::io::Cursor;
fn test_roundtrip(entries: usize, inverse: bool, encoding: u8, length: usize) {
let mut rng = thread_rng();
let mut block = BitmapBlock::new(64);
if inverse {
block.inner.negate();
}
// Flip `entries` bits in random spots
let mut count = 0;
while count < entries {
let idx = rng.gen_range(0, BitmapBlock::NBITS as usize);
if block.inner.get(idx).unwrap() == inverse {
count += 1;
block.inner.set(idx, !inverse);
}
}
// Serialize
let mut cursor = Cursor::new(Vec::<u8>::new());
let mut writer = BinWriter::new(&mut cursor, ProtocolVersion(1));
Writeable::write(&block, &mut writer).unwrap();
// Check encoding type and length
cursor.set_position(1);
assert_eq!(cursor.read_u8().unwrap(), encoding);
let actual_length = cursor.get_ref().len();
assert_eq!(actual_length, length);
assert!(actual_length <= 2 + BitmapBlock::NBITS as usize / 8);
// Deserialize
cursor.set_position(0);
let mut reader = BinReader::new(&mut cursor, ProtocolVersion(1));
let block2: BitmapBlock = Readable::read(&mut reader).unwrap();
assert_eq!(block, block2);
}
#[test]
fn block_ser_roundtrip() {
let threshold = BitmapBlock::NBITS as usize / 16;
let entries = thread_rng().gen_range(threshold, 4 * threshold);
test_roundtrip(entries, false, 0, 2 + BitmapBlock::NBITS as usize / 8);
test_roundtrip(entries, true, 0, 2 + BitmapBlock::NBITS as usize / 8);
}
#[test]
fn sparse_block_ser_roundtrip() {
let entries = thread_rng().gen_range(1024, BitmapBlock::NBITS as usize / 16);
test_roundtrip(entries, false, 1, 4 + 2 * entries);
}
#[test]
fn abdundant_block_ser_roundtrip() {
let entries = thread_rng().gen_range(1024, BitmapBlock::NBITS as usize / 16);
test_roundtrip(entries, true, 2, 4 + 2 * entries);
}
}

View file

@ -0,0 +1,79 @@
use self::chain::txhashset::{BitmapAccumulator, BitmapSegment};
use self::core::core::pmmr::segment::{Segment, SegmentIdentifier};
use self::core::ser::{BinReader, BinWriter, ProtocolVersion, Readable, Writeable};
use croaring::Bitmap;
use grin_chain as chain;
use grin_core as core;
use grin_util::secp::rand::Rng;
use rand::thread_rng;
use std::io::Cursor;
fn test_roundtrip(entries: usize) {
let mut rng = thread_rng();
let identifier = SegmentIdentifier {
height: 12,
idx: rng.gen_range(8, 16),
};
let block = rng.gen_range(2, 64);
let mut bitmap = Bitmap::create();
let block_size = 1 << 16;
let offset = (1 << identifier.height) * 1024 * identifier.idx + block_size * block;
let mut count = 0;
while count < entries {
let idx = (offset + rng.gen_range(0, block_size)) as u32;
if !bitmap.contains(idx) {
count += 1;
bitmap.add(idx);
}
}
// Add a bunch of segments after the one we are interested in
let size =
bitmap.maximum().unwrap() as u64 + (1 << identifier.height) * 1024 * rng.gen_range(0, 64);
// Construct the accumulator
let mut accumulator = BitmapAccumulator::new();
accumulator
.init(bitmap.iter().map(|v| v as u64), size)
.unwrap();
let mmr = accumulator.readonly_pmmr();
let segment = Segment::from_pmmr(identifier, &mmr, false).unwrap();
// Convert to `BitmapSegment`
let bms = BitmapSegment::from(segment.clone());
// Serialize `BitmapSegment`
let mut cursor = Cursor::new(Vec::<u8>::new());
let mut writer = BinWriter::new(&mut cursor, ProtocolVersion(1));
Writeable::write(&bms, &mut writer).unwrap();
// Read `BitmapSegment`
cursor.set_position(0);
let mut reader = BinReader::new(&mut cursor, ProtocolVersion(1));
let bms2: BitmapSegment = Readable::read(&mut reader).unwrap();
assert_eq!(bms, bms2);
// Convert back to `Segment`
let segment2 = Segment::from(bms2);
assert_eq!(segment, segment2);
}
#[test]
fn segment_ser_roundtrip() {
let threshold = 4096;
test_roundtrip(thread_rng().gen_range(threshold, 4 * threshold));
}
#[test]
fn sparse_segment_ser_roundtrip() {
test_roundtrip(thread_rng().gen_range(1024, 4096));
}
#[test]
fn abundant_segment_ser_roundtrip() {
let max = 1 << 16;
test_roundtrip(thread_rng().gen_range(max - 4096, max - 1024));
}

View file

@ -142,6 +142,64 @@ impl<T> Segment<T> {
.ok_or_else(|| SegmentError::MissingHash(pos))
}
/// Get the identifier associated with this segment
pub fn identifier(&self) -> SegmentIdentifier {
self.identifier
}
/// Consume the segment and return its parts
pub fn parts(
self,
) -> (
SegmentIdentifier,
Vec<u64>,
Vec<Hash>,
Vec<u64>,
Vec<T>,
SegmentProof,
) {
(
self.identifier,
self.hash_pos,
self.hashes,
self.leaf_pos,
self.leaf_data,
self.proof,
)
}
/// Construct a segment from its parts
pub fn from_parts(
identifier: SegmentIdentifier,
hash_pos: Vec<u64>,
hashes: Vec<Hash>,
leaf_pos: Vec<u64>,
leaf_data: Vec<T>,
proof: SegmentProof,
) -> Self {
assert_eq!(hash_pos.len(), hashes.len());
let mut last_pos = 0;
for &pos in &hash_pos {
assert!(pos > last_pos);
last_pos = pos;
}
assert_eq!(leaf_pos.len(), leaf_data.len());
last_pos = 0;
for &pos in &leaf_pos {
assert!(pos > last_pos);
last_pos = pos;
}
Self {
identifier,
hash_pos,
hashes,
leaf_pos,
leaf_data,
proof,
}
}
/// Iterator of all the leaves in the segment
pub fn leaf_iter(&self) -> impl Iterator<Item = (u64, &T)> + '_ {
self.leaf_pos.iter().map(|&p| p).zip(&self.leaf_data)