From 7725a05ac129a192fb1f55a48022ae6ba71b018e Mon Sep 17 00:00:00 2001
From: Yeastplume <yeastplume@protonmail.com>
Date: Mon, 6 Dec 2021 16:35:23 +0000
Subject: [PATCH] [SYNC PERFORMANCE] Replace header proof serialisation with
 more efficient algorithm (#3670)

* replace bitvec with more efficient bitpack algorithm

* optimise proof_unpack_len

* move proof pack length calculation

* small refactor

* integrate suggestions in #3670

* finish compressing compression function

* remove ordering cmp from pack function

* remainder fix for new logic

* remove println statements

* remove ordering import warning
---
 chain/tests/test_header_perf.rs | 119 ++++++++++++++++++++++++++++++++
 core/src/global.rs              |   4 +-
 core/src/pow/types.rs           |  89 +++++++++++++-----------
 3 files changed, 170 insertions(+), 42 deletions(-)
 create mode 100644 chain/tests/test_header_perf.rs

diff --git a/chain/tests/test_header_perf.rs b/chain/tests/test_header_perf.rs
new file mode 100644
index 000000000..f948606b9
--- /dev/null
+++ b/chain/tests/test_header_perf.rs
@@ -0,0 +1,119 @@
+// Copyright 2021 The Grin Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use grin_chain as chain;
+use grin_core as core;
+use grin_util as util;
+
+#[macro_use]
+extern crate log;
+
+use std::sync::Arc;
+
+use crate::chain::types::{NoopAdapter, Options};
+use crate::core::core::hash::Hashed;
+use crate::core::{genesis, global, pow};
+
+use self::chain_test_helper::clean_output_dir;
+
+mod chain_test_helper;
+
+fn test_header_perf_impl(is_test_chain: bool, src_root_dir: &str, dest_root_dir: &str) {
+	global::set_local_chain_type(global::ChainTypes::Mainnet);
+	let mut genesis = genesis::genesis_main();
+
+	if is_test_chain {
+		global::set_local_chain_type(global::ChainTypes::AutomatedTesting);
+		genesis = pow::mine_genesis_block().unwrap();
+	}
+
+	{
+		debug!("Reading Chain, genesis block: {}", genesis.hash());
+		let dummy_adapter = Arc::new(NoopAdapter {});
+
+		// The original chain we're reading from
+		let src_chain = Arc::new(
+			chain::Chain::init(
+				src_root_dir.into(),
+				dummy_adapter.clone(),
+				genesis.clone(),
+				pow::verify_size,
+				false,
+			)
+			.unwrap(),
+		);
+
+		// And the output chain we're writing to
+		let dest_chain = Arc::new(
+			chain::Chain::init(
+				dest_root_dir.into(),
+				dummy_adapter,
+				genesis.clone(),
+				pow::verify_size,
+				false,
+			)
+			.unwrap(),
+		);
+
+		let sh = src_chain.get_header_by_height(0).unwrap();
+		debug!("Source Genesis - {}", sh.hash());
+
+		let dh = dest_chain.get_header_by_height(0).unwrap();
+		debug!("Destination Genesis - {}", dh.hash());
+
+		let horizon_header = src_chain.txhashset_archive_header().unwrap();
+
+		debug!("Horizon header: {:?}", horizon_header);
+
+		// Copy the headers from source to output in chunks
+		let dest_sync_head = dest_chain.header_head().unwrap();
+		let copy_chunk_size = 1000;
+		let mut copied_header_index = 1;
+		let mut src_headers = vec![];
+		while copied_header_index <= 100000 {
+			let h = src_chain.get_header_by_height(copied_header_index).unwrap();
+			src_headers.push(h);
+			copied_header_index += 1;
+			if copied_header_index % copy_chunk_size == 0 {
+				debug!(
+					"Copying headers to {} of {}",
+					copied_header_index, horizon_header.height
+				);
+				dest_chain
+					.sync_block_headers(&src_headers, dest_sync_head, Options::NONE)
+					.unwrap();
+				src_headers = vec![];
+			}
+		}
+		if !src_headers.is_empty() {
+			dest_chain
+				.sync_block_headers(&src_headers, dest_sync_head, Options::NONE)
+				.unwrap();
+		}
+	}
+}
+
+#[test]
+#[ignore]
+// Ignored during CI, but use this to run this test on a real instance of a chain pointed where you like
+fn test_header_perf() {
+	util::init_test_logger();
+	// if testing against a real chain, insert location here
+	// NOTE: Modify to point at your own paths
+	let src_root_dir = format!("/Users/yeastplume/Projects/grin_project/server/chain_data");
+	let dest_root_dir = format!("/Users/yeastplume/Projects/grin_project/server/.chain_data_copy");
+	clean_output_dir(&dest_root_dir);
+	test_header_perf_impl(false, &src_root_dir, &dest_root_dir);
+	clean_output_dir(&dest_root_dir);
+}
diff --git a/core/src/global.rs b/core/src/global.rs
index 9b67d9369..663d7e4d8 100644
--- a/core/src/global.rs
+++ b/core/src/global.rs
@@ -25,7 +25,7 @@ use crate::consensus::{
 use crate::core::block::HeaderVersion;
 use crate::pow::{
 	self, new_cuckaroo_ctx, new_cuckarood_ctx, new_cuckaroom_ctx, new_cuckarooz_ctx,
-	new_cuckatoo_ctx, no_cuckaroo_ctx, BitVec, PoWContext,
+	new_cuckatoo_ctx, no_cuckaroo_ctx, PoWContext, Proof,
 };
 use crate::ser::ProtocolVersion;
 use std::cell::Cell;
@@ -488,7 +488,7 @@ where
 #[inline]
 pub fn header_size_bytes(edge_bits: u8) -> usize {
 	let size = 2 + 2 * 8 + 5 * 32 + 32 + 2 * 8;
-	let proof_size = 8 + 4 + 8 + 1 + BitVec::bytes_len(edge_bits as usize * proofsize());
+	let proof_size = 8 + 4 + 8 + 1 + Proof::pack_len(edge_bits);
 	size + proof_size
 }
 
diff --git a/core/src/pow/types.rs b/core/src/pow/types.rs
index 89ed9a262..0fa81c1ba 100644
--- a/core/src/pow/types.rs
+++ b/core/src/pow/types.rs
@@ -23,6 +23,7 @@ use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
 /// proof of work within a block header.
 use std::cmp::{max, min};
 use std::ops::{Add, Div, Mul, Sub};
+use std::u64;
 use std::{fmt, iter};
 
 /// Generic trait for a solver/verifier providing common interface into Cuckoo-family PoW
@@ -325,8 +326,8 @@ impl ProofOfWork {
 /// The hash of the `Proof` is the hash of its packed nonces when serializing
 /// them at their exact bit size. The resulting bit sequence is padded to be
 /// byte-aligned. We form a PROOFSIZE*edge_bits integer by packing the PROOFSIZE edge
-/// indices together, with edge index i occupying bits i * edge_bits through 
-/// (i+1) * edge_bits - 1, padding it with up to 7 0-bits to a multiple of 8 bits, 
+/// indices together, with edge index i occupying bits i * edge_bits through
+/// (i+1) * edge_bits - 1, padding it with up to 7 0-bits to a multiple of 8 bits,
 /// writing as a little endian byte array, and hashing with blake2b using 256 bit digest.
 
 #[derive(Clone, PartialOrd, PartialEq, Serialize)]
@@ -372,6 +373,11 @@ impl Proof {
 		}
 	}
 
+	/// Number of bytes required store a proof of given edge bits
+	pub fn pack_len(bit_width: u8) -> usize {
+		(bit_width as usize * global::proofsize() + 7) / 8
+	}
+
 	/// Builds a proof with random POW data,
 	/// needed so that tests that ignore POW
 	/// don't fail due to duplicate hashes
@@ -396,6 +402,17 @@ impl Proof {
 		self.nonces.len()
 	}
 
+	/// Pack the nonces of the proof to their exact bit size as described above
+	pub fn pack_nonces(&self) -> Vec<u8> {
+		let mut compressed = vec![0u8; Proof::pack_len(self.edge_bits)];
+		pack_bits(
+			self.edge_bits,
+			&self.nonces[0..self.nonces.len()],
+			&mut compressed,
+		);
+		compressed
+	}
+
 	/// Difficulty achieved by this proof with given scaling factor
 	fn scaled_difficulty(&self, scale: u64) -> u64 {
 		let diff = ((scale as u128) << 64) / (max(1, self.hash().to_u64()) as u128);
@@ -403,6 +420,34 @@ impl Proof {
 	}
 }
 
+/// Pack an array of u64s into `compressed` at the specified bit width. Caller
+/// must ensure `compressed` is the right size
+fn pack_bits(bit_width: u8, uncompressed: &[u64], mut compressed: &mut [u8]) {
+	// We will use a `u64` as a mini buffer of 64 bits.
+	// We accumulate bits in it until capacity, at which point we just copy this
+	// mini buffer to compressed.
+	let mut mini_buffer = 0u64;
+	let mut remaining = 64;
+	for el in uncompressed {
+		mini_buffer |= el << (64 - remaining);
+		if bit_width < remaining {
+			remaining -= bit_width;
+		} else {
+			compressed[..8].copy_from_slice(&mini_buffer.to_le_bytes());
+			compressed = &mut compressed[8..];
+			mini_buffer = el >> remaining;
+			remaining = 64 + remaining - bit_width;
+		}
+	}
+	let mut remainder = compressed.len() % 8;
+	if remainder == 0 {
+		remainder = 8;
+	}
+	if mini_buffer > 0 {
+		compressed[..].copy_from_slice(&mini_buffer.to_le_bytes()[..remainder]);
+	}
+}
+
 fn extract_bits(bits: &[u8], bit_start: usize, bit_count: usize, read_from: usize) -> u64 {
 	let mut buf: [u8; 8] = [0; 8];
 	buf.copy_from_slice(&bits[read_from..read_from + 8]);
@@ -448,8 +493,7 @@ impl Readable for Proof {
 		// prepare nonces and read the right number of bytes
 		let mut nonces = Vec::with_capacity(global::proofsize());
 		let nonce_bits = edge_bits as usize;
-		let bits_len = nonce_bits * global::proofsize();
-		let bytes_len = BitVec::bytes_len(bits_len);
+		let bytes_len = Proof::pack_len(edge_bits);
 		if bytes_len < 8 {
 			return Err(ser::Error::CorruptedData);
 		}
@@ -475,42 +519,7 @@ impl Writeable for Proof {
 		if writer.serialization_mode() != ser::SerializationMode::Hash {
 			writer.write_u8(self.edge_bits)?;
 		}
-		let nonce_bits = self.edge_bits as usize;
-		let mut bitvec = BitVec::new(nonce_bits * global::proofsize());
-		for (n, nonce) in self.nonces.iter().enumerate() {
-			for bit in 0..nonce_bits {
-				if nonce & (1 << bit) != 0 {
-					bitvec.set_bit_at(n * nonce_bits + (bit as usize))
-				}
-			}
-		}
-		writer.write_fixed_bytes(&bitvec.bits)?;
-		Ok(())
-	}
-}
-
-/// A bit vector
-// TODO this could likely be optimized by writing whole bytes (or even words)
-// in the `BitVec` at once, dealing with the truncation, instead of bits by bits
-pub struct BitVec {
-	bits: Vec<u8>,
-}
-
-impl BitVec {
-	/// Number of bytes required to store the provided number of bits
-	#[inline]
-	pub fn bytes_len(bits_len: usize) -> usize {
-		(bits_len + 7) / 8
-	}
-
-	fn new(bits_len: usize) -> BitVec {
-		BitVec {
-			bits: vec![0; BitVec::bytes_len(bits_len)],
-		}
-	}
-
-	fn set_bit_at(&mut self, pos: usize) {
-		self.bits[pos / 8] |= 1 << (pos % 8) as u8;
+		writer.write_fixed_bytes(&self.pack_nonces())
 	}
 }