From 502e13846c8e1f9daf158b616d2d207b9860886a Mon Sep 17 00:00:00 2001 From: Steven Watanabe Date: Fri, 13 Mar 2020 11:21:00 -0400 Subject: [PATCH 1/4] Use more efficient representation of null bytes in strings. --- include/eosio/to_key.hpp | 65 ++++++++++++++++++++++++++++++++++++---- src/key_test.cpp | 28 +++++++++++++++-- 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/include/eosio/to_key.hpp b/include/eosio/to_key.hpp index f590c94..1bcc1db 100644 --- a/include/eosio/to_key.hpp +++ b/include/eosio/to_key.hpp @@ -71,6 +71,46 @@ result to_key_optional(const bool* obj, S& stream) { return stream.write('\2'); } +template +result to_key_byte(const T& obj) { + static_assert(has_bitwise_serialization() && sizeof(T) == 1, "Only works for single byte types"); + char buf[1]; + fixed_buf_stream tmp_stream(buf, 1); + OUTCOME_TRY(to_key(obj, tmp_stream)); + if (tmp_stream.pos == tmp_stream.end) + return buf[0]; + else + return stream_error::overrun; // Just to be safe. This should never happen. +} + +template +result to_key_byte_range(const T& obj, S& stream) { + static_assert(has_bitwise_serialization() && sizeof(typename T::value_type) == 1, + "Only works for containers of single byte types"); + constexpr std::ptrdiff_t max_run_length = 127; + char pending_run = 0; + for (typename T::value_type item : obj) { + OUTCOME_TRY(ch, to_key_byte(item)); + if (pending_run) { + if (pending_run == max_run_length || ch != '\0') { + OUTCOME_TRY(stream.write('\0')); + OUTCOME_TRY(stream.write(-pending_run)); + pending_run = 0; + } else { + ++pending_run; + continue; + } + } + if (ch == '\0') { + pending_run = 1; + } else { + OUTCOME_TRY(stream.write(ch)); + } + } + OUTCOME_TRY(stream.write('\0')); + return stream.write(pending_run); +} + template result to_key_optional(const T* obj, S& stream) { if constexpr (has_bitwise_serialization() && sizeof(T) == 1) { @@ -104,14 +144,17 @@ result to_key(const std::pair& obj, S& stream) { template result to_key_range(const T& obj, S& stream) { - for (const auto& elem : obj) { OUTCOME_TRY(to_key_optional(&elem, stream)); } - return to_key_optional((decltype(&*std::begin(obj))) nullptr, stream); + if constexpr (has_bitwise_serialization() && sizeof(typename T::value_type) == 1) { + return to_key_byte_range(obj, stream); + } else { + for (const typename T::value_type& elem : obj) { OUTCOME_TRY(to_key_optional(&elem, stream)); } + return to_key_optional((const typename T::value_type*)nullptr, stream); + } } template result to_key(const std::vector& obj, S& stream) { - for (const T& elem : obj) { OUTCOME_TRY(to_key_optional(&elem, stream)); } - return to_key_optional((const T*)nullptr, stream); + return to_key_range(obj, stream); } template @@ -225,10 +268,20 @@ result to_key(const std::variant& obj, S& stream) { template result to_key(std::string_view obj, S& stream) { - for (char ch : obj) { + constexpr std::ptrdiff_t max_run_length = 127; + for (auto iter = obj.begin(), end = obj.end(); iter != end; ++iter) { + char ch = *iter; OUTCOME_TRY(stream.write(ch)); if (ch == '\0') { - OUTCOME_TRY(stream.write('\1')); + auto run_end = + std::find_if(iter + 1, iter + std::min(end - iter, max_run_length), [](char ch) { return ch != '\0'; }); + unsigned char run_length = (run_end - iter); + if (run_end == end) { + return stream.write(run_length); + } else { + OUTCOME_TRY(stream.write(-run_length)); + } + iter = run_end - 1; // will be incremented immediately } } return stream.write("\0", 2); diff --git a/src/key_test.cpp b/src/key_test.cpp index 98e2376..2b13a67 100644 --- a/src/key_test.cpp +++ b/src/key_test.cpp @@ -50,6 +50,12 @@ void test_key(const T& x, const T& y) { CHECK(std::lexicographical_compare(keyy.begin(), keyy.end(), keyx.begin(), keyx.end(), std::less()) == (y < x)); } +#define CHECK_EQUAL_KEY(x, y) CHECK(eosio::convert_to_key((x)).value() == eosio::convert_to_key((y)).value()) + +std::vector s2v(std::string_view s) { + return std::vector(s.begin(), s.end()); +} + enum class enum_u8 : unsigned char { v0, v1, @@ -127,9 +133,14 @@ void test_compare() { test_key(""s, "a"s); test_key("a"s, "b"s); test_key("aaaaa"s, "aaaaa"s); - test_key("\0"s, "\xFF"s); - test_key("\0"s, ""s); - test_key("\0\0\0"s, "\0\0"s); + test_key("a"s, "\xFF"s); + for(int i = 0; i < 257; ++i) { + test_key(std::string(i, '\0'), std::string(i + 1, '\0')); + } + test_key(std::string(256, '\0'), std::string(256, '\0') + "a"s); + for(int i = 257; i > 0; --i) { + test_key(std::string(i, '\0') + "a", std::string(i - 1, '\0') + "a"); + } test_key(std::vector{}, std::vector{}); test_key(std::vector{}, std::vector{0}); @@ -149,6 +160,17 @@ void test_compare() { test_key(std::vector{'\0'}, std::vector{255}); test_key(std::vector{'\1'}, std::vector{255}); test_key(std::vector{'b'}, std::vector{'a'}); + CHECK_EQUAL_KEY(std::vector(), std::string()); + CHECK_EQUAL_KEY(std::vector(1, '\0'), std::string(1, '\0')); + CHECK_EQUAL_KEY(std::vector(2, '\0'), std::string(2, '\0')); + CHECK_EQUAL_KEY(std::vector(127, '\0'), std::string(127, '\0')); + CHECK_EQUAL_KEY(std::vector(128, '\0'), std::string(128, '\0')); + CHECK_EQUAL_KEY(std::vector(257, '\0'), std::string(257, '\0')); + CHECK_EQUAL_KEY(s2v(std::string(1, '\0') + "a"), std::string(1, '\0') + "a"); + CHECK_EQUAL_KEY(s2v(std::string(2, '\0') + "a"), std::string(2, '\0') + "a"); + CHECK_EQUAL_KEY(s2v(std::string(127, '\0') + "a"), std::string(127, '\0') + "a"); + CHECK_EQUAL_KEY(s2v(std::string(128, '\0') + "a"), std::string(128, '\0') + "a"); + CHECK_EQUAL_KEY(s2v(std::string(257, '\0') + "a"), std::string(257, '\0') + "a"); test_key(std::vector{}, std::vector{true}); test_key(std::vector{false}, std::vector{true}); From caf2228ddc816cda724710d2ef05316ae4767d43 Mon Sep 17 00:00:00 2001 From: Steven Watanabe Date: Fri, 13 Mar 2020 11:29:27 -0400 Subject: [PATCH 2/4] Pack sequences of bool more efficiently. --- include/eosio/to_key.hpp | 17 ++++++++++++++++- src/key_test.cpp | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/eosio/to_key.hpp b/include/eosio/to_key.hpp index 1bcc1db..b2eacc8 100644 --- a/include/eosio/to_key.hpp +++ b/include/eosio/to_key.hpp @@ -144,7 +144,22 @@ result to_key(const std::pair& obj, S& stream) { template result to_key_range(const T& obj, S& stream) { - if constexpr (has_bitwise_serialization() && sizeof(typename T::value_type) == 1) { + if constexpr (std::is_same_v) { + // pack 5 boolean values into each byte + int offset = 0; + unsigned char val = 0; + for (bool item : obj) { + val *= 3; + val += item + 1; + if (++offset == 5) { + OUTCOME_TRY(stream.write(val)); + val = 0; + offset = 0; + } + } + while (++offset < 5) val *= 3; + return stream.write(val); + } else if constexpr (has_bitwise_serialization() && sizeof(typename T::value_type) == 1) { return to_key_byte_range(obj, stream); } else { for (const typename T::value_type& elem : obj) { OUTCOME_TRY(to_key_optional(&elem, stream)); } diff --git a/src/key_test.cpp b/src/key_test.cpp index 2b13a67..01cf1d0 100644 --- a/src/key_test.cpp +++ b/src/key_test.cpp @@ -175,6 +175,7 @@ void test_compare() { test_key(std::vector{}, std::vector{true}); test_key(std::vector{false}, std::vector{true}); test_key(std::vector{false}, std::vector{false, true}); + test_key(std::vector{false, false, false, true, false}, std::vector{false, false, false, true, false, false}); test_key(std::list{}, std::list{1}); test_key(std::list{0}, std::list{1}); From 2ecd5923b34cb48a6d1147edb46c034e1225628b Mon Sep 17 00:00:00 2001 From: Steven Watanabe Date: Fri, 13 Mar 2020 11:51:11 -0400 Subject: [PATCH 3/4] Add description of string encoding. --- include/eosio/to_key.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/eosio/to_key.hpp b/include/eosio/to_key.hpp index b2eacc8..6381a0a 100644 --- a/include/eosio/to_key.hpp +++ b/include/eosio/to_key.hpp @@ -83,6 +83,15 @@ result to_key_byte(const T& obj) { return stream_error::overrun; // Just to be safe. This should never happen. } +// After encoding each element of the range individually, applies the following transform: +// - Runs of 1-127 0's before the end of the range become two bytes: {0, -count} +// - Runs of 0-127 0's at the end of the range become two bytes: {0, count} +// +// Notes: +// - The second rule above will be applied exactly once +// - The byte sequence {0, 0x80} is unused +// - Runs are found greedily from the beginning of the input +// - For an input sequence of length N, the maximum output size is 1.5N + 2 template result to_key_byte_range(const T& obj, S& stream) { static_assert(has_bitwise_serialization() && sizeof(typename T::value_type) == 1, From 50ffa60abbb190157372ab680576c942ce8119e9 Mon Sep 17 00:00:00 2001 From: Steven Watanabe Date: Mon, 16 Mar 2020 09:29:37 -0400 Subject: [PATCH 4/4] Redo bool range again. This time the packing is nearly optimal with 7 bits per bytes and only one unused value. --- include/eosio/to_key.hpp | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/include/eosio/to_key.hpp b/include/eosio/to_key.hpp index 6381a0a..45200cc 100644 --- a/include/eosio/to_key.hpp +++ b/include/eosio/to_key.hpp @@ -154,19 +154,41 @@ result to_key(const std::pair& obj, S& stream) { template result to_key_range(const T& obj, S& stream) { if constexpr (std::is_same_v) { - // pack 5 boolean values into each byte - int offset = 0; + // pack 7 boolean values into each byte + // There are $\sum_{i=0}^7 2^i = 2^8-1 = 255$ sequences + // of bool of length 7 or less. Therefore it is possible + // to represent any such sequence in one byte. + // + // Every bit in a group of 7, is assigned a position, k, starting + // with the highest bit. 6543210. + // + // If the bit in position k is 0, add 1, if it is 1 add 2^{k+1} + // + // Proof: + // Base case: The empty sequence is represented by 0. + // + // Given any sequence of bits of size 7 or less, the lexicographically next + // such sequence can be found as follows: + // + // - If the sequence has fewer than 7 bits, append a 0: xxx -> xxx0 + // Since we add 1 to the result for each 0, this increments the encoding by 1. + // + // - If the sequence has 7 bits, then remove all trailing 1's then + // change the last 0 to a 1: xxx01... -> xxx1 + // The difference in the encoded value is: + // 2^{k+1} - (1 + 2^{k} + ... + 2) = 2^{k+1} - (2^{k+1} - 1) = 1 + // + // - If the sequence is 1111111, it is the maximum and has no next sequence. + int offset = 7; unsigned char val = 0; for (bool item : obj) { - val *= 3; - val += item + 1; - if (++offset == 5) { + val += 1 << (item * offset); + if (--offset == 0) { OUTCOME_TRY(stream.write(val)); val = 0; - offset = 0; + offset = 7; } } - while (++offset < 5) val *= 3; return stream.write(val); } else if constexpr (has_bitwise_serialization() && sizeof(typename T::value_type) == 1) { return to_key_byte_range(obj, stream);