From 3eb78b27505b602c1ccfa952c4cc00f942ccb2b9 Mon Sep 17 00:00:00 2001 From: Spartan322 Date: Tue, 2 Jul 2024 03:51:59 -0400 Subject: Fix string interning pointer invalidity for AST Fix errorneous string intern compare Add symbol interning tests Optimize non-conversion identifier cases Add `File::size()` Add InstanceOf concept Remove `FlatValue::value(const symbol_interner_type&)` Add `AbstractSyntaxTree::intern/intern_cst(lexy::lexeme)` overload Add `DiagnosticLogger::intern/intern_cstr(lexy::lexeme)` overload Use pinned_vector to maintain string interning pointer validity for buffers Add vmcontainer submodule for pinned_vector pinned_vector reserves virtual memory at runtime using OS APIs to maintain pointer validity Remove Exception and RTTI requirement from range-v3 submodule --- include/openvic-dataloader/detail/SymbolIntern.hpp | 256 ++++++++++++++++++++- 1 file changed, 254 insertions(+), 2 deletions(-) (limited to 'include/openvic-dataloader/detail/SymbolIntern.hpp') diff --git a/include/openvic-dataloader/detail/SymbolIntern.hpp b/include/openvic-dataloader/detail/SymbolIntern.hpp index 8755887..d72a0ba 100644 --- a/include/openvic-dataloader/detail/SymbolIntern.hpp +++ b/include/openvic-dataloader/detail/SymbolIntern.hpp @@ -1,14 +1,266 @@ #pragma once #include +#include +#include + +#include #include namespace ovdl { + // Contains all unique symbols, null-terminated, in memory one after the other. + template + struct symbol_buffer { + static constexpr auto min_buffer_size = 16 * 1024; + + constexpr symbol_buffer() : _data_buffer(ovdl::detail::max_elements(min_buffer_size + 1)) {} + explicit symbol_buffer(std::size_t max_elements) + : _data_buffer(ovdl::detail::max_elements(std::max(max_elements, min_buffer_size + 1))) { + _data_buffer.reserve(min_buffer_size); + } + + void free() { + } + + bool reserve(std::size_t new_capacity) { + if (new_capacity <= _data_buffer.capacity()) + return true; + + if (new_capacity >= _data_buffer.max_size()) { + _data_buffer.reserve(_data_buffer.max_size()); + return false; + } + + _data_buffer.reserve(new_capacity * sizeof(CharT)); + + return true; + } + + bool reserve_new_string(std::size_t new_string_length) { + // +1 for null-terminator. + auto new_size = _data_buffer.size() + new_string_length + 1; + if (new_size <= _data_buffer.capacity()) + return true; + + auto new_capacity = new_size * 2; + if (new_capacity < min_buffer_size) + new_capacity = min_buffer_size; + + if (!reserve(new_capacity)) { + return _data_buffer.capacity() >= new_size; + } + + return true; + } + + const CharT* insert(const CharT* str, std::size_t length) { + DRYAD_PRECONDITION(_data_buffer.capacity() - _data_buffer.size() >= length + 1); + + auto index = _data_buffer.cend(); + + _data_buffer.insert(_data_buffer.cend(), str, str + (length * sizeof(CharT))); + _data_buffer.push_back(CharT(0)); + + return index; + } + + const CharT* c_str(std::size_t index) const { + DRYAD_PRECONDITION(index < _data_buffer.size()); + return _data_buffer.data() + index; + } + + std::size_t size() const { + return _data_buffer.size(); + } + + std::size_t capacity() const { + return _data_buffer.capacity(); + } + + std::size_t max_size() const { + return _data_buffer.max_size(); + } + + private: + detail::pinned_vector _data_buffer; + }; + + template + struct symbol_index_hash_traits { + const symbol_buffer* buffer; + + using value_type = IndexType; + + struct string_view { + const CharT* ptr; + std::size_t length; + }; + + static constexpr bool is_unoccupied(IndexType index) { + return index == IndexType(-1); + } + static void fill_unoccupied(IndexType* data, std::size_t size) { + // It has all bits set to 1, so we can do it per-byte. + std::memset(data, static_cast(-1), size * sizeof(IndexType)); + } + + static constexpr bool is_equal(IndexType entry, IndexType value) { + return entry == value; + } + bool is_equal(IndexType entry, string_view str) const { + auto existing_str = buffer->c_str(entry); + return std::strncmp(existing_str, str.ptr, str.length) == 0 && *(existing_str + str.length) == CharT(0); + } + + std::size_t hash(IndexType entry) const { + auto str = buffer->c_str(entry); + return dryad::default_hash_algorithm().hash_c_str(str).finish(); + } + static constexpr std::size_t hash(string_view str) { + return dryad::default_hash_algorithm() + .hash_bytes(reinterpret_cast(str.ptr), str.length * sizeof(CharT)) + .finish(); + } + }; + + template + class symbol; + + template + class symbol_interner { + static_assert(std::is_trivial_v); + static_assert(std::is_unsigned_v); + + using resource_ptr = dryad::_detail::memory_resource_ptr; + using traits = symbol_index_hash_traits; + + public: + using symbol = ovdl::symbol; + + //=== construction ===// + constexpr symbol_interner() : _resource(dryad::_detail::get_memory_resource()) {} + constexpr explicit symbol_interner(std::size_t max_elements) + : _buffer(max_elements), + _resource(dryad::_detail::get_memory_resource()) {} + constexpr explicit symbol_interner(std::size_t max_elements, MemoryResource* resource) + : _buffer(max_elements), + _resource(resource) {} + + ~symbol_interner() noexcept { + _buffer.free(); + _map.free(_resource); + } + + symbol_interner(symbol_interner&& other) noexcept + : _buffer(other._buffer), _map(other._map), _resource(other._resource) { + other._buffer = {}; + other._map = {}; + } + + symbol_interner& operator=(symbol_interner&& other) noexcept { + dryad::_detail::swap(_buffer, other._buffer); + dryad::_detail::swap(_map, other._map); + dryad::_detail::swap(_resource, other._resource); + return *this; + } + + //=== interning ===// + bool reserve(std::size_t number_of_symbols, std::size_t average_symbol_length) { + auto success = _buffer.reserve(number_of_symbols * average_symbol_length); + _map.rehash(_resource, _map.to_table_capacity(number_of_symbols), traits { &_buffer }); + return success; + } + + symbol intern(const CharT* str, std::size_t length) { + if (_map.should_rehash()) + _map.rehash(_resource, traits { &_buffer }); + + auto entry = _map.lookup_entry(typename traits::string_view { str, length }, traits { &_buffer }); + if (entry) + // Already interned, return index. + return symbol(_buffer.c_str(entry.get())); + + // Copy string data to buffer, as we don't have it yet. + if (!_buffer.reserve_new_string(length)) // Ran out of virtual memory space + return symbol(); + + auto begin = _buffer.insert(str, length); + auto idx = std::distance(_buffer.c_str(0), begin); + DRYAD_PRECONDITION(idx == IndexType(idx)); // Overflow of index type. + + // Store index in map. + entry.create(IndexType(idx)); + + // Return new symbol. + return symbol(begin); + } + template + symbol intern(const CharT (&literal)[N]) { + DRYAD_PRECONDITION(literal[N - 1] == CharT(0)); + return intern(literal, N - 1); + } + + private: + symbol_buffer _buffer; + dryad::_detail::hash_table _map; + DRYAD_EMPTY_MEMBER resource_ptr _resource; + + friend symbol; + }; + + template + struct symbol { + using char_type = CharT; + + constexpr symbol() = default; + constexpr explicit symbol(const CharT* begin) : _begin(begin) {} + + constexpr explicit operator bool() const { + return _begin != nullptr; + } + + constexpr const CharT* c_str() const { + return _begin; + } + + constexpr const std::basic_string_view view() const { + return _begin; + } + + //=== comparison ===// + friend constexpr bool operator==(symbol lhs, symbol rhs) { + return lhs._begin == rhs._begin; + } + friend constexpr bool operator!=(symbol lhs, symbol rhs) { + return lhs._begin != rhs._begin; + } + + friend constexpr bool operator<(symbol lhs, symbol rhs) { + return lhs._begin < rhs._begin; + } + friend constexpr bool operator<=(symbol lhs, symbol rhs) { + return lhs._begin <= rhs._begin; + } + friend constexpr bool operator>(symbol lhs, symbol rhs) { + return lhs._begin > rhs._begin; + } + friend constexpr bool operator>=(symbol lhs, symbol rhs) { + return lhs._begin >= rhs._begin; + } + + private: + const CharT* _begin = nullptr; + + template + friend class symbol_interner; + }; + struct SymbolIntern { struct SymbolId; using index_type = std::uint32_t; - using symbol_type = dryad::symbol; - using symbol_interner_type = dryad::symbol_interner; + using symbol_type = symbol; + using symbol_interner_type = symbol_interner; }; } \ No newline at end of file -- cgit v1.2.3-56-ga3b1