From 853d391ce183996a63f53ae91a1e17d4d729e180 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 00:13:19 -0700 Subject: [PATCH 01/19] Fix clang-tidy warnings --- .clang-tidy | 2 + Makefile | 16 ++-- lug/error.hpp | 2 +- lug/unicode.hpp | 192 +++++++++++++++++++++--------------------- lug/utf8.hpp | 67 +++++++++------ tools/makeunicode.cpp | 168 ++++++++++++++++++------------------ 6 files changed, 235 insertions(+), 212 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 8f8f033..a66ba9b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -8,6 +8,7 @@ Checks: - -cert-dcl21-cpp - concurrency-* - cppcoreguidelines-* + - -cppcoreguidelines-avoid-magic-numbers - darwin-* - hicpp-* - -hicpp-braces-around-statements @@ -21,6 +22,7 @@ Checks: - readability-* - -readability-braces-around-statements - -readability-identifier-length + - -readability-magic-numbers WarningsAsErrors: '' HeaderFileExtensions: - '' diff --git a/Makefile b/Makefile index 8f99593..7aea5bf 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ PREFIX = /usr/local # toolchain CXXSTD = -std=c++17 CXXFLAGS = $(CXXSTD) -pedantic -Wall -Wconversion -Wextra -Wextra-semi -Wshadow -Wsign-conversion -Wsuggest-override -Wno-parentheses -Wno-logical-not-parentheses \ - -Os -ffunction-sections -fdata-sections -I. $$(if [ "$(CI_BUILD)" = "1" ]; then echo "-Werror"; fi) + -Os -ffunction-sections -fdata-sections -I. LDFLAGS = $(CXXSTD) -s CLANGTIDY = clang-tidy @@ -32,8 +32,8 @@ TOOLS = makeunicode TOOLS_BIN = $(TOOLS:%=tools/%) TOOLS_OBJ = $(TOOLS:%=tools/%.o) -# dependencies -DEPS = lug/lug.hpp lug/detail.hpp lug/error.hpp lug/unicode.hpp lug/utf8.hpp +# header dependencies +HEADERS = lug/detail.hpp lug/error.hpp lug/unicode.hpp lug/utf8.hpp lug/lug.hpp # distribution files DISTFILES = CHANGELOG.md LICENSE.md README.md CMakeLists.txt Makefile runtests.sh .clang-tidy .editorconfig .gitattributes .gitignore .github/ doc/ lug/ samples/ tests/ tools/ @@ -42,9 +42,9 @@ all: options samples tests .cpp.o: @echo CXX $< - @$(CXX) -c $(CXXFLAGS) -o $@ $< + @$(CXX) -c $(CXXFLAGS) $$(if [ "$(CI_BUILD)" = "1" ]; then echo "-Werror"; fi) -o $@ $< -$(SAMPLES_OBJ): $(DEPS) +$(SAMPLES_OBJ): $(HEADERS) $(SAMPLES_BIN): $(SAMPLES_OBJ) @echo LD $@ @@ -52,7 +52,7 @@ $(SAMPLES_BIN): $(SAMPLES_OBJ) samples: $(SAMPLES_BIN) -$(TESTS_OBJ): $(DEPS) +$(TESTS_OBJ): $(HEADERS) $(TESTS_BIN): $(TESTS_OBJ) @echo LD $@ @@ -64,9 +64,9 @@ check: tests @sh runtests.sh "tests" $(TESTS_BIN) lint: - @$(CLANGTIDY) --quiet $(CXXFLAGS:%=--extra-arg=%) lug/detail.hpp + @$(CLANGTIDY) --quiet $(CXXFLAGS:%=--extra-arg=%) $(HEADERS) -$(TOOLS_OBJ): $(DEPS) +$(TOOLS_OBJ): $(HEADERS) $(TOOLS_BIN): $(TOOLS_OBJ) @echo LD $@ diff --git a/lug/error.hpp b/lug/error.hpp index dfc81dc..7138a92 100644 --- a/lug/error.hpp +++ b/lug/error.hpp @@ -18,7 +18,7 @@ class reenterant_read_error : public lug_error { public: reenterant_read_error() class parse_context_error : public lug_error { public: parse_context_error() : lug_error{"operation valid only inside calling context of parser::parse" } {} }; class accept_context_error : public lug_error{ public: accept_context_error() : lug_error{"operation valid only inside calling context of parser::accept"} {} }; class attribute_stack_error : public lug_error{ public: attribute_stack_error() : lug_error{"incompatible or invalid stack frame"} {} }; -class bad_string_expression : public lug_error { public: bad_string_expression(const std::string& s = "invalid string or bracket expression") : lug_error{s} {} }; +class bad_string_expression : public lug_error { public: explicit bad_string_expression(std::string const& s = "invalid string or bracket expression") : lug_error{s} {} }; class bad_character_class : public bad_string_expression { public: bad_character_class() : bad_string_expression{"invalid character class"} {} }; class bad_character_range : public bad_string_expression { public: bad_character_range() : bad_string_expression{"character range is reversed"} {} }; class bad_grammar : public lug_error { public: bad_grammar() : lug_error{"invalid or empty grammar"} {} }; diff --git a/lug/unicode.hpp b/lug/unicode.hpp index 97a291f..1ea8245 100644 --- a/lug/unicode.hpp +++ b/lug/unicode.hpp @@ -21,6 +21,8 @@ #include #include +// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + namespace lug::unicode { // POSIX compatibility properties @@ -704,7 +706,7 @@ enum class eawtype : std::uint_least8_t }; // Property Traits -enum class property_enum +enum class property_enum : std::uint_least8_t { invalid, ctype, @@ -747,34 +749,34 @@ class record std::array stage2; std::array records; }; - static std::int_least32_t case_mapping(std::size_t index) noexcept; - static std::unique_ptr decompress_table(); + [[nodiscard]] static std::int_least32_t case_mapping(std::size_t index) noexcept; + [[nodiscard]] static std::unique_ptr decompress_table(); friend record query(char32_t r); public: - ctype compatibility() const noexcept { return static_cast(raw_->cflags); } - ptype properties() const noexcept { return static_cast(raw_->pflags); } - gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } - sctype script() const noexcept { return static_cast(raw_->scindex); } - blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } - std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } - std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } - std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } - bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } - bool all_of(ptype p) const noexcept { return (properties() & p) == p; } - bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } - bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } - bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } - bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } - bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } - bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } - bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } + [[nodiscard]] ctype compatibility() const noexcept { return static_cast(raw_->cflags); } + [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } + [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } + [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } + [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } + [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } + [[nodiscard]] bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } + [[nodiscard]] bool all_of(ptype p) const noexcept { return (properties() & p) == p; } + [[nodiscard]] bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } + [[nodiscard]] bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } + [[nodiscard]] bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } + [[nodiscard]] bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } + [[nodiscard]] bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } + [[nodiscard]] bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } + [[nodiscard]] bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } }; // Retrieves the UCD record for the given codepoint -inline record query(char32_t r) +[[nodiscard]] inline record query(char32_t r) { static auto const table = record::decompress_table(); std::size_t index = 1901; @@ -786,83 +788,80 @@ inline record query(char32_t r) } // Checks if the rune matches all of the string-packed property classes -inline bool all_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool all_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches any of the string-packed property classes -inline bool any_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool any_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches none of the string-packed property classes -inline bool none_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool none_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() != lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() != lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() != lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() != lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() != lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() != lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() != lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() != lug::detail::string_unpack(str); } - return result; + return false; } // Column width (-1 = non-displayable, 0 = non-spacing, 1 = normal, 2 = wide) -inline int cwidth(char32_t r) +[[nodiscard]] inline int cwidth(char32_t r) { return query(r).cwidth(); } // Absolute column width -inline unsigned int ucwidth(char32_t r) +[[nodiscard]] inline unsigned int ucwidth(char32_t r) { auto const cw = query(r).cwidth(); return static_cast(cw >= 0 ? cw : -cw); } // Simple casefold conversion -inline char32_t tocasefold(char32_t r) +[[nodiscard]] inline char32_t tocasefold(char32_t r) { return static_cast(static_cast(r) + query(r).casefold_mapping()); } // Simple lowercase conversion -inline char32_t tolower(char32_t r) +[[nodiscard]] inline char32_t tolower(char32_t r) { return static_cast(static_cast(r) + query(r).lowercase_mapping()); } // Simple uppercase conversion -inline char32_t toupper(char32_t r) +[[nodiscard]] inline char32_t toupper(char32_t r) { return static_cast(static_cast(r) + query(r).uppercase_mapping()); } @@ -893,9 +892,10 @@ inline void push_uniform_casefolded_range(rune_set& runes, ptype props, char32_t inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) { ptype p = query(start).properties(); - char32_t r1 = start, r2 = start; + char32_t r1 = start; + char32_t r2 = start; for (char32_t rn = start + 1; rn <= end; r2 = rn, ++rn) { - ptype q = query(start).properties(); + ptype const q = query(start).properties(); if (((p ^ q) & ptype::Cased) != ptype::None) { detail::push_uniform_casefolded_range(runes, p, r1, r2); r1 = rn; @@ -905,27 +905,27 @@ inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) detail::push_uniform_casefolded_range(runes, p, r1, r2); } -inline rune_set sort_and_optimize(rune_set runes) +[[nodiscard]] inline rune_set sort_and_optimize(rune_set runes) { rune_set optimized_runes; auto out = optimized_runes.end(); std::sort_heap(std::begin(runes), std::end(runes)); - for (auto curr = std::cbegin(runes), last = std::cend(runes); curr != last; ++curr) { - if (out == optimized_runes.end() || curr->first < out->first || out->second < curr->first) - out = optimized_runes.insert(optimized_runes.end(), *curr); + for (auto const& r : runes) { + if (out == optimized_runes.end() || r.first < out->first || out->second < r.first) + out = optimized_runes.insert(optimized_runes.end(), r); else - out->second = out->second < curr->second ? curr->second : out->second; + out->second = out->second < r.second ? r.second : out->second; } optimized_runes.shrink_to_fit(); return optimized_runes; } -inline rune_set negate(rune_set const& runes) +[[nodiscard]] inline rune_set negate(rune_set const& runes) { rune_set negated_runes; if (!runes.empty()) { - if (char32_t front = runes.front().first; U'\0' < front) - negated_runes.push_back({U'\0', front - 1}); + if (char32_t const front = runes.front().first; U'\0' < front) + negated_runes.emplace_back(U'\0', front - 1); if (runes.size() > 1) { auto const last = std::cend(runes); auto left = std::cbegin(runes); @@ -933,12 +933,12 @@ inline rune_set negate(rune_set const& runes) auto right = std::next(left); if (right == last) break; - negated_runes.push_back({left->second + 1, right->first - 1}); + negated_runes.emplace_back(left->second + 1, right->first - 1); left = right; } } - if (char32_t back = runes.back().second; back < U'\xFFFFFFFF') - negated_runes.push_back({back + 1, U'\xFFFFFFFF'}); + if (char32_t const back = runes.back().second; back < U'\xFFFFFFFF') + negated_runes.emplace_back(back + 1, U'\xFFFFFFFF'); negated_runes.shrink_to_fit(); } return negated_runes; @@ -946,10 +946,10 @@ inline rune_set negate(rune_set const& runes) namespace detail { -inline std::string normalize_property_label(std::string_view id) +[[nodiscard]] inline std::string normalize_property_label(std::string_view id) { std::string normid; - for (char c : id) + for (char const c : id) if (c != ' ' && c != '\t' && c != '_' && c != '-' && c != '.' && c != ';') normid.push_back(static_cast(std::tolower(c))); return normid; @@ -971,8 +971,8 @@ inline std::optional stoctype(std::string_view s) { "xdigit"sv, ct::xdigit } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1014,8 +1014,8 @@ inline std::optional stoptype(std::string_view s) { "xidcontinue"sv, pt::XID_Continue }, { "xidstart"sv, pt::XID_Start } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1044,8 +1044,8 @@ inline std::optional stogctype(std::string_view s) { "zl"sv, gc::Zl }, { "zp"sv, gc::Zp }, { "zs"sv, gc::Zs } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1111,8 +1111,8 @@ inline std::optional stosctype(std::string_view s) { "zanabazarsquare"sv, sc::Zanabazar_Square } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1267,8 +1267,8 @@ inline std::optional stoblktype(std::string_view s) { "zanabazarsquare"sv, blk::Zanabazar_Square }, { "znamennymusicalnotation"sv, blk::Znamenny_Musical_Notation } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1288,8 +1288,8 @@ inline std::optional stoagetype(std::string_view s) { "90"sv, at::v9_0 }, { "unassigned"sv, at::Unassigned } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1304,8 +1304,8 @@ inline std::optional stoeawtype(std::string_view s) { "a"sv, eaw::A }, { "f"sv, eaw::F }, { "h"sv, eaw::H }, { "n"sv, eaw::N }, { "na"sv, eaw::Na }, { "w"sv, eaw::W } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1316,7 +1316,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ull << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1340,7 +1340,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace detail -inline std::int_least32_t record::case_mapping(std::size_t index) noexcept +[[nodiscard]] inline std::int_least32_t record::case_mapping(std::size_t index) noexcept { static constexpr std::array casemappings = { @@ -1358,7 +1358,7 @@ inline std::int_least32_t record::case_mapping(std::size_t index) noexcept return casemappings[index]; } -inline std::unique_ptr record::decompress_table() +[[nodiscard]] inline std::unique_ptr record::decompress_table() { using detail::run_length_decode; using lug::detail::make_member_accessor; @@ -2096,7 +2096,7 @@ inline std::unique_ptr record::decompress_table() 512, 896, 640, 2432, 3080, 3072, 7280, 7269, 7237, 7176, 7267, 7235, 7233, 7168, 0, 128 }; - std::array flyweights; + std::array flyweights{}; auto table = std::make_unique(); auto& records = table->records; @@ -2131,4 +2131,6 @@ inline std::unique_ptr record::decompress_table() } // namespace lug::unicode +// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + #endif diff --git a/lug/utf8.hpp b/lug/utf8.hpp index 1617d43..b516c31 100644 --- a/lug/utf8.hpp +++ b/lug/utf8.hpp @@ -16,8 +16,7 @@ namespace lug::utf8 { namespace detail { -inline constexpr unsigned int decode_accept = 0; -inline constexpr unsigned int decode_reject = 12; +enum class decode_state : unsigned char { accept = 0, reject = 12 }; inline constexpr std::array dfa_class_table { @@ -50,30 +49,48 @@ inline constexpr std::array dfa_transition_table 12,36,12,12,12,12,12,12,12,12,12,12 }; -} // namespace detail +inline constexpr std::array utf8_replacement_sequence +{ + static_cast(0xefU), + static_cast(0xbfU), + static_cast(0xbdU) +}; -[[nodiscard]] constexpr bool is_lead(char octet) noexcept +inline constexpr char32_t utf32_replacement = U'\U0000fffd'; + +[[nodiscard]] constexpr decode_state decode_rune_octet(char32_t& rune, char octet, decode_state state) noexcept { - return (static_cast(octet) & 0xc0) != 0x80; + auto const symbol = static_cast(static_cast(octet)); + auto const dfa_class = static_cast(dfa_class_table[symbol]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index) + rune = (state == decode_state::accept) ? (symbol & (0xffU >> dfa_class)) : ((symbol & 0x3fU) | (rune << 6U)); + return static_cast(dfa_transition_table[static_cast(state) + dfa_class]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index) } -[[nodiscard]] constexpr unsigned int decode_rune_octet(char32_t& rune, char octet, unsigned int state) +[[nodiscard]] constexpr unsigned int non_ascii_rune_length(char32_t rune) noexcept +{ + if (rune >= 0x00010000U) + return 4; + if (rune >= 0x00000800U) + return 3; + return 2; +} + +} // namespace detail + +[[nodiscard]] constexpr bool is_lead(char octet) noexcept { - unsigned int const symbol = static_cast(static_cast(octet)); - unsigned int const dfa_class = static_cast(detail::dfa_class_table[symbol]); - rune = state == detail::decode_accept ? (symbol & (0xffU >> dfa_class)) : ((symbol & 0x3fU) | (rune << 6)); - return detail::dfa_transition_table[state + dfa_class]; + return (static_cast(octet) & 0xc0U) != 0x80U; } template > [[nodiscard]] constexpr std::pair decode_rune(InputIt first, InputIt last) { char32_t rune = U'\0'; - unsigned int state = detail::decode_accept; - while (first != last && state != detail::decode_reject) - if (state = lug::utf8::decode_rune_octet(rune, *first++, state); state == detail::decode_accept) + detail::decode_state state = detail::decode_state::accept; + while ((first != last) && (state != detail::decode_state::reject)) + if (state = utf8::detail::decode_rune_octet(rune, *first++, state); state == detail::decode_state::accept) return std::make_pair(first, rune); - return std::make_pair(std::find_if(first, last, lug::utf8::is_lead), U'\U0000fffd'); + return std::make_pair(std::find_if(first, last, lug::utf8::is_lead), detail::utf32_replacement); } template > @@ -97,11 +114,11 @@ inline std::pair encode_rune(OutputIt dst, char32_t rune) if (rune < 0x80) { *dst++ = static_cast(rune); } else { - if (0x00110000U <= rune || (rune & 0xfffff800U) == 0x0000d800U) - return {std::copy_n(reinterpret_cast(u8"\U0000fffd"), 3, dst), false}; - unsigned int const n = rune >= 0x00010000U ? 4 : rune >= 0x00000800U ? 3 : 2; - for (unsigned int i = 0, c = (0xf0 << (4 - n)) & 0xf0; i < n; ++i, c = 0x80) - *dst++ = static_cast(((rune >> (6 * (n - i - 1))) & 0x3f) | c); + if ((0x00110000U <= rune) || ((rune & 0xfffff800U) == 0x0000d800U)) + return {std::copy(detail::utf8_replacement_sequence.begin(), detail::utf8_replacement_sequence.end(), dst), false}; + unsigned int const n = detail::non_ascii_rune_length(rune); + for (unsigned int i = 0, c = ((0xf0U << (4 - n)) & 0xf0U); i < n; ++i, c = 0x80U) + *dst++ = static_cast(((rune >> (6 * (n - i - 1))) & 0x3fU) | c); } return {dst, true}; } @@ -116,7 +133,7 @@ inline std::pair encode_rune(OutputIt dst, char32_t rune) inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -126,7 +143,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); @@ -139,7 +156,7 @@ tocasefold{}; inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -149,7 +166,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); @@ -162,7 +179,7 @@ tolower{}; inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -172,7 +189,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); diff --git a/tools/makeunicode.cpp b/tools/makeunicode.cpp index 9337a26..648c63f 100644 --- a/tools/makeunicode.cpp +++ b/tools/makeunicode.cpp @@ -1073,8 +1073,8 @@ class enum_parser_printer return out << "\t\t" << line << "\n" << "\t} };\n\n" - << "\tauto l = detail::normalize_property_label(s);\n" - << "\tauto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; });\n" + << "\tauto const l = detail::normalize_property_label(s);\n" + << "\tauto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; });\n" << "\treturn c != labels.end() && c->first == l ? std::optional<" << p.name_ << ">{static_cast<" << p.name_ << ">(c->second)} : std::nullopt;\n" << "}\n"; } @@ -1149,6 +1149,8 @@ R"c++(// lug - Embedded DSL for PE grammar parser combinators in C++ #include #include +// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + namespace lug::unicode { )c++" << "\n" @@ -1220,7 +1222,7 @@ namespace lug::unicode { }) << R"c++( // Property Traits -enum class property_enum +enum class property_enum : std::uint_least8_t { invalid, ctype, @@ -1263,34 +1265,34 @@ class record << "\t\tstd::array<" << recordstagetable.typeinfo2.name << ", " << std::dec << recordstagetable.stage2.size() << "> stage2;\n" << "\t\tstd::array records;" << R"c++( }; - static std::int_least32_t case_mapping(std::size_t index) noexcept; - static std::unique_ptr decompress_table(); + [[nodiscard]] static std::int_least32_t case_mapping(std::size_t index) noexcept; + [[nodiscard]] static std::unique_ptr decompress_table(); friend record query(char32_t r); public: - ctype compatibility() const noexcept { return static_cast(raw_->cflags); } - ptype properties() const noexcept { return static_cast(raw_->pflags); } - gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } - sctype script() const noexcept { return static_cast(raw_->scindex); } - blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } - std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } - std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } - std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } - bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } - bool all_of(ptype p) const noexcept { return (properties() & p) == p; } - bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } - bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } - bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } - bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } - bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } - bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } - bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } + [[nodiscard]] ctype compatibility() const noexcept { return static_cast(raw_->cflags); } + [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } + [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } + [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } + [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } + [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } + [[nodiscard]] bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } + [[nodiscard]] bool all_of(ptype p) const noexcept { return (properties() & p) == p; } + [[nodiscard]] bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } + [[nodiscard]] bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } + [[nodiscard]] bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } + [[nodiscard]] bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } + [[nodiscard]] bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } + [[nodiscard]] bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } + [[nodiscard]] bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } }; // Retrieves the UCD record for the given codepoint -inline record query(char32_t r) +[[nodiscard]] inline record query(char32_t r) { static auto const table = record::decompress_table(); std::size_t index = )c++" << std::dec << invalidrecordindex << R"c++(; @@ -1302,83 +1304,80 @@ inline record query(char32_t r) } // Checks if the rune matches all of the string-packed property classes -inline bool all_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool all_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches any of the string-packed property classes -inline bool any_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool any_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches none of the string-packed property classes -inline bool none_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool none_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() != lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() != lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() != lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() != lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() != lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() != lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() != lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() != lug::detail::string_unpack(str); } - return result; + return false; } // Column width (-1 = non-displayable, 0 = non-spacing, 1 = normal, 2 = wide) -inline int cwidth(char32_t r) +[[nodiscard]] inline int cwidth(char32_t r) { return query(r).cwidth(); } // Absolute column width -inline unsigned int ucwidth(char32_t r) +[[nodiscard]] inline unsigned int ucwidth(char32_t r) { auto const cw = query(r).cwidth(); return static_cast(cw >= 0 ? cw : -cw); } // Simple casefold conversion -inline char32_t tocasefold(char32_t r) +[[nodiscard]] inline char32_t tocasefold(char32_t r) { return static_cast(static_cast(r) + query(r).casefold_mapping()); } // Simple lowercase conversion -inline char32_t tolower(char32_t r) +[[nodiscard]] inline char32_t tolower(char32_t r) { return static_cast(static_cast(r) + query(r).lowercase_mapping()); } // Simple uppercase conversion -inline char32_t toupper(char32_t r) +[[nodiscard]] inline char32_t toupper(char32_t r) { return static_cast(static_cast(r) + query(r).uppercase_mapping()); } @@ -1409,9 +1408,10 @@ inline void push_uniform_casefolded_range(rune_set& runes, ptype props, char32_t inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) { ptype p = query(start).properties(); - char32_t r1 = start, r2 = start; + char32_t r1 = start; + char32_t r2 = start; for (char32_t rn = start + 1; rn <= end; r2 = rn, ++rn) { - ptype q = query(start).properties(); + ptype const q = query(start).properties(); if (((p ^ q) & ptype::Cased) != ptype::None) { detail::push_uniform_casefolded_range(runes, p, r1, r2); r1 = rn; @@ -1421,27 +1421,27 @@ inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) detail::push_uniform_casefolded_range(runes, p, r1, r2); } -inline rune_set sort_and_optimize(rune_set runes) +[[nodiscard]] inline rune_set sort_and_optimize(rune_set runes) { rune_set optimized_runes; auto out = optimized_runes.end(); std::sort_heap(std::begin(runes), std::end(runes)); - for (auto curr = std::cbegin(runes), last = std::cend(runes); curr != last; ++curr) { - if (out == optimized_runes.end() || curr->first < out->first || out->second < curr->first) - out = optimized_runes.insert(optimized_runes.end(), *curr); + for (auto const& r : runes) { + if (out == optimized_runes.end() || r.first < out->first || out->second < r.first) + out = optimized_runes.insert(optimized_runes.end(), r); else - out->second = out->second < curr->second ? curr->second : out->second; + out->second = out->second < r.second ? r.second : out->second; } optimized_runes.shrink_to_fit(); return optimized_runes; } -inline rune_set negate(rune_set const& runes) +[[nodiscard]] inline rune_set negate(rune_set const& runes) { rune_set negated_runes; if (!runes.empty()) { - if (char32_t front = runes.front().first; U'\0' < front) - negated_runes.push_back({U'\0', front - 1}); + if (char32_t const front = runes.front().first; U'\0' < front) + negated_runes.emplace_back(U'\0', front - 1); if (runes.size() > 1) { auto const last = std::cend(runes); auto left = std::cbegin(runes); @@ -1449,12 +1449,12 @@ inline rune_set negate(rune_set const& runes) auto right = std::next(left); if (right == last) break; - negated_runes.push_back({left->second + 1, right->first - 1}); + negated_runes.emplace_back(left->second + 1, right->first - 1); left = right; } } - if (char32_t back = runes.back().second; back < U'\xFFFFFFFF') - negated_runes.push_back({back + 1, U'\xFFFFFFFF'}); + if (char32_t const back = runes.back().second; back < U'\xFFFFFFFF') + negated_runes.emplace_back(back + 1, U'\xFFFFFFFF'); negated_runes.shrink_to_fit(); } return negated_runes; @@ -1462,10 +1462,10 @@ inline rune_set negate(rune_set const& runes) namespace detail { -inline std::string normalize_property_label(std::string_view id) +[[nodiscard]] inline std::string normalize_property_label(std::string_view id) { std::string normid; - for (char c : id) + for (char const c : id) if (c != ' ' && c != '\t' && c != '_' && c != '-' && c != '.' && c != ';') normid.push_back(static_cast(std::tolower(c))); return normid; @@ -1538,7 +1538,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ull << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1562,7 +1562,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace detail -inline std::int_least32_t record::case_mapping(std::size_t index) noexcept +[[nodiscard]] inline std::int_least32_t record::case_mapping(std::size_t index) noexcept { )c++" << function_table_printer("casemappings", "std::int_least32_t", compressedrecords.cmapping_values) @@ -1570,7 +1570,7 @@ inline std::int_least32_t record::case_mapping(std::size_t index) noexcept return casemappings[index]; } -inline std::unique_ptr record::decompress_table() +[[nodiscard]] inline std::unique_ptr record::decompress_table() { using detail::run_length_decode; using lug::detail::make_member_accessor; @@ -1581,7 +1581,7 @@ inline std::unique_ptr record::decompress_table() << rle_stage_table_printer("rlestage2", recordstagetable.stage2, recordstagetable.typeinfo2) << "\n" << record_flyweight_printer(compressedrecords) -<< "\n\tstd::array flyweights;" +<< "\n\tstd::array flyweights{};" << R"c++( auto table = std::make_unique(); auto& records = table->records; @@ -1617,6 +1617,8 @@ inline std::unique_ptr record::decompress_table() } // namespace lug::unicode +// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + #endif )c++"; } From 4e8b9f6f24d8594b51af2f87255baa90ac53bfdb Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:56:07 -0700 Subject: [PATCH 02/19] Code cleanup --- lug/utf8.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lug/utf8.hpp b/lug/utf8.hpp index b516c31..c7d544f 100644 --- a/lug/utf8.hpp +++ b/lug/utf8.hpp @@ -68,11 +68,11 @@ inline constexpr char32_t utf32_replacement = U'\U0000fffd'; [[nodiscard]] constexpr unsigned int non_ascii_rune_length(char32_t rune) noexcept { - if (rune >= 0x00010000U) - return 4; - if (rune >= 0x00000800U) + if (rune < 0x00000800U) + return 2; + if (rune < 0x00010000U) return 3; - return 2; + return 4; } } // namespace detail From 9e9ee8918343ff0f4e0361e13045c19e5b4eba5b Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:57:02 -0700 Subject: [PATCH 03/19] Remove #include from that was left in during testing --- lug/lug.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index acb7982..2ebe5f9 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -15,7 +15,6 @@ #include #include #include -#include namespace lug { From 3aa4aa5cb2804e05aa74a35654a80be105c9bb7f Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:57:27 -0700 Subject: [PATCH 04/19] Update README fixing typos --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8b13503..76d1c8c 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,11 @@ A C++ embedded domain specific language for expressing parsers as extended [pars Features --- -- Natural syntax resembling external parser generator languages. +- Natural syntax resembling external parser generator languages, with support for attributes and semantic actions. +- Ability to handle context-sensitive grammars with symbol tables, conditions and syntactic predicates. - Generated parsers are compiled to special-purpose bytecode and executed in a virtual parsing machine. - Clear separation of syntactic and lexical rules, with the ability to customize implicit whitespace skipping. - Support for direct and indirect left recursion, with precedence levels to disambiguate subexpressions with mixed left/right recursion. -- Extended PEG syntax to include attribute grammars and semantic actions. -- Ability to handle context-sensitive grammars with symbol tables, conditions, and syntactic predicates. - Full support for UTF-8 text parsing, including Level 1 and partial Level 2 compliance with the UTS #18 Unicode Regular Expressions technical standard. - Automatic tracking of line and column numbers, with customizable tab width and alignment. - Header-only library utilizing C++17 language and library features. @@ -69,7 +68,7 @@ Syntax Reference | One-or-More | `+e` | Repetition matching of expression *e* one or more times. | | Optional | `~e` | Matches expression *e* zero or one times. | | Positive Lookahead | `&e` | Matches without consuming input if expression *e* succeeds to match the input. | -| Negative Lookahead | `~e` | Matches without consuming input if expression *e* fails to match the input. | +| Negative Lookahead | `!e` | Matches without consuming input if expression *e* fails to match the input. | | Cut Before | `--e` | Issues a cut instruction before the expression *e*. | | Cut After | `e--` | Issues a cut instruction after the expression *e*. | | Action Scheduling | `e < a` | Schedules a semantic action *a* to be evaluated if expression *e* successfully matches the input. | From 7025cae7577e37be6d4beb34d0eaac77184f1f9e Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:34:59 -0700 Subject: [PATCH 05/19] Fix clang-tidy warnings --- .clang-tidy | 4 ++++ lug/detail.hpp | 2 +- lug/unicode.hpp | 26 +++++++++++++------------- tools/makeunicode.cpp | 26 ++++++++++++-------------- 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index a66ba9b..f7c52a8 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -9,6 +9,9 @@ Checks: - concurrency-* - cppcoreguidelines-* - -cppcoreguidelines-avoid-magic-numbers + - -cppcoreguidelines-avoid-do-while + - -cppcoreguidelines-avoid-goto + - -cppcoreguidelines-pro-bounds-* - darwin-* - hicpp-* - -hicpp-braces-around-statements @@ -23,6 +26,7 @@ Checks: - -readability-braces-around-statements - -readability-identifier-length - -readability-magic-numbers + - -readability-qualified-auto WarningsAsErrors: '' HeaderFileExtensions: - '' diff --git a/lug/detail.hpp b/lug/detail.hpp index 3c7fccd..a7ebb5d 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -162,7 +162,7 @@ template using enable_if_char_contiguous_iterator_t = std::enable_if_t, T>; template -constexpr void ignore(Args&&...) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward,hicpp-named-parameter,readability-named-parameter) +constexpr void ignore([[maybe_unused]] Args&&... args) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward) struct identity { diff --git a/lug/unicode.hpp b/lug/unicode.hpp index 1ea8245..d1c7c48 100644 --- a/lug/unicode.hpp +++ b/lug/unicode.hpp @@ -21,10 +21,10 @@ #include #include -// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - namespace lug::unicode { +// NOLINTBEGIN(hicpp-signed-bitwise) + // POSIX compatibility properties enum class ctype : std::uint_least16_t { @@ -159,6 +159,8 @@ enum class gctype : std::uint_least32_t is_bitfield_enum }; +// NOLINTEND(hicpp-signed-bitwise) + // Scripts enum class sctype : std::uint_least8_t { @@ -757,10 +759,10 @@ class record [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } - [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x03ffU); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10U); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0fU); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4U) - 1; } [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } @@ -781,8 +783,8 @@ class record static auto const table = record::decompress_table(); std::size_t index = 1901; if (r < 0x110000) { - index = table->stage1[r >> 7]; - index = table->stage2[(index << 7) | (r & 0x7f)]; + index = table->stage1[r >> 7U]; + index = table->stage2[(index << 7U) | (r & 0x7fU)]; } return record{&table->records[index]}; } @@ -1316,7 +1318,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << static_cast(std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1324,14 +1326,14 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) auto const tail = *first++; for (std::size_t i = 0; i < count; ++i) { if ((head & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(head & ~seqmask) + 1, tail); + dest = std::fill_n(dest, static_cast(head & ~static_cast(seqmask)) + 1, tail); } else { *dest++ = head; *dest++ = tail; } } } else if ((lead & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(lead & ~seqmask) + 1, *first++); + dest = std::fill_n(dest, static_cast(lead & ~static_cast(seqmask)) + 1, *first++); } else { *dest++ = lead; } @@ -2131,6 +2133,4 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace lug::unicode -// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - #endif diff --git a/tools/makeunicode.cpp b/tools/makeunicode.cpp index 648c63f..70b414e 100644 --- a/tools/makeunicode.cpp +++ b/tools/makeunicode.cpp @@ -1149,9 +1149,9 @@ R"c++(// lug - Embedded DSL for PE grammar parser combinators in C++ #include #include -// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - namespace lug::unicode { + +// NOLINTBEGIN(hicpp-signed-bitwise) )c++" << "\n" << enum_printer(enum_type::bitfield, "ctype", "std::uint_least16_t", "POSIX compatibility properties", [](std::ostream& out) { @@ -1191,7 +1191,7 @@ namespace lug::unicode { out << "," << std::right << std::setw(21 - padcount) << " " << compound.second.first << " = " << compound.first << ",\n"; } }) -<< "\n" +<< "\n// NOLINTEND(hicpp-signed-bitwise)\n\n" << enum_printer(enum_type::index, "sctype", "std::uint_least8_t", "Scripts", [](std::ostream& out) { auto const pad = align_padding(max_element_size(script_names.cbegin(), script_names.cend())); for (std::size_t i = 0, n = script_names.size(); i < n; ++i) @@ -1273,10 +1273,10 @@ class record [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } - [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x03ffU); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10U); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0fU); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4U) - 1; } [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } @@ -1297,8 +1297,8 @@ class record static auto const table = record::decompress_table(); std::size_t index = )c++" << std::dec << invalidrecordindex << R"c++(; if (r < 0x)c++" << std::hex << ptable.size() << R"c++() { - index = table->stage1[r >> )c++" << std::dec << block_shift << R"c++(]; - index = table->stage2[(index << )c++" << std::dec << block_shift << R"c++() | (r & 0x)c++" << std::hex << block_mask << R"c++()]; + index = table->stage1[r >> )c++" << std::dec << block_shift << R"c++(U]; + index = table->stage2[(index << )c++" << std::dec << block_shift << R"c++(U) | (r & 0x)c++" << std::hex << block_mask << R"c++(U)]; } return record{&table->records[index]}; } @@ -1538,7 +1538,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << static_cast(std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1546,14 +1546,14 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) auto const tail = *first++; for (std::size_t i = 0; i < count; ++i) { if ((head & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(head & ~seqmask) + 1, tail); + dest = std::fill_n(dest, static_cast(head & ~static_cast(seqmask)) + 1, tail); } else { *dest++ = head; *dest++ = tail; } } } else if ((lead & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(lead & ~seqmask) + 1, *first++); + dest = std::fill_n(dest, static_cast(lead & ~static_cast(seqmask)) + 1, *first++); } else { *dest++ = lead; } @@ -1617,8 +1617,6 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace lug::unicode -// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - #endif )c++"; } From 76c0a8f95d411660c578b55deb6cc41584169f1e Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:36:04 -0700 Subject: [PATCH 06/19] Ignore all files under tools/ucd/ --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6d8ad9..02d3c4c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Unicode Character Database files -tools/ucd/*.txt +tools/ucd/ # User-specific files *.suo From ea973fef42955a40517d8e3d6718fa730051f852 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:44:13 -0700 Subject: [PATCH 07/19] Update clang-tidy checks --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index f7c52a8..628c9b1 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -14,6 +14,7 @@ Checks: - -cppcoreguidelines-pro-bounds-* - darwin-* - hicpp-* + - -hicpp-avoid-goto - -hicpp-braces-around-statements - llvm-namespace-comment - misc-* From 4d156f8dbd5d52c3b6897f14cf4b5322872eeaa2 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 3 Jul 2024 19:02:15 -0700 Subject: [PATCH 08/19] Fixed clang-tidy warnings --- .clang-tidy | 38 ++-- CHANGELOG.md | 6 +- lug/detail.hpp | 37 ++-- lug/lug.hpp | 447 ++++++++++++++++++++++------------------ samples/basic/basic.cpp | 18 +- samples/calc/calc.cpp | 23 ++- tests/captures.cpp | 44 ++-- tests/leftrecursion.cpp | 6 +- 8 files changed, 343 insertions(+), 276 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 628c9b1..9cc6212 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -2,32 +2,42 @@ Checks: - clang-diagnostic-* - clang-analyzer-* + - -clang-analyzer-optin.core.EnumCastOutOfRange # interferes with enum bitfield flags - android-* - bugprone-* - cert-* - - -cert-dcl21-cpp + - -cert-dcl21-cpp # this check is deprecated, it is no longer part of the CERT standard - concurrency-* - cppcoreguidelines-* - - -cppcoreguidelines-avoid-magic-numbers - - -cppcoreguidelines-avoid-do-while - - -cppcoreguidelines-avoid-goto - - -cppcoreguidelines-pro-bounds-* + - -cppcoreguidelines-avoid-magic-numbers # revisit after new instruction scheme, maybe only disable for unicode tables + - -cppcoreguidelines-avoid-do-while # if removing do-while does not cause serious performance issues remove this check + - -cppcoreguidelines-avoid-goto # if removing goto does not cause serious performance issues remove this check + - -cppcoreguidelines-pro-bounds-* # requires gsl::at and std::span to suppress, would prefer Standard Library hardening approach + - -cppcoreguidelines-pro-type-union-access # remove after developing new instruction encoding scheme that doesn't use union - darwin-* + - fuschia-* + - google-* + - -google-build-using-namespace # would require too many invidual using-declarations to satisfy + - -google-readability-braces-around-statements # adversely affects line count + - -google-runtime-int # revisit after new instruction scheme - hicpp-* - - -hicpp-avoid-goto - - -hicpp-braces-around-statements + - -hicpp-avoid-goto # if removing goto does not cause serious performance issues remove this check + - -hicpp-braces-around-statements # adversely affects line count - llvm-namespace-comment - misc-* - - -misc-include-cleaner + - -misc-include-cleaner # brings in redundant headers that are already included - modernize-* - - -modernize-use-trailing-return-type + - -modernize-use-constraints # C++20 feature + - -modernize-use-trailing-return-type # stylistic preference, revisit later - performance-* - portability-* - readability-* - - -readability-braces-around-statements - - -readability-identifier-length - - -readability-magic-numbers - - -readability-qualified-auto + - -readability-braces-around-statements # adversely affects line count + - -readability-container-contains # C++20 feature + - -readability-function-cognitive-complexity # grammar::start() and basic_parser::parse() are complex, revisit or suppress only for these functions + - -readability-identifier-length # revisit later + - -readability-magic-numbers # revisit after new instruction scheme, maybe only disable for unicode tables + - -readability-qualified-auto # stylistic preference that unfortunately warns when marking 'auto*' as 'auto* const' or just 'auto const' WarningsAsErrors: '' HeaderFileExtensions: - '' @@ -284,7 +294,7 @@ CheckOptions: misc-header-include-cycle.IgnoredFilesList: '' misc-include-cleaner.DeduplicateFindings: 'true' misc-include-cleaner.IgnoreHeaders: '' - misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'false' + misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true' misc-non-private-member-variables-in-classes.IgnorePublicMemberVariables: 'false' misc-throw-by-value-catch-by-reference.CheckThrowTemporaries: 'true' misc-throw-by-value-catch-by-reference.WarnOnLargeObjects: 'false' diff --git a/CHANGELOG.md b/CHANGELOG.md index df68d94..7657b34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,12 @@ * Added support for parsing characters and character literals where applicable without explicitly needing to wrap them with `chr()` or `_cx`. * Symbols now respects `caseless` mode, allowing for case-insensitive matching against symbol definitions. * Allow for use of variables of all types in attribute bindings and removed the `lug::variable` template class that was used previously. Variable state is automatically saved and restored across rule boundaries. -* Allow for capturing text to a `lug::syntax` object or any string-like object that is convertible from `std::string_view`. +* Allow for capturing text to a `lug::syntax` object or any string-like object that is convertible from `std::string_view`, and renamed `syntax::capture` to `syntax::str` in order to match `std::sub_match::str`. * Added `lug::source_options::interactive` flag that ignores `eoi` tokens for TTY input sources. * Rewrote the expression function objects/lambdas as expression template classes. Allows for multiple passes over the expression tree as well as top-down and bottom-up traversal, which was needed when implementing attribute state tracking. This will also allow for additional optimizations to be implemented in the future. * Renamed `syntactic_capture` to `semantic_capture_action` to reflect that it is executed during the semantic action evaluation phase. * Make all variations of callables that return a non-void value that can be type-erased by `semantic_action` and `semantic_capture_action` push their result onto the attribute result stack. +* Removed `semantic_response` from the public API as it was only used internally inside of the parser. * Attempting to bind a variable to a nonexistent value from the attribute result stack now throws an `attribute_stack_error`. * `implicit_space_rule` no longer causes a compiler warning with Clang, uses RAII to push/pop the thread-local white space rule for grammars. * Moved `call_depth()`, `prune_depth()` and `escape()` functions into the `lug::environment` class since they are used exclusively during semantic action phase. @@ -18,12 +19,13 @@ * Turned `lug::parser` into an alias of a new `lug::basic_parser` template class parameterized with an input source strategy. This allows for parsing and capturing of text without making a copy of the input. * Placed all DSL operator overloads inside of an inline namespace `operators` within `lug::language`. This allows only the operators to be imported into the current scope if desired. * Enabled `-Wconversion` and `-Wshadow` warnings for Clang and GCC and fixed warnings. +* Full clang-tidy pass on all of the library headers and fixed all warnings. * Added CMake build support and removed old MSVS solution and vcxproj files. * Handle situation where compilation with RTTI is disabled. ## Release v0.2.0 (June 21, 2024) -* Implemented new support for context-sensitive grammars with symbol tables and parsing conditions, based on the PEG extensions described in the paper *"A Declarative Extension of Parsing Expression Grammars for Recognizing Most Programming Languages"* by Tetsuro Matsumura and Kimio Kuramitsu (2015). +* Implemented new support for context-sensitive grammars with symbol tables and parsing conditions based on the PEG extensions described in the paper *"A Declarative Extension of Parsing Expression Grammars for Recognizing Most Programming Languages"* by Tetsuro Matsumura and Kimio Kuramitsu (2015). * Added an XML Standard 1.0 matcher sample program demonstrating use of symbol tables. * Finished the BASIC language interpreter sample program, which is now feature complete, using parsing conditions. * Updated Unicode support to version 15.1.0 and automated Unicode table generation via Makefile build. diff --git a/lug/detail.hpp b/lug/detail.hpp index a7ebb5d..60a9b0e 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -87,43 +87,43 @@ inline namespace bitfield_ops { template > [[nodiscard]] constexpr T operator~(T x) noexcept { - return static_cast(~static_cast>(x)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(~static_cast>(x)); } template > [[nodiscard]] constexpr T operator&(T x, T y) noexcept { - return static_cast(static_cast>(x) & static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) & static_cast>(y)); } template > [[nodiscard]] constexpr T operator|(T x, T y) noexcept { - return static_cast(static_cast>(x) | static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) | static_cast>(y)); } template > [[nodiscard]] constexpr T operator^(T x, T y) noexcept { - return static_cast(static_cast>(x) ^ static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) ^ static_cast>(y)); } template > constexpr T& operator&=(T& x, T y) noexcept { - return (x = x & y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x & y); } template > constexpr T& operator|=(T& x, T y) noexcept { - return (x = x | y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x | y); } template > constexpr T& operator^=(T& x, T y) noexcept { - return (x = x ^ y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x ^ y); } } // namespace bitfield_ops @@ -161,9 +161,6 @@ using enable_if_char_input_iterator_t = std::enable_if_t< template using enable_if_char_contiguous_iterator_t = std::enable_if_t, T>; -template -constexpr void ignore([[maybe_unused]] Args&&... args) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward) - struct identity { template @@ -224,21 +221,21 @@ template template class dynamic_cast_if_base_of { - std::remove_reference_t& value_; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + std::reference_wrapper> value_; public: constexpr explicit dynamic_cast_if_base_of(std::remove_reference_t& x) noexcept : value_{x} {} template , std::decay_t>>> - [[nodiscard]] constexpr operator U&() const // NOLINT(hicpp-explicit-conversions) + [[nodiscard]] constexpr operator U&() const noexcept(std::is_same_v, std::decay_t>) // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) { #ifndef LUG_NO_RTTI if constexpr (std::is_same_v, std::decay_t>) #endif // LUG_NO_RTTI - return static_cast&>(value_); + return static_cast&>(value_.get()); #ifndef LUG_NO_RTTI else - return dynamic_cast&>(value_); + return dynamic_cast&>(value_.get()); #endif // LUG_NO_RTTI } }; @@ -246,20 +243,20 @@ class dynamic_cast_if_base_of template class reentrancy_sentinel { - bool& value; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + std::reference_wrapper value_; public: constexpr explicit reentrancy_sentinel(bool& x) - : value{x} + : value_{x} { - if (value) + if (value_.get()) throw Error(); - value = true; + value_.get() = true; } ~reentrancy_sentinel() { - value = false; + value_.get() = false; } reentrancy_sentinel(reentrancy_sentinel const&) = delete; @@ -341,7 +338,7 @@ inline std::size_t push_back_unique(Sequence& s, T&& x) template [[nodiscard]] inline auto pop_back(Sequence& s) -> typename Sequence::value_type { - typename Sequence::value_type result{std::move(s.back())}; + typename Sequence::value_type result{std::move(s.back())}; // NOLINT(misc-const-correctness) s.pop_back(); return result; } diff --git a/lug/lug.hpp b/lug/lug.hpp index 2ebe5f9..29f6fed 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -30,9 +30,8 @@ class string_view_input_source; template class basic_parser; using parser = basic_parser; struct program; -struct syntax_position { std::size_t column, line; }; -struct syntax_range { std::size_t index, size; }; -struct semantic_response { unsigned short call_depth, action_index; syntax_range range; }; +struct syntax_position { std::size_t column; std::size_t line; }; +struct syntax_range { std::size_t index; std::size_t size; }; using semantic_action = std::function; using semantic_capture_action = std::function; using syntactic_predicate = std::function; @@ -65,14 +64,14 @@ enum class operands : unsigned char { none = 0, off = 0x40, str = 0x80, is_bitfi union instruction { - static inline constexpr std::size_t maxstrlen = 256; + static constexpr std::size_t maxstrlen = 256; struct prefix { opcode op; operands aux; unsigned short val; } pf; int off; std::array str; instruction(opcode op, operands aux, immediate imm) noexcept : pf{op, aux, static_cast(imm)} {} explicit instruction(std::ptrdiff_t o) : off{static_cast(o)} { if (off != o) throw program_limit_error{}; } - explicit instruction(std::string_view s) { std::fill(std::copy_n(s.begin(), (std::min)(s.size(), std::size_t{4}), str.begin()), str.end(), char{0}); } + explicit instruction(std::string_view s) : str{} { std::fill(std::copy_n(s.begin(), (std::min)(s.size(), std::size_t{4}), str.begin()), str.end(), char{0}); } [[nodiscard]] static auto decode(std::vector const& code, std::ptrdiff_t& pc) { @@ -81,9 +80,10 @@ union instruction unsigned short imm = pf.val; std::string_view str; if ((pf.aux & operands::str) != operands::none) { - str = std::string_view{code[static_cast(pc)].str.data(), static_cast((imm & 0xff) + 1)}; - pc += ((imm & 0xff) + 4) >> 2; - imm = static_cast(imm >> 8); + auto const strsize = (static_cast(imm) & 0xffU) + 1U; + str = std::string_view{code[static_cast(pc)].str.data(), strsize}; + pc += static_cast((strsize + 3U) >> 2U); + imm = static_cast(static_cast(imm) >> 8U); } return std::make_tuple(pf.op, imm, off, str); } @@ -92,7 +92,7 @@ union instruction { std::ptrdiff_t len = 1; len += ((pf.aux & operands::off) != operands::none) ? 1 : 0; - len += ((pf.aux & operands::str) != operands::none) ? static_cast(((pf.val & 0xff) >> 2) + 1) : 0; + len += ((pf.aux & operands::str) != operands::none) ? static_cast(((static_cast(pf.val) & 0xffU) >> 2U) + 1U) : 0; return len; } }; @@ -102,7 +102,7 @@ static_assert(sizeof(unicode::sctype) <= sizeof(immediate), "immediate must be l static_assert(sizeof(instruction) == sizeof(int), "expected instruction to be same size as int"); static_assert(sizeof(int) <= sizeof(std::ptrdiff_t), "expected int to be no larger than ptrdiff_t"); -enum class directives : unsigned int { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; +enum class directives : std::uint_least8_t { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; using program_callees = std::vector>; struct program @@ -119,7 +119,7 @@ struct program instructions.reserve(detail::checked_add(instructions.size(), src.instructions.size())); for (auto i = src.instructions.begin(), j = i, e = src.instructions.end(); i != e; i = j) { instruction instr = *i; - std::size_t val; + std::size_t val = 0; switch (instr.pf.op) { case opcode::match_set: val = detail::push_back_unique(runesets, src.runesets[instr.pf.val]); break; case opcode::action: val = actions.size(); actions.push_back(src.actions[instr.pf.val]); break; @@ -128,7 +128,7 @@ struct program default: val = (std::numeric_limits::max)(); break; } if (val != (std::numeric_limits::max)()) { - detail::assure_in_range(val, 0u, (std::numeric_limits::max)()); + detail::assure_in_range(val, 0U, (std::numeric_limits::max)()); instr.pf.val = static_cast(val); } j = std::next(i, instruction::length(instr.pf)); @@ -153,57 +153,58 @@ class rule { friend class encoder; friend class rule_encoder; - friend grammar start(rule const&); + friend grammar start(rule const& start_rule); program program_; program_callees callees_; bool currently_encoding_{false}; public: - rule() = default; - template >> rule(E const& e); + rule() noexcept = default; + template && !std::is_same_v>> rule(E const& e); // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) rule(rule const& r); - rule(rule&& r) = default; + rule(rule&& r) noexcept = default; rule& operator=(rule const& r) { rule{r}.swap(*this); return *this; } - rule& operator=(rule&& r) = default; + rule& operator=(rule&& r) noexcept = default; + ~rule() = default; void swap(rule& r) noexcept { program_.swap(r.program_); callees_.swap(r.callees_); } [[nodiscard]] auto operator[](unsigned short precedence) const noexcept; }; class grammar { - friend grammar start(rule const&); + friend grammar start(rule const& start_rule); lug::program program_; - grammar(lug::program p) : program_{std::move(p)} {} + explicit grammar(lug::program&& p) noexcept : program_{std::move(p)} {} public: - grammar() = default; + grammar() noexcept = default; void swap(grammar& g) noexcept { program_.swap(g.program_); } [[nodiscard]] lug::program const& program() const noexcept { return program_; } - static thread_local std::shared_ptr> const implicit_space; + [[nodiscard]] static std::shared_ptr> const& implicit_space(); }; class syntax { - std::string_view capture_; + std::string_view str_; std::size_t index_{0}; public: constexpr syntax() noexcept = default; - constexpr syntax(std::string_view c, std::size_t i) noexcept : capture_{c}, index_{i} {} - [[nodiscard]] constexpr std::string_view capture() const noexcept { return capture_; } - [[nodiscard]] constexpr syntax_range range() const noexcept { return syntax_range{index_, capture_.size()}; } - [[nodiscard]] operator std::string() const noexcept { return std::string{capture_}; } - [[nodiscard]] constexpr operator std::string_view() const noexcept { return capture_; } - [[nodiscard]] constexpr operator syntax_range() const noexcept { return range(); } - [[nodiscard]] constexpr bool empty() const noexcept { return capture_.empty(); } - [[nodiscard]] constexpr std::size_t size() const noexcept { return capture_.size(); } - [[nodiscard]] constexpr bool operator==(syntax const& other) const noexcept { return capture_ == other.capture_ && index_ == other.index_; } - [[nodiscard]] constexpr bool operator!=(syntax const& other) const noexcept { return capture_ != other.capture_ || index_ != other.index_; } + constexpr syntax(std::string_view c, std::size_t i) noexcept : str_{c}, index_{i} {} + [[nodiscard]] constexpr std::string_view str() const noexcept { return str_; } + [[nodiscard]] constexpr syntax_range range() const noexcept { return syntax_range{index_, str_.size()}; } + [[nodiscard]] operator std::string() const { return std::string{str_}; } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr operator std::string_view() const noexcept { return str_; } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr operator syntax_range() const noexcept { return range(); } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr bool empty() const noexcept { return str_.empty(); } + [[nodiscard]] constexpr std::size_t size() const noexcept { return str_.size(); } + [[nodiscard]] constexpr bool operator==(syntax const& other) const noexcept { return str_ == other.str_ && index_ == other.index_; } + [[nodiscard]] constexpr bool operator!=(syntax const& other) const noexcept { return str_ != other.str_ || index_ != other.index_; } }; class environment { template friend class basic_parser; - static inline constexpr unsigned short max_call_depth = (std::numeric_limits::max)(); - static inline const std::vector empty_symbols_{}; + static constexpr unsigned short max_call_depth = (std::numeric_limits::max)(); + static inline std::vector const empty_symbols_{}; std::vector attribute_frame_stack_; std::vector attribute_result_stack_; std::unordered_set conditions_; @@ -250,13 +251,18 @@ class environment } public: + environment() = default; + environment(environment const&) = delete; + environment(environment&&) noexcept = default; + environment& operator=(environment const&) = delete; + environment& operator=(environment&&) noexcept = default; virtual ~environment() = default; [[nodiscard]] unsigned int tab_width() const { return tab_width_; } void tab_width(unsigned int w) { tab_width_ = w; } [[nodiscard]] unsigned int tab_alignment() const { return tab_alignment_; } void tab_alignment(unsigned int a) { tab_alignment_ = a; } [[nodiscard]] bool has_condition(std::string_view name) const noexcept { return (conditions_.count(name) > 0); } - bool set_condition(std::string_view name, bool value) { if (value) { return !conditions_.emplace(name).second; } else { return (conditions_.erase(name) > 0); } } + bool set_condition(std::string_view name, bool value) { return value ? (!conditions_.emplace(name).second) : (conditions_.erase(name) > 0); } void clear_conditions() { conditions_.clear(); } [[nodiscard]] bool has_symbol(std::string_view name) const noexcept { return (symbols_.count(name) > 0); } [[nodiscard]] std::vector const& get_symbols(std::string_view name) const { auto it = symbols_.find(name); if (it == symbols_.end()) return empty_symbols_; return it->second; } @@ -285,7 +291,8 @@ class environment } auto first = std::next(std::begin(match_), static_cast(startindex)); auto const last = std::next(std::begin(match_), static_cast(index)); - char32_t rune, prevrune = U'\0'; + char32_t rune = U'\0'; + char32_t prevrune = U'\0'; for (auto curr = first, next = curr; curr < last; curr = next, prevrune = rune) { std::tie(next, rune) = utf8::decode_rune(curr, last); if ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) != unicode::ptype::None && (prevrune != U'\r' || rune != U'\n')) { @@ -345,7 +352,7 @@ struct encoder_metadata template >> constexpr encoder_metadata() noexcept : attribute_frame{} {} template >> - constexpr encoder_metadata(Frame&& frame) noexcept : attribute_frame{std::forward(frame)} {} + constexpr explicit encoder_metadata(Frame&& frame) noexcept : attribute_frame{std::forward(frame)} {} }; encoder_metadata() -> encoder_metadata<>; @@ -353,17 +360,17 @@ template encoder_metadata(Frame&&) -> encoder_metadata mode_; - virtual void do_append(instruction) = 0; + virtual void do_append(instruction instr) = 0; virtual void do_append(program const&) = 0; - virtual immediate do_add_rune_set(unicode::rune_set) { return immediate{0}; } - virtual immediate do_add_semantic_action(semantic_action) { return immediate{0}; } - virtual immediate do_add_semantic_capture_action(semantic_capture_action) { return immediate{0}; } - virtual immediate do_add_syntactic_predicate(syntactic_predicate) { return immediate{0}; } - virtual void do_add_callee(rule const*, program const*, std::ptrdiff_t, directives) {} - virtual bool do_should_evaluate_length() const noexcept { return true; } - virtual std::ptrdiff_t do_length() const noexcept = 0; + [[nodiscard]] virtual immediate do_add_rune_set(unicode::rune_set&& /*r*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_semantic_action(semantic_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_syntactic_predicate(syntactic_predicate&& /*p*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + virtual void do_add_callee(rule const* /*r*/, program const* /*p*/, std::ptrdiff_t /*n*/, directives /*d*/) {} + [[nodiscard]] virtual bool do_should_evaluate_length() const noexcept { return true; } + [[nodiscard]] virtual std::ptrdiff_t do_length() const noexcept = 0; protected: encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, unsigned short prec) @@ -380,7 +387,7 @@ class encoder std::string_view subsequence = sequence.substr(0, instruction::maxstrlen); while (!subsequence.empty() && !utf8::is_lead(subsequence.back())) subsequence.remove_suffix(1); - subsequence.remove_suffix(!subsequence.empty()); + subsequence.remove_suffix(!subsequence.empty() ? 1 : 0); encode(op, subsequence); sequence.remove_prefix(subsequence.size()); } @@ -397,35 +404,40 @@ class encoder void do_skip() { mode_.back() = (mode_.back() & ~(directives::preskip | directives::postskip)) | directives::lexeme | directives::noskip; - (*grammar::implicit_space)(*this); + (*grammar::implicit_space())(*this); } public: - explicit encoder(directives initial) : mandate_{directives::none}, mode_{initial} {} + explicit encoder(directives initial) : mode_{initial} {} virtual ~encoder() = default; + encoder(encoder const&) = delete; + encoder(encoder&&) = delete; + encoder& operator=(encoder const&) = delete; + encoder& operator=(encoder&&) = delete; template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); template >> [[nodiscard]] std::ptrdiff_t evaluate_length(E const& e, M const& m); - encoder& dpsh(directives enable, directives disable) { directives prev = mode_.back(); mode_.push_back((prev & ~disable) | enable); return *this; } + encoder& dpsh(directives enable, directives disable) { directives const prev = mode_.back(); mode_.push_back((prev & ~disable) | enable); return *this; } encoder& append(instruction instr) { do_append(instr); return *this; } encoder& append(program const& p) { do_append(p); return *this; } encoder& call(program const& p, unsigned short prec) { return do_call(nullptr, &p, 0, prec); } encoder& call(grammar const& g, unsigned short prec) { return do_call(nullptr, &g.program(), 3, prec); } encoder& encode(opcode op, immediate imm = immediate{0}) { return append(instruction{op, operands::none, imm}); } - encoder& encode(opcode op, semantic_action a) { return append(instruction{op, operands::none, do_add_semantic_action(std::move(a))}); } - encoder& encode(opcode op, semantic_capture_action c) { return append(instruction{op, operands::none, do_add_semantic_capture_action(std::move(c))}); } - encoder& encode(opcode op, syntactic_predicate p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } + encoder& encode(opcode op, semantic_action&& a) { return append(instruction{op, operands::none, do_add_semantic_action(std::move(a))}); } + encoder& encode(opcode op, semantic_capture_action&& a) { return append(instruction{op, operands::none, do_add_semantic_capture_action(std::move(a))}); } + encoder& encode(opcode op, syntactic_predicate&& p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } encoder& encode(opcode op, std::ptrdiff_t off, immediate imm = immediate{0}) { return append(instruction{op, operands::off, imm}).append(instruction{off}); } [[nodiscard]] std::ptrdiff_t length() const noexcept { return do_length(); } [[nodiscard]] directives mandate() const noexcept { return (mandate_ & ~directives::eps) | mode_.back(); } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } - encoder& match(unicode::rune_set runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } + encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match); } encoder& match_any() { return skip().encode(opcode::match_any); } template >> encoder& match_class(T properties) { return skip().do_match_class(Op, properties); } encoder& dpop(directives relay) { - auto prev = detail::pop_back(mode_), next = (mode_.back() & ~relay) | (prev & relay); + auto const prev = detail::pop_back(mode_); + auto const next = (mode_.back() & ~relay) | (prev & relay); if ((next & directives::postskip) == directives::none && (prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip) do_skip(); mode_.back() = next; @@ -434,7 +446,7 @@ class encoder encoder& skip(directives callee_mandate = directives::eps, directives callee_skip = directives::lexeme) { - auto mode = mode_.back(); + auto const mode = mode_.back(); if (mandate_ == directives::none) mandate_ = (mode & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; if ((((mode | callee_mandate)) & (callee_skip | directives::preskip)) == directives::preskip) @@ -446,7 +458,7 @@ class encoder encoder& call(rule const& r, unsigned short prec, bool allow_inlining = true) { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && - p.instructions.size() <= 8 && p.actions.size() <= 1 && p.captures.size() <= 1 && p.predicates.size() <= 1) + (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) return skip(p.mandate, directives::noskip).append(p); return do_call(&r, &r.program_, 0, prec); } @@ -465,9 +477,9 @@ class encoder encoder& encode(opcode op, std::string_view subsequence, immediate imm = immediate{0}) { if (!subsequence.empty()) { - detail::assure_in_range(static_cast(imm), 0u, instruction::maxstrlen - 1); - detail::assure_in_range(subsequence.size(), 1u, instruction::maxstrlen); - do_append(instruction{op, operands::str, static_cast((static_cast(imm) << 8) | static_cast(subsequence.size() - 1))}); + detail::assure_in_range(static_cast(imm), 0U, instruction::maxstrlen - 1); + detail::assure_in_range(subsequence.size(), 1U, instruction::maxstrlen); + do_append(instruction{op, operands::str, static_cast(static_cast((static_cast(imm) << 8U) | static_cast(subsequence.size() - 1)))}); do { do_append(instruction{subsequence}); subsequence.remove_prefix((std::min)(std::size_t{4}, subsequence.size())); @@ -481,40 +493,43 @@ class encoder skip(!subject.empty() ? directives::eps : directives::none); if ((mode() & directives::caseless) != directives::none) return do_match(opcode::match_cf, utf8::tocasefold(subject)); - else - return do_match(opcode::match, subject); + return do_match(opcode::match, subject); } }; class instruction_length_evaluator final : public encoder { - std::ptrdiff_t length_; - void do_append(instruction) final { length_ = detail::checked_add(length_, std::ptrdiff_t{1}); } + std::ptrdiff_t length_{0}; + void do_append(instruction instr) final { std::ignore = instr; length_ = detail::checked_add(length_, std::ptrdiff_t{1}); } void do_append(program const& p) final { length_ = detail::checked_add(length_, static_cast(p.instructions.size())); } - bool do_should_evaluate_length() const noexcept final { return false; } - std::ptrdiff_t do_length() const noexcept final { return length_; } + [[nodiscard]] bool do_should_evaluate_length() const noexcept final { return false; } + [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return length_; } public: - explicit instruction_length_evaluator(directives initial) : encoder{initial}, length_{0} {} + explicit instruction_length_evaluator(directives initial) : encoder{initial} {} ~instruction_length_evaluator() final = default; + instruction_length_evaluator(instruction_length_evaluator const&) = delete; + instruction_length_evaluator(instruction_length_evaluator&&) = delete; + instruction_length_evaluator& operator=(instruction_length_evaluator const&) = delete; + instruction_length_evaluator& operator=(instruction_length_evaluator&&) = delete; }; class program_encoder : public encoder { program& program_; program_callees& callees_; - std::ptrdiff_t do_length() const noexcept final { return static_cast(program_.instructions.size()); } + [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return static_cast(program_.instructions.size()); } void do_append(instruction instr) final { program_.instructions.push_back(instr); } void do_append(program const& p) final { program_.concatenate(p); } void do_add_callee(rule const* r, program const* p, std::ptrdiff_t n, directives d) final { callees_.emplace_back(r, p, n, d); } - immediate do_add_rune_set(unicode::rune_set r) final { return add_item(program_.runesets, std::move(r)); } - immediate do_add_semantic_action(semantic_action a) final { return add_item(program_.actions, std::move(a)); } - immediate do_add_semantic_capture_action(semantic_capture_action a) final { return add_item(program_.captures, std::move(a)); } - immediate do_add_syntactic_predicate(syntactic_predicate p) final { return add_item(program_.predicates, std::move(p)); } + [[nodiscard]] immediate do_add_rune_set(unicode::rune_set&& r) final { return add_item(program_.runesets, std::move(r)); } + [[nodiscard]] immediate do_add_semantic_action(semantic_action&& a) final { return add_item(program_.actions, std::move(a)); } + [[nodiscard]] immediate do_add_semantic_capture_action(semantic_capture_action&& a) final { return add_item(program_.captures, std::move(a)); } + [[nodiscard]] immediate do_add_syntactic_predicate(syntactic_predicate&& p) final { return add_item(program_.predicates, std::move(p)); } template - immediate add_item(std::vector& items, Item&& item) + [[nodiscard]] immediate add_item(std::vector& items, Item&& item) { - detail::assure_in_range(items.size(), 0u, (std::numeric_limits::max)() - 1u); + detail::assure_in_range(items.size(), 0U, (std::numeric_limits::max)() - 1U); items.push_back(std::forward(item)); return static_cast(items.size() - 1); } @@ -522,6 +537,10 @@ class program_encoder : public encoder public: program_encoder(program& p, program_callees& c, directives initial) : encoder{initial}, program_{p}, callees_{c} {} ~program_encoder() override { program_.mandate = mandate(); } + program_encoder(program_encoder const&) = delete; + program_encoder(program_encoder&&) = delete; + program_encoder& operator=(program_encoder const&) = delete; + program_encoder& operator=(program_encoder&&) = delete; }; class rule_encoder final : public program_encoder @@ -530,10 +549,14 @@ class rule_encoder final : public program_encoder public: explicit rule_encoder(rule& r) : program_encoder{r.program_, r.callees_, directives::eps}, rule_{r} { rule_.currently_encoding_ = true; } ~rule_encoder() final { rule_.currently_encoding_ = false; } + rule_encoder(rule_encoder const&) = delete; + rule_encoder(rule_encoder&&) = delete; + rule_encoder& operator=(rule_encoder const&) = delete; + rule_encoder& operator=(rule_encoder&&) = delete; }; template -inline auto&& add_rune_range(RuneSet&& runes, directives mode, char32_t first, char32_t last) +inline decltype(auto) add_rune_range(RuneSet&& runes, directives mode, char32_t first, char32_t last) { if (first > last) throw bad_character_range{}; @@ -541,7 +564,7 @@ inline auto&& add_rune_range(RuneSet&& runes, directives mode, char32_t first, c unicode::push_casefolded_range(runes, first, last); else unicode::push_range(runes, first, last); - return std::move(runes); + return std::forward(runes); } struct terminal_encoder_expression_interface @@ -553,20 +576,22 @@ template struct unary_encoder_expression_interface { using expression_trait = encoder_expression_trait_tag; E1 e1; - template constexpr explicit unary_encoder_expression_interface(X1&& x1) noexcept : e1(std::forward(x1)) {} + template >> + constexpr explicit unary_encoder_expression_interface(X1&& x1) : e1(std::forward(x1)) {} }; template struct binary_encoder_expression_interface { using expression_trait = encoder_expression_trait_tag; E1 e1; E2 e2; - template constexpr binary_encoder_expression_interface(X1&& x1, X2&& x2) noexcept : e1(std::forward(x1)), e2(std::forward(x2)) {} + template && std::is_constructible_v>> + constexpr binary_encoder_expression_interface(X1&& x1, X2&& x2) : e1(std::forward(x1)), e2(std::forward(x2)) {} }; class basic_regular_expression : public terminal_encoder_expression_interface { - std::string const expression_; - std::shared_ptr const program_; + std::string expression_; + std::shared_ptr program_; [[nodiscard]] static grammar make_grammar(); @@ -654,16 +679,16 @@ struct char32_range_expression : terminal_encoder_expression_interface { char32_t start; char32_t end; - constexpr char32_range_expression(char32_t s, char32_t e) noexcept : start{s}, end{e} {} + constexpr char32_range_expression(char32_t first, char32_t last) noexcept : start{first}, end{last} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(unicode::sort_and_optimize(add_rune_range(unicode::rune_set{}, d.mode(), start, end))); return m; } }; template struct callable_expression : terminal_encoder_expression_interface { - Target& target; + std::reference_wrapper target; constexpr explicit callable_expression(Target& t) noexcept : target{t} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target, 0); } + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), 0); } }; template struct is_callable_encoder_expression : std::false_type {}; @@ -674,7 +699,7 @@ template struct predicate_expression : terminal_encoder_expression_interface { Pred pred; - template constexpr explicit predicate_expression(P&& p) noexcept(std::is_nothrow_constructible_v) : pred(std::forward

(p)) {} + template >> constexpr explicit predicate_expression(P&& p) noexcept(std::is_nothrow_constructible_v) : pred(std::forward

(p)) {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::predicate, syntactic_predicate{pred}); return m; } }; @@ -695,7 +720,7 @@ template && is_ex else if constexpr (std::is_same_v, char32_t>) return char32_expression{std::forward(e)}; else if constexpr (std::is_convertible_v, std::string_view>) - return string_expression{std::forward(e)}; + return string_expression{std::forward(e)}; // NOLINT(cppcoreguidelines-pro-bounds-array-to-pointer-decay,hicpp-no-array-decay) else if constexpr (std::is_invocable_r_v, environment&>) return predicate_expression{std::forward(e)}; else @@ -703,9 +728,9 @@ template && is_ex } template >> -[[nodiscard]] constexpr auto make_space_expression(E&& e) +[[nodiscard]] constexpr auto make_space_expression(E const& e) { - return [x = make_expression(std::forward(e))](encoder& d) { (void)x(d, encoder_metadata{}); }; + return [x = make_expression(e)](encoder& d) { (void)x(d, encoder_metadata{}); }; } template @@ -738,10 +763,10 @@ inline rule::rule(rule const& r) struct rule_precedence_expression : terminal_encoder_expression_interface { - rule const& target; + std::reference_wrapper target; unsigned short precedence; - constexpr rule_precedence_expression(rule const& t, unsigned short p) noexcept : target{t}, precedence{p} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target, precedence); } + rule_precedence_expression(rule const& t, unsigned short p) noexcept : target{t}, precedence{p} {} + template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), precedence); } }; [[nodiscard]] inline auto rule::operator[](unsigned short precedence) const noexcept @@ -779,7 +804,7 @@ inline constexpr directive_modifier skip_after{}; inline constexpr directive_modifier skip_before{}; -struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder&, M const& m) const -> M const& { return m; } }; +struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{1}); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; @@ -792,7 +817,7 @@ struct match_class_combinator struct match_class_expression : terminal_encoder_expression_interface { Property property; - constexpr match_class_expression(Property p) noexcept : property{p} {} + constexpr explicit match_class_expression(Property p) noexcept : property{p} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_class(property); return m; } }; @@ -817,7 +842,7 @@ struct condition_test_combinator struct condition_test_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr condition_test_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit condition_test_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, immediate{Value ? 1 : 0}); return m; } }; @@ -863,7 +888,7 @@ struct symbol_exists_combinator struct symbol_exists_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr symbol_exists_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit symbol_exists_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, immediate{Value ? 1 : 0}); return m; } }; @@ -876,7 +901,7 @@ struct symbol_match_combinator struct symbol_match_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr symbol_match_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit symbol_match_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name); return m; } }; @@ -975,7 +1000,7 @@ template struct attribute_action_expression : unary_encoder_expression_interface { Operand operand; - template constexpr attribute_action_expression(X1&& x1, O&& o) noexcept : unary_encoder_expression_interface{std::forward(x1)}, operand(std::forward(o)) {} + template constexpr attribute_action_expression(X1&& x1, O&& o) : unary_encoder_expression_interface{std::forward(x1)}, operand(std::forward(o)) {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const @@ -990,17 +1015,17 @@ struct attribute_action_expression : unary_encoder_expression_interface }; template -struct attribute_bind_to_expression : attribute_action_expression +struct attribute_bind_to_expression : attribute_action_expression { - using attribute_action_expression::attribute_action_expression; - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::operator()(d, m)).attribute_frame, std::forward_as_tuple(this->operand))}; } + using attribute_action_expression::attribute_action_expression; + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::operator()(d, m)).attribute_frame, std::forward_as_tuple(*(this->operand)))}; } }; template struct action_expression : attribute_action_expression, E1, Action> { using attribute_action_expression, E1, Action>::attribute_action_expression; - constexpr void do_prologue(encoder&) const {} + constexpr void do_prologue(encoder& /*d*/) const {} constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[a = this->operand](environment& envr) { a(detail::dynamic_cast_if_base_of{envr}); }}); } template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, a = this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); a(detail::dynamic_cast_if_base_of{envr}); }}); } }; @@ -1018,9 +1043,9 @@ template struct assign_to_expression : attribute_bind_to_expression, E1, Target> { using attribute_bind_to_expression, E1, Target>::attribute_bind_to_expression; - constexpr void do_prologue(encoder&) const {} - constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[t = &this->operand](environment& envr) { *t = envr.pop_attribute(); }}); } - template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, t = &this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); *t = envr.pop_attribute(); }}); } + constexpr void do_prologue(encoder& /*d*/) const {} + constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[t = this->operand](environment& envr) { *t = envr.pop_attribute(); }}); } + template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, t = this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); *t = envr.pop_attribute(); }}); } }; template @@ -1028,8 +1053,8 @@ struct capture_to_expression : attribute_bind_to_expression, E1, Target>::attribute_bind_to_expression; constexpr void do_prologue(encoder& d) const { d.skip().encode(opcode::capture_start); } - constexpr void do_epilogue(encoder& d) const { d.encode(opcode::capture_end, semantic_capture_action{[t = &this->operand](environment&, syntax const& sx) { *t = sx; }}); } - template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::capture_end, semantic_capture_action{[f = m.attribute_frame, t = &this->operand](environment& envr, syntax const& sx) mutable { envr.pop_attribute_frame(f); *t = sx; }}); } + constexpr void do_epilogue(encoder& d) const { d.encode(opcode::capture_end, semantic_capture_action{[t = this->operand](environment&, syntax const& sx) { *t = sx; }}); } + template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::capture_end, semantic_capture_action{[f = m.attribute_frame, t = this->operand](environment& envr, syntax const& sx) mutable { envr.pop_attribute_frame(f); *t = sx; }}); } }; template @@ -1093,8 +1118,8 @@ template choice_expression(X1&&, X2&&) -> choice_expression template sequence_expression(X1&&, X2&&) -> sequence_expression, std::decay_t>; template action_expression(X1&&, Action&&) -> action_expression, std::decay_t>; template capture_expression(X1&&, Action&&) -> capture_expression, std::decay_t>; -template assign_to_expression(X1&&, Target&) -> assign_to_expression, Target>; -template capture_to_expression(X1&&, Target&) -> capture_to_expression, Target>; +template assign_to_expression(X1&&, Target*) -> assign_to_expression, Target>; +template capture_to_expression(X1&&, Target*) -> capture_to_expression, Target>; template symbol_assign_expression(X1&&, std::string_view) -> symbol_assign_expression>; template symbol_block_expression(X1&&) -> symbol_block_expression>; template local_block_expression(X1&&) -> local_block_expression>; @@ -1163,12 +1188,12 @@ inline namespace operators { [[nodiscard]] inline auto operator ""_srx(char const* s, std::size_t n) { return cased[basic_regular_expression{std::string_view{s, n}}]; } template >> [[nodiscard]] constexpr auto operator!(E const& e) { return negative_lookahead_expression{matches_eps[e]}; } -template >> [[nodiscard]] constexpr auto operator&(E const& e) { return positive_lookahead_expression{matches_eps[e]}; } +template >> [[nodiscard]] constexpr auto operator&(E const& e) { return positive_lookahead_expression{matches_eps[e]}; } // NOLINT(google-runtime-operator) template >> [[nodiscard]] constexpr auto operator*(E const& e) { return repetition_expression{matches_eps[skip_after[e]]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator|(E1 const& e1, E2 const& e2) { return choice_expression{relays_eps[e1], relays_eps[e2]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator>(E1 const& e1, E2 const& e2) { return sequence_expression{make_expression(e1), skip_before[e2]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator>>(E1 const& e1, E2 const& e2) { return e1 > *(e2 > e1); } -template >> [[nodiscard]] constexpr auto operator%(T& target, E const& e) { return assign_to_expression{make_expression(e), target}; } +template >> [[nodiscard]] constexpr auto operator%(T& target, E const& e) { return assign_to_expression{make_expression(e), std::addressof(target)}; } template >> [[nodiscard]] constexpr auto operator+(E const& e) { auto x{make_expression(e)}; return x > *x; } template >> [[nodiscard]] constexpr auto operator~(E const& e) { return e | eps; } template >> [[nodiscard]] constexpr auto operator--(E const& e) { return cut > e; } @@ -1204,7 +1229,7 @@ inline constexpr struct template struct capture_to { - Target& target; + Target* target; template >> [[nodiscard]] constexpr auto operator[](E const& e) const noexcept { return capture_to_expression{make_expression(e), target}; } }; template @@ -1213,7 +1238,7 @@ inline constexpr struct Action action; template >> [[nodiscard]] constexpr auto operator[](E const& e) const noexcept { return e < action; } }; - template >> [[nodiscard]] constexpr capture_to operator()(Target& t) const noexcept { return capture_to{t}; } + template >> [[nodiscard]] constexpr capture_to operator()(Target& t) const noexcept { return capture_to{std::addressof(t)}; } template >> [[nodiscard]] constexpr capture_with> operator()(Action&& a) const noexcept { return capture_with>{std::forward(a)}; } } capture{}; @@ -1251,14 +1276,33 @@ class implicit_space_rule { std::function prev_rule_; std::weak_ptr> implicit_space_ref_; + public: - template >> implicit_space_rule(E const& e) : prev_rule_{std::exchange(*grammar::implicit_space, std::function{make_space_expression(e)})}, implicit_space_ref_{grammar::implicit_space} {} - ~implicit_space_rule() { if (auto const implicit_space = implicit_space_ref_.lock(); implicit_space) { *implicit_space = std::move(prev_rule_); } } + template >> + implicit_space_rule(E const& e) // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + : prev_rule_{std::exchange(*grammar::implicit_space(), std::function{make_space_expression(e)})} + , implicit_space_ref_{grammar::implicit_space()} + {} + + ~implicit_space_rule() + { + if (auto const implicit_space_instance = implicit_space_ref_.lock(); implicit_space_instance) + *implicit_space_instance = std::move(prev_rule_); + } + + implicit_space_rule(implicit_space_rule const&) = delete; + implicit_space_rule(implicit_space_rule&&) = delete; + implicit_space_rule& operator=(implicit_space_rule const&) = delete; + implicit_space_rule& operator=(implicit_space_rule&&) = delete; }; } // namespace language -inline thread_local std::shared_ptr> const grammar::implicit_space{std::make_shared>(make_space_expression(language::operator*(language::space)))}; +[[nodiscard]] inline std::shared_ptr> const& grammar::implicit_space() +{ + static thread_local std::shared_ptr> const instance{std::make_shared>(make_space_expression(language::operator*(language::space)))}; + return instance; +} [[nodiscard]] inline grammar start(rule const& start_rule) { @@ -1278,11 +1322,14 @@ inline thread_local std::shared_ptr> const grammar grprogram.concatenate(*subprogram); grprogram.instructions.emplace_back(opcode::ret, operands::none, immediate{0}); if (auto top_rule = callstack.back().first; top_rule) { - for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { + for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { // NOLINT(performance-for-range-copy) calls.emplace_back(callee_program, address + instr_offset); - if (callee_rule && (mode & directives::eps) != directives::none && detail::escaping_find_if( - callstack.crbegin(), callstack.crend(), [rule = callee_rule](auto& caller) { - return caller.first == rule ? 1 : (caller.second ? 0 : -1); }) != callstack.crend()) { + if ((callee_rule != nullptr) && ((mode & directives::eps) != directives::none) && + detail::escaping_find_if(callstack.crbegin(), callstack.crend(), [callee = callee_rule](auto const& caller) { + if (caller.first == callee) + return 1; + return (caller.second ? 0 : -1); + }) != callstack.crend()) { left_recursive.insert(callee_program); } else { auto callee_callstack = callstack; @@ -1295,7 +1342,7 @@ inline thread_local std::shared_ptr> const grammar } while (!unprocessed.empty()); for (auto [subprogram, instr_addr] : calls) { if (auto& iprefix = grprogram.instructions[static_cast(instr_addr)]; iprefix.pf.op == opcode::call) - iprefix.pf.val = left_recursive.count(subprogram) != 0 ? (iprefix.pf.val != 0 ? iprefix.pf.val : 1) : 0; + iprefix.pf.val = (left_recursive.count(subprogram) != 0) ? (std::max)(iprefix.pf.val, static_cast(1)) : 0; auto& ioffset = grprogram.instructions[static_cast(instr_addr + 1)]; auto const rel_addr = ioffset.off + addresses[subprogram] - (instr_addr + 2); detail::assure_in_range(rel_addr, std::numeric_limits::lowest(), (std::numeric_limits::max)()); @@ -1304,7 +1351,7 @@ inline thread_local std::shared_ptr> const grammar return grammar{std::move(grprogram)}; } -enum class source_options : unsigned int { none = 0, interactive = 0x01, is_bitfield_enum }; +enum class source_options : std::uint_least8_t { none = 0, interactive = 0x01, is_bitfield_enum }; namespace detail { @@ -1386,7 +1433,7 @@ class string_view_input_source struct parser_registers { - std::size_t sr, mr, rc; std::ptrdiff_t pc; std::size_t fc; + std::size_t sr{0}; std::size_t mr{0}; std::size_t rc{0}; std::ptrdiff_t pc{0}; std::size_t fc{0}; [[nodiscard]] auto as_tuple() noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } [[nodiscard]] auto as_tuple() const noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } }; @@ -1396,11 +1443,12 @@ class basic_parser { enum class stack_frame_type : unsigned char { backtrack, call, capture, condition, lrcall, symbol_definition, symbol_table }; enum class subject_location : std::size_t {}; - struct lrmemo { std::size_t srr, sra, prec; std::ptrdiff_t pcr, pca; std::size_t rcr; std::vector responses; }; - static inline constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); - static inline constexpr std::size_t max_size = (std::numeric_limits::max)(); - lug::grammar const& grammar_; - lug::environment& environment_; + struct response { unsigned short call_depth; unsigned short action_index; syntax_range range; }; + struct lrmemo { std::size_t srr{0}; std::size_t sra{0}; std::size_t prec{0}; std::ptrdiff_t pcr{0}; std::ptrdiff_t pca{0}; std::size_t rcr{0}; std::vector responses; }; + static constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); + static constexpr std::size_t max_size = (std::numeric_limits::max)(); + lug::grammar const* grammar_; + lug::environment* environment_; InputSource input_source_; std::unordered_map casefolded_subjects_; parser_registers registers_{0, 0, 0, 0, 0}; @@ -1414,7 +1462,7 @@ class basic_parser std::vector> symbol_definition_stack_; // name, sr std::vector>> symbol_table_stack_; std::vector lrmemo_stack_; - std::vector responses_; + std::vector responses_; [[nodiscard]] bool available(std::size_t sr, std::size_t sn) { @@ -1440,16 +1488,16 @@ class basic_parser [[nodiscard]] bool casefold_compare(std::size_t sr, std::size_t sn, std::string_view str) { - auto& subject = casefolded_subjects_[sr]; + std::string& subject = casefolded_subjects_[sr]; if (subject.size() < sn) subject = utf8::tocasefold(input_source_.buffer().substr(sr, sn)); return subject.compare(0, sn, str) == 0; } template - [[nodiscard]] bool match_sequence(std::size_t& sr, std::string_view str, Compare&& comp) + [[nodiscard]] bool match_sequence(std::size_t& sr, std::string_view str, Compare const& comp) { - if (auto sn = str.size(); !sn || (available(sr, sn) && comp(*this, sr, sn, str))) { + if (std::size_t const sn = str.size(); !sn || (available(sr, sn) && comp(*this, sr, sn, str))) { sr += sn; return true; } @@ -1457,7 +1505,7 @@ class basic_parser } template - [[nodiscard]] bool match_single(std::size_t& sr, Match&& match) + [[nodiscard]] bool match_single(std::size_t& sr, Match const& match) { if (!available(sr, 1)) return false; @@ -1465,7 +1513,7 @@ class basic_parser auto const curr = buffer.cbegin() + static_cast(sr); auto const last = buffer.cend(); auto [next, rune] = utf8::decode_rune(curr, last); - bool matched; + bool matched = false; if constexpr (std::is_invocable_v) { matched = match(curr, last, next, rune); } else if constexpr(std::is_invocable_v) { @@ -1474,7 +1522,7 @@ class basic_parser matched = match(rune); } else { matched = match(); - detail::ignore(rune); + std::ignore = rune; } if (matched) sr += static_cast(std::distance(curr, next)); @@ -1482,9 +1530,9 @@ class basic_parser } template - [[nodiscard]] bool match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify&& mod, Compare&& comp) + [[nodiscard]] bool match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); if (std::size_t tsr = sr; std::all_of(symbols.begin(), symbols.end(), [&tsr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(tsr, mod(symbol), comp); })) { sr = tsr; return true; @@ -1493,23 +1541,23 @@ class basic_parser } template - [[nodiscard]] bool match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify&& mod, Compare&& comp) + [[nodiscard]] bool match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return std::any_of(symbols.begin(), symbols.end(), [&sr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(sr, mod(symbol), comp); }); } template [[nodiscard]] bool match_symbol_head(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbol_index]), std::forward(comp)) : false; } template [[nodiscard]] bool match_symbol_tail(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : false; } @@ -1521,7 +1569,7 @@ class basic_parser if constexpr (Opcode == opcode::commit_partial) { detail::make_tuple_view<0, 1>(backtrack_stack_.back()) = {sr, rc}; } else { - detail::ignore(sr, rc); + std::ignore = std::tie(sr, rc); if constexpr (Opcode == opcode::commit_back) sr = std::get<0>(backtrack_stack_.back()); pop_stack_frame(backtrack_stack_); @@ -1534,24 +1582,24 @@ class basic_parser { registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; auto const full_match = match(); - auto const prior_call_depth = environment_.start_accept(full_match, subject()); - detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_.end_accept(prior_call_depth); }}; - auto const& actions = grammar_.program().actions; - auto const& captures = grammar_.program().captures; - for (auto& response : responses_) { - if (environment_.prune_depth() <= response.call_depth) + auto const prior_call_depth = environment_->start_accept(full_match, subject()); + detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_->end_accept(prior_call_depth); }}; + auto const& actions = grammar_->program().actions; + auto const& captures = grammar_->program().captures; + for (auto& resp : responses_) { + if (environment_->prune_depth() <= resp.call_depth) continue; - environment_.reset_call_depth(response.call_depth); - if (response.range.index < max_size) - captures[response.action_index](environment_, syntax{full_match.substr(response.range.index, response.range.size), response.range.index}); + environment_->reset_call_depth(resp.call_depth); + if (resp.range.index < max_size) + captures[resp.action_index](*environment_, syntax{full_match.substr(resp.range.index, resp.range.size), resp.range.index}); else - actions[response.action_index](environment_); + actions[resp.action_index](*environment_); } } [[nodiscard]] auto drain() { - environment_.reset_origin(); + environment_->reset_origin(); input_source_.drain_buffer(registers_.sr); casefolded_subjects_.clear(); responses_.clear(); @@ -1569,7 +1617,7 @@ class basic_parser [[nodiscard]] auto drop_responses_after(std::size_t n) { - std::vector dropped; + std::vector dropped; if (n < responses_.size()) { dropped.assign(responses_.begin() + static_cast(n), responses_.end()); responses_.resize(n); @@ -1577,7 +1625,7 @@ class basic_parser return dropped; } - [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) + [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) { pop_responses_after(n); responses_.insert(responses_.end(), restore.begin(), restore.end()); @@ -1601,18 +1649,18 @@ class basic_parser } public: - basic_parser(lug::grammar const& g, lug::environment& e) : grammar_{g}, environment_{e} {} - [[nodiscard]] lug::grammar const& grammar() const noexcept { return grammar_; } - [[nodiscard]] lug::environment& environment() const noexcept { return environment_; } + basic_parser(lug::grammar const& g, lug::environment& e) : grammar_{&g}, environment_{&e} {} + [[nodiscard]] lug::grammar const& grammar() const noexcept { return *grammar_; } + [[nodiscard]] lug::environment& environment() const noexcept { return *environment_; } [[nodiscard]] std::string_view match() const noexcept { return input_source_.buffer().substr(0, registers_.sr); } [[nodiscard]] std::string_view subject() const noexcept { return input_source_.buffer().substr(registers_.sr, input_source_.buffer().size() - registers_.sr); } [[nodiscard]] std::size_t subject_index() const noexcept { return registers_.sr; } [[nodiscard]] std::size_t max_subject_index() const noexcept { return registers_.mr; } - [[nodiscard]] syntax_position subject_position() { return environment_.position_at(registers_.sr); } - [[nodiscard]] syntax_position max_subject_position() { return environment_.position_at(registers_.mr); } - [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_.position_at(index); } - [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_.position_at(range.index); } - [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_.position_at(range.index + range.size); } + [[nodiscard]] syntax_position subject_position() { return environment_->position_at(registers_.sr); } + [[nodiscard]] syntax_position max_subject_position() { return environment_->position_at(registers_.mr); } + [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_->position_at(index); } + [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_->position_at(range.index); } + [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_->position_at(range.index + range.size); } [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } [[nodiscard]] parser_registers& registers() noexcept { return registers_; } [[nodiscard]] parser_registers const& registers() const noexcept { return registers_; } @@ -1649,12 +1697,14 @@ class basic_parser bool parse() { detail::reentrancy_sentinel const guard{parsing_}; - program const& prog = grammar_.program(); + program const& prog = grammar_->program(); if (prog.instructions.empty()) throw bad_grammar{}; auto [sr, mr, rc, pc, fc] = drain(); - bool result = false, done = false; - pc = 0, fc = 0; + bool result = false; + bool done = false; + pc = 0; + fc = 0; while (!done) { auto [op, imm, off, str] = instruction::decode(prog.instructions, pc); switch (op) { @@ -1668,7 +1718,7 @@ class basic_parser } break; case opcode::match_any: { if constexpr (detail::input_source_has_options::value) { - if (((imm & 0x8000) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) + if (((imm & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) goto failure; } if (!match_single(sr, []{ return true; })) @@ -1723,16 +1773,19 @@ class basic_parser } break; case opcode::call: { if (imm != 0) { - auto const memo = detail::escaping_find_if(lrmemo_stack_.crbegin(), lrmemo_stack_.crend(), - [srr = sr, pca = pc + off](auto const& m){ return m.srr == srr && m.pca == pca ? 1 : (m.srr < srr ? 0 : -1); }); + auto const memo = detail::escaping_find_if(lrmemo_stack_.crbegin(), lrmemo_stack_.crend(), [srr = sr, pca = pc + off](auto const& m) { + if ((m.srr == srr) && (m.pca == pca)) + return 1; + return ((m.srr < srr) ? 0 : -1); + }); if (memo != lrmemo_stack_.crend()) { - if (memo->sra == lrfailcode || imm < memo->prec) + if ((memo->sra == lrfailcode) || (imm < memo->prec)) goto failure; sr = memo->sra, rc = restore_responses_after(rc, memo->responses); continue; } stack_frames_.push_back(stack_frame_type::lrcall); - lrmemo_stack_.push_back({sr, lrfailcode, imm, pc, pc + off, rc, std::vector{}}); + lrmemo_stack_.push_back({sr, lrfailcode, imm, pc, pc + off, rc, std::vector{}}); } else { stack_frames_.push_back(stack_frame_type::call); call_stack_.push_back(pc); @@ -1749,7 +1802,7 @@ class basic_parser } break; case stack_frame_type::lrcall: { auto& memo = lrmemo_stack_.back(); - if (memo.sra == lrfailcode || sr > memo.sra) { + if ((memo.sra == lrfailcode) || (sr > memo.sra)) { memo.sra = sr, memo.responses = drop_responses_after(memo.rcr); sr = memo.srr, pc = memo.pca, rc = memo.rcr; continue; @@ -1781,7 +1834,7 @@ class basic_parser } break; case stack_frame_type::condition: { auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_.set_condition(cond_name, cond_value); + environment_->set_condition(cond_name, cond_value); pop_stack_frame(condition_stack_), ++fc; } break; case stack_frame_type::lrcall: { @@ -1795,7 +1848,7 @@ class basic_parser pop_stack_frame(symbol_definition_stack_), ++fc; } break; case stack_frame_type::symbol_table: { - environment_.symbols_.swap(symbol_table_stack_.back()); + environment_->symbols_.swap(symbol_table_stack_.back()); pop_stack_frame(symbol_table_stack_), ++fc; } break; default: break; @@ -1804,7 +1857,7 @@ class basic_parser pop_responses_after(rc); } break; case opcode::accept: { - if (cut_deferred_ = !capture_stack_.empty() || !lrmemo_stack_.empty(); !cut_deferred_) { + if (cut_deferred_ = (!capture_stack_.empty() || !lrmemo_stack_.empty()); !cut_deferred_) { accept(sr, mr, rc, pc); std::tie(sr, mr, rc, pc, std::ignore) = drain(); } @@ -1818,8 +1871,8 @@ class basic_parser } break; case opcode::predicate: { registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; - environment_.reset_match_and_subject(match(), subject()); - bool const accepted = prog.predicates[imm](environment_); + environment_->reset_match_and_subject(match(), subject()); + bool const accepted = prog.predicates[imm](*environment_); std::tie(sr, mr, rc, pc, fc) = registers_.as_tuple(); pop_responses_after(rc); if (!accepted) @@ -1832,29 +1885,30 @@ class basic_parser case opcode::capture_end: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::capture)) goto failure; - auto const sr0 = static_cast(capture_stack_.back()), sr1 = sr; + auto const sr0 = static_cast(capture_stack_.back()); + auto const sr1 = sr; pop_stack_frame(capture_stack_, sr, mr, rc, pc); if (sr0 > sr1) goto failure; rc = push_response(call_stack_.size() + lrmemo_stack_.size(), imm, {sr0, sr1 - sr0}); } break; case opcode::condition_test: { - if (environment_.has_condition(str) != (imm != 0)) + if (environment_->has_condition(str) != (imm != 0)) goto failure; } break; case opcode::condition_push: { stack_frames_.push_back(stack_frame_type::condition); - condition_stack_.emplace_back(str, environment_.set_condition(str, imm != 0)); + condition_stack_.emplace_back(str, environment_->set_condition(str, imm != 0)); } break; case opcode::condition_pop: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::condition)) goto failure; auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_.set_condition(cond_name, cond_value); + environment_->set_condition(cond_name, cond_value); pop_stack_frame(condition_stack_); } break; case opcode::symbol_exists: { - if (environment_.has_symbol(str) != (imm != 0)) + if (environment_->has_symbol(str) != (imm != 0)) goto failure; } break; case opcode::symbol_all: { @@ -1897,24 +1951,25 @@ class basic_parser if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_definition)) goto failure; auto const [symbol_name, symbol_sr] = symbol_definition_stack_.back(); - auto const sr0 = static_cast(symbol_sr), sr1 = sr; + auto const sr0 = static_cast(symbol_sr); + auto const sr1 = sr; pop_stack_frame(symbol_definition_stack_); if (sr0 > sr1) goto failure; - environment_.add_symbol(symbol_name, std::string{match().substr(sr0, sr1 - sr0)}); + environment_->add_symbol(symbol_name, std::string{match().substr(sr0, sr1 - sr0)}); } break; case opcode::symbol_push: { stack_frames_.push_back(stack_frame_type::symbol_table); - symbol_table_stack_.emplace_back(environment_.symbols_); + symbol_table_stack_.emplace_back(environment_->symbols_); if (imm == 1) - environment_.symbols_.erase(str); + environment_->symbols_.erase(str); else if (imm == 2) - environment_.symbols_.clear(); + environment_->symbols_.clear(); } break; case opcode::symbol_pop: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_table)) goto failure; - environment_.symbols_.swap(symbol_table_stack_.back()); + environment_->symbols_.swap(symbol_table_stack_.back()); pop_stack_frame(symbol_table_stack_); } break; default: registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; throw bad_opcode{}; @@ -1982,15 +2037,17 @@ LUG_DIAGNOSTIC_PUSH_AND_IGNORE [[nodiscard]] inline grammar basic_regular_expression::make_grammar() { using namespace language; - implicit_space_rule default_space = nop; - rule Empty = eps <[](generator& g) { g.encoder.match_eps(); }; - rule Dot = chr('.') <[](generator& g) { g.encoder.match_any(); }; - rule Element = any > chr('-') > !chr(']') > any <[](generator& g, syntax const& x) { g.bracket_range(x.capture()); } - | str("[:") > +(!chr(':') > any) > str(":]") <[](generator& g, syntax const& x) { g.bracket_class(x.capture().substr(2, x.range().size - 4)); } - | any <[](generator& g, syntax const& x) { g.bracket_range(x.capture(), x.capture()); }; - rule Bracket = chr('[') > ~(chr('^') <[](generator& g) { g.circumflex = true; }) - > Element > *(!chr(']') > Element) > chr(']') <[](generator& g) { g.bracket_commit(); }; - rule Sequence = +(!(chr('.') | chr('[')) > any) <[](generator& g, syntax const& x) { g.encoder.match(x.capture()); }; + implicit_space_rule const default_space = nop; + // NOLINTBEGIN(bugprone-chained-comparison) + rule const Empty = eps <[](generator& g) { g.encoder.match_eps(); }; + rule const Dot = chr('.') <[](generator& g) { g.encoder.match_any(); }; + rule const Element = any > chr('-') > !chr(']') > any <[](generator& g, syntax const& x) { g.bracket_range(x.str()); } + | str("[:") > +(!chr(':') > any) > str(":]") <[](generator& g, syntax const& x) { g.bracket_class(x.str().substr(2, x.range().size - 4)); } + | any <[](generator& g, syntax const& x) { g.bracket_range(x.str(), x.str()); }; + rule const Bracket = chr('[') > ~(chr('^') <[](generator& g) { g.circumflex = true; }) + > Element > *(!chr(']') > Element) > chr(']') <[](generator& g) { g.bracket_commit(); }; + rule const Sequence = +(!(chr('.') | chr('[')) > any) <[](generator& g, syntax const& x) { g.encoder.match(x.str()); }; + // NOLINTEND(bugprone-chained-comparison) return start((+(Dot | Bracket | Sequence) | Empty) > eoi); } diff --git a/samples/basic/basic.cpp b/samples/basic/basic.cpp index 335f937..c3567fa 100644 --- a/samples/basic/basic.cpp +++ b/samples/basic/basic.cpp @@ -37,12 +37,12 @@ class basic_interpreter rule NL = lexeme["\n"_sx | "\r\n" | "\r"]; rule Delim = lexeme[","_sx | ";"]; - rule LineNo = lexeme[capture(stx_)[+"[0-9]"_rx]] <[this]{ return std::stoi(std::string{stx_}); }; - rule Real = lexeme[capture(stx_)[+"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) - > ~("[Ee]"_rx > ~"[+-]"_rx > +"[0-9]"_rx)]] <[this]{ return std::stod(std::string{stx_}); }; - rule String = lexeme["\"" > capture(stx_)[*"[^\"]"_rx] > "\""] <[this]{ return stx_.capture(); }; - rule Var = lexeme[capture(stx_)["[A-Za-z]"_rx > ~"[0-9]"_rx]] <[this]{ return lug::utf8::toupper(stx_); }; - rule Fn = lexeme["FN"_isx > capture(stx_)["[A-Za-z]"_rx]] <[this]{ return lug::utf8::toupper(stx_); }; + rule LineNo = lexeme[capture(tok_)[+"[0-9]"_rx]] <[this]{ return std::stoi(std::string{tok_}); }; + rule Real = lexeme[capture(tok_)[+"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) + > ~("[Ee]"_rx > ~"[+-]"_rx > +"[0-9]"_rx)]] <[this]{ return std::stod(std::string{tok_}); }; + rule String = lexeme["\"" > capture(tok_)[*"[^\"]"_rx] > "\""] <[this]{ return tok_.str(); }; + rule Var = lexeme[capture(tok_)["[A-Za-z]"_rx > ~"[0-9]"_rx]] <[this]{ return lug::utf8::toupper(tok_); }; + rule Fn = lexeme["FN"_isx > capture(tok_)["[A-Za-z]"_rx]] <[this]{ return lug::utf8::toupper(tok_); }; rule RelOp = "=" <[]() -> RelOpFn { return [](double x, double y) { return x == y; }; } | ">=" <[]() -> RelOpFn { return std::isgreaterequal; } @@ -110,7 +110,7 @@ class basic_interpreter | "GOTO"_isx > no_%LineNo <[this]{ goto_line(no_); } | "DEF"_isx > fn_%Fn > "(" > id_%Var > ")" - > "=" > capture(stx_)[*(!NL > any)] <[this]{ fn_param_body_[fn_] = { id_, std::string{stx_} }; } + > "=" > capture(tok_)[*(!NL > any)] <[this]{ fn_param_body_[fn_] = { id_, std::string{tok_} }; } | "LET"_isx > ref_%Ref > "=" > r1_%Expr <[this]{ *ref_ = r1_; } | "DIM"_isx > DimEl > *(Delim > DimEl) | "RESTORE"_isx <[this]{ read_itr_ = data_.cbegin(); } @@ -137,7 +137,7 @@ class basic_interpreter rule Line = Stmnt > ~Rem > NL | Cmnd > ~Rem > NL | no_%LineNo - > capture(stx_)[*(!NL > any) > NL] <[this]{ update_line(no_, stx_); } + > capture(tok_)[*(!NL > any) > NL] <[this]{ update_line(no_, tok_); } | Rem > NL | NL | ( *(!NL > any) > NL ) <[this]{ print_error("ILLEGAL FORMULA"); }; @@ -423,7 +423,7 @@ class basic_interpreter lug::environment environment_; std::string fn_; std::string id_; - lug::syntax stx_; + lug::syntax tok_; std::string_view txt_; double r1_{0.0}; double r2_{0.0}; diff --git a/samples/calc/calc.cpp b/samples/calc/calc.cpp index 8581763..e0acd09 100644 --- a/samples/calc/calc.cpp +++ b/samples/calc/calc.cpp @@ -11,9 +11,8 @@ namespace samples::calc { using namespace lug::language; -lug::syntax m; -double e, l, n, r, s; int i; +double e, l, n, r, s; double v[26]; extern rule Expr; @@ -21,22 +20,24 @@ extern rule Expr; implicit_space_rule BLANK = lexeme[ *"[ \t]"_rx ]; rule EOL = lexeme[ "[\n\r;]"_rx ]; -rule ID = lexeme[ capture(m)[ "[a-z]"_rx ] <[]() -> int { return m.capture().at(0) - 'a'; } ]; -rule NUMBER = lexeme[ capture(m)[ ~"[-+]"_rx > +"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) ] <[]{ return std::stod(std::string{m}); } ]; +rule ID = lexeme[ "[a-z]"_rx <[](syntax m) -> int { return m.str().at(0) - 'a'; } ]; +rule NUMBER = lexeme[ ( ~"[-+]"_rx > +"[0-9]"_rx > ~('.' > +"[0-9]"_rx) ) + <[](syntax m) -> double { return std::stod(std::string{m}); } ]; rule Value = n%NUMBER <[]{ return n; } | i%ID > !"="_sx <[]{ return v[i]; } - | "(" > e%Expr > ")" <[]{ return e; }; + | '(' > e%Expr > ')' <[]{ return e; }; rule Prod = l%Value > *( - "*" > r%Value <[]{ l *= r; } - | "/" > r%Value <[]{ l /= r; } + '*' > r%Value <[]{ l *= r; } + | '/' > r%Value <[]{ l /= r; } ) <[]{ return l; }; rule Sum = l%Prod > *( - "+" > r%Prod <[]{ l += r; } - | "-" > r%Prod <[]{ l -= r; } + '+' > r%Prod <[]{ l += r; } + | '-' > r%Prod <[]{ l -= r; } ) <[]{ return l; }; -rule Expr = i%ID > "=" > s%Sum <[]{ return v[i] = s; } +rule Expr = i%ID > '=' > s%Sum <[]{ return v[i] = s; } | s%Sum <[]{ return s; }; -rule Stmt = ( "quit"_isx <[]{ std::exit(EXIT_SUCCESS); } +rule Stmt = ( ( "exit"_isx + | "quit"_isx ) <[]{ std::exit(EXIT_SUCCESS); } | e%Expr <[]{ std::cout << e << "\n"; } ) > EOL | *( !EOL > any ) > EOL <[]{ std::cerr << "SYNTAX ERROR\n"; }; diff --git a/tests/captures.cpp b/tests/captures.cpp index e526bce..9bba93e 100644 --- a/tests/captures.cpp +++ b/tests/captures.cpp @@ -42,22 +42,22 @@ void test_capture_email_syntax() std::string_view const email = "user@example.com"; assert(lug::parse(email, G)); - assert(username.capture() == "user"); - assert(domain.capture() == "example"); - assert(tld.capture() == "com"); - assert(username.capture().data() == email.data()); - assert(domain.capture().data() == email.substr(5).data()); - assert(tld.capture().data() == email.substr(13).data()); + assert(username.str() == "user"); + assert(domain.str() == "example"); + assert(tld.str() == "com"); + assert(username.str().data() == email.data()); + assert(domain.str().data() == email.substr(5).data()); + assert(tld.str().data() == email.substr(13).data()); std::string const email2 = "not.an@email"; assert(!lug::parse(email2, G)); // failure to parse the above should not change captures, as no semantic actions should be executed - assert(username.capture() == "user"); - assert(domain.capture() == "example"); - assert(tld.capture() == "com"); - assert(username.capture().data() == email.data()); - assert(domain.capture().data() == email.substr(5).data()); - assert(tld.capture().data() == email.substr(13).data()); + assert(username.str() == "user"); + assert(domain.str() == "example"); + assert(tld.str() == "com"); + assert(username.str().data() == email.data()); + assert(domain.str().data() == email.substr(5).data()); + assert(tld.str().data() == email.substr(13).data()); } void test_capture_url_syntax() @@ -78,29 +78,29 @@ void test_capture_url_syntax() assert(lug::parse(url1, G)); assert(protocol == "https"); assert(domain == "www.example.com"); - assert(path.capture() == "/path/to/resource"); + assert(path.str() == "/path/to/resource"); assert(protocol.data() == url1.data()); assert(domain.data() != url1.data()); // std::string makes a copy - assert(path.capture().data() == url1.substr(23).data()); + assert(path.str().data() == url1.substr(23).data()); std::string const url2 = "http://api.example2.com/path/to/other/resource.html"; assert(lug::parse(url2, G)); assert(protocol == "http"); assert(domain == "api.example2.com"); - assert(path.capture() == "/path/to/other/resource.html"); + assert(path.str() == "/path/to/other/resource.html"); assert(protocol.data() == url2.c_str()); assert(domain.data() != url2.data()); // std::string makes a copy - assert(path.capture().data() == &url2[23]); + assert(path.str().data() == &url2[23]); std::string const url3 = "https://www.example3.com$path/to/resource"; assert(!lug::parse(url3, G)); // failure to parse the above should not change captures, as no semantic actions should be executed assert(protocol == "http"); assert(domain == "api.example2.com"); - assert(path.capture() == "/path/to/other/resource.html"); + assert(path.str() == "/path/to/other/resource.html"); assert(protocol.data() == url2.c_str()); assert(domain.data() != url2.data()); // std::string makes a copy - assert(path.capture().data() == &url2[23]); + assert(path.str().data() == &url2[23]); } void test_capture_comma_delimited_list() @@ -121,7 +121,7 @@ void test_capture_comma_delimited_list() assert(items[0] == "apple"); assert(items[1] == "banana"); assert(items[2] == "cherry"); - assert(item.capture() == "cherry"); // item should capture the last item parsed + assert(item.str() == "cherry"); // item should capture the last item parsed items.clear(); std::string const list2 = "123 , 456 ,789,987"; @@ -131,14 +131,14 @@ void test_capture_comma_delimited_list() assert(items[1] == "456"); assert(items[2] == "789"); assert(items[3] == "987"); - assert(item.capture() == "987"); // item should capture the last item parsed + assert(item.str() == "987"); // item should capture the last item parsed items.clear(); std::string_view const list3 = "one_single-item"; assert(lug::parse(list3, G)); assert(items.size() == 1); assert(items[0] == "one_single-item"); - assert(item.capture() == "one_single-item"); // item should capture the last item parsed + assert(item.str() == "one_single-item"); // item should capture the last item parsed // Test with an invalid list (no items) std::string const list4 = ""; @@ -146,7 +146,7 @@ void test_capture_comma_delimited_list() // After failing to parse, items should remain unchanged from the last successful parse assert(items.size() == 1); assert(items[0] == "one_single-item"); - assert(item.capture() == "one_single-item"); + assert(item.str() == "one_single-item"); } void test_capture_nested_calls() diff --git a/tests/leftrecursion.cpp b/tests/leftrecursion.cpp index 1d05e15..ccf631e 100644 --- a/tests/leftrecursion.cpp +++ b/tests/leftrecursion.cpp @@ -34,7 +34,7 @@ void test_indirect_left_recursion() Q = R > chr('a'); R = Q | chr('a'); S = R > !chr('a'); - grammar G = start(S); + grammar const G = start(S); assert(lug::parse("a", G)); assert(lug::parse("aa", G)); assert(lug::parse("aab", G)); @@ -55,9 +55,9 @@ void test_association_and_precedence() N = chr('1') | chr('2') | chr('3'); E = E[1] > chr('+') > E[2] <[&out]{ out += '+'; } | E[2] > chr('*') > E[3] <[&out]{ out += '*'; } - | N <[&out](syntax x){ out += x.capture(); }; + | N <[&out](syntax x){ out += x.str(); }; S = E > eoi; - grammar G = start(S); + grammar const G = start(S); out.clear(); assert(lug::parse("1", G) && out == "1"); out.clear(); From 64536767789201b187d7f002c957eb725bc8374d Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 00:13:19 -0700 Subject: [PATCH 09/19] Fix clang-tidy warnings --- .clang-tidy | 2 + Makefile | 16 ++-- lug/error.hpp | 2 +- lug/unicode.hpp | 192 +++++++++++++++++++++--------------------- lug/utf8.hpp | 67 +++++++++------ tools/makeunicode.cpp | 168 ++++++++++++++++++------------------ 6 files changed, 235 insertions(+), 212 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 8f8f033..a66ba9b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -8,6 +8,7 @@ Checks: - -cert-dcl21-cpp - concurrency-* - cppcoreguidelines-* + - -cppcoreguidelines-avoid-magic-numbers - darwin-* - hicpp-* - -hicpp-braces-around-statements @@ -21,6 +22,7 @@ Checks: - readability-* - -readability-braces-around-statements - -readability-identifier-length + - -readability-magic-numbers WarningsAsErrors: '' HeaderFileExtensions: - '' diff --git a/Makefile b/Makefile index 8f99593..7aea5bf 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ PREFIX = /usr/local # toolchain CXXSTD = -std=c++17 CXXFLAGS = $(CXXSTD) -pedantic -Wall -Wconversion -Wextra -Wextra-semi -Wshadow -Wsign-conversion -Wsuggest-override -Wno-parentheses -Wno-logical-not-parentheses \ - -Os -ffunction-sections -fdata-sections -I. $$(if [ "$(CI_BUILD)" = "1" ]; then echo "-Werror"; fi) + -Os -ffunction-sections -fdata-sections -I. LDFLAGS = $(CXXSTD) -s CLANGTIDY = clang-tidy @@ -32,8 +32,8 @@ TOOLS = makeunicode TOOLS_BIN = $(TOOLS:%=tools/%) TOOLS_OBJ = $(TOOLS:%=tools/%.o) -# dependencies -DEPS = lug/lug.hpp lug/detail.hpp lug/error.hpp lug/unicode.hpp lug/utf8.hpp +# header dependencies +HEADERS = lug/detail.hpp lug/error.hpp lug/unicode.hpp lug/utf8.hpp lug/lug.hpp # distribution files DISTFILES = CHANGELOG.md LICENSE.md README.md CMakeLists.txt Makefile runtests.sh .clang-tidy .editorconfig .gitattributes .gitignore .github/ doc/ lug/ samples/ tests/ tools/ @@ -42,9 +42,9 @@ all: options samples tests .cpp.o: @echo CXX $< - @$(CXX) -c $(CXXFLAGS) -o $@ $< + @$(CXX) -c $(CXXFLAGS) $$(if [ "$(CI_BUILD)" = "1" ]; then echo "-Werror"; fi) -o $@ $< -$(SAMPLES_OBJ): $(DEPS) +$(SAMPLES_OBJ): $(HEADERS) $(SAMPLES_BIN): $(SAMPLES_OBJ) @echo LD $@ @@ -52,7 +52,7 @@ $(SAMPLES_BIN): $(SAMPLES_OBJ) samples: $(SAMPLES_BIN) -$(TESTS_OBJ): $(DEPS) +$(TESTS_OBJ): $(HEADERS) $(TESTS_BIN): $(TESTS_OBJ) @echo LD $@ @@ -64,9 +64,9 @@ check: tests @sh runtests.sh "tests" $(TESTS_BIN) lint: - @$(CLANGTIDY) --quiet $(CXXFLAGS:%=--extra-arg=%) lug/detail.hpp + @$(CLANGTIDY) --quiet $(CXXFLAGS:%=--extra-arg=%) $(HEADERS) -$(TOOLS_OBJ): $(DEPS) +$(TOOLS_OBJ): $(HEADERS) $(TOOLS_BIN): $(TOOLS_OBJ) @echo LD $@ diff --git a/lug/error.hpp b/lug/error.hpp index dfc81dc..7138a92 100644 --- a/lug/error.hpp +++ b/lug/error.hpp @@ -18,7 +18,7 @@ class reenterant_read_error : public lug_error { public: reenterant_read_error() class parse_context_error : public lug_error { public: parse_context_error() : lug_error{"operation valid only inside calling context of parser::parse" } {} }; class accept_context_error : public lug_error{ public: accept_context_error() : lug_error{"operation valid only inside calling context of parser::accept"} {} }; class attribute_stack_error : public lug_error{ public: attribute_stack_error() : lug_error{"incompatible or invalid stack frame"} {} }; -class bad_string_expression : public lug_error { public: bad_string_expression(const std::string& s = "invalid string or bracket expression") : lug_error{s} {} }; +class bad_string_expression : public lug_error { public: explicit bad_string_expression(std::string const& s = "invalid string or bracket expression") : lug_error{s} {} }; class bad_character_class : public bad_string_expression { public: bad_character_class() : bad_string_expression{"invalid character class"} {} }; class bad_character_range : public bad_string_expression { public: bad_character_range() : bad_string_expression{"character range is reversed"} {} }; class bad_grammar : public lug_error { public: bad_grammar() : lug_error{"invalid or empty grammar"} {} }; diff --git a/lug/unicode.hpp b/lug/unicode.hpp index 97a291f..1ea8245 100644 --- a/lug/unicode.hpp +++ b/lug/unicode.hpp @@ -21,6 +21,8 @@ #include #include +// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + namespace lug::unicode { // POSIX compatibility properties @@ -704,7 +706,7 @@ enum class eawtype : std::uint_least8_t }; // Property Traits -enum class property_enum +enum class property_enum : std::uint_least8_t { invalid, ctype, @@ -747,34 +749,34 @@ class record std::array stage2; std::array records; }; - static std::int_least32_t case_mapping(std::size_t index) noexcept; - static std::unique_ptr decompress_table(); + [[nodiscard]] static std::int_least32_t case_mapping(std::size_t index) noexcept; + [[nodiscard]] static std::unique_ptr decompress_table(); friend record query(char32_t r); public: - ctype compatibility() const noexcept { return static_cast(raw_->cflags); } - ptype properties() const noexcept { return static_cast(raw_->pflags); } - gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } - sctype script() const noexcept { return static_cast(raw_->scindex); } - blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } - std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } - std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } - std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } - bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } - bool all_of(ptype p) const noexcept { return (properties() & p) == p; } - bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } - bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } - bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } - bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } - bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } - bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } - bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } + [[nodiscard]] ctype compatibility() const noexcept { return static_cast(raw_->cflags); } + [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } + [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } + [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } + [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } + [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } + [[nodiscard]] bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } + [[nodiscard]] bool all_of(ptype p) const noexcept { return (properties() & p) == p; } + [[nodiscard]] bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } + [[nodiscard]] bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } + [[nodiscard]] bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } + [[nodiscard]] bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } + [[nodiscard]] bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } + [[nodiscard]] bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } + [[nodiscard]] bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } }; // Retrieves the UCD record for the given codepoint -inline record query(char32_t r) +[[nodiscard]] inline record query(char32_t r) { static auto const table = record::decompress_table(); std::size_t index = 1901; @@ -786,83 +788,80 @@ inline record query(char32_t r) } // Checks if the rune matches all of the string-packed property classes -inline bool all_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool all_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches any of the string-packed property classes -inline bool any_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool any_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches none of the string-packed property classes -inline bool none_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool none_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() != lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() != lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() != lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() != lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() != lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() != lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() != lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() != lug::detail::string_unpack(str); } - return result; + return false; } // Column width (-1 = non-displayable, 0 = non-spacing, 1 = normal, 2 = wide) -inline int cwidth(char32_t r) +[[nodiscard]] inline int cwidth(char32_t r) { return query(r).cwidth(); } // Absolute column width -inline unsigned int ucwidth(char32_t r) +[[nodiscard]] inline unsigned int ucwidth(char32_t r) { auto const cw = query(r).cwidth(); return static_cast(cw >= 0 ? cw : -cw); } // Simple casefold conversion -inline char32_t tocasefold(char32_t r) +[[nodiscard]] inline char32_t tocasefold(char32_t r) { return static_cast(static_cast(r) + query(r).casefold_mapping()); } // Simple lowercase conversion -inline char32_t tolower(char32_t r) +[[nodiscard]] inline char32_t tolower(char32_t r) { return static_cast(static_cast(r) + query(r).lowercase_mapping()); } // Simple uppercase conversion -inline char32_t toupper(char32_t r) +[[nodiscard]] inline char32_t toupper(char32_t r) { return static_cast(static_cast(r) + query(r).uppercase_mapping()); } @@ -893,9 +892,10 @@ inline void push_uniform_casefolded_range(rune_set& runes, ptype props, char32_t inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) { ptype p = query(start).properties(); - char32_t r1 = start, r2 = start; + char32_t r1 = start; + char32_t r2 = start; for (char32_t rn = start + 1; rn <= end; r2 = rn, ++rn) { - ptype q = query(start).properties(); + ptype const q = query(start).properties(); if (((p ^ q) & ptype::Cased) != ptype::None) { detail::push_uniform_casefolded_range(runes, p, r1, r2); r1 = rn; @@ -905,27 +905,27 @@ inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) detail::push_uniform_casefolded_range(runes, p, r1, r2); } -inline rune_set sort_and_optimize(rune_set runes) +[[nodiscard]] inline rune_set sort_and_optimize(rune_set runes) { rune_set optimized_runes; auto out = optimized_runes.end(); std::sort_heap(std::begin(runes), std::end(runes)); - for (auto curr = std::cbegin(runes), last = std::cend(runes); curr != last; ++curr) { - if (out == optimized_runes.end() || curr->first < out->first || out->second < curr->first) - out = optimized_runes.insert(optimized_runes.end(), *curr); + for (auto const& r : runes) { + if (out == optimized_runes.end() || r.first < out->first || out->second < r.first) + out = optimized_runes.insert(optimized_runes.end(), r); else - out->second = out->second < curr->second ? curr->second : out->second; + out->second = out->second < r.second ? r.second : out->second; } optimized_runes.shrink_to_fit(); return optimized_runes; } -inline rune_set negate(rune_set const& runes) +[[nodiscard]] inline rune_set negate(rune_set const& runes) { rune_set negated_runes; if (!runes.empty()) { - if (char32_t front = runes.front().first; U'\0' < front) - negated_runes.push_back({U'\0', front - 1}); + if (char32_t const front = runes.front().first; U'\0' < front) + negated_runes.emplace_back(U'\0', front - 1); if (runes.size() > 1) { auto const last = std::cend(runes); auto left = std::cbegin(runes); @@ -933,12 +933,12 @@ inline rune_set negate(rune_set const& runes) auto right = std::next(left); if (right == last) break; - negated_runes.push_back({left->second + 1, right->first - 1}); + negated_runes.emplace_back(left->second + 1, right->first - 1); left = right; } } - if (char32_t back = runes.back().second; back < U'\xFFFFFFFF') - negated_runes.push_back({back + 1, U'\xFFFFFFFF'}); + if (char32_t const back = runes.back().second; back < U'\xFFFFFFFF') + negated_runes.emplace_back(back + 1, U'\xFFFFFFFF'); negated_runes.shrink_to_fit(); } return negated_runes; @@ -946,10 +946,10 @@ inline rune_set negate(rune_set const& runes) namespace detail { -inline std::string normalize_property_label(std::string_view id) +[[nodiscard]] inline std::string normalize_property_label(std::string_view id) { std::string normid; - for (char c : id) + for (char const c : id) if (c != ' ' && c != '\t' && c != '_' && c != '-' && c != '.' && c != ';') normid.push_back(static_cast(std::tolower(c))); return normid; @@ -971,8 +971,8 @@ inline std::optional stoctype(std::string_view s) { "xdigit"sv, ct::xdigit } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1014,8 +1014,8 @@ inline std::optional stoptype(std::string_view s) { "xidcontinue"sv, pt::XID_Continue }, { "xidstart"sv, pt::XID_Start } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1044,8 +1044,8 @@ inline std::optional stogctype(std::string_view s) { "zl"sv, gc::Zl }, { "zp"sv, gc::Zp }, { "zs"sv, gc::Zs } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1111,8 +1111,8 @@ inline std::optional stosctype(std::string_view s) { "zanabazarsquare"sv, sc::Zanabazar_Square } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1267,8 +1267,8 @@ inline std::optional stoblktype(std::string_view s) { "zanabazarsquare"sv, blk::Zanabazar_Square }, { "znamennymusicalnotation"sv, blk::Znamenny_Musical_Notation } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1288,8 +1288,8 @@ inline std::optional stoagetype(std::string_view s) { "90"sv, at::v9_0 }, { "unassigned"sv, at::Unassigned } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1304,8 +1304,8 @@ inline std::optional stoeawtype(std::string_view s) { "a"sv, eaw::A }, { "f"sv, eaw::F }, { "h"sv, eaw::H }, { "n"sv, eaw::N }, { "na"sv, eaw::Na }, { "w"sv, eaw::W } } }; - auto l = detail::normalize_property_label(s); - auto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); + auto const l = detail::normalize_property_label(s); + auto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; }); return c != labels.end() && c->first == l ? std::optional{static_cast(c->second)} : std::nullopt; } @@ -1316,7 +1316,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ull << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1340,7 +1340,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace detail -inline std::int_least32_t record::case_mapping(std::size_t index) noexcept +[[nodiscard]] inline std::int_least32_t record::case_mapping(std::size_t index) noexcept { static constexpr std::array casemappings = { @@ -1358,7 +1358,7 @@ inline std::int_least32_t record::case_mapping(std::size_t index) noexcept return casemappings[index]; } -inline std::unique_ptr record::decompress_table() +[[nodiscard]] inline std::unique_ptr record::decompress_table() { using detail::run_length_decode; using lug::detail::make_member_accessor; @@ -2096,7 +2096,7 @@ inline std::unique_ptr record::decompress_table() 512, 896, 640, 2432, 3080, 3072, 7280, 7269, 7237, 7176, 7267, 7235, 7233, 7168, 0, 128 }; - std::array flyweights; + std::array flyweights{}; auto table = std::make_unique(); auto& records = table->records; @@ -2131,4 +2131,6 @@ inline std::unique_ptr record::decompress_table() } // namespace lug::unicode +// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + #endif diff --git a/lug/utf8.hpp b/lug/utf8.hpp index 1617d43..b516c31 100644 --- a/lug/utf8.hpp +++ b/lug/utf8.hpp @@ -16,8 +16,7 @@ namespace lug::utf8 { namespace detail { -inline constexpr unsigned int decode_accept = 0; -inline constexpr unsigned int decode_reject = 12; +enum class decode_state : unsigned char { accept = 0, reject = 12 }; inline constexpr std::array dfa_class_table { @@ -50,30 +49,48 @@ inline constexpr std::array dfa_transition_table 12,36,12,12,12,12,12,12,12,12,12,12 }; -} // namespace detail +inline constexpr std::array utf8_replacement_sequence +{ + static_cast(0xefU), + static_cast(0xbfU), + static_cast(0xbdU) +}; -[[nodiscard]] constexpr bool is_lead(char octet) noexcept +inline constexpr char32_t utf32_replacement = U'\U0000fffd'; + +[[nodiscard]] constexpr decode_state decode_rune_octet(char32_t& rune, char octet, decode_state state) noexcept { - return (static_cast(octet) & 0xc0) != 0x80; + auto const symbol = static_cast(static_cast(octet)); + auto const dfa_class = static_cast(dfa_class_table[symbol]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index) + rune = (state == decode_state::accept) ? (symbol & (0xffU >> dfa_class)) : ((symbol & 0x3fU) | (rune << 6U)); + return static_cast(dfa_transition_table[static_cast(state) + dfa_class]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index) } -[[nodiscard]] constexpr unsigned int decode_rune_octet(char32_t& rune, char octet, unsigned int state) +[[nodiscard]] constexpr unsigned int non_ascii_rune_length(char32_t rune) noexcept +{ + if (rune >= 0x00010000U) + return 4; + if (rune >= 0x00000800U) + return 3; + return 2; +} + +} // namespace detail + +[[nodiscard]] constexpr bool is_lead(char octet) noexcept { - unsigned int const symbol = static_cast(static_cast(octet)); - unsigned int const dfa_class = static_cast(detail::dfa_class_table[symbol]); - rune = state == detail::decode_accept ? (symbol & (0xffU >> dfa_class)) : ((symbol & 0x3fU) | (rune << 6)); - return detail::dfa_transition_table[state + dfa_class]; + return (static_cast(octet) & 0xc0U) != 0x80U; } template > [[nodiscard]] constexpr std::pair decode_rune(InputIt first, InputIt last) { char32_t rune = U'\0'; - unsigned int state = detail::decode_accept; - while (first != last && state != detail::decode_reject) - if (state = lug::utf8::decode_rune_octet(rune, *first++, state); state == detail::decode_accept) + detail::decode_state state = detail::decode_state::accept; + while ((first != last) && (state != detail::decode_state::reject)) + if (state = utf8::detail::decode_rune_octet(rune, *first++, state); state == detail::decode_state::accept) return std::make_pair(first, rune); - return std::make_pair(std::find_if(first, last, lug::utf8::is_lead), U'\U0000fffd'); + return std::make_pair(std::find_if(first, last, lug::utf8::is_lead), detail::utf32_replacement); } template > @@ -97,11 +114,11 @@ inline std::pair encode_rune(OutputIt dst, char32_t rune) if (rune < 0x80) { *dst++ = static_cast(rune); } else { - if (0x00110000U <= rune || (rune & 0xfffff800U) == 0x0000d800U) - return {std::copy_n(reinterpret_cast(u8"\U0000fffd"), 3, dst), false}; - unsigned int const n = rune >= 0x00010000U ? 4 : rune >= 0x00000800U ? 3 : 2; - for (unsigned int i = 0, c = (0xf0 << (4 - n)) & 0xf0; i < n; ++i, c = 0x80) - *dst++ = static_cast(((rune >> (6 * (n - i - 1))) & 0x3f) | c); + if ((0x00110000U <= rune) || ((rune & 0xfffff800U) == 0x0000d800U)) + return {std::copy(detail::utf8_replacement_sequence.begin(), detail::utf8_replacement_sequence.end(), dst), false}; + unsigned int const n = detail::non_ascii_rune_length(rune); + for (unsigned int i = 0, c = ((0xf0U << (4 - n)) & 0xf0U); i < n; ++i, c = 0x80U) + *dst++ = static_cast(((rune >> (6 * (n - i - 1))) & 0x3fU) | c); } return {dst, true}; } @@ -116,7 +133,7 @@ inline std::pair encode_rune(OutputIt dst, char32_t rune) inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -126,7 +143,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); @@ -139,7 +156,7 @@ tocasefold{}; inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -149,7 +166,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); @@ -162,7 +179,7 @@ tolower{}; inline constexpr struct { template - inline OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const + OutputIt operator()(InputIt first, InputIt last, OutputIt dst) const { while (first != last) { auto [next, rune] = lug::utf8::decode_rune(first, last); @@ -172,7 +189,7 @@ inline constexpr struct return dst; } - [[nodiscard]] inline std::string operator()(std::string_view src) const + [[nodiscard]] std::string operator()(std::string_view src) const { std::string result; result.reserve(src.size()); diff --git a/tools/makeunicode.cpp b/tools/makeunicode.cpp index 9337a26..648c63f 100644 --- a/tools/makeunicode.cpp +++ b/tools/makeunicode.cpp @@ -1073,8 +1073,8 @@ class enum_parser_printer return out << "\t\t" << line << "\n" << "\t} };\n\n" - << "\tauto l = detail::normalize_property_label(s);\n" - << "\tauto c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; });\n" + << "\tauto const l = detail::normalize_property_label(s);\n" + << "\tauto const c = std::lower_bound(labels.begin(), labels.end(), l, [](auto const& x, auto const& y) { return x.first < y; });\n" << "\treturn c != labels.end() && c->first == l ? std::optional<" << p.name_ << ">{static_cast<" << p.name_ << ">(c->second)} : std::nullopt;\n" << "}\n"; } @@ -1149,6 +1149,8 @@ R"c++(// lug - Embedded DSL for PE grammar parser combinators in C++ #include #include +// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + namespace lug::unicode { )c++" << "\n" @@ -1220,7 +1222,7 @@ namespace lug::unicode { }) << R"c++( // Property Traits -enum class property_enum +enum class property_enum : std::uint_least8_t { invalid, ctype, @@ -1263,34 +1265,34 @@ class record << "\t\tstd::array<" << recordstagetable.typeinfo2.name << ", " << std::dec << recordstagetable.stage2.size() << "> stage2;\n" << "\t\tstd::array records;" << R"c++( }; - static std::int_least32_t case_mapping(std::size_t index) noexcept; - static std::unique_ptr decompress_table(); + [[nodiscard]] static std::int_least32_t case_mapping(std::size_t index) noexcept; + [[nodiscard]] static std::unique_ptr decompress_table(); friend record query(char32_t r); public: - ctype compatibility() const noexcept { return static_cast(raw_->cflags); } - ptype properties() const noexcept { return static_cast(raw_->pflags); } - gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } - sctype script() const noexcept { return static_cast(raw_->scindex); } - blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } - std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } - std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } - std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } - bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } - bool all_of(ptype p) const noexcept { return (properties() & p) == p; } - bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } - bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } - bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } - bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } - bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } - bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } - bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } + [[nodiscard]] ctype compatibility() const noexcept { return static_cast(raw_->cflags); } + [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } + [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } + [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } + [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } + [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } + [[nodiscard]] bool all_of(ctype c) const noexcept { return (compatibility() & c) == c; } + [[nodiscard]] bool all_of(ptype p) const noexcept { return (properties() & p) == p; } + [[nodiscard]] bool all_of(gctype gc) const noexcept { return (general_category() & gc) == gc; } + [[nodiscard]] bool any_of(ctype c) const noexcept { return (compatibility() & c) != ctype::none; } + [[nodiscard]] bool any_of(ptype p) const noexcept { return (properties() & p) != ptype::None; } + [[nodiscard]] bool any_of(gctype gc) const noexcept { return (general_category() & gc) != gctype::None; } + [[nodiscard]] bool none_of(ctype c) const noexcept { return (compatibility() & c) == ctype::none; } + [[nodiscard]] bool none_of(ptype p) const noexcept { return (properties() & p) == ptype::None; } + [[nodiscard]] bool none_of(gctype gc) const noexcept { return (general_category() & gc) == gctype::None; } }; // Retrieves the UCD record for the given codepoint -inline record query(char32_t r) +[[nodiscard]] inline record query(char32_t r) { static auto const table = record::decompress_table(); std::size_t index = )c++" << std::dec << invalidrecordindex << R"c++(; @@ -1302,83 +1304,80 @@ inline record query(char32_t r) } // Checks if the rune matches all of the string-packed property classes -inline bool all_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool all_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.all_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.all_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches any of the string-packed property classes -inline bool any_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool any_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.any_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() == lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() == lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() == lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() == lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.any_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() == lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() == lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() == lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() == lug::detail::string_unpack(str); } - return result; + return false; } // Checks if the rune matches none of the string-packed property classes -inline bool none_of(record const& rec, property_enum penum, std::string_view str) +[[nodiscard]] inline bool none_of(record const& rec, property_enum penum, std::string_view str) { - bool result; switch (penum) { - case property_enum::ctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::ptype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::gctype: result = rec.none_of(lug::detail::string_unpack(str)); break; - case property_enum::sctype: result = rec.script() != lug::detail::string_unpack(str); break; - case property_enum::blktype: result = rec.block() != lug::detail::string_unpack(str); break; - case property_enum::agetype: result = rec.age() != lug::detail::string_unpack(str); break; - case property_enum::eawtype: result = rec.eawidth() != lug::detail::string_unpack(str); break; - default: result = false; break; + case property_enum::invalid: return false; + case property_enum::ctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::ptype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::gctype: return rec.none_of(lug::detail::string_unpack(str)); + case property_enum::sctype: return rec.script() != lug::detail::string_unpack(str); + case property_enum::blktype: return rec.block() != lug::detail::string_unpack(str); + case property_enum::agetype: return rec.age() != lug::detail::string_unpack(str); + case property_enum::eawtype: return rec.eawidth() != lug::detail::string_unpack(str); } - return result; + return false; } // Column width (-1 = non-displayable, 0 = non-spacing, 1 = normal, 2 = wide) -inline int cwidth(char32_t r) +[[nodiscard]] inline int cwidth(char32_t r) { return query(r).cwidth(); } // Absolute column width -inline unsigned int ucwidth(char32_t r) +[[nodiscard]] inline unsigned int ucwidth(char32_t r) { auto const cw = query(r).cwidth(); return static_cast(cw >= 0 ? cw : -cw); } // Simple casefold conversion -inline char32_t tocasefold(char32_t r) +[[nodiscard]] inline char32_t tocasefold(char32_t r) { return static_cast(static_cast(r) + query(r).casefold_mapping()); } // Simple lowercase conversion -inline char32_t tolower(char32_t r) +[[nodiscard]] inline char32_t tolower(char32_t r) { return static_cast(static_cast(r) + query(r).lowercase_mapping()); } // Simple uppercase conversion -inline char32_t toupper(char32_t r) +[[nodiscard]] inline char32_t toupper(char32_t r) { return static_cast(static_cast(r) + query(r).uppercase_mapping()); } @@ -1409,9 +1408,10 @@ inline void push_uniform_casefolded_range(rune_set& runes, ptype props, char32_t inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) { ptype p = query(start).properties(); - char32_t r1 = start, r2 = start; + char32_t r1 = start; + char32_t r2 = start; for (char32_t rn = start + 1; rn <= end; r2 = rn, ++rn) { - ptype q = query(start).properties(); + ptype const q = query(start).properties(); if (((p ^ q) & ptype::Cased) != ptype::None) { detail::push_uniform_casefolded_range(runes, p, r1, r2); r1 = rn; @@ -1421,27 +1421,27 @@ inline void push_casefolded_range(rune_set& runes, char32_t start, char32_t end) detail::push_uniform_casefolded_range(runes, p, r1, r2); } -inline rune_set sort_and_optimize(rune_set runes) +[[nodiscard]] inline rune_set sort_and_optimize(rune_set runes) { rune_set optimized_runes; auto out = optimized_runes.end(); std::sort_heap(std::begin(runes), std::end(runes)); - for (auto curr = std::cbegin(runes), last = std::cend(runes); curr != last; ++curr) { - if (out == optimized_runes.end() || curr->first < out->first || out->second < curr->first) - out = optimized_runes.insert(optimized_runes.end(), *curr); + for (auto const& r : runes) { + if (out == optimized_runes.end() || r.first < out->first || out->second < r.first) + out = optimized_runes.insert(optimized_runes.end(), r); else - out->second = out->second < curr->second ? curr->second : out->second; + out->second = out->second < r.second ? r.second : out->second; } optimized_runes.shrink_to_fit(); return optimized_runes; } -inline rune_set negate(rune_set const& runes) +[[nodiscard]] inline rune_set negate(rune_set const& runes) { rune_set negated_runes; if (!runes.empty()) { - if (char32_t front = runes.front().first; U'\0' < front) - negated_runes.push_back({U'\0', front - 1}); + if (char32_t const front = runes.front().first; U'\0' < front) + negated_runes.emplace_back(U'\0', front - 1); if (runes.size() > 1) { auto const last = std::cend(runes); auto left = std::cbegin(runes); @@ -1449,12 +1449,12 @@ inline rune_set negate(rune_set const& runes) auto right = std::next(left); if (right == last) break; - negated_runes.push_back({left->second + 1, right->first - 1}); + negated_runes.emplace_back(left->second + 1, right->first - 1); left = right; } } - if (char32_t back = runes.back().second; back < U'\xFFFFFFFF') - negated_runes.push_back({back + 1, U'\xFFFFFFFF'}); + if (char32_t const back = runes.back().second; back < U'\xFFFFFFFF') + negated_runes.emplace_back(back + 1, U'\xFFFFFFFF'); negated_runes.shrink_to_fit(); } return negated_runes; @@ -1462,10 +1462,10 @@ inline rune_set negate(rune_set const& runes) namespace detail { -inline std::string normalize_property_label(std::string_view id) +[[nodiscard]] inline std::string normalize_property_label(std::string_view id) { std::string normid; - for (char c : id) + for (char const c : id) if (c != ' ' && c != '\t' && c != '_' && c != '-' && c != '.' && c != ';') normid.push_back(static_cast(std::tolower(c))); return normid; @@ -1538,7 +1538,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ull << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1562,7 +1562,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace detail -inline std::int_least32_t record::case_mapping(std::size_t index) noexcept +[[nodiscard]] inline std::int_least32_t record::case_mapping(std::size_t index) noexcept { )c++" << function_table_printer("casemappings", "std::int_least32_t", compressedrecords.cmapping_values) @@ -1570,7 +1570,7 @@ inline std::int_least32_t record::case_mapping(std::size_t index) noexcept return casemappings[index]; } -inline std::unique_ptr record::decompress_table() +[[nodiscard]] inline std::unique_ptr record::decompress_table() { using detail::run_length_decode; using lug::detail::make_member_accessor; @@ -1581,7 +1581,7 @@ inline std::unique_ptr record::decompress_table() << rle_stage_table_printer("rlestage2", recordstagetable.stage2, recordstagetable.typeinfo2) << "\n" << record_flyweight_printer(compressedrecords) -<< "\n\tstd::array flyweights;" +<< "\n\tstd::array flyweights{};" << R"c++( auto table = std::make_unique(); auto& records = table->records; @@ -1617,6 +1617,8 @@ inline std::unique_ptr record::decompress_table() } // namespace lug::unicode +// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) + #endif )c++"; } From e50d8f3248a27d33060d6960a248973e9a602c73 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:56:07 -0700 Subject: [PATCH 10/19] Code cleanup --- lug/utf8.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lug/utf8.hpp b/lug/utf8.hpp index b516c31..c7d544f 100644 --- a/lug/utf8.hpp +++ b/lug/utf8.hpp @@ -68,11 +68,11 @@ inline constexpr char32_t utf32_replacement = U'\U0000fffd'; [[nodiscard]] constexpr unsigned int non_ascii_rune_length(char32_t rune) noexcept { - if (rune >= 0x00010000U) - return 4; - if (rune >= 0x00000800U) + if (rune < 0x00000800U) + return 2; + if (rune < 0x00010000U) return 3; - return 2; + return 4; } } // namespace detail From 8c3724b7ac0bd73ca5882687c8b5200b6264217f Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:57:02 -0700 Subject: [PATCH 11/19] Remove #include from that was left in during testing --- lug/lug.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index acb7982..2ebe5f9 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -15,7 +15,6 @@ #include #include #include -#include namespace lug { From 9e0b8ae22be623bc3ab33ddb6b7719587688fafa Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 21:57:27 -0700 Subject: [PATCH 12/19] Update README fixing typos --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8b13503..76d1c8c 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,11 @@ A C++ embedded domain specific language for expressing parsers as extended [pars Features --- -- Natural syntax resembling external parser generator languages. +- Natural syntax resembling external parser generator languages, with support for attributes and semantic actions. +- Ability to handle context-sensitive grammars with symbol tables, conditions and syntactic predicates. - Generated parsers are compiled to special-purpose bytecode and executed in a virtual parsing machine. - Clear separation of syntactic and lexical rules, with the ability to customize implicit whitespace skipping. - Support for direct and indirect left recursion, with precedence levels to disambiguate subexpressions with mixed left/right recursion. -- Extended PEG syntax to include attribute grammars and semantic actions. -- Ability to handle context-sensitive grammars with symbol tables, conditions, and syntactic predicates. - Full support for UTF-8 text parsing, including Level 1 and partial Level 2 compliance with the UTS #18 Unicode Regular Expressions technical standard. - Automatic tracking of line and column numbers, with customizable tab width and alignment. - Header-only library utilizing C++17 language and library features. @@ -69,7 +68,7 @@ Syntax Reference | One-or-More | `+e` | Repetition matching of expression *e* one or more times. | | Optional | `~e` | Matches expression *e* zero or one times. | | Positive Lookahead | `&e` | Matches without consuming input if expression *e* succeeds to match the input. | -| Negative Lookahead | `~e` | Matches without consuming input if expression *e* fails to match the input. | +| Negative Lookahead | `!e` | Matches without consuming input if expression *e* fails to match the input. | | Cut Before | `--e` | Issues a cut instruction before the expression *e*. | | Cut After | `e--` | Issues a cut instruction after the expression *e*. | | Action Scheduling | `e < a` | Schedules a semantic action *a* to be evaluated if expression *e* successfully matches the input. | From 23a8597b969c8cbb91236e10d4ae99fe7534c780 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:34:59 -0700 Subject: [PATCH 13/19] Fix clang-tidy warnings --- .clang-tidy | 4 ++++ lug/detail.hpp | 2 +- lug/unicode.hpp | 26 +++++++++++++------------- tools/makeunicode.cpp | 26 ++++++++++++-------------- 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index a66ba9b..f7c52a8 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -9,6 +9,9 @@ Checks: - concurrency-* - cppcoreguidelines-* - -cppcoreguidelines-avoid-magic-numbers + - -cppcoreguidelines-avoid-do-while + - -cppcoreguidelines-avoid-goto + - -cppcoreguidelines-pro-bounds-* - darwin-* - hicpp-* - -hicpp-braces-around-statements @@ -23,6 +26,7 @@ Checks: - -readability-braces-around-statements - -readability-identifier-length - -readability-magic-numbers + - -readability-qualified-auto WarningsAsErrors: '' HeaderFileExtensions: - '' diff --git a/lug/detail.hpp b/lug/detail.hpp index 3c7fccd..a7ebb5d 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -162,7 +162,7 @@ template using enable_if_char_contiguous_iterator_t = std::enable_if_t, T>; template -constexpr void ignore(Args&&...) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward,hicpp-named-parameter,readability-named-parameter) +constexpr void ignore([[maybe_unused]] Args&&... args) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward) struct identity { diff --git a/lug/unicode.hpp b/lug/unicode.hpp index 1ea8245..d1c7c48 100644 --- a/lug/unicode.hpp +++ b/lug/unicode.hpp @@ -21,10 +21,10 @@ #include #include -// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - namespace lug::unicode { +// NOLINTBEGIN(hicpp-signed-bitwise) + // POSIX compatibility properties enum class ctype : std::uint_least16_t { @@ -159,6 +159,8 @@ enum class gctype : std::uint_least32_t is_bitfield_enum }; +// NOLINTEND(hicpp-signed-bitwise) + // Scripts enum class sctype : std::uint_least8_t { @@ -757,10 +759,10 @@ class record [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } - [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x03ffU); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10U); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0fU); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4U) - 1; } [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } @@ -781,8 +783,8 @@ class record static auto const table = record::decompress_table(); std::size_t index = 1901; if (r < 0x110000) { - index = table->stage1[r >> 7]; - index = table->stage2[(index << 7) | (r & 0x7f)]; + index = table->stage1[r >> 7U]; + index = table->stage2[(index << 7U) | (r & 0x7fU)]; } return record{&table->records[index]}; } @@ -1316,7 +1318,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << static_cast(std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1324,14 +1326,14 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) auto const tail = *first++; for (std::size_t i = 0; i < count; ++i) { if ((head & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(head & ~seqmask) + 1, tail); + dest = std::fill_n(dest, static_cast(head & ~static_cast(seqmask)) + 1, tail); } else { *dest++ = head; *dest++ = tail; } } } else if ((lead & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(lead & ~seqmask) + 1, *first++); + dest = std::fill_n(dest, static_cast(lead & ~static_cast(seqmask)) + 1, *first++); } else { *dest++ = lead; } @@ -2131,6 +2133,4 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace lug::unicode -// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - #endif diff --git a/tools/makeunicode.cpp b/tools/makeunicode.cpp index 648c63f..70b414e 100644 --- a/tools/makeunicode.cpp +++ b/tools/makeunicode.cpp @@ -1149,9 +1149,9 @@ R"c++(// lug - Embedded DSL for PE grammar parser combinators in C++ #include #include -// NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - namespace lug::unicode { + +// NOLINTBEGIN(hicpp-signed-bitwise) )c++" << "\n" << enum_printer(enum_type::bitfield, "ctype", "std::uint_least16_t", "POSIX compatibility properties", [](std::ostream& out) { @@ -1191,7 +1191,7 @@ namespace lug::unicode { out << "," << std::right << std::setw(21 - padcount) << " " << compound.second.first << " = " << compound.first << ",\n"; } }) -<< "\n" +<< "\n// NOLINTEND(hicpp-signed-bitwise)\n\n" << enum_printer(enum_type::index, "sctype", "std::uint_least8_t", "Scripts", [](std::ostream& out) { auto const pad = align_padding(max_element_size(script_names.cbegin(), script_names.cend())); for (std::size_t i = 0, n = script_names.size(); i < n; ++i) @@ -1273,10 +1273,10 @@ class record [[nodiscard]] ptype properties() const noexcept { return static_cast(raw_->pflags); } [[nodiscard]] gctype general_category() const noexcept { return static_cast(UINT32_C(1) << raw_->gcindex); } [[nodiscard]] sctype script() const noexcept { return static_cast(raw_->scindex); } - [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x3ff); } - [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10); } - [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0f); } - [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4) - 1; } + [[nodiscard]] blktype block() const noexcept { return static_cast(raw_->abfields & 0x03ffU); } + [[nodiscard]] agetype age() const noexcept { return static_cast(raw_->abfields >> 10U); } + [[nodiscard]] eawtype eawidth() const noexcept { return static_cast(raw_->wfields & 0x0fU); } + [[nodiscard]] int cwidth() const noexcept { return static_cast(raw_->wfields >> 4U) - 1; } [[nodiscard]] std::int_least32_t casefold_mapping() const noexcept { return case_mapping(raw_->cfindex); } [[nodiscard]] std::int_least32_t lowercase_mapping() const noexcept { return case_mapping(raw_->clindex); } [[nodiscard]] std::int_least32_t uppercase_mapping() const noexcept { return case_mapping(raw_->cuindex); } @@ -1297,8 +1297,8 @@ class record static auto const table = record::decompress_table(); std::size_t index = )c++" << std::dec << invalidrecordindex << R"c++(; if (r < 0x)c++" << std::hex << ptable.size() << R"c++() { - index = table->stage1[r >> )c++" << std::dec << block_shift << R"c++(]; - index = table->stage2[(index << )c++" << std::dec << block_shift << R"c++() | (r & 0x)c++" << std::hex << block_mask << R"c++()]; + index = table->stage1[r >> )c++" << std::dec << block_shift << R"c++(U]; + index = table->stage2[(index << )c++" << std::dec << block_shift << R"c++(U) | (r & 0x)c++" << std::hex << block_mask << R"c++(U)]; } return record{&table->records[index]}; } @@ -1538,7 +1538,7 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) { using value_type = typename std::iterator_traits::value_type; constexpr auto ilseqcode = (std::numeric_limits::max)(); - constexpr auto seqmask = static_cast(0x03ULL << (std::numeric_limits::digits - 2)); + constexpr auto seqmask = static_cast(0x03ULL << static_cast(std::numeric_limits::digits - 2)); while (first != last) { if (auto const lead = *first++; lead == ilseqcode) { auto const count = static_cast(*first++); @@ -1546,14 +1546,14 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) auto const tail = *first++; for (std::size_t i = 0; i < count; ++i) { if ((head & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(head & ~seqmask) + 1, tail); + dest = std::fill_n(dest, static_cast(head & ~static_cast(seqmask)) + 1, tail); } else { *dest++ = head; *dest++ = tail; } } } else if ((lead & seqmask) == seqmask) { - dest = std::fill_n(dest, static_cast(lead & ~seqmask) + 1, *first++); + dest = std::fill_n(dest, static_cast(lead & ~static_cast(seqmask)) + 1, *first++); } else { *dest++ = lead; } @@ -1617,8 +1617,6 @@ void run_length_decode(InputIt first, InputIt last, OutputIt dest) } // namespace lug::unicode -// NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index,hicpp-signed-bitwise) - #endif )c++"; } From 1ffba6352451745985d63b7997be169ffdbe9320 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:36:04 -0700 Subject: [PATCH 14/19] Ignore all files under tools/ucd/ --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6d8ad9..02d3c4c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Unicode Character Database files -tools/ucd/*.txt +tools/ucd/ # User-specific files *.suo From 3df9f2b7399c86a1e776f1646744785dd6c9b4f7 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 2 Jul 2024 22:44:13 -0700 Subject: [PATCH 15/19] Update clang-tidy checks --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index f7c52a8..628c9b1 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -14,6 +14,7 @@ Checks: - -cppcoreguidelines-pro-bounds-* - darwin-* - hicpp-* + - -hicpp-avoid-goto - -hicpp-braces-around-statements - llvm-namespace-comment - misc-* From d72f7951cf298f5581e0fb3d0cb8cfde79bb5683 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 3 Jul 2024 19:02:15 -0700 Subject: [PATCH 16/19] Fixed clang-tidy warnings --- .clang-tidy | 38 ++-- CHANGELOG.md | 6 +- lug/detail.hpp | 37 ++-- lug/lug.hpp | 447 ++++++++++++++++++++++------------------ samples/basic/basic.cpp | 18 +- samples/calc/calc.cpp | 23 ++- tests/captures.cpp | 44 ++-- tests/leftrecursion.cpp | 6 +- 8 files changed, 343 insertions(+), 276 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 628c9b1..9cc6212 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -2,32 +2,42 @@ Checks: - clang-diagnostic-* - clang-analyzer-* + - -clang-analyzer-optin.core.EnumCastOutOfRange # interferes with enum bitfield flags - android-* - bugprone-* - cert-* - - -cert-dcl21-cpp + - -cert-dcl21-cpp # this check is deprecated, it is no longer part of the CERT standard - concurrency-* - cppcoreguidelines-* - - -cppcoreguidelines-avoid-magic-numbers - - -cppcoreguidelines-avoid-do-while - - -cppcoreguidelines-avoid-goto - - -cppcoreguidelines-pro-bounds-* + - -cppcoreguidelines-avoid-magic-numbers # revisit after new instruction scheme, maybe only disable for unicode tables + - -cppcoreguidelines-avoid-do-while # if removing do-while does not cause serious performance issues remove this check + - -cppcoreguidelines-avoid-goto # if removing goto does not cause serious performance issues remove this check + - -cppcoreguidelines-pro-bounds-* # requires gsl::at and std::span to suppress, would prefer Standard Library hardening approach + - -cppcoreguidelines-pro-type-union-access # remove after developing new instruction encoding scheme that doesn't use union - darwin-* + - fuschia-* + - google-* + - -google-build-using-namespace # would require too many invidual using-declarations to satisfy + - -google-readability-braces-around-statements # adversely affects line count + - -google-runtime-int # revisit after new instruction scheme - hicpp-* - - -hicpp-avoid-goto - - -hicpp-braces-around-statements + - -hicpp-avoid-goto # if removing goto does not cause serious performance issues remove this check + - -hicpp-braces-around-statements # adversely affects line count - llvm-namespace-comment - misc-* - - -misc-include-cleaner + - -misc-include-cleaner # brings in redundant headers that are already included - modernize-* - - -modernize-use-trailing-return-type + - -modernize-use-constraints # C++20 feature + - -modernize-use-trailing-return-type # stylistic preference, revisit later - performance-* - portability-* - readability-* - - -readability-braces-around-statements - - -readability-identifier-length - - -readability-magic-numbers - - -readability-qualified-auto + - -readability-braces-around-statements # adversely affects line count + - -readability-container-contains # C++20 feature + - -readability-function-cognitive-complexity # grammar::start() and basic_parser::parse() are complex, revisit or suppress only for these functions + - -readability-identifier-length # revisit later + - -readability-magic-numbers # revisit after new instruction scheme, maybe only disable for unicode tables + - -readability-qualified-auto # stylistic preference that unfortunately warns when marking 'auto*' as 'auto* const' or just 'auto const' WarningsAsErrors: '' HeaderFileExtensions: - '' @@ -284,7 +294,7 @@ CheckOptions: misc-header-include-cycle.IgnoredFilesList: '' misc-include-cleaner.DeduplicateFindings: 'true' misc-include-cleaner.IgnoreHeaders: '' - misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'false' + misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true' misc-non-private-member-variables-in-classes.IgnorePublicMemberVariables: 'false' misc-throw-by-value-catch-by-reference.CheckThrowTemporaries: 'true' misc-throw-by-value-catch-by-reference.WarnOnLargeObjects: 'false' diff --git a/CHANGELOG.md b/CHANGELOG.md index df68d94..7657b34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,12 @@ * Added support for parsing characters and character literals where applicable without explicitly needing to wrap them with `chr()` or `_cx`. * Symbols now respects `caseless` mode, allowing for case-insensitive matching against symbol definitions. * Allow for use of variables of all types in attribute bindings and removed the `lug::variable` template class that was used previously. Variable state is automatically saved and restored across rule boundaries. -* Allow for capturing text to a `lug::syntax` object or any string-like object that is convertible from `std::string_view`. +* Allow for capturing text to a `lug::syntax` object or any string-like object that is convertible from `std::string_view`, and renamed `syntax::capture` to `syntax::str` in order to match `std::sub_match::str`. * Added `lug::source_options::interactive` flag that ignores `eoi` tokens for TTY input sources. * Rewrote the expression function objects/lambdas as expression template classes. Allows for multiple passes over the expression tree as well as top-down and bottom-up traversal, which was needed when implementing attribute state tracking. This will also allow for additional optimizations to be implemented in the future. * Renamed `syntactic_capture` to `semantic_capture_action` to reflect that it is executed during the semantic action evaluation phase. * Make all variations of callables that return a non-void value that can be type-erased by `semantic_action` and `semantic_capture_action` push their result onto the attribute result stack. +* Removed `semantic_response` from the public API as it was only used internally inside of the parser. * Attempting to bind a variable to a nonexistent value from the attribute result stack now throws an `attribute_stack_error`. * `implicit_space_rule` no longer causes a compiler warning with Clang, uses RAII to push/pop the thread-local white space rule for grammars. * Moved `call_depth()`, `prune_depth()` and `escape()` functions into the `lug::environment` class since they are used exclusively during semantic action phase. @@ -18,12 +19,13 @@ * Turned `lug::parser` into an alias of a new `lug::basic_parser` template class parameterized with an input source strategy. This allows for parsing and capturing of text without making a copy of the input. * Placed all DSL operator overloads inside of an inline namespace `operators` within `lug::language`. This allows only the operators to be imported into the current scope if desired. * Enabled `-Wconversion` and `-Wshadow` warnings for Clang and GCC and fixed warnings. +* Full clang-tidy pass on all of the library headers and fixed all warnings. * Added CMake build support and removed old MSVS solution and vcxproj files. * Handle situation where compilation with RTTI is disabled. ## Release v0.2.0 (June 21, 2024) -* Implemented new support for context-sensitive grammars with symbol tables and parsing conditions, based on the PEG extensions described in the paper *"A Declarative Extension of Parsing Expression Grammars for Recognizing Most Programming Languages"* by Tetsuro Matsumura and Kimio Kuramitsu (2015). +* Implemented new support for context-sensitive grammars with symbol tables and parsing conditions based on the PEG extensions described in the paper *"A Declarative Extension of Parsing Expression Grammars for Recognizing Most Programming Languages"* by Tetsuro Matsumura and Kimio Kuramitsu (2015). * Added an XML Standard 1.0 matcher sample program demonstrating use of symbol tables. * Finished the BASIC language interpreter sample program, which is now feature complete, using parsing conditions. * Updated Unicode support to version 15.1.0 and automated Unicode table generation via Makefile build. diff --git a/lug/detail.hpp b/lug/detail.hpp index a7ebb5d..60a9b0e 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -87,43 +87,43 @@ inline namespace bitfield_ops { template > [[nodiscard]] constexpr T operator~(T x) noexcept { - return static_cast(~static_cast>(x)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(~static_cast>(x)); } template > [[nodiscard]] constexpr T operator&(T x, T y) noexcept { - return static_cast(static_cast>(x) & static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) & static_cast>(y)); } template > [[nodiscard]] constexpr T operator|(T x, T y) noexcept { - return static_cast(static_cast>(x) | static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) | static_cast>(y)); } template > [[nodiscard]] constexpr T operator^(T x, T y) noexcept { - return static_cast(static_cast>(x) ^ static_cast>(y)); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return static_cast(static_cast>(x) ^ static_cast>(y)); } template > constexpr T& operator&=(T& x, T y) noexcept { - return (x = x & y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x & y); } template > constexpr T& operator|=(T& x, T y) noexcept { - return (x = x | y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x | y); } template > constexpr T& operator^=(T& x, T y) noexcept { - return (x = x ^ y); // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange) + return (x = x ^ y); } } // namespace bitfield_ops @@ -161,9 +161,6 @@ using enable_if_char_input_iterator_t = std::enable_if_t< template using enable_if_char_contiguous_iterator_t = std::enable_if_t, T>; -template -constexpr void ignore([[maybe_unused]] Args&&... args) noexcept {} // NOLINT(cppcoreguidelines-missing-std-forward) - struct identity { template @@ -224,21 +221,21 @@ template template class dynamic_cast_if_base_of { - std::remove_reference_t& value_; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + std::reference_wrapper> value_; public: constexpr explicit dynamic_cast_if_base_of(std::remove_reference_t& x) noexcept : value_{x} {} template , std::decay_t>>> - [[nodiscard]] constexpr operator U&() const // NOLINT(hicpp-explicit-conversions) + [[nodiscard]] constexpr operator U&() const noexcept(std::is_same_v, std::decay_t>) // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) { #ifndef LUG_NO_RTTI if constexpr (std::is_same_v, std::decay_t>) #endif // LUG_NO_RTTI - return static_cast&>(value_); + return static_cast&>(value_.get()); #ifndef LUG_NO_RTTI else - return dynamic_cast&>(value_); + return dynamic_cast&>(value_.get()); #endif // LUG_NO_RTTI } }; @@ -246,20 +243,20 @@ class dynamic_cast_if_base_of template class reentrancy_sentinel { - bool& value; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + std::reference_wrapper value_; public: constexpr explicit reentrancy_sentinel(bool& x) - : value{x} + : value_{x} { - if (value) + if (value_.get()) throw Error(); - value = true; + value_.get() = true; } ~reentrancy_sentinel() { - value = false; + value_.get() = false; } reentrancy_sentinel(reentrancy_sentinel const&) = delete; @@ -341,7 +338,7 @@ inline std::size_t push_back_unique(Sequence& s, T&& x) template [[nodiscard]] inline auto pop_back(Sequence& s) -> typename Sequence::value_type { - typename Sequence::value_type result{std::move(s.back())}; + typename Sequence::value_type result{std::move(s.back())}; // NOLINT(misc-const-correctness) s.pop_back(); return result; } diff --git a/lug/lug.hpp b/lug/lug.hpp index 2ebe5f9..29f6fed 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -30,9 +30,8 @@ class string_view_input_source; template class basic_parser; using parser = basic_parser; struct program; -struct syntax_position { std::size_t column, line; }; -struct syntax_range { std::size_t index, size; }; -struct semantic_response { unsigned short call_depth, action_index; syntax_range range; }; +struct syntax_position { std::size_t column; std::size_t line; }; +struct syntax_range { std::size_t index; std::size_t size; }; using semantic_action = std::function; using semantic_capture_action = std::function; using syntactic_predicate = std::function; @@ -65,14 +64,14 @@ enum class operands : unsigned char { none = 0, off = 0x40, str = 0x80, is_bitfi union instruction { - static inline constexpr std::size_t maxstrlen = 256; + static constexpr std::size_t maxstrlen = 256; struct prefix { opcode op; operands aux; unsigned short val; } pf; int off; std::array str; instruction(opcode op, operands aux, immediate imm) noexcept : pf{op, aux, static_cast(imm)} {} explicit instruction(std::ptrdiff_t o) : off{static_cast(o)} { if (off != o) throw program_limit_error{}; } - explicit instruction(std::string_view s) { std::fill(std::copy_n(s.begin(), (std::min)(s.size(), std::size_t{4}), str.begin()), str.end(), char{0}); } + explicit instruction(std::string_view s) : str{} { std::fill(std::copy_n(s.begin(), (std::min)(s.size(), std::size_t{4}), str.begin()), str.end(), char{0}); } [[nodiscard]] static auto decode(std::vector const& code, std::ptrdiff_t& pc) { @@ -81,9 +80,10 @@ union instruction unsigned short imm = pf.val; std::string_view str; if ((pf.aux & operands::str) != operands::none) { - str = std::string_view{code[static_cast(pc)].str.data(), static_cast((imm & 0xff) + 1)}; - pc += ((imm & 0xff) + 4) >> 2; - imm = static_cast(imm >> 8); + auto const strsize = (static_cast(imm) & 0xffU) + 1U; + str = std::string_view{code[static_cast(pc)].str.data(), strsize}; + pc += static_cast((strsize + 3U) >> 2U); + imm = static_cast(static_cast(imm) >> 8U); } return std::make_tuple(pf.op, imm, off, str); } @@ -92,7 +92,7 @@ union instruction { std::ptrdiff_t len = 1; len += ((pf.aux & operands::off) != operands::none) ? 1 : 0; - len += ((pf.aux & operands::str) != operands::none) ? static_cast(((pf.val & 0xff) >> 2) + 1) : 0; + len += ((pf.aux & operands::str) != operands::none) ? static_cast(((static_cast(pf.val) & 0xffU) >> 2U) + 1U) : 0; return len; } }; @@ -102,7 +102,7 @@ static_assert(sizeof(unicode::sctype) <= sizeof(immediate), "immediate must be l static_assert(sizeof(instruction) == sizeof(int), "expected instruction to be same size as int"); static_assert(sizeof(int) <= sizeof(std::ptrdiff_t), "expected int to be no larger than ptrdiff_t"); -enum class directives : unsigned int { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; +enum class directives : std::uint_least8_t { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; using program_callees = std::vector>; struct program @@ -119,7 +119,7 @@ struct program instructions.reserve(detail::checked_add(instructions.size(), src.instructions.size())); for (auto i = src.instructions.begin(), j = i, e = src.instructions.end(); i != e; i = j) { instruction instr = *i; - std::size_t val; + std::size_t val = 0; switch (instr.pf.op) { case opcode::match_set: val = detail::push_back_unique(runesets, src.runesets[instr.pf.val]); break; case opcode::action: val = actions.size(); actions.push_back(src.actions[instr.pf.val]); break; @@ -128,7 +128,7 @@ struct program default: val = (std::numeric_limits::max)(); break; } if (val != (std::numeric_limits::max)()) { - detail::assure_in_range(val, 0u, (std::numeric_limits::max)()); + detail::assure_in_range(val, 0U, (std::numeric_limits::max)()); instr.pf.val = static_cast(val); } j = std::next(i, instruction::length(instr.pf)); @@ -153,57 +153,58 @@ class rule { friend class encoder; friend class rule_encoder; - friend grammar start(rule const&); + friend grammar start(rule const& start_rule); program program_; program_callees callees_; bool currently_encoding_{false}; public: - rule() = default; - template >> rule(E const& e); + rule() noexcept = default; + template && !std::is_same_v>> rule(E const& e); // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) rule(rule const& r); - rule(rule&& r) = default; + rule(rule&& r) noexcept = default; rule& operator=(rule const& r) { rule{r}.swap(*this); return *this; } - rule& operator=(rule&& r) = default; + rule& operator=(rule&& r) noexcept = default; + ~rule() = default; void swap(rule& r) noexcept { program_.swap(r.program_); callees_.swap(r.callees_); } [[nodiscard]] auto operator[](unsigned short precedence) const noexcept; }; class grammar { - friend grammar start(rule const&); + friend grammar start(rule const& start_rule); lug::program program_; - grammar(lug::program p) : program_{std::move(p)} {} + explicit grammar(lug::program&& p) noexcept : program_{std::move(p)} {} public: - grammar() = default; + grammar() noexcept = default; void swap(grammar& g) noexcept { program_.swap(g.program_); } [[nodiscard]] lug::program const& program() const noexcept { return program_; } - static thread_local std::shared_ptr> const implicit_space; + [[nodiscard]] static std::shared_ptr> const& implicit_space(); }; class syntax { - std::string_view capture_; + std::string_view str_; std::size_t index_{0}; public: constexpr syntax() noexcept = default; - constexpr syntax(std::string_view c, std::size_t i) noexcept : capture_{c}, index_{i} {} - [[nodiscard]] constexpr std::string_view capture() const noexcept { return capture_; } - [[nodiscard]] constexpr syntax_range range() const noexcept { return syntax_range{index_, capture_.size()}; } - [[nodiscard]] operator std::string() const noexcept { return std::string{capture_}; } - [[nodiscard]] constexpr operator std::string_view() const noexcept { return capture_; } - [[nodiscard]] constexpr operator syntax_range() const noexcept { return range(); } - [[nodiscard]] constexpr bool empty() const noexcept { return capture_.empty(); } - [[nodiscard]] constexpr std::size_t size() const noexcept { return capture_.size(); } - [[nodiscard]] constexpr bool operator==(syntax const& other) const noexcept { return capture_ == other.capture_ && index_ == other.index_; } - [[nodiscard]] constexpr bool operator!=(syntax const& other) const noexcept { return capture_ != other.capture_ || index_ != other.index_; } + constexpr syntax(std::string_view c, std::size_t i) noexcept : str_{c}, index_{i} {} + [[nodiscard]] constexpr std::string_view str() const noexcept { return str_; } + [[nodiscard]] constexpr syntax_range range() const noexcept { return syntax_range{index_, str_.size()}; } + [[nodiscard]] operator std::string() const { return std::string{str_}; } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr operator std::string_view() const noexcept { return str_; } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr operator syntax_range() const noexcept { return range(); } // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + [[nodiscard]] constexpr bool empty() const noexcept { return str_.empty(); } + [[nodiscard]] constexpr std::size_t size() const noexcept { return str_.size(); } + [[nodiscard]] constexpr bool operator==(syntax const& other) const noexcept { return str_ == other.str_ && index_ == other.index_; } + [[nodiscard]] constexpr bool operator!=(syntax const& other) const noexcept { return str_ != other.str_ || index_ != other.index_; } }; class environment { template friend class basic_parser; - static inline constexpr unsigned short max_call_depth = (std::numeric_limits::max)(); - static inline const std::vector empty_symbols_{}; + static constexpr unsigned short max_call_depth = (std::numeric_limits::max)(); + static inline std::vector const empty_symbols_{}; std::vector attribute_frame_stack_; std::vector attribute_result_stack_; std::unordered_set conditions_; @@ -250,13 +251,18 @@ class environment } public: + environment() = default; + environment(environment const&) = delete; + environment(environment&&) noexcept = default; + environment& operator=(environment const&) = delete; + environment& operator=(environment&&) noexcept = default; virtual ~environment() = default; [[nodiscard]] unsigned int tab_width() const { return tab_width_; } void tab_width(unsigned int w) { tab_width_ = w; } [[nodiscard]] unsigned int tab_alignment() const { return tab_alignment_; } void tab_alignment(unsigned int a) { tab_alignment_ = a; } [[nodiscard]] bool has_condition(std::string_view name) const noexcept { return (conditions_.count(name) > 0); } - bool set_condition(std::string_view name, bool value) { if (value) { return !conditions_.emplace(name).second; } else { return (conditions_.erase(name) > 0); } } + bool set_condition(std::string_view name, bool value) { return value ? (!conditions_.emplace(name).second) : (conditions_.erase(name) > 0); } void clear_conditions() { conditions_.clear(); } [[nodiscard]] bool has_symbol(std::string_view name) const noexcept { return (symbols_.count(name) > 0); } [[nodiscard]] std::vector const& get_symbols(std::string_view name) const { auto it = symbols_.find(name); if (it == symbols_.end()) return empty_symbols_; return it->second; } @@ -285,7 +291,8 @@ class environment } auto first = std::next(std::begin(match_), static_cast(startindex)); auto const last = std::next(std::begin(match_), static_cast(index)); - char32_t rune, prevrune = U'\0'; + char32_t rune = U'\0'; + char32_t prevrune = U'\0'; for (auto curr = first, next = curr; curr < last; curr = next, prevrune = rune) { std::tie(next, rune) = utf8::decode_rune(curr, last); if ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) != unicode::ptype::None && (prevrune != U'\r' || rune != U'\n')) { @@ -345,7 +352,7 @@ struct encoder_metadata template >> constexpr encoder_metadata() noexcept : attribute_frame{} {} template >> - constexpr encoder_metadata(Frame&& frame) noexcept : attribute_frame{std::forward(frame)} {} + constexpr explicit encoder_metadata(Frame&& frame) noexcept : attribute_frame{std::forward(frame)} {} }; encoder_metadata() -> encoder_metadata<>; @@ -353,17 +360,17 @@ template encoder_metadata(Frame&&) -> encoder_metadata mode_; - virtual void do_append(instruction) = 0; + virtual void do_append(instruction instr) = 0; virtual void do_append(program const&) = 0; - virtual immediate do_add_rune_set(unicode::rune_set) { return immediate{0}; } - virtual immediate do_add_semantic_action(semantic_action) { return immediate{0}; } - virtual immediate do_add_semantic_capture_action(semantic_capture_action) { return immediate{0}; } - virtual immediate do_add_syntactic_predicate(syntactic_predicate) { return immediate{0}; } - virtual void do_add_callee(rule const*, program const*, std::ptrdiff_t, directives) {} - virtual bool do_should_evaluate_length() const noexcept { return true; } - virtual std::ptrdiff_t do_length() const noexcept = 0; + [[nodiscard]] virtual immediate do_add_rune_set(unicode::rune_set&& /*r*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_semantic_action(semantic_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual immediate do_add_syntactic_predicate(syntactic_predicate&& /*p*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + virtual void do_add_callee(rule const* /*r*/, program const* /*p*/, std::ptrdiff_t /*n*/, directives /*d*/) {} + [[nodiscard]] virtual bool do_should_evaluate_length() const noexcept { return true; } + [[nodiscard]] virtual std::ptrdiff_t do_length() const noexcept = 0; protected: encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, unsigned short prec) @@ -380,7 +387,7 @@ class encoder std::string_view subsequence = sequence.substr(0, instruction::maxstrlen); while (!subsequence.empty() && !utf8::is_lead(subsequence.back())) subsequence.remove_suffix(1); - subsequence.remove_suffix(!subsequence.empty()); + subsequence.remove_suffix(!subsequence.empty() ? 1 : 0); encode(op, subsequence); sequence.remove_prefix(subsequence.size()); } @@ -397,35 +404,40 @@ class encoder void do_skip() { mode_.back() = (mode_.back() & ~(directives::preskip | directives::postskip)) | directives::lexeme | directives::noskip; - (*grammar::implicit_space)(*this); + (*grammar::implicit_space())(*this); } public: - explicit encoder(directives initial) : mandate_{directives::none}, mode_{initial} {} + explicit encoder(directives initial) : mode_{initial} {} virtual ~encoder() = default; + encoder(encoder const&) = delete; + encoder(encoder&&) = delete; + encoder& operator=(encoder const&) = delete; + encoder& operator=(encoder&&) = delete; template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); template >> [[nodiscard]] std::ptrdiff_t evaluate_length(E const& e, M const& m); - encoder& dpsh(directives enable, directives disable) { directives prev = mode_.back(); mode_.push_back((prev & ~disable) | enable); return *this; } + encoder& dpsh(directives enable, directives disable) { directives const prev = mode_.back(); mode_.push_back((prev & ~disable) | enable); return *this; } encoder& append(instruction instr) { do_append(instr); return *this; } encoder& append(program const& p) { do_append(p); return *this; } encoder& call(program const& p, unsigned short prec) { return do_call(nullptr, &p, 0, prec); } encoder& call(grammar const& g, unsigned short prec) { return do_call(nullptr, &g.program(), 3, prec); } encoder& encode(opcode op, immediate imm = immediate{0}) { return append(instruction{op, operands::none, imm}); } - encoder& encode(opcode op, semantic_action a) { return append(instruction{op, operands::none, do_add_semantic_action(std::move(a))}); } - encoder& encode(opcode op, semantic_capture_action c) { return append(instruction{op, operands::none, do_add_semantic_capture_action(std::move(c))}); } - encoder& encode(opcode op, syntactic_predicate p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } + encoder& encode(opcode op, semantic_action&& a) { return append(instruction{op, operands::none, do_add_semantic_action(std::move(a))}); } + encoder& encode(opcode op, semantic_capture_action&& a) { return append(instruction{op, operands::none, do_add_semantic_capture_action(std::move(a))}); } + encoder& encode(opcode op, syntactic_predicate&& p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } encoder& encode(opcode op, std::ptrdiff_t off, immediate imm = immediate{0}) { return append(instruction{op, operands::off, imm}).append(instruction{off}); } [[nodiscard]] std::ptrdiff_t length() const noexcept { return do_length(); } [[nodiscard]] directives mandate() const noexcept { return (mandate_ & ~directives::eps) | mode_.back(); } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } - encoder& match(unicode::rune_set runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } + encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match); } encoder& match_any() { return skip().encode(opcode::match_any); } template >> encoder& match_class(T properties) { return skip().do_match_class(Op, properties); } encoder& dpop(directives relay) { - auto prev = detail::pop_back(mode_), next = (mode_.back() & ~relay) | (prev & relay); + auto const prev = detail::pop_back(mode_); + auto const next = (mode_.back() & ~relay) | (prev & relay); if ((next & directives::postskip) == directives::none && (prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip) do_skip(); mode_.back() = next; @@ -434,7 +446,7 @@ class encoder encoder& skip(directives callee_mandate = directives::eps, directives callee_skip = directives::lexeme) { - auto mode = mode_.back(); + auto const mode = mode_.back(); if (mandate_ == directives::none) mandate_ = (mode & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; if ((((mode | callee_mandate)) & (callee_skip | directives::preskip)) == directives::preskip) @@ -446,7 +458,7 @@ class encoder encoder& call(rule const& r, unsigned short prec, bool allow_inlining = true) { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && - p.instructions.size() <= 8 && p.actions.size() <= 1 && p.captures.size() <= 1 && p.predicates.size() <= 1) + (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) return skip(p.mandate, directives::noskip).append(p); return do_call(&r, &r.program_, 0, prec); } @@ -465,9 +477,9 @@ class encoder encoder& encode(opcode op, std::string_view subsequence, immediate imm = immediate{0}) { if (!subsequence.empty()) { - detail::assure_in_range(static_cast(imm), 0u, instruction::maxstrlen - 1); - detail::assure_in_range(subsequence.size(), 1u, instruction::maxstrlen); - do_append(instruction{op, operands::str, static_cast((static_cast(imm) << 8) | static_cast(subsequence.size() - 1))}); + detail::assure_in_range(static_cast(imm), 0U, instruction::maxstrlen - 1); + detail::assure_in_range(subsequence.size(), 1U, instruction::maxstrlen); + do_append(instruction{op, operands::str, static_cast(static_cast((static_cast(imm) << 8U) | static_cast(subsequence.size() - 1)))}); do { do_append(instruction{subsequence}); subsequence.remove_prefix((std::min)(std::size_t{4}, subsequence.size())); @@ -481,40 +493,43 @@ class encoder skip(!subject.empty() ? directives::eps : directives::none); if ((mode() & directives::caseless) != directives::none) return do_match(opcode::match_cf, utf8::tocasefold(subject)); - else - return do_match(opcode::match, subject); + return do_match(opcode::match, subject); } }; class instruction_length_evaluator final : public encoder { - std::ptrdiff_t length_; - void do_append(instruction) final { length_ = detail::checked_add(length_, std::ptrdiff_t{1}); } + std::ptrdiff_t length_{0}; + void do_append(instruction instr) final { std::ignore = instr; length_ = detail::checked_add(length_, std::ptrdiff_t{1}); } void do_append(program const& p) final { length_ = detail::checked_add(length_, static_cast(p.instructions.size())); } - bool do_should_evaluate_length() const noexcept final { return false; } - std::ptrdiff_t do_length() const noexcept final { return length_; } + [[nodiscard]] bool do_should_evaluate_length() const noexcept final { return false; } + [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return length_; } public: - explicit instruction_length_evaluator(directives initial) : encoder{initial}, length_{0} {} + explicit instruction_length_evaluator(directives initial) : encoder{initial} {} ~instruction_length_evaluator() final = default; + instruction_length_evaluator(instruction_length_evaluator const&) = delete; + instruction_length_evaluator(instruction_length_evaluator&&) = delete; + instruction_length_evaluator& operator=(instruction_length_evaluator const&) = delete; + instruction_length_evaluator& operator=(instruction_length_evaluator&&) = delete; }; class program_encoder : public encoder { program& program_; program_callees& callees_; - std::ptrdiff_t do_length() const noexcept final { return static_cast(program_.instructions.size()); } + [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return static_cast(program_.instructions.size()); } void do_append(instruction instr) final { program_.instructions.push_back(instr); } void do_append(program const& p) final { program_.concatenate(p); } void do_add_callee(rule const* r, program const* p, std::ptrdiff_t n, directives d) final { callees_.emplace_back(r, p, n, d); } - immediate do_add_rune_set(unicode::rune_set r) final { return add_item(program_.runesets, std::move(r)); } - immediate do_add_semantic_action(semantic_action a) final { return add_item(program_.actions, std::move(a)); } - immediate do_add_semantic_capture_action(semantic_capture_action a) final { return add_item(program_.captures, std::move(a)); } - immediate do_add_syntactic_predicate(syntactic_predicate p) final { return add_item(program_.predicates, std::move(p)); } + [[nodiscard]] immediate do_add_rune_set(unicode::rune_set&& r) final { return add_item(program_.runesets, std::move(r)); } + [[nodiscard]] immediate do_add_semantic_action(semantic_action&& a) final { return add_item(program_.actions, std::move(a)); } + [[nodiscard]] immediate do_add_semantic_capture_action(semantic_capture_action&& a) final { return add_item(program_.captures, std::move(a)); } + [[nodiscard]] immediate do_add_syntactic_predicate(syntactic_predicate&& p) final { return add_item(program_.predicates, std::move(p)); } template - immediate add_item(std::vector& items, Item&& item) + [[nodiscard]] immediate add_item(std::vector& items, Item&& item) { - detail::assure_in_range(items.size(), 0u, (std::numeric_limits::max)() - 1u); + detail::assure_in_range(items.size(), 0U, (std::numeric_limits::max)() - 1U); items.push_back(std::forward(item)); return static_cast(items.size() - 1); } @@ -522,6 +537,10 @@ class program_encoder : public encoder public: program_encoder(program& p, program_callees& c, directives initial) : encoder{initial}, program_{p}, callees_{c} {} ~program_encoder() override { program_.mandate = mandate(); } + program_encoder(program_encoder const&) = delete; + program_encoder(program_encoder&&) = delete; + program_encoder& operator=(program_encoder const&) = delete; + program_encoder& operator=(program_encoder&&) = delete; }; class rule_encoder final : public program_encoder @@ -530,10 +549,14 @@ class rule_encoder final : public program_encoder public: explicit rule_encoder(rule& r) : program_encoder{r.program_, r.callees_, directives::eps}, rule_{r} { rule_.currently_encoding_ = true; } ~rule_encoder() final { rule_.currently_encoding_ = false; } + rule_encoder(rule_encoder const&) = delete; + rule_encoder(rule_encoder&&) = delete; + rule_encoder& operator=(rule_encoder const&) = delete; + rule_encoder& operator=(rule_encoder&&) = delete; }; template -inline auto&& add_rune_range(RuneSet&& runes, directives mode, char32_t first, char32_t last) +inline decltype(auto) add_rune_range(RuneSet&& runes, directives mode, char32_t first, char32_t last) { if (first > last) throw bad_character_range{}; @@ -541,7 +564,7 @@ inline auto&& add_rune_range(RuneSet&& runes, directives mode, char32_t first, c unicode::push_casefolded_range(runes, first, last); else unicode::push_range(runes, first, last); - return std::move(runes); + return std::forward(runes); } struct terminal_encoder_expression_interface @@ -553,20 +576,22 @@ template struct unary_encoder_expression_interface { using expression_trait = encoder_expression_trait_tag; E1 e1; - template constexpr explicit unary_encoder_expression_interface(X1&& x1) noexcept : e1(std::forward(x1)) {} + template >> + constexpr explicit unary_encoder_expression_interface(X1&& x1) : e1(std::forward(x1)) {} }; template struct binary_encoder_expression_interface { using expression_trait = encoder_expression_trait_tag; E1 e1; E2 e2; - template constexpr binary_encoder_expression_interface(X1&& x1, X2&& x2) noexcept : e1(std::forward(x1)), e2(std::forward(x2)) {} + template && std::is_constructible_v>> + constexpr binary_encoder_expression_interface(X1&& x1, X2&& x2) : e1(std::forward(x1)), e2(std::forward(x2)) {} }; class basic_regular_expression : public terminal_encoder_expression_interface { - std::string const expression_; - std::shared_ptr const program_; + std::string expression_; + std::shared_ptr program_; [[nodiscard]] static grammar make_grammar(); @@ -654,16 +679,16 @@ struct char32_range_expression : terminal_encoder_expression_interface { char32_t start; char32_t end; - constexpr char32_range_expression(char32_t s, char32_t e) noexcept : start{s}, end{e} {} + constexpr char32_range_expression(char32_t first, char32_t last) noexcept : start{first}, end{last} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(unicode::sort_and_optimize(add_rune_range(unicode::rune_set{}, d.mode(), start, end))); return m; } }; template struct callable_expression : terminal_encoder_expression_interface { - Target& target; + std::reference_wrapper target; constexpr explicit callable_expression(Target& t) noexcept : target{t} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target, 0); } + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), 0); } }; template struct is_callable_encoder_expression : std::false_type {}; @@ -674,7 +699,7 @@ template struct predicate_expression : terminal_encoder_expression_interface { Pred pred; - template constexpr explicit predicate_expression(P&& p) noexcept(std::is_nothrow_constructible_v) : pred(std::forward

(p)) {} + template >> constexpr explicit predicate_expression(P&& p) noexcept(std::is_nothrow_constructible_v) : pred(std::forward

(p)) {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::predicate, syntactic_predicate{pred}); return m; } }; @@ -695,7 +720,7 @@ template && is_ex else if constexpr (std::is_same_v, char32_t>) return char32_expression{std::forward(e)}; else if constexpr (std::is_convertible_v, std::string_view>) - return string_expression{std::forward(e)}; + return string_expression{std::forward(e)}; // NOLINT(cppcoreguidelines-pro-bounds-array-to-pointer-decay,hicpp-no-array-decay) else if constexpr (std::is_invocable_r_v, environment&>) return predicate_expression{std::forward(e)}; else @@ -703,9 +728,9 @@ template && is_ex } template >> -[[nodiscard]] constexpr auto make_space_expression(E&& e) +[[nodiscard]] constexpr auto make_space_expression(E const& e) { - return [x = make_expression(std::forward(e))](encoder& d) { (void)x(d, encoder_metadata{}); }; + return [x = make_expression(e)](encoder& d) { (void)x(d, encoder_metadata{}); }; } template @@ -738,10 +763,10 @@ inline rule::rule(rule const& r) struct rule_precedence_expression : terminal_encoder_expression_interface { - rule const& target; + std::reference_wrapper target; unsigned short precedence; - constexpr rule_precedence_expression(rule const& t, unsigned short p) noexcept : target{t}, precedence{p} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target, precedence); } + rule_precedence_expression(rule const& t, unsigned short p) noexcept : target{t}, precedence{p} {} + template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), precedence); } }; [[nodiscard]] inline auto rule::operator[](unsigned short precedence) const noexcept @@ -779,7 +804,7 @@ inline constexpr directive_modifier skip_after{}; inline constexpr directive_modifier skip_before{}; -struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder&, M const& m) const -> M const& { return m; } }; +struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{1}); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; @@ -792,7 +817,7 @@ struct match_class_combinator struct match_class_expression : terminal_encoder_expression_interface { Property property; - constexpr match_class_expression(Property p) noexcept : property{p} {} + constexpr explicit match_class_expression(Property p) noexcept : property{p} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_class(property); return m; } }; @@ -817,7 +842,7 @@ struct condition_test_combinator struct condition_test_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr condition_test_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit condition_test_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, immediate{Value ? 1 : 0}); return m; } }; @@ -863,7 +888,7 @@ struct symbol_exists_combinator struct symbol_exists_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr symbol_exists_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit symbol_exists_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, immediate{Value ? 1 : 0}); return m; } }; @@ -876,7 +901,7 @@ struct symbol_match_combinator struct symbol_match_expression : terminal_encoder_expression_interface { std::string_view name; - constexpr symbol_match_expression(std::string_view n) noexcept : name{n} {} + constexpr explicit symbol_match_expression(std::string_view n) noexcept : name{n} {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name); return m; } }; @@ -975,7 +1000,7 @@ template struct attribute_action_expression : unary_encoder_expression_interface { Operand operand; - template constexpr attribute_action_expression(X1&& x1, O&& o) noexcept : unary_encoder_expression_interface{std::forward(x1)}, operand(std::forward(o)) {} + template constexpr attribute_action_expression(X1&& x1, O&& o) : unary_encoder_expression_interface{std::forward(x1)}, operand(std::forward(o)) {} template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const @@ -990,17 +1015,17 @@ struct attribute_action_expression : unary_encoder_expression_interface }; template -struct attribute_bind_to_expression : attribute_action_expression +struct attribute_bind_to_expression : attribute_action_expression { - using attribute_action_expression::attribute_action_expression; - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::operator()(d, m)).attribute_frame, std::forward_as_tuple(this->operand))}; } + using attribute_action_expression::attribute_action_expression; + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::operator()(d, m)).attribute_frame, std::forward_as_tuple(*(this->operand)))}; } }; template struct action_expression : attribute_action_expression, E1, Action> { using attribute_action_expression, E1, Action>::attribute_action_expression; - constexpr void do_prologue(encoder&) const {} + constexpr void do_prologue(encoder& /*d*/) const {} constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[a = this->operand](environment& envr) { a(detail::dynamic_cast_if_base_of{envr}); }}); } template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, a = this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); a(detail::dynamic_cast_if_base_of{envr}); }}); } }; @@ -1018,9 +1043,9 @@ template struct assign_to_expression : attribute_bind_to_expression, E1, Target> { using attribute_bind_to_expression, E1, Target>::attribute_bind_to_expression; - constexpr void do_prologue(encoder&) const {} - constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[t = &this->operand](environment& envr) { *t = envr.pop_attribute(); }}); } - template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, t = &this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); *t = envr.pop_attribute(); }}); } + constexpr void do_prologue(encoder& /*d*/) const {} + constexpr void do_epilogue(encoder& d) const { d.encode(opcode::action, semantic_action{[t = this->operand](environment& envr) { *t = envr.pop_attribute(); }}); } + template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::action, semantic_action{[f = m.attribute_frame, t = this->operand](environment& envr) mutable { envr.pop_attribute_frame(f); *t = envr.pop_attribute(); }}); } }; template @@ -1028,8 +1053,8 @@ struct capture_to_expression : attribute_bind_to_expression, E1, Target>::attribute_bind_to_expression; constexpr void do_prologue(encoder& d) const { d.skip().encode(opcode::capture_start); } - constexpr void do_epilogue(encoder& d) const { d.encode(opcode::capture_end, semantic_capture_action{[t = &this->operand](environment&, syntax const& sx) { *t = sx; }}); } - template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::capture_end, semantic_capture_action{[f = m.attribute_frame, t = &this->operand](environment& envr, syntax const& sx) mutable { envr.pop_attribute_frame(f); *t = sx; }}); } + constexpr void do_epilogue(encoder& d) const { d.encode(opcode::capture_end, semantic_capture_action{[t = this->operand](environment&, syntax const& sx) { *t = sx; }}); } + template constexpr void do_epilogue_inlined(encoder& d, M const& m) const { d.encode(opcode::capture_end, semantic_capture_action{[f = m.attribute_frame, t = this->operand](environment& envr, syntax const& sx) mutable { envr.pop_attribute_frame(f); *t = sx; }}); } }; template @@ -1093,8 +1118,8 @@ template choice_expression(X1&&, X2&&) -> choice_expression template sequence_expression(X1&&, X2&&) -> sequence_expression, std::decay_t>; template action_expression(X1&&, Action&&) -> action_expression, std::decay_t>; template capture_expression(X1&&, Action&&) -> capture_expression, std::decay_t>; -template assign_to_expression(X1&&, Target&) -> assign_to_expression, Target>; -template capture_to_expression(X1&&, Target&) -> capture_to_expression, Target>; +template assign_to_expression(X1&&, Target*) -> assign_to_expression, Target>; +template capture_to_expression(X1&&, Target*) -> capture_to_expression, Target>; template symbol_assign_expression(X1&&, std::string_view) -> symbol_assign_expression>; template symbol_block_expression(X1&&) -> symbol_block_expression>; template local_block_expression(X1&&) -> local_block_expression>; @@ -1163,12 +1188,12 @@ inline namespace operators { [[nodiscard]] inline auto operator ""_srx(char const* s, std::size_t n) { return cased[basic_regular_expression{std::string_view{s, n}}]; } template >> [[nodiscard]] constexpr auto operator!(E const& e) { return negative_lookahead_expression{matches_eps[e]}; } -template >> [[nodiscard]] constexpr auto operator&(E const& e) { return positive_lookahead_expression{matches_eps[e]}; } +template >> [[nodiscard]] constexpr auto operator&(E const& e) { return positive_lookahead_expression{matches_eps[e]}; } // NOLINT(google-runtime-operator) template >> [[nodiscard]] constexpr auto operator*(E const& e) { return repetition_expression{matches_eps[skip_after[e]]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator|(E1 const& e1, E2 const& e2) { return choice_expression{relays_eps[e1], relays_eps[e2]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator>(E1 const& e1, E2 const& e2) { return sequence_expression{make_expression(e1), skip_before[e2]}; } template && is_expression_v>> [[nodiscard]] constexpr auto operator>>(E1 const& e1, E2 const& e2) { return e1 > *(e2 > e1); } -template >> [[nodiscard]] constexpr auto operator%(T& target, E const& e) { return assign_to_expression{make_expression(e), target}; } +template >> [[nodiscard]] constexpr auto operator%(T& target, E const& e) { return assign_to_expression{make_expression(e), std::addressof(target)}; } template >> [[nodiscard]] constexpr auto operator+(E const& e) { auto x{make_expression(e)}; return x > *x; } template >> [[nodiscard]] constexpr auto operator~(E const& e) { return e | eps; } template >> [[nodiscard]] constexpr auto operator--(E const& e) { return cut > e; } @@ -1204,7 +1229,7 @@ inline constexpr struct template struct capture_to { - Target& target; + Target* target; template >> [[nodiscard]] constexpr auto operator[](E const& e) const noexcept { return capture_to_expression{make_expression(e), target}; } }; template @@ -1213,7 +1238,7 @@ inline constexpr struct Action action; template >> [[nodiscard]] constexpr auto operator[](E const& e) const noexcept { return e < action; } }; - template >> [[nodiscard]] constexpr capture_to operator()(Target& t) const noexcept { return capture_to{t}; } + template >> [[nodiscard]] constexpr capture_to operator()(Target& t) const noexcept { return capture_to{std::addressof(t)}; } template >> [[nodiscard]] constexpr capture_with> operator()(Action&& a) const noexcept { return capture_with>{std::forward(a)}; } } capture{}; @@ -1251,14 +1276,33 @@ class implicit_space_rule { std::function prev_rule_; std::weak_ptr> implicit_space_ref_; + public: - template >> implicit_space_rule(E const& e) : prev_rule_{std::exchange(*grammar::implicit_space, std::function{make_space_expression(e)})}, implicit_space_ref_{grammar::implicit_space} {} - ~implicit_space_rule() { if (auto const implicit_space = implicit_space_ref_.lock(); implicit_space) { *implicit_space = std::move(prev_rule_); } } + template >> + implicit_space_rule(E const& e) // NOLINT(google-explicit-constructor,hicpp-explicit-conversions) + : prev_rule_{std::exchange(*grammar::implicit_space(), std::function{make_space_expression(e)})} + , implicit_space_ref_{grammar::implicit_space()} + {} + + ~implicit_space_rule() + { + if (auto const implicit_space_instance = implicit_space_ref_.lock(); implicit_space_instance) + *implicit_space_instance = std::move(prev_rule_); + } + + implicit_space_rule(implicit_space_rule const&) = delete; + implicit_space_rule(implicit_space_rule&&) = delete; + implicit_space_rule& operator=(implicit_space_rule const&) = delete; + implicit_space_rule& operator=(implicit_space_rule&&) = delete; }; } // namespace language -inline thread_local std::shared_ptr> const grammar::implicit_space{std::make_shared>(make_space_expression(language::operator*(language::space)))}; +[[nodiscard]] inline std::shared_ptr> const& grammar::implicit_space() +{ + static thread_local std::shared_ptr> const instance{std::make_shared>(make_space_expression(language::operator*(language::space)))}; + return instance; +} [[nodiscard]] inline grammar start(rule const& start_rule) { @@ -1278,11 +1322,14 @@ inline thread_local std::shared_ptr> const grammar grprogram.concatenate(*subprogram); grprogram.instructions.emplace_back(opcode::ret, operands::none, immediate{0}); if (auto top_rule = callstack.back().first; top_rule) { - for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { + for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { // NOLINT(performance-for-range-copy) calls.emplace_back(callee_program, address + instr_offset); - if (callee_rule && (mode & directives::eps) != directives::none && detail::escaping_find_if( - callstack.crbegin(), callstack.crend(), [rule = callee_rule](auto& caller) { - return caller.first == rule ? 1 : (caller.second ? 0 : -1); }) != callstack.crend()) { + if ((callee_rule != nullptr) && ((mode & directives::eps) != directives::none) && + detail::escaping_find_if(callstack.crbegin(), callstack.crend(), [callee = callee_rule](auto const& caller) { + if (caller.first == callee) + return 1; + return (caller.second ? 0 : -1); + }) != callstack.crend()) { left_recursive.insert(callee_program); } else { auto callee_callstack = callstack; @@ -1295,7 +1342,7 @@ inline thread_local std::shared_ptr> const grammar } while (!unprocessed.empty()); for (auto [subprogram, instr_addr] : calls) { if (auto& iprefix = grprogram.instructions[static_cast(instr_addr)]; iprefix.pf.op == opcode::call) - iprefix.pf.val = left_recursive.count(subprogram) != 0 ? (iprefix.pf.val != 0 ? iprefix.pf.val : 1) : 0; + iprefix.pf.val = (left_recursive.count(subprogram) != 0) ? (std::max)(iprefix.pf.val, static_cast(1)) : 0; auto& ioffset = grprogram.instructions[static_cast(instr_addr + 1)]; auto const rel_addr = ioffset.off + addresses[subprogram] - (instr_addr + 2); detail::assure_in_range(rel_addr, std::numeric_limits::lowest(), (std::numeric_limits::max)()); @@ -1304,7 +1351,7 @@ inline thread_local std::shared_ptr> const grammar return grammar{std::move(grprogram)}; } -enum class source_options : unsigned int { none = 0, interactive = 0x01, is_bitfield_enum }; +enum class source_options : std::uint_least8_t { none = 0, interactive = 0x01, is_bitfield_enum }; namespace detail { @@ -1386,7 +1433,7 @@ class string_view_input_source struct parser_registers { - std::size_t sr, mr, rc; std::ptrdiff_t pc; std::size_t fc; + std::size_t sr{0}; std::size_t mr{0}; std::size_t rc{0}; std::ptrdiff_t pc{0}; std::size_t fc{0}; [[nodiscard]] auto as_tuple() noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } [[nodiscard]] auto as_tuple() const noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } }; @@ -1396,11 +1443,12 @@ class basic_parser { enum class stack_frame_type : unsigned char { backtrack, call, capture, condition, lrcall, symbol_definition, symbol_table }; enum class subject_location : std::size_t {}; - struct lrmemo { std::size_t srr, sra, prec; std::ptrdiff_t pcr, pca; std::size_t rcr; std::vector responses; }; - static inline constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); - static inline constexpr std::size_t max_size = (std::numeric_limits::max)(); - lug::grammar const& grammar_; - lug::environment& environment_; + struct response { unsigned short call_depth; unsigned short action_index; syntax_range range; }; + struct lrmemo { std::size_t srr{0}; std::size_t sra{0}; std::size_t prec{0}; std::ptrdiff_t pcr{0}; std::ptrdiff_t pca{0}; std::size_t rcr{0}; std::vector responses; }; + static constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); + static constexpr std::size_t max_size = (std::numeric_limits::max)(); + lug::grammar const* grammar_; + lug::environment* environment_; InputSource input_source_; std::unordered_map casefolded_subjects_; parser_registers registers_{0, 0, 0, 0, 0}; @@ -1414,7 +1462,7 @@ class basic_parser std::vector> symbol_definition_stack_; // name, sr std::vector>> symbol_table_stack_; std::vector lrmemo_stack_; - std::vector responses_; + std::vector responses_; [[nodiscard]] bool available(std::size_t sr, std::size_t sn) { @@ -1440,16 +1488,16 @@ class basic_parser [[nodiscard]] bool casefold_compare(std::size_t sr, std::size_t sn, std::string_view str) { - auto& subject = casefolded_subjects_[sr]; + std::string& subject = casefolded_subjects_[sr]; if (subject.size() < sn) subject = utf8::tocasefold(input_source_.buffer().substr(sr, sn)); return subject.compare(0, sn, str) == 0; } template - [[nodiscard]] bool match_sequence(std::size_t& sr, std::string_view str, Compare&& comp) + [[nodiscard]] bool match_sequence(std::size_t& sr, std::string_view str, Compare const& comp) { - if (auto sn = str.size(); !sn || (available(sr, sn) && comp(*this, sr, sn, str))) { + if (std::size_t const sn = str.size(); !sn || (available(sr, sn) && comp(*this, sr, sn, str))) { sr += sn; return true; } @@ -1457,7 +1505,7 @@ class basic_parser } template - [[nodiscard]] bool match_single(std::size_t& sr, Match&& match) + [[nodiscard]] bool match_single(std::size_t& sr, Match const& match) { if (!available(sr, 1)) return false; @@ -1465,7 +1513,7 @@ class basic_parser auto const curr = buffer.cbegin() + static_cast(sr); auto const last = buffer.cend(); auto [next, rune] = utf8::decode_rune(curr, last); - bool matched; + bool matched = false; if constexpr (std::is_invocable_v) { matched = match(curr, last, next, rune); } else if constexpr(std::is_invocable_v) { @@ -1474,7 +1522,7 @@ class basic_parser matched = match(rune); } else { matched = match(); - detail::ignore(rune); + std::ignore = rune; } if (matched) sr += static_cast(std::distance(curr, next)); @@ -1482,9 +1530,9 @@ class basic_parser } template - [[nodiscard]] bool match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify&& mod, Compare&& comp) + [[nodiscard]] bool match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); if (std::size_t tsr = sr; std::all_of(symbols.begin(), symbols.end(), [&tsr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(tsr, mod(symbol), comp); })) { sr = tsr; return true; @@ -1493,23 +1541,23 @@ class basic_parser } template - [[nodiscard]] bool match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify&& mod, Compare&& comp) + [[nodiscard]] bool match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return std::any_of(symbols.begin(), symbols.end(), [&sr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(sr, mod(symbol), comp); }); } template [[nodiscard]] bool match_symbol_head(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbol_index]), std::forward(comp)) : false; } template [[nodiscard]] bool match_symbol_tail(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { - auto const& symbols = environment_.get_symbols(symbol_name); + auto const& symbols = environment_->get_symbols(symbol_name); return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : false; } @@ -1521,7 +1569,7 @@ class basic_parser if constexpr (Opcode == opcode::commit_partial) { detail::make_tuple_view<0, 1>(backtrack_stack_.back()) = {sr, rc}; } else { - detail::ignore(sr, rc); + std::ignore = std::tie(sr, rc); if constexpr (Opcode == opcode::commit_back) sr = std::get<0>(backtrack_stack_.back()); pop_stack_frame(backtrack_stack_); @@ -1534,24 +1582,24 @@ class basic_parser { registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; auto const full_match = match(); - auto const prior_call_depth = environment_.start_accept(full_match, subject()); - detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_.end_accept(prior_call_depth); }}; - auto const& actions = grammar_.program().actions; - auto const& captures = grammar_.program().captures; - for (auto& response : responses_) { - if (environment_.prune_depth() <= response.call_depth) + auto const prior_call_depth = environment_->start_accept(full_match, subject()); + detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_->end_accept(prior_call_depth); }}; + auto const& actions = grammar_->program().actions; + auto const& captures = grammar_->program().captures; + for (auto& resp : responses_) { + if (environment_->prune_depth() <= resp.call_depth) continue; - environment_.reset_call_depth(response.call_depth); - if (response.range.index < max_size) - captures[response.action_index](environment_, syntax{full_match.substr(response.range.index, response.range.size), response.range.index}); + environment_->reset_call_depth(resp.call_depth); + if (resp.range.index < max_size) + captures[resp.action_index](*environment_, syntax{full_match.substr(resp.range.index, resp.range.size), resp.range.index}); else - actions[response.action_index](environment_); + actions[resp.action_index](*environment_); } } [[nodiscard]] auto drain() { - environment_.reset_origin(); + environment_->reset_origin(); input_source_.drain_buffer(registers_.sr); casefolded_subjects_.clear(); responses_.clear(); @@ -1569,7 +1617,7 @@ class basic_parser [[nodiscard]] auto drop_responses_after(std::size_t n) { - std::vector dropped; + std::vector dropped; if (n < responses_.size()) { dropped.assign(responses_.begin() + static_cast(n), responses_.end()); responses_.resize(n); @@ -1577,7 +1625,7 @@ class basic_parser return dropped; } - [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) + [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) { pop_responses_after(n); responses_.insert(responses_.end(), restore.begin(), restore.end()); @@ -1601,18 +1649,18 @@ class basic_parser } public: - basic_parser(lug::grammar const& g, lug::environment& e) : grammar_{g}, environment_{e} {} - [[nodiscard]] lug::grammar const& grammar() const noexcept { return grammar_; } - [[nodiscard]] lug::environment& environment() const noexcept { return environment_; } + basic_parser(lug::grammar const& g, lug::environment& e) : grammar_{&g}, environment_{&e} {} + [[nodiscard]] lug::grammar const& grammar() const noexcept { return *grammar_; } + [[nodiscard]] lug::environment& environment() const noexcept { return *environment_; } [[nodiscard]] std::string_view match() const noexcept { return input_source_.buffer().substr(0, registers_.sr); } [[nodiscard]] std::string_view subject() const noexcept { return input_source_.buffer().substr(registers_.sr, input_source_.buffer().size() - registers_.sr); } [[nodiscard]] std::size_t subject_index() const noexcept { return registers_.sr; } [[nodiscard]] std::size_t max_subject_index() const noexcept { return registers_.mr; } - [[nodiscard]] syntax_position subject_position() { return environment_.position_at(registers_.sr); } - [[nodiscard]] syntax_position max_subject_position() { return environment_.position_at(registers_.mr); } - [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_.position_at(index); } - [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_.position_at(range.index); } - [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_.position_at(range.index + range.size); } + [[nodiscard]] syntax_position subject_position() { return environment_->position_at(registers_.sr); } + [[nodiscard]] syntax_position max_subject_position() { return environment_->position_at(registers_.mr); } + [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_->position_at(index); } + [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_->position_at(range.index); } + [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_->position_at(range.index + range.size); } [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } [[nodiscard]] parser_registers& registers() noexcept { return registers_; } [[nodiscard]] parser_registers const& registers() const noexcept { return registers_; } @@ -1649,12 +1697,14 @@ class basic_parser bool parse() { detail::reentrancy_sentinel const guard{parsing_}; - program const& prog = grammar_.program(); + program const& prog = grammar_->program(); if (prog.instructions.empty()) throw bad_grammar{}; auto [sr, mr, rc, pc, fc] = drain(); - bool result = false, done = false; - pc = 0, fc = 0; + bool result = false; + bool done = false; + pc = 0; + fc = 0; while (!done) { auto [op, imm, off, str] = instruction::decode(prog.instructions, pc); switch (op) { @@ -1668,7 +1718,7 @@ class basic_parser } break; case opcode::match_any: { if constexpr (detail::input_source_has_options::value) { - if (((imm & 0x8000) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) + if (((imm & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) goto failure; } if (!match_single(sr, []{ return true; })) @@ -1723,16 +1773,19 @@ class basic_parser } break; case opcode::call: { if (imm != 0) { - auto const memo = detail::escaping_find_if(lrmemo_stack_.crbegin(), lrmemo_stack_.crend(), - [srr = sr, pca = pc + off](auto const& m){ return m.srr == srr && m.pca == pca ? 1 : (m.srr < srr ? 0 : -1); }); + auto const memo = detail::escaping_find_if(lrmemo_stack_.crbegin(), lrmemo_stack_.crend(), [srr = sr, pca = pc + off](auto const& m) { + if ((m.srr == srr) && (m.pca == pca)) + return 1; + return ((m.srr < srr) ? 0 : -1); + }); if (memo != lrmemo_stack_.crend()) { - if (memo->sra == lrfailcode || imm < memo->prec) + if ((memo->sra == lrfailcode) || (imm < memo->prec)) goto failure; sr = memo->sra, rc = restore_responses_after(rc, memo->responses); continue; } stack_frames_.push_back(stack_frame_type::lrcall); - lrmemo_stack_.push_back({sr, lrfailcode, imm, pc, pc + off, rc, std::vector{}}); + lrmemo_stack_.push_back({sr, lrfailcode, imm, pc, pc + off, rc, std::vector{}}); } else { stack_frames_.push_back(stack_frame_type::call); call_stack_.push_back(pc); @@ -1749,7 +1802,7 @@ class basic_parser } break; case stack_frame_type::lrcall: { auto& memo = lrmemo_stack_.back(); - if (memo.sra == lrfailcode || sr > memo.sra) { + if ((memo.sra == lrfailcode) || (sr > memo.sra)) { memo.sra = sr, memo.responses = drop_responses_after(memo.rcr); sr = memo.srr, pc = memo.pca, rc = memo.rcr; continue; @@ -1781,7 +1834,7 @@ class basic_parser } break; case stack_frame_type::condition: { auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_.set_condition(cond_name, cond_value); + environment_->set_condition(cond_name, cond_value); pop_stack_frame(condition_stack_), ++fc; } break; case stack_frame_type::lrcall: { @@ -1795,7 +1848,7 @@ class basic_parser pop_stack_frame(symbol_definition_stack_), ++fc; } break; case stack_frame_type::symbol_table: { - environment_.symbols_.swap(symbol_table_stack_.back()); + environment_->symbols_.swap(symbol_table_stack_.back()); pop_stack_frame(symbol_table_stack_), ++fc; } break; default: break; @@ -1804,7 +1857,7 @@ class basic_parser pop_responses_after(rc); } break; case opcode::accept: { - if (cut_deferred_ = !capture_stack_.empty() || !lrmemo_stack_.empty(); !cut_deferred_) { + if (cut_deferred_ = (!capture_stack_.empty() || !lrmemo_stack_.empty()); !cut_deferred_) { accept(sr, mr, rc, pc); std::tie(sr, mr, rc, pc, std::ignore) = drain(); } @@ -1818,8 +1871,8 @@ class basic_parser } break; case opcode::predicate: { registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; - environment_.reset_match_and_subject(match(), subject()); - bool const accepted = prog.predicates[imm](environment_); + environment_->reset_match_and_subject(match(), subject()); + bool const accepted = prog.predicates[imm](*environment_); std::tie(sr, mr, rc, pc, fc) = registers_.as_tuple(); pop_responses_after(rc); if (!accepted) @@ -1832,29 +1885,30 @@ class basic_parser case opcode::capture_end: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::capture)) goto failure; - auto const sr0 = static_cast(capture_stack_.back()), sr1 = sr; + auto const sr0 = static_cast(capture_stack_.back()); + auto const sr1 = sr; pop_stack_frame(capture_stack_, sr, mr, rc, pc); if (sr0 > sr1) goto failure; rc = push_response(call_stack_.size() + lrmemo_stack_.size(), imm, {sr0, sr1 - sr0}); } break; case opcode::condition_test: { - if (environment_.has_condition(str) != (imm != 0)) + if (environment_->has_condition(str) != (imm != 0)) goto failure; } break; case opcode::condition_push: { stack_frames_.push_back(stack_frame_type::condition); - condition_stack_.emplace_back(str, environment_.set_condition(str, imm != 0)); + condition_stack_.emplace_back(str, environment_->set_condition(str, imm != 0)); } break; case opcode::condition_pop: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::condition)) goto failure; auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_.set_condition(cond_name, cond_value); + environment_->set_condition(cond_name, cond_value); pop_stack_frame(condition_stack_); } break; case opcode::symbol_exists: { - if (environment_.has_symbol(str) != (imm != 0)) + if (environment_->has_symbol(str) != (imm != 0)) goto failure; } break; case opcode::symbol_all: { @@ -1897,24 +1951,25 @@ class basic_parser if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_definition)) goto failure; auto const [symbol_name, symbol_sr] = symbol_definition_stack_.back(); - auto const sr0 = static_cast(symbol_sr), sr1 = sr; + auto const sr0 = static_cast(symbol_sr); + auto const sr1 = sr; pop_stack_frame(symbol_definition_stack_); if (sr0 > sr1) goto failure; - environment_.add_symbol(symbol_name, std::string{match().substr(sr0, sr1 - sr0)}); + environment_->add_symbol(symbol_name, std::string{match().substr(sr0, sr1 - sr0)}); } break; case opcode::symbol_push: { stack_frames_.push_back(stack_frame_type::symbol_table); - symbol_table_stack_.emplace_back(environment_.symbols_); + symbol_table_stack_.emplace_back(environment_->symbols_); if (imm == 1) - environment_.symbols_.erase(str); + environment_->symbols_.erase(str); else if (imm == 2) - environment_.symbols_.clear(); + environment_->symbols_.clear(); } break; case opcode::symbol_pop: { if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_table)) goto failure; - environment_.symbols_.swap(symbol_table_stack_.back()); + environment_->symbols_.swap(symbol_table_stack_.back()); pop_stack_frame(symbol_table_stack_); } break; default: registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; throw bad_opcode{}; @@ -1982,15 +2037,17 @@ LUG_DIAGNOSTIC_PUSH_AND_IGNORE [[nodiscard]] inline grammar basic_regular_expression::make_grammar() { using namespace language; - implicit_space_rule default_space = nop; - rule Empty = eps <[](generator& g) { g.encoder.match_eps(); }; - rule Dot = chr('.') <[](generator& g) { g.encoder.match_any(); }; - rule Element = any > chr('-') > !chr(']') > any <[](generator& g, syntax const& x) { g.bracket_range(x.capture()); } - | str("[:") > +(!chr(':') > any) > str(":]") <[](generator& g, syntax const& x) { g.bracket_class(x.capture().substr(2, x.range().size - 4)); } - | any <[](generator& g, syntax const& x) { g.bracket_range(x.capture(), x.capture()); }; - rule Bracket = chr('[') > ~(chr('^') <[](generator& g) { g.circumflex = true; }) - > Element > *(!chr(']') > Element) > chr(']') <[](generator& g) { g.bracket_commit(); }; - rule Sequence = +(!(chr('.') | chr('[')) > any) <[](generator& g, syntax const& x) { g.encoder.match(x.capture()); }; + implicit_space_rule const default_space = nop; + // NOLINTBEGIN(bugprone-chained-comparison) + rule const Empty = eps <[](generator& g) { g.encoder.match_eps(); }; + rule const Dot = chr('.') <[](generator& g) { g.encoder.match_any(); }; + rule const Element = any > chr('-') > !chr(']') > any <[](generator& g, syntax const& x) { g.bracket_range(x.str()); } + | str("[:") > +(!chr(':') > any) > str(":]") <[](generator& g, syntax const& x) { g.bracket_class(x.str().substr(2, x.range().size - 4)); } + | any <[](generator& g, syntax const& x) { g.bracket_range(x.str(), x.str()); }; + rule const Bracket = chr('[') > ~(chr('^') <[](generator& g) { g.circumflex = true; }) + > Element > *(!chr(']') > Element) > chr(']') <[](generator& g) { g.bracket_commit(); }; + rule const Sequence = +(!(chr('.') | chr('[')) > any) <[](generator& g, syntax const& x) { g.encoder.match(x.str()); }; + // NOLINTEND(bugprone-chained-comparison) return start((+(Dot | Bracket | Sequence) | Empty) > eoi); } diff --git a/samples/basic/basic.cpp b/samples/basic/basic.cpp index 335f937..c3567fa 100644 --- a/samples/basic/basic.cpp +++ b/samples/basic/basic.cpp @@ -37,12 +37,12 @@ class basic_interpreter rule NL = lexeme["\n"_sx | "\r\n" | "\r"]; rule Delim = lexeme[","_sx | ";"]; - rule LineNo = lexeme[capture(stx_)[+"[0-9]"_rx]] <[this]{ return std::stoi(std::string{stx_}); }; - rule Real = lexeme[capture(stx_)[+"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) - > ~("[Ee]"_rx > ~"[+-]"_rx > +"[0-9]"_rx)]] <[this]{ return std::stod(std::string{stx_}); }; - rule String = lexeme["\"" > capture(stx_)[*"[^\"]"_rx] > "\""] <[this]{ return stx_.capture(); }; - rule Var = lexeme[capture(stx_)["[A-Za-z]"_rx > ~"[0-9]"_rx]] <[this]{ return lug::utf8::toupper(stx_); }; - rule Fn = lexeme["FN"_isx > capture(stx_)["[A-Za-z]"_rx]] <[this]{ return lug::utf8::toupper(stx_); }; + rule LineNo = lexeme[capture(tok_)[+"[0-9]"_rx]] <[this]{ return std::stoi(std::string{tok_}); }; + rule Real = lexeme[capture(tok_)[+"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) + > ~("[Ee]"_rx > ~"[+-]"_rx > +"[0-9]"_rx)]] <[this]{ return std::stod(std::string{tok_}); }; + rule String = lexeme["\"" > capture(tok_)[*"[^\"]"_rx] > "\""] <[this]{ return tok_.str(); }; + rule Var = lexeme[capture(tok_)["[A-Za-z]"_rx > ~"[0-9]"_rx]] <[this]{ return lug::utf8::toupper(tok_); }; + rule Fn = lexeme["FN"_isx > capture(tok_)["[A-Za-z]"_rx]] <[this]{ return lug::utf8::toupper(tok_); }; rule RelOp = "=" <[]() -> RelOpFn { return [](double x, double y) { return x == y; }; } | ">=" <[]() -> RelOpFn { return std::isgreaterequal; } @@ -110,7 +110,7 @@ class basic_interpreter | "GOTO"_isx > no_%LineNo <[this]{ goto_line(no_); } | "DEF"_isx > fn_%Fn > "(" > id_%Var > ")" - > "=" > capture(stx_)[*(!NL > any)] <[this]{ fn_param_body_[fn_] = { id_, std::string{stx_} }; } + > "=" > capture(tok_)[*(!NL > any)] <[this]{ fn_param_body_[fn_] = { id_, std::string{tok_} }; } | "LET"_isx > ref_%Ref > "=" > r1_%Expr <[this]{ *ref_ = r1_; } | "DIM"_isx > DimEl > *(Delim > DimEl) | "RESTORE"_isx <[this]{ read_itr_ = data_.cbegin(); } @@ -137,7 +137,7 @@ class basic_interpreter rule Line = Stmnt > ~Rem > NL | Cmnd > ~Rem > NL | no_%LineNo - > capture(stx_)[*(!NL > any) > NL] <[this]{ update_line(no_, stx_); } + > capture(tok_)[*(!NL > any) > NL] <[this]{ update_line(no_, tok_); } | Rem > NL | NL | ( *(!NL > any) > NL ) <[this]{ print_error("ILLEGAL FORMULA"); }; @@ -423,7 +423,7 @@ class basic_interpreter lug::environment environment_; std::string fn_; std::string id_; - lug::syntax stx_; + lug::syntax tok_; std::string_view txt_; double r1_{0.0}; double r2_{0.0}; diff --git a/samples/calc/calc.cpp b/samples/calc/calc.cpp index 8581763..e0acd09 100644 --- a/samples/calc/calc.cpp +++ b/samples/calc/calc.cpp @@ -11,9 +11,8 @@ namespace samples::calc { using namespace lug::language; -lug::syntax m; -double e, l, n, r, s; int i; +double e, l, n, r, s; double v[26]; extern rule Expr; @@ -21,22 +20,24 @@ extern rule Expr; implicit_space_rule BLANK = lexeme[ *"[ \t]"_rx ]; rule EOL = lexeme[ "[\n\r;]"_rx ]; -rule ID = lexeme[ capture(m)[ "[a-z]"_rx ] <[]() -> int { return m.capture().at(0) - 'a'; } ]; -rule NUMBER = lexeme[ capture(m)[ ~"[-+]"_rx > +"[0-9]"_rx > ~("."_sx > +"[0-9]"_rx) ] <[]{ return std::stod(std::string{m}); } ]; +rule ID = lexeme[ "[a-z]"_rx <[](syntax m) -> int { return m.str().at(0) - 'a'; } ]; +rule NUMBER = lexeme[ ( ~"[-+]"_rx > +"[0-9]"_rx > ~('.' > +"[0-9]"_rx) ) + <[](syntax m) -> double { return std::stod(std::string{m}); } ]; rule Value = n%NUMBER <[]{ return n; } | i%ID > !"="_sx <[]{ return v[i]; } - | "(" > e%Expr > ")" <[]{ return e; }; + | '(' > e%Expr > ')' <[]{ return e; }; rule Prod = l%Value > *( - "*" > r%Value <[]{ l *= r; } - | "/" > r%Value <[]{ l /= r; } + '*' > r%Value <[]{ l *= r; } + | '/' > r%Value <[]{ l /= r; } ) <[]{ return l; }; rule Sum = l%Prod > *( - "+" > r%Prod <[]{ l += r; } - | "-" > r%Prod <[]{ l -= r; } + '+' > r%Prod <[]{ l += r; } + | '-' > r%Prod <[]{ l -= r; } ) <[]{ return l; }; -rule Expr = i%ID > "=" > s%Sum <[]{ return v[i] = s; } +rule Expr = i%ID > '=' > s%Sum <[]{ return v[i] = s; } | s%Sum <[]{ return s; }; -rule Stmt = ( "quit"_isx <[]{ std::exit(EXIT_SUCCESS); } +rule Stmt = ( ( "exit"_isx + | "quit"_isx ) <[]{ std::exit(EXIT_SUCCESS); } | e%Expr <[]{ std::cout << e << "\n"; } ) > EOL | *( !EOL > any ) > EOL <[]{ std::cerr << "SYNTAX ERROR\n"; }; diff --git a/tests/captures.cpp b/tests/captures.cpp index e526bce..9bba93e 100644 --- a/tests/captures.cpp +++ b/tests/captures.cpp @@ -42,22 +42,22 @@ void test_capture_email_syntax() std::string_view const email = "user@example.com"; assert(lug::parse(email, G)); - assert(username.capture() == "user"); - assert(domain.capture() == "example"); - assert(tld.capture() == "com"); - assert(username.capture().data() == email.data()); - assert(domain.capture().data() == email.substr(5).data()); - assert(tld.capture().data() == email.substr(13).data()); + assert(username.str() == "user"); + assert(domain.str() == "example"); + assert(tld.str() == "com"); + assert(username.str().data() == email.data()); + assert(domain.str().data() == email.substr(5).data()); + assert(tld.str().data() == email.substr(13).data()); std::string const email2 = "not.an@email"; assert(!lug::parse(email2, G)); // failure to parse the above should not change captures, as no semantic actions should be executed - assert(username.capture() == "user"); - assert(domain.capture() == "example"); - assert(tld.capture() == "com"); - assert(username.capture().data() == email.data()); - assert(domain.capture().data() == email.substr(5).data()); - assert(tld.capture().data() == email.substr(13).data()); + assert(username.str() == "user"); + assert(domain.str() == "example"); + assert(tld.str() == "com"); + assert(username.str().data() == email.data()); + assert(domain.str().data() == email.substr(5).data()); + assert(tld.str().data() == email.substr(13).data()); } void test_capture_url_syntax() @@ -78,29 +78,29 @@ void test_capture_url_syntax() assert(lug::parse(url1, G)); assert(protocol == "https"); assert(domain == "www.example.com"); - assert(path.capture() == "/path/to/resource"); + assert(path.str() == "/path/to/resource"); assert(protocol.data() == url1.data()); assert(domain.data() != url1.data()); // std::string makes a copy - assert(path.capture().data() == url1.substr(23).data()); + assert(path.str().data() == url1.substr(23).data()); std::string const url2 = "http://api.example2.com/path/to/other/resource.html"; assert(lug::parse(url2, G)); assert(protocol == "http"); assert(domain == "api.example2.com"); - assert(path.capture() == "/path/to/other/resource.html"); + assert(path.str() == "/path/to/other/resource.html"); assert(protocol.data() == url2.c_str()); assert(domain.data() != url2.data()); // std::string makes a copy - assert(path.capture().data() == &url2[23]); + assert(path.str().data() == &url2[23]); std::string const url3 = "https://www.example3.com$path/to/resource"; assert(!lug::parse(url3, G)); // failure to parse the above should not change captures, as no semantic actions should be executed assert(protocol == "http"); assert(domain == "api.example2.com"); - assert(path.capture() == "/path/to/other/resource.html"); + assert(path.str() == "/path/to/other/resource.html"); assert(protocol.data() == url2.c_str()); assert(domain.data() != url2.data()); // std::string makes a copy - assert(path.capture().data() == &url2[23]); + assert(path.str().data() == &url2[23]); } void test_capture_comma_delimited_list() @@ -121,7 +121,7 @@ void test_capture_comma_delimited_list() assert(items[0] == "apple"); assert(items[1] == "banana"); assert(items[2] == "cherry"); - assert(item.capture() == "cherry"); // item should capture the last item parsed + assert(item.str() == "cherry"); // item should capture the last item parsed items.clear(); std::string const list2 = "123 , 456 ,789,987"; @@ -131,14 +131,14 @@ void test_capture_comma_delimited_list() assert(items[1] == "456"); assert(items[2] == "789"); assert(items[3] == "987"); - assert(item.capture() == "987"); // item should capture the last item parsed + assert(item.str() == "987"); // item should capture the last item parsed items.clear(); std::string_view const list3 = "one_single-item"; assert(lug::parse(list3, G)); assert(items.size() == 1); assert(items[0] == "one_single-item"); - assert(item.capture() == "one_single-item"); // item should capture the last item parsed + assert(item.str() == "one_single-item"); // item should capture the last item parsed // Test with an invalid list (no items) std::string const list4 = ""; @@ -146,7 +146,7 @@ void test_capture_comma_delimited_list() // After failing to parse, items should remain unchanged from the last successful parse assert(items.size() == 1); assert(items[0] == "one_single-item"); - assert(item.capture() == "one_single-item"); + assert(item.str() == "one_single-item"); } void test_capture_nested_calls() diff --git a/tests/leftrecursion.cpp b/tests/leftrecursion.cpp index 1d05e15..ccf631e 100644 --- a/tests/leftrecursion.cpp +++ b/tests/leftrecursion.cpp @@ -34,7 +34,7 @@ void test_indirect_left_recursion() Q = R > chr('a'); R = Q | chr('a'); S = R > !chr('a'); - grammar G = start(S); + grammar const G = start(S); assert(lug::parse("a", G)); assert(lug::parse("aa", G)); assert(lug::parse("aab", G)); @@ -55,9 +55,9 @@ void test_association_and_precedence() N = chr('1') | chr('2') | chr('3'); E = E[1] > chr('+') > E[2] <[&out]{ out += '+'; } | E[2] > chr('*') > E[3] <[&out]{ out += '*'; } - | N <[&out](syntax x){ out += x.capture(); }; + | N <[&out](syntax x){ out += x.str(); }; S = E > eoi; - grammar G = start(S); + grammar const G = start(S); out.clear(); assert(lug::parse("1", G) && out == "1"); out.clear(); From 74e575cd7839b6f26d7b1623940c77fb4f065d87 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 3 Jul 2024 23:29:21 -0700 Subject: [PATCH 17/19] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76d1c8c..4082923 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Features - Support for direct and indirect left recursion, with precedence levels to disambiguate subexpressions with mixed left/right recursion. - Full support for UTF-8 text parsing, including Level 1 and partial Level 2 compliance with the UTS #18 Unicode Regular Expressions technical standard. - Automatic tracking of line and column numbers, with customizable tab width and alignment. -- Header-only library utilizing C++17 language and library features. +- Header-only library utilizing C++17 language and library features. Forward compatible with C++20 and C++23. - Relatively small with the goal of keeping total line count across all header files under 5000 lines of terse code. It is based on research introduced in the following papers: From 37dc51be7ba70f65adc10e72ad6f5684cc2a45e1 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Thu, 4 Jul 2024 00:07:12 -0700 Subject: [PATCH 18/19] Update version for release --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7657b34..52473f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## Release v0.3.0 (Under Development) +## Release v0.3.0 (July 4, 2024) * Added list repetition operator `e1 >> e2` to the DSL that is shorthand for `e1 > *(e2 > e1)`. * Added support for parsing characters and character literals where applicable without explicitly needing to wrap them with `chr()` or `_cx`. From 3c11ef4a9960d99165bcae3da6143153e08ec7e2 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Thu, 4 Jul 2024 00:07:30 -0700 Subject: [PATCH 19/19] Updated version for release --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7aea5bf..fc89f24 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # See LICENSE file for copyright and license details # distribution version -VERSION = 0.3.0-pre +VERSION = 0.3.0 # paths PREFIX = /usr/local