From db7ad0bd76c5418399a24096b8dd13723a54015f Mon Sep 17 00:00:00 2001 From: Deukhoofd Date: Sun, 4 Oct 2020 17:15:28 +0200 Subject: [PATCH] Implements string lexing. --- src/Parser/Lexer/LexToken.hpp | 17 ++++++-- src/Parser/Lexer/LexTokenKind.hpp | 7 +-- src/Parser/Lexer/Lexer.cpp | 59 +++++++++++++++++++++----- src/Parser/Lexer/Lexer.hpp | 9 ++-- tests/LexerTests/NumericalLexTests.cpp | 16 ++++--- tests/LexerTests/StringLexTests.cpp | 33 ++++++++++++++ 6 files changed, 114 insertions(+), 27 deletions(-) create mode 100644 tests/LexerTests/StringLexTests.cpp diff --git a/src/Parser/Lexer/LexToken.hpp b/src/Parser/Lexer/LexToken.hpp index fc7ee18..896a34d 100644 --- a/src/Parser/Lexer/LexToken.hpp +++ b/src/Parser/Lexer/LexToken.hpp @@ -2,6 +2,7 @@ #define ELOHIMSCRIPT_LEXTOKEN_HPP #include +#include #include "LexTokenKind.hpp" namespace ElohimScript::Parser { @@ -22,21 +23,29 @@ namespace ElohimScript::Parser { [[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; } }; - class IntegerToken : public LexTokenImpl { + class IntegerLiteral : public LexTokenImpl { uint64_t _value; public: - IntegerToken(uint64_t value) : _value(value) {} + IntegerLiteral(uint64_t value) : _value(value) {} [[nodiscard]] uint64_t GetValue() const noexcept { return _value; } }; - class FloatToken : public LexTokenImpl { + class FloatLiteral : public LexTokenImpl { double _value; public: - FloatToken(double value) : _value(value) {} + FloatLiteral(double value) : _value(value) {} [[nodiscard]] double GetValue() const noexcept { return _value; } }; + + class StringLiteral : public LexTokenImpl { + std::u8string _value; + + public: + StringLiteral(std::u8string value) : _value(std::move(value)) {} + [[nodiscard]] const std::u8string& GetValue() const noexcept { return _value; } + }; } #endif // ELOHIMSCRIPT_LEXTOKEN_HPP diff --git a/src/Parser/Lexer/LexTokenKind.hpp b/src/Parser/Lexer/LexTokenKind.hpp index dfa443b..c33a5e7 100644 --- a/src/Parser/Lexer/LexTokenKind.hpp +++ b/src/Parser/Lexer/LexTokenKind.hpp @@ -62,9 +62,10 @@ namespace ElohimScript::Parser { ExclamationMarkIsSymbol, ColonColonSymbol, - // Misc - FloatToken, - IntegerToken, + // Literals + FloatLiteral, + IntegerLiteral, + StringLiteral, }; } diff --git a/src/Parser/Lexer/Lexer.cpp b/src/Parser/Lexer/Lexer.cpp index aef8cf6..231c66b 100644 --- a/src/Parser/Lexer/Lexer.cpp +++ b/src/Parser/Lexer/Lexer.cpp @@ -215,6 +215,13 @@ namespace ElohimScript::Parser { case u8'7': case u8'8': case u8'9': return LexNumerical(c); + case u8'\'': return LexString(u8'\'', false); + case u8'"': { + if (Peek() == '"' && Peek(2) == '\"') { + return LexString(u8'"', true); + } + return LexString(u8'"', false); + } default: return new LexTokenImpl(); } @@ -285,12 +292,12 @@ namespace ElohimScript::Parser { while (true) { auto v = (uint64_t)LexDecimalValue(Peek()); if (v == 255) { - if (!isDecimal && Peek() == '.') { + if (!isDecimal && Peek() == u8'.') { isDecimal = true; Progress(); continue; } - if (isDecimal && (Peek() == 'e' || Peek() == 'E')) { + if (isDecimal && (Peek() == u8'e' || Peek() == u8'E')) { isDecimal = false; isExponent = true; Progress(); @@ -316,12 +323,12 @@ namespace ElohimScript::Parser { if (isExponent) { val *= pow(10, exponentValue); } - return new FloatToken(val); + return new FloatLiteral(val); } - return new IntegerToken(value); + return new IntegerLiteral(value); } - IntegerToken* Lexer::LexHexadecimal() { + IntegerLiteral* Lexer::LexHexadecimal() { uint64_t value = 0; while (true) { auto v = LexHexadecimalValue(Peek()); @@ -332,9 +339,9 @@ namespace ElohimScript::Parser { value <<= 4; value += v; } - return new IntegerToken(value); + return new IntegerLiteral(value); } - IntegerToken* Lexer::LexOctal() { + IntegerLiteral* Lexer::LexOctal() { uint64_t value = 0; while (true) { auto v = LexOctalValue(Peek()); @@ -345,9 +352,9 @@ namespace ElohimScript::Parser { value <<= 3; value += v; } - return new IntegerToken(value); + return new IntegerLiteral(value); } - IntegerToken* Lexer::LexBinary() { + IntegerLiteral* Lexer::LexBinary() { uint64_t value = 0; while (true) { auto v = LexBinaryValue(Peek()); @@ -358,6 +365,38 @@ namespace ElohimScript::Parser { value <<= 1; value += v; } - return new IntegerToken(value); + return new IntegerLiteral(value); + } + StringLiteral* Lexer::LexString(char8_t opening, bool heredoc) { + Progress(); + if (heredoc) { + Progress(2); + } + auto start = _position; + size_t offset = 0; + while (true) { + auto current = Peek(offset); + if (heredoc) { + if (current == '"' && Peek(offset + 1) == '"' && Peek(offset + 2) == '"' && Peek(offset + 3) != '"') { + break; + } + } else if (current == opening) { + break; + } + if (current == u8'\0') { + // TODO: Log error + break; + } + if (!heredoc && (current == u8'\n' || current == u8'\r')) { + // TODO: log error + break; + } + offset++; + } + Progress(offset); + if (heredoc) { + Progress(2); + } + return new StringLiteral(std::u8string(_script.substr(start, offset))); } } diff --git a/src/Parser/Lexer/Lexer.hpp b/src/Parser/Lexer/Lexer.hpp index 0cfc59c..3d32991 100644 --- a/src/Parser/Lexer/Lexer.hpp +++ b/src/Parser/Lexer/Lexer.hpp @@ -8,6 +8,7 @@ namespace ElohimScript::Parser { class Lexer { public: Lexer(const char* script) : _script(reinterpret_cast(script)) {} + Lexer(const char8_t* script) : _script(script) {} Lexer(std::u8string_view script) : _script(script) {} const LexToken* Lex(); @@ -37,9 +38,11 @@ namespace ElohimScript::Parser { LexToken* LexNext(); LexToken* LexNumerical(char8_t); LexToken* LexDecimal(uint64_t initial); - IntegerToken* LexHexadecimal(); - IntegerToken* LexOctal(); - IntegerToken* LexBinary(); + IntegerLiteral* LexHexadecimal(); + IntegerLiteral* LexOctal(); + IntegerLiteral* LexBinary(); + + StringLiteral* LexString(char8_t opening, bool heredoc); }; } diff --git a/tests/LexerTests/NumericalLexTests.cpp b/tests/LexerTests/NumericalLexTests.cpp index 179ef2f..d2ffcaf 100644 --- a/tests/LexerTests/NumericalLexTests.cpp +++ b/tests/LexerTests/NumericalLexTests.cpp @@ -7,25 +7,24 @@ using namespace ElohimScript::Parser; TEST_CASE("Lex " script) { \ auto lexer = Lexer(script); \ const auto* token = lexer.Lex(); \ - REQUIRE(token->GetKind() == LexTokenKind::IntegerToken); \ - auto value = ((const IntegerToken*)token)->GetValue(); \ + REQUIRE(token->GetKind() == LexTokenKind::IntegerLiteral); \ + auto value = ((const IntegerLiteral*)token)->GetValue(); \ CHECK(value == (expected)); \ CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \ delete token; \ } -#define FLOAT_TEST(script, expected) \ +#define FLOAT_TEST(script, expected) \ TEST_CASE("Lex " script) { \ auto lexer = Lexer(script); \ const auto* token = lexer.Lex(); \ - REQUIRE(token->GetKind() == LexTokenKind::FloatToken); \ - auto value = ((const FloatToken*)token)->GetValue(); \ + REQUIRE(token->GetKind() == LexTokenKind::FloatLiteral); \ + auto value = ((const FloatLiteral*)token)->GetValue(); \ CHECK(value == (expected)); \ CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \ delete token; \ } - // Decimal lexing INTEGER_TEST("123456", 123456); INTEGER_TEST("0d123456", 123456); @@ -61,4 +60,7 @@ INTEGER_TEST("0b1", 1); INTEGER_TEST("0b11", 3); INTEGER_TEST("0b111", 7); INTEGER_TEST("0b1111", 15); -INTEGER_TEST("0b110011", 51); \ No newline at end of file +INTEGER_TEST("0b110011", 51); + +#undef INTEGER_TEST +#undef FLOAT_TEST \ No newline at end of file diff --git a/tests/LexerTests/StringLexTests.cpp b/tests/LexerTests/StringLexTests.cpp new file mode 100644 index 0000000..e1a8a47 --- /dev/null +++ b/tests/LexerTests/StringLexTests.cpp @@ -0,0 +1,33 @@ +#include "../../extern/doctest.hpp" +#include "../../src/Parser/Lexer/Lexer.hpp" + +using namespace ElohimScript::Parser; + +#define STRING_TEST(str, constraint) \ + TEST_CASE("Lex string " constraint str constraint) { \ + auto lexer = Lexer(constraint str constraint); \ + const auto* token = lexer.Lex(); \ + REQUIRE(token->GetKind() == LexTokenKind::StringLiteral); \ + auto value = ((const StringLiteral*)token)->GetValue(); \ + CHECK(value == std::u8string(reinterpret_cast(str))); \ + CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \ + delete token; \ + } + +STRING_TEST("foo bar", "'"); +STRING_TEST("foo bar", "\""); +STRING_TEST("foo bar", "\"\"\""); +STRING_TEST("\"foo bar\"", "\"\"\""); +STRING_TEST("\"\"foo bar\"\"", "\"\"\""); + +TEST_CASE("Lex multiline string") { + auto lexer = Lexer(R"("""foo +bar""")"); + const auto* token = lexer.Lex(); + REQUIRE(token->GetKind() == LexTokenKind::StringLiteral); + auto value = (dynamic_cast(token))->GetValue(); + CHECK(value == std::u8string(reinterpret_cast(R"(foo +bar)"))); + CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); + delete token; +} \ No newline at end of file