Implements string lexing.

This commit is contained in:
Deukhoofd 2020-10-04 17:15:28 +02:00
parent e0c52f4ae7
commit db7ad0bd76
Signed by: Deukhoofd
GPG Key ID: F63E044490819F6F
6 changed files with 114 additions and 27 deletions

View File

@ -2,6 +2,7 @@
#define ELOHIMSCRIPT_LEXTOKEN_HPP #define ELOHIMSCRIPT_LEXTOKEN_HPP
#include <memory> #include <memory>
#include <utility>
#include "LexTokenKind.hpp" #include "LexTokenKind.hpp"
namespace ElohimScript::Parser { namespace ElohimScript::Parser {
@ -22,21 +23,29 @@ namespace ElohimScript::Parser {
[[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; } [[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; }
}; };
class IntegerToken : public LexTokenImpl<LexTokenKind::IntegerToken> { class IntegerLiteral : public LexTokenImpl<LexTokenKind::IntegerLiteral> {
uint64_t _value; uint64_t _value;
public: public:
IntegerToken(uint64_t value) : _value(value) {} IntegerLiteral(uint64_t value) : _value(value) {}
[[nodiscard]] uint64_t GetValue() const noexcept { return _value; } [[nodiscard]] uint64_t GetValue() const noexcept { return _value; }
}; };
class FloatToken : public LexTokenImpl<LexTokenKind::FloatToken> { class FloatLiteral : public LexTokenImpl<LexTokenKind::FloatLiteral> {
double _value; double _value;
public: public:
FloatToken(double value) : _value(value) {} FloatLiteral(double value) : _value(value) {}
[[nodiscard]] double GetValue() const noexcept { return _value; } [[nodiscard]] double GetValue() const noexcept { return _value; }
}; };
class StringLiteral : public LexTokenImpl<LexTokenKind::StringLiteral> {
std::u8string _value;
public:
StringLiteral(std::u8string value) : _value(std::move(value)) {}
[[nodiscard]] const std::u8string& GetValue() const noexcept { return _value; }
};
} }
#endif // ELOHIMSCRIPT_LEXTOKEN_HPP #endif // ELOHIMSCRIPT_LEXTOKEN_HPP

View File

@ -62,9 +62,10 @@ namespace ElohimScript::Parser {
ExclamationMarkIsSymbol, ExclamationMarkIsSymbol,
ColonColonSymbol, ColonColonSymbol,
// Misc // Literals
FloatToken, FloatLiteral,
IntegerToken, IntegerLiteral,
StringLiteral,
}; };
} }

View File

@ -215,6 +215,13 @@ namespace ElohimScript::Parser {
case u8'7': case u8'7':
case u8'8': case u8'8':
case u8'9': return LexNumerical(c); case u8'9': return LexNumerical(c);
case u8'\'': return LexString(u8'\'', false);
case u8'"': {
if (Peek() == '"' && Peek(2) == '\"') {
return LexString(u8'"', true);
}
return LexString(u8'"', false);
}
default: return new LexTokenImpl<LexTokenKind::Unknown>(); default: return new LexTokenImpl<LexTokenKind::Unknown>();
} }
@ -285,12 +292,12 @@ namespace ElohimScript::Parser {
while (true) { while (true) {
auto v = (uint64_t)LexDecimalValue(Peek()); auto v = (uint64_t)LexDecimalValue(Peek());
if (v == 255) { if (v == 255) {
if (!isDecimal && Peek() == '.') { if (!isDecimal && Peek() == u8'.') {
isDecimal = true; isDecimal = true;
Progress(); Progress();
continue; continue;
} }
if (isDecimal && (Peek() == 'e' || Peek() == 'E')) { if (isDecimal && (Peek() == u8'e' || Peek() == u8'E')) {
isDecimal = false; isDecimal = false;
isExponent = true; isExponent = true;
Progress(); Progress();
@ -316,12 +323,12 @@ namespace ElohimScript::Parser {
if (isExponent) { if (isExponent) {
val *= pow(10, exponentValue); val *= pow(10, exponentValue);
} }
return new FloatToken(val); return new FloatLiteral(val);
} }
return new IntegerToken(value); return new IntegerLiteral(value);
} }
IntegerToken* Lexer::LexHexadecimal() { IntegerLiteral* Lexer::LexHexadecimal() {
uint64_t value = 0; uint64_t value = 0;
while (true) { while (true) {
auto v = LexHexadecimalValue(Peek()); auto v = LexHexadecimalValue(Peek());
@ -332,9 +339,9 @@ namespace ElohimScript::Parser {
value <<= 4; value <<= 4;
value += v; value += v;
} }
return new IntegerToken(value); return new IntegerLiteral(value);
} }
IntegerToken* Lexer::LexOctal() { IntegerLiteral* Lexer::LexOctal() {
uint64_t value = 0; uint64_t value = 0;
while (true) { while (true) {
auto v = LexOctalValue(Peek()); auto v = LexOctalValue(Peek());
@ -345,9 +352,9 @@ namespace ElohimScript::Parser {
value <<= 3; value <<= 3;
value += v; value += v;
} }
return new IntegerToken(value); return new IntegerLiteral(value);
} }
IntegerToken* Lexer::LexBinary() { IntegerLiteral* Lexer::LexBinary() {
uint64_t value = 0; uint64_t value = 0;
while (true) { while (true) {
auto v = LexBinaryValue(Peek()); auto v = LexBinaryValue(Peek());
@ -358,6 +365,38 @@ namespace ElohimScript::Parser {
value <<= 1; value <<= 1;
value += v; value += v;
} }
return new IntegerToken(value); return new IntegerLiteral(value);
}
StringLiteral* Lexer::LexString(char8_t opening, bool heredoc) {
Progress();
if (heredoc) {
Progress(2);
}
auto start = _position;
size_t offset = 0;
while (true) {
auto current = Peek(offset);
if (heredoc) {
if (current == '"' && Peek(offset + 1) == '"' && Peek(offset + 2) == '"' && Peek(offset + 3) != '"') {
break;
}
} else if (current == opening) {
break;
}
if (current == u8'\0') {
// TODO: Log error
break;
}
if (!heredoc && (current == u8'\n' || current == u8'\r')) {
// TODO: log error
break;
}
offset++;
}
Progress(offset);
if (heredoc) {
Progress(2);
}
return new StringLiteral(std::u8string(_script.substr(start, offset)));
} }
} }

View File

@ -8,6 +8,7 @@ namespace ElohimScript::Parser {
class Lexer { class Lexer {
public: public:
Lexer(const char* script) : _script(reinterpret_cast<const char8_t*>(script)) {} Lexer(const char* script) : _script(reinterpret_cast<const char8_t*>(script)) {}
Lexer(const char8_t* script) : _script(script) {}
Lexer(std::u8string_view script) : _script(script) {} Lexer(std::u8string_view script) : _script(script) {}
const LexToken* Lex(); const LexToken* Lex();
@ -37,9 +38,11 @@ namespace ElohimScript::Parser {
LexToken* LexNext(); LexToken* LexNext();
LexToken* LexNumerical(char8_t); LexToken* LexNumerical(char8_t);
LexToken* LexDecimal(uint64_t initial); LexToken* LexDecimal(uint64_t initial);
IntegerToken* LexHexadecimal(); IntegerLiteral* LexHexadecimal();
IntegerToken* LexOctal(); IntegerLiteral* LexOctal();
IntegerToken* LexBinary(); IntegerLiteral* LexBinary();
StringLiteral* LexString(char8_t opening, bool heredoc);
}; };
} }

View File

@ -7,25 +7,24 @@ using namespace ElohimScript::Parser;
TEST_CASE("Lex " script) { \ TEST_CASE("Lex " script) { \
auto lexer = Lexer(script); \ auto lexer = Lexer(script); \
const auto* token = lexer.Lex(); \ const auto* token = lexer.Lex(); \
REQUIRE(token->GetKind() == LexTokenKind::IntegerToken); \ REQUIRE(token->GetKind() == LexTokenKind::IntegerLiteral); \
auto value = ((const IntegerToken*)token)->GetValue(); \ auto value = ((const IntegerLiteral*)token)->GetValue(); \
CHECK(value == (expected)); \ CHECK(value == (expected)); \
CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \ CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \
delete token; \ delete token; \
} }
#define FLOAT_TEST(script, expected) \ #define FLOAT_TEST(script, expected) \
TEST_CASE("Lex " script) { \ TEST_CASE("Lex " script) { \
auto lexer = Lexer(script); \ auto lexer = Lexer(script); \
const auto* token = lexer.Lex(); \ const auto* token = lexer.Lex(); \
REQUIRE(token->GetKind() == LexTokenKind::FloatToken); \ REQUIRE(token->GetKind() == LexTokenKind::FloatLiteral); \
auto value = ((const FloatToken*)token)->GetValue(); \ auto value = ((const FloatLiteral*)token)->GetValue(); \
CHECK(value == (expected)); \ CHECK(value == (expected)); \
CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \ CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \
delete token; \ delete token; \
} }
// Decimal lexing // Decimal lexing
INTEGER_TEST("123456", 123456); INTEGER_TEST("123456", 123456);
INTEGER_TEST("0d123456", 123456); INTEGER_TEST("0d123456", 123456);
@ -62,3 +61,6 @@ INTEGER_TEST("0b11", 3);
INTEGER_TEST("0b111", 7); INTEGER_TEST("0b111", 7);
INTEGER_TEST("0b1111", 15); INTEGER_TEST("0b1111", 15);
INTEGER_TEST("0b110011", 51); INTEGER_TEST("0b110011", 51);
#undef INTEGER_TEST
#undef FLOAT_TEST

View File

@ -0,0 +1,33 @@
#include "../../extern/doctest.hpp"
#include "../../src/Parser/Lexer/Lexer.hpp"
using namespace ElohimScript::Parser;
#define STRING_TEST(str, constraint) \
TEST_CASE("Lex string " constraint str constraint) { \
auto lexer = Lexer(constraint str constraint); \
const auto* token = lexer.Lex(); \
REQUIRE(token->GetKind() == LexTokenKind::StringLiteral); \
auto value = ((const StringLiteral*)token)->GetValue(); \
CHECK(value == std::u8string(reinterpret_cast<const char8_t*>(str))); \
CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile); \
delete token; \
}
STRING_TEST("foo bar", "'");
STRING_TEST("foo bar", "\"");
STRING_TEST("foo bar", "\"\"\"");
STRING_TEST("\"foo bar\"", "\"\"\"");
STRING_TEST("\"\"foo bar\"\"", "\"\"\"");
TEST_CASE("Lex multiline string") {
auto lexer = Lexer(R"("""foo
bar""")");
const auto* token = lexer.Lex();
REQUIRE(token->GetKind() == LexTokenKind::StringLiteral);
auto value = (dynamic_cast<const StringLiteral*>(token))->GetValue();
CHECK(value == std::u8string(reinterpret_cast<const char8_t*>(R"(foo
bar)")));
CHECK(token->GetNext()->GetKind() == LexTokenKind::EndOfFile);
delete token;
}