Initial work on parsing.

This commit is contained in:
Deukhoofd 2020-10-07 22:11:18 +02:00
parent f299d5183f
commit 2036f1ce43
Signed by: Deukhoofd
GPG Key ID: F63E044490819F6F
10 changed files with 276 additions and 55 deletions

View File

@ -3,7 +3,12 @@
#include <string> #include <string>
namespace MalachScript::Diagnostics { namespace MalachScript::Diagnostics {
enum class DiagnosticType : uint8_t { UnknownToken, InvalidNumericalBase, ExpectedEndOfString }; enum class DiagnosticType : uint8_t {
UnknownToken,
InvalidNumericalBase,
ExpectedEndOfString,
UnexpectedToken,
};
class DiagnosticTypeHelper { class DiagnosticTypeHelper {
static std::string ToEnglishString(DiagnosticType type) { static std::string ToEnglishString(DiagnosticType type) {
@ -11,6 +16,7 @@ namespace MalachScript::Diagnostics {
case DiagnosticType::UnknownToken: return "Unknown token"; case DiagnosticType::UnknownToken: return "Unknown token";
case DiagnosticType::InvalidNumericalBase: return "Invalid numerical base"; case DiagnosticType::InvalidNumericalBase: return "Invalid numerical base";
case DiagnosticType::ExpectedEndOfString: return "Expected end of string"; case DiagnosticType::ExpectedEndOfString: return "Expected end of string";
case DiagnosticType::UnexpectedToken: return "Unexpected Token";
} }
return std::to_string((uint8_t)type); return std::to_string((uint8_t)type);
} }

View File

@ -1,8 +0,0 @@
#ifndef MALACHSCRIPT_PARSEDEXPERSSION_HPP
#define MALACHSCRIPT_PARSEDEXPERSSION_HPP
namespace MalachScript::Parser{
class ParsedExpression{};
}
#endif // MALACHSCRIPT_PARSEDEXPERSSION_HPP

View File

@ -2,13 +2,11 @@
#define MALACHSCRIPT_LEXTOKEN_HPP #define MALACHSCRIPT_LEXTOKEN_HPP
#include <memory> #include <memory>
#include <utility>
#include "LexTokenKind.hpp" #include "LexTokenKind.hpp"
#include "../../TextSpan.hpp"
namespace MalachScript::Parser { namespace MalachScript::Parser {
class LexToken { class LexToken {
friend class Lexer;
std::unique_ptr<const LexToken> _next; std::unique_ptr<const LexToken> _next;
TextSpan _span; TextSpan _span;
@ -18,12 +16,16 @@ namespace MalachScript::Parser {
[[nodiscard]] virtual LexTokenKind GetKind() const noexcept = 0; [[nodiscard]] virtual LexTokenKind GetKind() const noexcept = 0;
[[nodiscard]] const std::unique_ptr<const LexToken>& GetNext() const noexcept { return _next; } [[nodiscard]] const std::unique_ptr<const LexToken>& GetNext() const noexcept { return _next; }
[[nodiscard]] const TextSpan& GetSpan() const noexcept { return _span; } [[nodiscard]] const TextSpan& GetSpan() const noexcept { return _span; }
void SetNext(LexToken* token){
_next = std::unique_ptr<const LexToken>(token);
}
}; };
template <LexTokenKind kind> class LexTokenImpl : public LexToken { template <LexTokenKind kind> class LexTokenImpl : public LexToken {
public: public:
LexTokenImpl(TextSpan span) : LexToken(span){}; LexTokenImpl(TextSpan span) : LexToken(span){};
[[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; } [[nodiscard]] LexTokenKind GetKind() const noexcept final { return kind; }
}; };
class IntegerLiteral : public LexTokenImpl<LexTokenKind::IntegerLiteral> { class IntegerLiteral : public LexTokenImpl<LexTokenKind::IntegerLiteral> {
@ -53,12 +55,12 @@ namespace MalachScript::Parser {
}; };
class IdentifierToken : public LexTokenImpl<LexTokenKind::Identifier> { class IdentifierToken : public LexTokenImpl<LexTokenKind::Identifier> {
std::u8string _value; std::u8string_view _value;
public: public:
IdentifierToken(TextSpan span, std::u8string value) IdentifierToken(TextSpan span, std::u8string_view value)
: LexTokenImpl<LexTokenKind::Identifier>(span), _value(std::move(value)) {} : LexTokenImpl<LexTokenKind::Identifier>(span), _value(value) {}
[[nodiscard]] const std::u8string& GetValue() const noexcept { return _value; } [[nodiscard]] const std::u8string_view& GetValue() const noexcept { return _value; }
}; };
} }

View File

@ -12,7 +12,7 @@ namespace MalachScript::Parser {
auto* last = first; auto* last = first;
while (true) { while (true) {
auto* next = LexNext(); auto* next = LexNext();
last->_next = std::unique_ptr<const LexToken>(next); last->SetNext(next);
last = next; last = next;
if (next->GetKind() == LexTokenKind::EndOfFile) { if (next->GetKind() == LexTokenKind::EndOfFile) {
break; break;
@ -459,8 +459,21 @@ namespace MalachScript::Parser {
} }
static uint32_t constexpr Hash(const char8_t* input) { static uint32_t constexpr Hash(const char8_t* input) {
return *input != 0U ? static_cast<uint32_t>(*input) + 33 * Hash(input + 1) : 5381; if (*input != 0U) {
return static_cast<uint32_t>(*input) + 33 * Hash(input + 1);
} else {
return 5381;
}
}; };
static uint32_t HashStringView(const std::u8string_view& sv){
auto init = 5381;
for (auto it = sv.rbegin(); it != sv.rend(); ++it)
{
init *= 33;
init += static_cast<uint32_t>(*it);
}
return init;
}
LexToken* Lexer::LexKeywordOrIdentifier() { LexToken* Lexer::LexKeywordOrIdentifier() {
auto start = _position; auto start = _position;
@ -468,9 +481,9 @@ namespace MalachScript::Parser {
while (IsAlphaNumericalOrUnderscore(Peek(offset))) { while (IsAlphaNumericalOrUnderscore(Peek(offset))) {
offset++; offset++;
} }
auto str = std::u8string(_script.substr(start, offset)); auto str = _script.substr(start, offset);
Progress(offset - 1); Progress(offset - 1);
switch (Hash(str.c_str())) { switch (HashStringView(str)) {
case Hash(u8"and"): return Create<LexTokenImpl<LexTokenKind::AndKeyword>>(TextSpan(start, _position)); case Hash(u8"and"): return Create<LexTokenImpl<LexTokenKind::AndKeyword>>(TextSpan(start, _position));
case Hash(u8"abstract"): case Hash(u8"abstract"):
return Create<LexTokenImpl<LexTokenKind::AbstractKeyword>>(TextSpan(start, _position)); return Create<LexTokenImpl<LexTokenKind::AbstractKeyword>>(TextSpan(start, _position));

View File

@ -1,15 +1,28 @@
#include "Parser.hpp" #include "Parser.hpp"
#include <iostream>
#define PROGRESS_TOKEN(token) \
token = token->GetNext().get(); \
while (token->GetKind() == LexTokenKind::Whitespace) { \
token = token->GetNext().get(); \
}
namespace MalachScript::Parser { namespace MalachScript::Parser {
ParsedScriptStatement* Parser::Parse() { ParsedScriptStatement* Parser::Parse() {
std::vector<const ParsedStatement*> statements(32); std::vector<const ParsedStatement*> statements;
statements.reserve(32);
size_t current = 0; size_t current = 0;
while (true) { while (true) {
auto next = this->Consume(); while (_currentToken->GetKind() == LexTokenKind::Whitespace) {
if (next->GetKind() == LexTokenKind::EndOfFile) { _currentToken = _currentToken->GetNext().get();
}
if (_currentToken->GetKind() == LexTokenKind::EndOfFile) {
break; break;
} }
statements[current] = this->ParseStatement(next); const ParsedStatement* statement;
if (ParseClass(statement)) {
}
statements.push_back(statement);
current++; current++;
} }
statements.resize(current); statements.resize(current);
@ -17,10 +30,84 @@ namespace MalachScript::Parser {
if (current > 0) { if (current > 0) {
end = statements.back()->GetSpan().GetEnd(); end = statements.back()->GetSpan().GetEnd();
} }
const auto* block = new ParsedBlockStatement(TextSpan(0, end), statements); return new ParsedScriptStatement(TextSpan(0, end), statements);
return new ParsedScriptStatement(block);
} }
const ParsedStatement* Parser::ParseStatement(const LexToken* token) { bool Parser::ParseClass(const ParsedStatement*& out) {
// If modifier (shared, external, private, protected, etc) push to buffer, continue const auto* current = _currentToken;
auto start = current->GetSpan().GetStart();
bool lookingForClass = true;
bool encounteredError = false;
while (lookingForClass) {
switch (current->GetKind()) {
case LexTokenKind::SharedKeyword: break;
case LexTokenKind::AbstractKeyword: break;
case LexTokenKind::FinalKeyword: break;
case LexTokenKind::ExternalKeyword: break;
case LexTokenKind::ClassKeyword: lookingForClass = false; break;
default: return false;
}
PROGRESS_TOKEN(current);
}
std::u8string_view identifier;
// After class keyword, an identifier should always follow, if it doesn't, log an error.
identifier = ParseIdentifier(current, encounteredError);
PROGRESS_TOKEN(current);
std::vector<std::u8string_view> inherits;
std::vector<const ParsedStatement*> body;
body.reserve(16);
switch (current->GetKind()) {
case LexTokenKind::SemicolonSymbol: {
PROGRESS_TOKEN(current);
break;
}
case LexTokenKind::ColonSymbol: {
PROGRESS_TOKEN(current);
auto id = ParseIdentifier(current, encounteredError);
inherits.push_back(id);
while (current->GetKind() == LexTokenKind::CommaSymbol) {
PROGRESS_TOKEN(current);
id = ParseIdentifier(current, encounteredError);
inherits.push_back(id);
PROGRESS_TOKEN(current);
}
if (!encounteredError && current->GetKind() != LexTokenKind::OpenCurlyParenthesisSymbol)
{
encounteredError = true;
LogError(Diagnostics::DiagnosticType::UnexpectedToken, current->GetSpan());
}
// Intentionally don't break so we continue into the inner body statement.
}
case LexTokenKind::OpenCurlyParenthesisSymbol: {
PROGRESS_TOKEN(current);
while (true) {
// Cheapest operation, check first
if (current->GetKind() == LexTokenKind::CloseCurlyParenthesisSymbol) {
PROGRESS_TOKEN(current);
break;
}
const ParsedStatement* statement;
// TODO: Sort by
if (!ParseVirtProp(statement) &&
!ParseFunc(statement) &&
!ParseVar(statement) &&
!ParseFuncDef(statement)){
LogError(Diagnostics::DiagnosticType::UnexpectedToken, current->GetSpan());
} else{
body.push_back(statement);
} }
} }
break;
}
default: throw;
}
out = new ParsedClassStatement(TextSpan(start, current->GetSpan().GetEnd()), identifier,
inherits, body);
_currentToken = current;
return true;
}
bool Parser::ParseVirtProp([[maybe_unused]]const ParsedStatement*& out) { return false; }
bool Parser::ParseFunc([[maybe_unused]]const ParsedStatement*& out) { return false; }
bool Parser::ParseVar([[maybe_unused]]const ParsedStatement*& out) { return false; }
bool Parser::ParseFuncDef([[maybe_unused]]const ParsedStatement*& out) { return false; }
}

View File

@ -1,32 +1,43 @@
#ifndef MALACHSCRIPT_PARSER_HPP #ifndef MALACHSCRIPT_PARSER_HPP
#define MALACHSCRIPT_PARSER_HPP #define MALACHSCRIPT_PARSER_HPP
#include "../Diagnostics/Diagnostics.hpp"
#include "Lexer/LexToken.hpp" #include "Lexer/LexToken.hpp"
#include "Statements/ParsedStatement.hpp" #include "Statements/ParsedStatement.hpp"
namespace MalachScript::Parser { namespace MalachScript::Parser {
class Parser { class Parser {
public: public:
Parser(const LexToken* firstToken) : _currentToken(firstToken) {} Parser(const char* scriptName, const LexToken* firstToken, Diagnostics::Diagnostics* diagnostics)
: _scriptName(reinterpret_cast<const char8_t*>(scriptName)), _diagnostics(diagnostics),
_currentToken(firstToken) {}
Parser(std::u8string_view scriptName, const LexToken* firstToken, Diagnostics::Diagnostics* diagnostics)
: _scriptName(scriptName), _diagnostics(diagnostics), _currentToken(firstToken) {}
ParsedScriptStatement* Parse(); ParsedScriptStatement* Parse();
private: private:
std::u8string_view _scriptName;
Diagnostics::Diagnostics* _diagnostics;
const LexToken* _currentToken; const LexToken* _currentToken;
inline const LexToken* Peek() {
if (_currentToken->GetKind() == LexTokenKind::EndOfFile) { inline void LogError(Diagnostics::DiagnosticType type, const TextSpan& span) {
return _currentToken; _diagnostics->LogError(type, _scriptName, span);
}
return _currentToken->GetNext().get();
} }
inline const LexToken* Consume() { bool ParseClass(const ParsedStatement*& out);
if (_currentToken->GetKind() == LexTokenKind::EndOfFile) { bool ParseVirtProp(const ParsedStatement*& out);
return _currentToken; bool ParseFunc(const ParsedStatement*& out);
} bool ParseVar(const ParsedStatement*& out);
_currentToken = _currentToken->GetNext().get(); bool ParseFuncDef(const ParsedStatement*& out);
return _currentToken;
}
const ParsedStatement* ParseStatement(const LexToken* token); std::u8string_view ParseIdentifier(const LexToken* token, bool& logError) {
if (logError && token->GetKind() != LexTokenKind::Identifier) {
LogError(Diagnostics::DiagnosticType::UnexpectedToken, token->GetSpan());
logError = false;
return std::u8string_view();
}
return reinterpret_cast<const IdentifierToken*>(token)->GetValue();
}
}; };
} }

View File

@ -10,6 +10,7 @@ namespace MalachScript::Parser {
public: public:
ParsedStatement(TextSpan span) : _span(span) {} ParsedStatement(TextSpan span) : _span(span) {}
virtual ~ParsedStatement() = default;
[[nodiscard]] virtual ParsedStatementKind GetKind() const noexcept = 0; [[nodiscard]] virtual ParsedStatementKind GetKind() const noexcept = 0;
[[nodiscard]] inline const TextSpan& GetSpan() const noexcept { return _span; } [[nodiscard]] inline const TextSpan& GetSpan() const noexcept { return _span; }
}; };
@ -17,15 +18,15 @@ namespace MalachScript::Parser {
template <ParsedStatementKind kind> class ParsedStatementImpl : public ParsedStatement { template <ParsedStatementKind kind> class ParsedStatementImpl : public ParsedStatement {
public: public:
ParsedStatementImpl(TextSpan span) : ParsedStatement(span) {} ParsedStatementImpl(TextSpan span) : ParsedStatement(span) {}
[[nodiscard]] inline ParsedStatementKind GetKind() const noexcept override { return kind; } [[nodiscard]] inline ParsedStatementKind GetKind() const noexcept final { return kind; }
}; };
class ParsedBlockStatement : public ParsedStatementImpl<ParsedStatementKind::Block> { class ParsedScriptStatement : public ParsedStatementImpl<ParsedStatementKind::Script> {
std::vector<std::unique_ptr<const ParsedStatement>> _statements; std::vector<std::unique_ptr<const ParsedStatement>> _statements;
public: public:
ParsedBlockStatement(TextSpan span, const std::vector<const ParsedStatement*>& statements) ParsedScriptStatement(TextSpan span, const std::vector<const ParsedStatement*>& statements)
: ParsedStatementImpl<ParsedStatementKind::Block>(span), _statements(statements.size()) { : ParsedStatementImpl<ParsedStatementKind::Script>(span), _statements(statements.size()) {
for (size_t i = 0; i < statements.size(); i++) for (size_t i = 0; i < statements.size(); i++)
_statements[i] = std::unique_ptr<const ParsedStatement>(statements[i]); _statements[i] = std::unique_ptr<const ParsedStatement>(statements[i]);
} }
@ -35,15 +36,18 @@ namespace MalachScript::Parser {
} }
}; };
class ParsedScriptStatement : public ParsedStatementImpl<ParsedStatementKind::Script> { class ParsedClassStatement : public ParsedStatementImpl<ParsedStatementKind::Class> {
std::unique_ptr<const ParsedBlockStatement> _block; std::u8string_view _identifier;
std::vector<std::u8string_view> _inherits;
std::vector<std::unique_ptr<const ParsedStatement>> _body;
public: public:
ParsedScriptStatement(const ParsedBlockStatement* block) ParsedClassStatement(TextSpan span, std::u8string_view identifier, std::vector<std::u8string_view> inherits,
: ParsedStatementImpl<ParsedStatementKind::Script>(block->GetSpan()), _block(block) {} const std::vector<const ParsedStatement*>& body)
: ParsedStatementImpl<ParsedStatementKind::Class>(span), _identifier(identifier), _inherits(inherits),
[[nodiscard]] inline const std::unique_ptr<const ParsedBlockStatement>& GetBlock() const noexcept { _body(body.size()) {
return _block; for (size_t i = 0; i < body.size(); i++)
_body[i] = std::unique_ptr<const ParsedStatement>(body[i]);
} }
}; };
} }

View File

@ -2,10 +2,12 @@
#define MALACHSCRIPT_PARSEDSTATEMENTKIND_HPP #define MALACHSCRIPT_PARSEDSTATEMENTKIND_HPP
namespace MalachScript::Parser { namespace MalachScript::Parser {
enum class ParsedStatementKind{ enum class ParsedStatementKind : uint8_t {
Unknown, Unknown,
Block, Block,
Script, Script,
ExpressionStatement,
Class,
}; };
} }

View File

@ -0,0 +1,77 @@
#include "../../extern/doctest.hpp"
#include "../../src/Parser/Parser.hpp"
using namespace MalachScript;
TEST_CASE("Parse basic class without body") {
std::vector<Parser::LexToken*> vec = {
new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0)),
new Parser::LexTokenImpl<Parser::LexTokenKind::EndOfFile>(TextSpan(0, 0)),
};
for (size_t i = 0; i < vec.size() - 1; i++) {
vec[i]->SetNext(vec[i + 1]);
}
Diagnostics::Diagnostics diags;
auto parser = Parser::Parser(u8"class without body", vec.front(), &diags);
auto* script = parser.Parse();
REQUIRE(diags.GetMessages().empty());
{
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
}
delete vec[0];
delete script;
}
#define PARSER_TEST(name, tokens, asserts) \
TEST_CASE(name) { \
std::vector<Parser::LexToken*> vec = { \
tokens, \
new Parser::LexTokenImpl<Parser::LexTokenKind::EndOfFile>(TextSpan(0, 0)), \
}; \
for (size_t i = 0; i < vec.size() - 1; i++) { \
vec[i]->SetNext(vec[i + 1]); \
} \
Diagnostics::Diagnostics diags; \
auto parser = Parser::Parser(u8"scriptname", vec.front(), &diags); \
auto* script = parser.Parse(); \
REQUIRE(diags.GetMessages().empty()); \
asserts; \
delete vec[0]; \
delete script; \
}
#define PARSER_TEST_TOKENS(...) __VA_ARGS__
PARSER_TEST("Parse basic class without body",
PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0))),
{
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
})
PARSER_TEST("Parse basic class without body with whitespaces",
PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
new Parser::LexTokenImpl<Parser::LexTokenKind::Whitespace>(TextSpan(0, 0)),
new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
new Parser::LexTokenImpl<Parser::LexTokenKind::Whitespace>(TextSpan(0, 0)),
new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0))),
{
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
})
PARSER_TEST(
"Parse basic class with empty body",
PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
new Parser::LexTokenImpl<Parser::LexTokenKind::OpenCurlyParenthesisSymbol>(TextSpan(0, 0)),
new Parser::LexTokenImpl<Parser::LexTokenKind::CloseCurlyParenthesisSymbol>(TextSpan(0, 0))),
{
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
})

View File

@ -0,0 +1,27 @@
#include "../../extern/doctest.hpp"
#include "../../src/Parser/Lexer/Lexer.hpp"
#include "../../src/Parser/Parser.hpp"
using namespace MalachScript;
#define PARSE_TEST(name, scriptText, asserts) \
TEST_CASE(name) { \
Diagnostics::Diagnostics diags; \
auto lexer = Parser::Lexer(name, scriptText, &diags); \
auto token = lexer.Lex(); \
auto parser = Parser::Parser(name, token, &diags); \
auto script = parser.Parse(); \
asserts; \
}
PARSE_TEST("Parse class without definition", "class foobar;", {
REQUIRE(diags.GetMessages().empty());
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
})
PARSE_TEST("Parse class with empty definition", "class foobar {}", {
REQUIRE(diags.GetMessages().empty());
REQUIRE(script->GetStatements().size() == 1);
REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
})