Initial work on parsing.

2020-10-07 22:11:18 +02:00
parent f299d5183f
commit 2036f1ce43
10 changed files with 276 additions and 55 deletions
--- a/src/Diagnostics/DiagnosticType.hpp
+++ b/src/Diagnostics/DiagnosticType.hpp
@@ -3,7 +3,12 @@
 #include <string>
 namespace MalachScript::Diagnostics {
-    enum class DiagnosticType : uint8_t { UnknownToken, InvalidNumericalBase, ExpectedEndOfString };
+    enum class DiagnosticType : uint8_t {
        UnknownToken,
        InvalidNumericalBase,
        ExpectedEndOfString,
        UnexpectedToken,
    };
    class DiagnosticTypeHelper {
        static std::string ToEnglishString(DiagnosticType type) {
@@ -11,6 +16,7 @@ namespace MalachScript::Diagnostics {
                case DiagnosticType::UnknownToken: return "Unknown token";
                case DiagnosticType::InvalidNumericalBase: return "Invalid numerical base";
                case DiagnosticType::ExpectedEndOfString: return "Expected end of string";
                case DiagnosticType::UnexpectedToken: return "Unexpected Token";
            }
            return std::to_string((uint8_t)type);
        }
--- a/src/Parser/Expressions/ParsedExperssion.hpp
+++ b/src/Parser/Expressions/ParsedExperssion.hpp
@@ -1,8 +0,0 @@
 #ifndef MALACHSCRIPT_PARSEDEXPERSSION_HPP
 #define MALACHSCRIPT_PARSEDEXPERSSION_HPP
 namespace MalachScript::Parser{
    class ParsedExpression{};
 }
 #endif // MALACHSCRIPT_PARSEDEXPERSSION_HPP
--- a/src/Parser/Lexer/LexToken.hpp
+++ b/src/Parser/Lexer/LexToken.hpp
@@ -2,13 +2,11 @@
 #define MALACHSCRIPT_LEXTOKEN_HPP
 #include <memory>
 #include <utility>
 #include "LexTokenKind.hpp"
 #include "../../TextSpan.hpp"
 namespace MalachScript::Parser {
    class LexToken {
        friend class Lexer;
        std::unique_ptr<const LexToken> _next;
        TextSpan _span;
@@ -18,12 +16,16 @@ namespace MalachScript::Parser {
        [[nodiscard]] virtual LexTokenKind GetKind() const noexcept = 0;
        [[nodiscard]] const std::unique_ptr<const LexToken>& GetNext() const noexcept { return _next; }
        [[nodiscard]] const TextSpan& GetSpan() const noexcept { return _span; }
        void SetNext(LexToken* token){
            _next = std::unique_ptr<const LexToken>(token);
        }
    };
    template <LexTokenKind kind> class LexTokenImpl : public LexToken {
    public:
        LexTokenImpl(TextSpan span) : LexToken(span){};
-        [[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; }
+        [[nodiscard]] LexTokenKind GetKind() const noexcept final { return kind; }
    };
    class IntegerLiteral : public LexTokenImpl<LexTokenKind::IntegerLiteral> {
@@ -53,12 +55,12 @@ namespace MalachScript::Parser {
    };
    class IdentifierToken : public LexTokenImpl<LexTokenKind::Identifier> {
-        std::u8string _value;
+        std::u8string_view _value;
    public:
-        IdentifierToken(TextSpan span, std::u8string value)
+        IdentifierToken(TextSpan span, std::u8string_view value)
-            : LexTokenImpl<LexTokenKind::Identifier>(span), _value(std::move(value)) {}
+            : LexTokenImpl<LexTokenKind::Identifier>(span), _value(value) {}
-        [[nodiscard]] const std::u8string& GetValue() const noexcept { return _value; }
+        [[nodiscard]] const std::u8string_view& GetValue() const noexcept { return _value; }
    };
 }
--- a/src/Parser/Lexer/Lexer.cpp
+++ b/src/Parser/Lexer/Lexer.cpp
@@ -12,7 +12,7 @@ namespace MalachScript::Parser {
        auto* last = first;
        while (true) {
            auto* next = LexNext();
-            last->_next = std::unique_ptr<const LexToken>(next);
+            last->SetNext(next);
            last = next;
            if (next->GetKind() == LexTokenKind::EndOfFile) {
                break;
@@ -459,8 +459,21 @@ namespace MalachScript::Parser {
    }
    static uint32_t constexpr Hash(const char8_t* input) {
-        return *input != 0U ? static_cast<uint32_t>(*input) + 33 * Hash(input + 1) : 5381;
+        if (*input != 0U) {
            return static_cast<uint32_t>(*input) + 33 * Hash(input + 1);
        } else {
            return 5381;
        }
    };
    static uint32_t HashStringView(const std::u8string_view& sv){
        auto init = 5381;
        for (auto it = sv.rbegin(); it != sv.rend(); ++it)
        {
            init *= 33;
            init += static_cast<uint32_t>(*it);
        }
        return init;
    }
    LexToken* Lexer::LexKeywordOrIdentifier() {
        auto start = _position;
@@ -468,9 +481,9 @@ namespace MalachScript::Parser {
        while (IsAlphaNumericalOrUnderscore(Peek(offset))) {
            offset++;
        }
-        auto str = std::u8string(_script.substr(start, offset));
+        auto str = _script.substr(start, offset);
        Progress(offset - 1);
-        switch (Hash(str.c_str())) {
+        switch (HashStringView(str)) {
            case Hash(u8"and"): return Create<LexTokenImpl<LexTokenKind::AndKeyword>>(TextSpan(start, _position));
            case Hash(u8"abstract"):
                return Create<LexTokenImpl<LexTokenKind::AbstractKeyword>>(TextSpan(start, _position));
--- a/src/Parser/Parser.cpp
+++ b/src/Parser/Parser.cpp
@@ -1,15 +1,28 @@
 #include "Parser.hpp"
 #include <iostream>
 #define PROGRESS_TOKEN(token)                                                                                          \
    token = token->GetNext().get();                                                                                    \
    while (token->GetKind() == LexTokenKind::Whitespace) {                                                             \
        token = token->GetNext().get();                                                                                \
    }
 namespace MalachScript::Parser {
    ParsedScriptStatement* Parser::Parse() {
-        std::vector<const ParsedStatement*> statements(32);
+        std::vector<const ParsedStatement*> statements;
        statements.reserve(32);
        size_t current = 0;
        while (true) {
-            auto next = this->Consume();
+            while (_currentToken->GetKind() == LexTokenKind::Whitespace) {
-            if (next->GetKind() == LexTokenKind::EndOfFile) {
+                _currentToken = _currentToken->GetNext().get();
            }
            if (_currentToken->GetKind() == LexTokenKind::EndOfFile) {
                break;
            }
-            statements[current] = this->ParseStatement(next);
+            const ParsedStatement* statement;
            if (ParseClass(statement)) {
            }
            statements.push_back(statement);
            current++;
        }
        statements.resize(current);
@@ -17,10 +30,84 @@ namespace MalachScript::Parser {
        if (current > 0) {
            end = statements.back()->GetSpan().GetEnd();
        }
-        const auto* block = new ParsedBlockStatement(TextSpan(0, end), statements);
+        return new ParsedScriptStatement(TextSpan(0, end), statements);
        return new ParsedScriptStatement(block);
    }
-    const ParsedStatement* Parser::ParseStatement(const LexToken* token) {
+    bool Parser::ParseClass(const ParsedStatement*& out) {
-        // If modifier (shared, external, private, protected, etc) push to buffer, continue
+        const auto* current = _currentToken;
        auto start = current->GetSpan().GetStart();
        bool lookingForClass = true;
        bool encounteredError = false;
        while (lookingForClass) {
            switch (current->GetKind()) {
                case LexTokenKind::SharedKeyword: break;
                case LexTokenKind::AbstractKeyword: break;
                case LexTokenKind::FinalKeyword: break;
                case LexTokenKind::ExternalKeyword: break;
                case LexTokenKind::ClassKeyword: lookingForClass = false; break;
                default: return false;
            }
            PROGRESS_TOKEN(current);
        }
        std::u8string_view identifier;
        // After class keyword, an identifier should always follow, if it doesn't, log an error.
        identifier = ParseIdentifier(current, encounteredError);
        PROGRESS_TOKEN(current);
        std::vector<std::u8string_view> inherits;
        std::vector<const ParsedStatement*> body;
        body.reserve(16);
        switch (current->GetKind()) {
            case LexTokenKind::SemicolonSymbol: {
                PROGRESS_TOKEN(current);
                break;
            }
            case LexTokenKind::ColonSymbol: {
                PROGRESS_TOKEN(current);
                auto id = ParseIdentifier(current, encounteredError);
                inherits.push_back(id);
                while (current->GetKind() == LexTokenKind::CommaSymbol) {
                    PROGRESS_TOKEN(current);
                    id = ParseIdentifier(current, encounteredError);
                    inherits.push_back(id);
                    PROGRESS_TOKEN(current);
                }
                if (!encounteredError && current->GetKind() != LexTokenKind::OpenCurlyParenthesisSymbol)
                {
                    encounteredError = true;
                    LogError(Diagnostics::DiagnosticType::UnexpectedToken, current->GetSpan());
                }
                // Intentionally don't break so we continue into the inner body statement.
            }
            case LexTokenKind::OpenCurlyParenthesisSymbol: {
                PROGRESS_TOKEN(current);
                while (true) {
                    // Cheapest operation, check first
                    if (current->GetKind() == LexTokenKind::CloseCurlyParenthesisSymbol) {
                        PROGRESS_TOKEN(current);
                        break;
                    }
                    const ParsedStatement* statement;
                    // TODO: Sort by
                    if (!ParseVirtProp(statement) &&
                        !ParseFunc(statement) &&
                        !ParseVar(statement) &&
                        !ParseFuncDef(statement)){
                        LogError(Diagnostics::DiagnosticType::UnexpectedToken, current->GetSpan());
                    } else{
                        body.push_back(statement);
                    }
                }
                break;
            }
            default: throw;
        }
        out = new ParsedClassStatement(TextSpan(start, current->GetSpan().GetEnd()), identifier,
                                       inherits, body);
        _currentToken = current;
        return true;
    }
    bool Parser::ParseVirtProp([[maybe_unused]]const ParsedStatement*& out) { return false; }
    bool Parser::ParseFunc([[maybe_unused]]const ParsedStatement*& out) { return false; }
    bool Parser::ParseVar([[maybe_unused]]const ParsedStatement*& out) { return false; }
    bool Parser::ParseFuncDef([[maybe_unused]]const ParsedStatement*& out) { return false; }
 }
--- a/src/Parser/Parser.hpp
+++ b/src/Parser/Parser.hpp
@@ -1,32 +1,43 @@
 #ifndef MALACHSCRIPT_PARSER_HPP
 #define MALACHSCRIPT_PARSER_HPP
 #include "../Diagnostics/Diagnostics.hpp"
 #include "Lexer/LexToken.hpp"
 #include "Statements/ParsedStatement.hpp"
 namespace MalachScript::Parser {
    class Parser {
    public:
-        Parser(const LexToken* firstToken) : _currentToken(firstToken) {}
+        Parser(const char* scriptName, const LexToken* firstToken, Diagnostics::Diagnostics* diagnostics)
            : _scriptName(reinterpret_cast<const char8_t*>(scriptName)), _diagnostics(diagnostics),
              _currentToken(firstToken) {}
        Parser(std::u8string_view scriptName, const LexToken* firstToken, Diagnostics::Diagnostics* diagnostics)
            : _scriptName(scriptName), _diagnostics(diagnostics), _currentToken(firstToken) {}
        ParsedScriptStatement* Parse();
    private:
        std::u8string_view _scriptName;
        Diagnostics::Diagnostics* _diagnostics;
        const LexToken* _currentToken;
-        inline const LexToken* Peek() {
+
-            if (_currentToken->GetKind() == LexTokenKind::EndOfFile) {
+        inline void LogError(Diagnostics::DiagnosticType type, const TextSpan& span) {
-                return _currentToken;
+            _diagnostics->LogError(type, _scriptName, span);
            }
            return _currentToken->GetNext().get();
        }
-        inline const LexToken* Consume() {
+        bool ParseClass(const ParsedStatement*& out);
-            if (_currentToken->GetKind() == LexTokenKind::EndOfFile) {
+        bool ParseVirtProp(const ParsedStatement*& out);
-                return _currentToken;
+        bool ParseFunc(const ParsedStatement*& out);
-            }
+        bool ParseVar(const ParsedStatement*& out);
-            _currentToken = _currentToken->GetNext().get();
+        bool ParseFuncDef(const ParsedStatement*& out);
            return _currentToken;
        }
-        const ParsedStatement* ParseStatement(const LexToken* token);
+        std::u8string_view ParseIdentifier(const LexToken* token, bool& logError) {
            if (logError && token->GetKind() != LexTokenKind::Identifier) {
                LogError(Diagnostics::DiagnosticType::UnexpectedToken, token->GetSpan());
                logError = false;
                return std::u8string_view();
            }
            return reinterpret_cast<const IdentifierToken*>(token)->GetValue();
        }
    };
 }
--- a/src/Parser/Statements/ParsedStatement.hpp
+++ b/src/Parser/Statements/ParsedStatement.hpp
@@ -10,6 +10,7 @@ namespace MalachScript::Parser {
    public:
        ParsedStatement(TextSpan span) : _span(span) {}
        virtual ~ParsedStatement() = default;
        [[nodiscard]] virtual ParsedStatementKind GetKind() const noexcept = 0;
        [[nodiscard]] inline const TextSpan& GetSpan() const noexcept { return _span; }
    };
@@ -17,15 +18,15 @@ namespace MalachScript::Parser {
    template <ParsedStatementKind kind> class ParsedStatementImpl : public ParsedStatement {
    public:
        ParsedStatementImpl(TextSpan span) : ParsedStatement(span) {}
-        [[nodiscard]] inline ParsedStatementKind GetKind() const noexcept override { return kind; }
+        [[nodiscard]] inline ParsedStatementKind GetKind() const noexcept final { return kind; }
    };
-    class ParsedBlockStatement : public ParsedStatementImpl<ParsedStatementKind::Block> {
+    class ParsedScriptStatement : public ParsedStatementImpl<ParsedStatementKind::Script> {
        std::vector<std::unique_ptr<const ParsedStatement>> _statements;
    public:
-        ParsedBlockStatement(TextSpan span, const std::vector<const ParsedStatement*>& statements)
+        ParsedScriptStatement(TextSpan span, const std::vector<const ParsedStatement*>& statements)
-            : ParsedStatementImpl<ParsedStatementKind::Block>(span), _statements(statements.size()) {
+            : ParsedStatementImpl<ParsedStatementKind::Script>(span), _statements(statements.size()) {
            for (size_t i = 0; i < statements.size(); i++)
                _statements[i] = std::unique_ptr<const ParsedStatement>(statements[i]);
        }
@@ -35,15 +36,18 @@ namespace MalachScript::Parser {
        }
    };
-    class ParsedScriptStatement : public ParsedStatementImpl<ParsedStatementKind::Script> {
+    class ParsedClassStatement : public ParsedStatementImpl<ParsedStatementKind::Class> {
-        std::unique_ptr<const ParsedBlockStatement> _block;
+        std::u8string_view _identifier;
        std::vector<std::u8string_view> _inherits;
        std::vector<std::unique_ptr<const ParsedStatement>> _body;
    public:
-        ParsedScriptStatement(const ParsedBlockStatement* block)
+        ParsedClassStatement(TextSpan span, std::u8string_view identifier, std::vector<std::u8string_view> inherits,
-            : ParsedStatementImpl<ParsedStatementKind::Script>(block->GetSpan()), _block(block) {}
+                             const std::vector<const ParsedStatement*>& body)
-
+            : ParsedStatementImpl<ParsedStatementKind::Class>(span), _identifier(identifier), _inherits(inherits),
-        [[nodiscard]] inline const std::unique_ptr<const ParsedBlockStatement>& GetBlock() const noexcept {
+              _body(body.size()) {
-            return _block;
+            for (size_t i = 0; i < body.size(); i++)
                _body[i] = std::unique_ptr<const ParsedStatement>(body[i]);
        }
    };
 }
--- a/src/Parser/Statements/ParsedStatementKind.hpp
+++ b/src/Parser/Statements/ParsedStatementKind.hpp
@@ -2,10 +2,12 @@
 #define MALACHSCRIPT_PARSEDSTATEMENTKIND_HPP
 namespace MalachScript::Parser {
-    enum class ParsedStatementKind{
+    enum class ParsedStatementKind : uint8_t {
        Unknown,
        Block,
        Script,
        ExpressionStatement,
        Class,
    };
 }
--- a/tests/ParserTests/ClassTests.cpp
+++ b/tests/ParserTests/ClassTests.cpp
@@ -0,0 +1,77 @@
 #include "../../extern/doctest.hpp"
 #include "../../src/Parser/Parser.hpp"
 using namespace MalachScript;
 TEST_CASE("Parse basic class without body") {
    std::vector<Parser::LexToken*> vec = {
        new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
        new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
        new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0)),
        new Parser::LexTokenImpl<Parser::LexTokenKind::EndOfFile>(TextSpan(0, 0)),
    };
    for (size_t i = 0; i < vec.size() - 1; i++) {
        vec[i]->SetNext(vec[i + 1]);
    }
    Diagnostics::Diagnostics diags;
    auto parser = Parser::Parser(u8"class without body", vec.front(), &diags);
    auto* script = parser.Parse();
    REQUIRE(diags.GetMessages().empty());
    {
        REQUIRE(script->GetStatements().size() == 1);
        REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
    }
    delete vec[0];
    delete script;
 }
 #define PARSER_TEST(name, tokens, asserts)                                                                             \
    TEST_CASE(name) {                                                                                                  \
        std::vector<Parser::LexToken*> vec = {                                                                         \
            tokens,                                                                                                    \
            new Parser::LexTokenImpl<Parser::LexTokenKind::EndOfFile>(TextSpan(0, 0)),                                 \
        };                                                                                                             \
        for (size_t i = 0; i < vec.size() - 1; i++) {                                                                  \
            vec[i]->SetNext(vec[i + 1]);                                                                               \
        }                                                                                                              \
        Diagnostics::Diagnostics diags;                                                                                \
        auto parser = Parser::Parser(u8"scriptname", vec.front(), &diags);                                             \
        auto* script = parser.Parse();                                                                                 \
        REQUIRE(diags.GetMessages().empty());                                                                          \
        asserts;                                                                                                       \
        delete vec[0];                                                                                                 \
        delete script;                                                                                                 \
    }
 #define PARSER_TEST_TOKENS(...) __VA_ARGS__
 PARSER_TEST("Parse basic class without body",
            PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
                               new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
                               new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0))),
            {
                REQUIRE(script->GetStatements().size() == 1);
                REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
            })
 PARSER_TEST("Parse basic class without body with whitespaces",
            PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
                               new Parser::LexTokenImpl<Parser::LexTokenKind::Whitespace>(TextSpan(0, 0)),
                               new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
                               new Parser::LexTokenImpl<Parser::LexTokenKind::Whitespace>(TextSpan(0, 0)),
                               new Parser::LexTokenImpl<Parser::LexTokenKind::SemicolonSymbol>(TextSpan(0, 0))),
            {
                REQUIRE(script->GetStatements().size() == 1);
                REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
            })
 PARSER_TEST(
    "Parse basic class with empty body",
    PARSER_TEST_TOKENS(new Parser::LexTokenImpl<Parser::LexTokenKind::ClassKeyword>(TextSpan(0, 0)),
                       new Parser::IdentifierToken(TextSpan(0, 0), u8"foobar"),
                       new Parser::LexTokenImpl<Parser::LexTokenKind::OpenCurlyParenthesisSymbol>(TextSpan(0, 0)),
                       new Parser::LexTokenImpl<Parser::LexTokenKind::CloseCurlyParenthesisSymbol>(TextSpan(0, 0))),
    {
        REQUIRE(script->GetStatements().size() == 1);
        REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
    })
--- a/tests/ParserTests/ParserIntegrationTests.cpp
+++ b/tests/ParserTests/ParserIntegrationTests.cpp
@@ -0,0 +1,27 @@
 #include "../../extern/doctest.hpp"
 #include "../../src/Parser/Lexer/Lexer.hpp"
 #include "../../src/Parser/Parser.hpp"
 using namespace MalachScript;
 #define PARSE_TEST(name, scriptText, asserts)                                                                          \
    TEST_CASE(name) {                                                                                                  \
        Diagnostics::Diagnostics diags;                                                                                \
        auto lexer = Parser::Lexer(name, scriptText, &diags);                                                          \
        auto token = lexer.Lex();                                                                                      \
        auto parser = Parser::Parser(name, token, &diags);                                                             \
        auto script = parser.Parse();                                                                                  \
        asserts;                                                                                                       \
    }
 PARSE_TEST("Parse class without definition", "class foobar;", {
    REQUIRE(diags.GetMessages().empty());
    REQUIRE(script->GetStatements().size() == 1);
    REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
 })
 PARSE_TEST("Parse class with empty definition", "class foobar {}", {
    REQUIRE(diags.GetMessages().empty());
    REQUIRE(script->GetStatements().size() == 1);
    REQUIRE(script->GetStatements()[0].get()->GetKind() == Parser::ParsedStatementKind::Class);
 })