Lex Strings

2019-05-22 13:24:28 +02:00
parent 23991ab2ea
commit 6eb005ab3f
6 changed files with 125 additions and 4 deletions
--- a/src/Diagnostics/DiagnosticCode.hpp
+++ b/src/Diagnostics/DiagnosticCode.hpp
@@ -3,9 +3,14 @@
 #define PORYGONLANG_DIAGNOSTICCODE_HPP
 enum class DiagnosticCode{
    // Lex diagnostics
    UnexpectedCharacter,
    InvalidStringControlCharacter,
    // Parse diagnostics
    UnexpectedToken,
    // Bind diagnostics
    NoBinaryOperationFound,
    NoUnaryOperationFound,
 };
--- a/src/Parser/Lexer.cpp
+++ b/src/Parser/Lexer.cpp
@@ -1,5 +1,7 @@
 #include <utility>
 #include <cmath>
 #include <unordered_map>
 #include <sstream>
 #include "Lexer.hpp"
@@ -60,11 +62,15 @@ IToken* Lexer::LexNext(char c){
            return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);
        case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
            return LexNumber(c);
        case '"':
            return LexString(c);
        case '\'':
            return LexString(c);
        case '_':
-            return LexIdentifierOrKeyword(c);
+            return LexIdentifierOrKeyword();
        default:
            if (isalpha(c)){
-                return LexIdentifierOrKeyword(c);
+                return LexIdentifierOrKeyword();
            }
            this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);
            return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);
@@ -143,7 +149,7 @@ unsigned constexpr const_hash(char const *input) {
           5381;
 }
-IToken* Lexer::LexIdentifierOrKeyword(char c){
+IToken * Lexer::LexIdentifierOrKeyword() {
    auto start = this -> Position - 1;
    auto end = start;
    while (true){
@@ -182,3 +188,56 @@ IToken* Lexer::LexIdentifierOrKeyword(char c){
        default: return new IdentifierToken(s, start, s.length());
    }
 }
 const unordered_map<char, char> ControlCharacters{
        {'0', '\0'},
        {'a', '\a'},
        {'b', '\b'},
        {'t', '\t'},
        {'n', '\n'},
        {'v', '\v'},
        {'f', '\f'},
        {'r', '\r'},
        {'"', '\"'},
        {'\'', '\''},
        {'\?', '\?'},
        {'\\', '\\'},
 };
 IToken* Lexer::LexString(char c){
    auto start = this -> Position - 1;
    auto end = start;
    char last = c;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (next == c && last != '\\') break;
        this -> Next();
        end++;
        last = next;
    }
    auto closeToken = this -> Next();
    if (closeToken != c){
        this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1);
        return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
    }
    string s = this -> ScriptString.substr(start + 1, end - start);
    stringstream stream;
    for (int i = 0; i < s.size(); i++){
        c = s[i];
        if (c == '\\'){
            i++;
            c = s[i];
            if (ControlCharacters.find(c) != ControlCharacters.end()) {
                stream << ControlCharacters.at(c);
            } else{
                this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
                stream << c;
            }
        } else{
            stream << c;
        }
    }
    return new StringToken(stream.str(), start, end - start );
 }
--- a/src/Parser/Lexer.hpp
+++ b/src/Parser/Lexer.hpp
@@ -17,12 +17,14 @@ public:
    char Next();
    IToken* LexNext(char c);
    IToken* LexNumber(char c);
-    IToken *LexIdentifierOrKeyword(char c);
+    IToken *LexIdentifierOrKeyword();
    IToken *LexString(char c);
 public:
    Script* ScriptData;
    vector<IToken*> Lex();
    explicit Lexer(string scriptString, class Script* script);
 };
--- a/src/Parser/Token.hpp
+++ b/src/Parser/Token.hpp
@@ -1,3 +1,5 @@
 #include <utility>
 #ifndef PORYGONLANG_TOKEN_HPP
 #define PORYGONLANG_TOKEN_HPP
@@ -73,6 +75,19 @@ public:
    }
 };
 class StringToken : public IToken{
 public:
    string Value;
    explicit StringToken(string value, unsigned int position, unsigned int length) : IToken(position, length){
        Value = std::move(value);
    }
    TokenKind GetKind() override{
        return TokenKind::String;
    }
 };
 class IdentifierToken : public IToken{
 public:
    string Value;
--- a/src/Parser/TokenKind.hpp
+++ b/src/Parser/TokenKind.hpp
@@ -19,6 +19,7 @@ enum class TokenKind{
    Integer,
    Float,
    String,
    AndKeyword,
    BreakKeyword,
--- a/tests/parser/LexerTests.cpp
+++ b/tests/parser/LexerTests.cpp
@@ -276,4 +276,43 @@ TEST_CASE( "Lex End Position", "[lexer]" ) {
    CHECK(((IdentifierToken*)tokens[3]) -> GetEndPosition() == 11);
    CHECK(((IdentifierToken*)tokens[4]) -> GetEndPosition() == 12);
 }
 TEST_CASE("Lex Double Quote String", "[lexer]") {
    Lexer lexer = Lexer("\"foo bar\"", nullptr);
    auto tokens = lexer.Lex();
    REQUIRE(tokens.size() == 2);
    IToken* firstToken = tokens[0];
    REQUIRE(firstToken -> GetKind() == TokenKind::String);
    REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
 }
 TEST_CASE("Lex Single Quote String", "[lexer]") {
    Lexer lexer = Lexer("'foo bar'", nullptr);
    auto tokens = lexer.Lex();
    REQUIRE(tokens.size() == 2);
    IToken* firstToken = tokens[0];
    REQUIRE(firstToken -> GetKind() == TokenKind::String);
    REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
 }
 TEST_CASE("Lex Double Quote String, Escape Quote", "[lexer]") {
    Lexer lexer = Lexer("'foo\\\"bar'", nullptr);
    auto tokens = lexer.Lex();
    REQUIRE(tokens.size() == 2);
    IToken* firstToken = tokens[0];
    REQUIRE(firstToken -> GetKind() == TokenKind::String);
    REQUIRE(((StringToken*)firstToken) -> Value == "foo\"bar");
 }
 TEST_CASE("Lex String with newline", "[lexer]") {
    Lexer lexer = Lexer("'foo\\nbar'", nullptr);
    auto tokens = lexer.Lex();
    REQUIRE(tokens.size() == 2);
    IToken* firstToken = tokens[0];
    REQUIRE(firstToken -> GetKind() == TokenKind::String);
    REQUIRE(((StringToken*)firstToken) -> Value == "foo\nbar");
 }
 #endif