From 6eb005ab3f1227c9cb9383956ae883ac5e0e3035 Mon Sep 17 00:00:00 2001 From: Deukhoofd Date: Wed, 22 May 2019 13:24:28 +0200 Subject: [PATCH] Lex Strings --- src/Diagnostics/DiagnosticCode.hpp | 5 +++ src/Parser/Lexer.cpp | 65 ++++++++++++++++++++++++++++-- src/Parser/Lexer.hpp | 4 +- src/Parser/Token.hpp | 15 +++++++ src/Parser/TokenKind.hpp | 1 + tests/parser/LexerTests.cpp | 39 ++++++++++++++++++ 6 files changed, 125 insertions(+), 4 deletions(-) diff --git a/src/Diagnostics/DiagnosticCode.hpp b/src/Diagnostics/DiagnosticCode.hpp index 72fb34c..a76b71f 100644 --- a/src/Diagnostics/DiagnosticCode.hpp +++ b/src/Diagnostics/DiagnosticCode.hpp @@ -3,9 +3,14 @@ #define PORYGONLANG_DIAGNOSTICCODE_HPP enum class DiagnosticCode{ + // Lex diagnostics UnexpectedCharacter, + InvalidStringControlCharacter, + + // Parse diagnostics UnexpectedToken, + // Bind diagnostics NoBinaryOperationFound, NoUnaryOperationFound, }; diff --git a/src/Parser/Lexer.cpp b/src/Parser/Lexer.cpp index 09b5860..fc5fe9e 100644 --- a/src/Parser/Lexer.cpp +++ b/src/Parser/Lexer.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include "Lexer.hpp" @@ -60,11 +62,15 @@ IToken* Lexer::LexNext(char c){ return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexNumber(c); + case '"': + return LexString(c); + case '\'': + return LexString(c); case '_': - return LexIdentifierOrKeyword(c); + return LexIdentifierOrKeyword(); default: if (isalpha(c)){ - return LexIdentifierOrKeyword(c); + return LexIdentifierOrKeyword(); } this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1); return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1); @@ -143,7 +149,7 @@ unsigned constexpr const_hash(char const *input) { 5381; } -IToken* Lexer::LexIdentifierOrKeyword(char c){ +IToken * Lexer::LexIdentifierOrKeyword() { auto start = this -> Position - 1; auto end = start; while (true){ @@ -181,4 +187,57 @@ IToken* Lexer::LexIdentifierOrKeyword(char c){ case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5); default: return new IdentifierToken(s, start, s.length()); } +} + +const unordered_map ControlCharacters{ + {'0', '\0'}, + {'a', '\a'}, + {'b', '\b'}, + {'t', '\t'}, + {'n', '\n'}, + {'v', '\v'}, + {'f', '\f'}, + {'r', '\r'}, + {'"', '\"'}, + {'\'', '\''}, + {'\?', '\?'}, + {'\\', '\\'}, +}; + +IToken* Lexer::LexString(char c){ + auto start = this -> Position - 1; + auto end = start; + char last = c; + while (true){ + char next = this -> Peek(); + if (next == '\0') break; + if (next == c && last != '\\') break; + this -> Next(); + end++; + last = next; + } + auto closeToken = this -> Next(); + if (closeToken != c){ + this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1); + return new SimpleToken(TokenKind::BadToken, start, end -start + 1); + } + + string s = this -> ScriptString.substr(start + 1, end - start); + stringstream stream; + for (int i = 0; i < s.size(); i++){ + c = s[i]; + if (c == '\\'){ + i++; + c = s[i]; + if (ControlCharacters.find(c) != ControlCharacters.end()) { + stream << ControlCharacters.at(c); + } else{ + this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1); + stream << c; + } + } else{ + stream << c; + } + } + return new StringToken(stream.str(), start, end - start ); } \ No newline at end of file diff --git a/src/Parser/Lexer.hpp b/src/Parser/Lexer.hpp index 6742e2b..a489420 100644 --- a/src/Parser/Lexer.hpp +++ b/src/Parser/Lexer.hpp @@ -17,12 +17,14 @@ public: char Next(); IToken* LexNext(char c); IToken* LexNumber(char c); - IToken *LexIdentifierOrKeyword(char c); + IToken *LexIdentifierOrKeyword(); + IToken *LexString(char c); public: Script* ScriptData; vector Lex(); explicit Lexer(string scriptString, class Script* script); + }; diff --git a/src/Parser/Token.hpp b/src/Parser/Token.hpp index 78ebeb3..184cf83 100644 --- a/src/Parser/Token.hpp +++ b/src/Parser/Token.hpp @@ -1,3 +1,5 @@ +#include + #ifndef PORYGONLANG_TOKEN_HPP #define PORYGONLANG_TOKEN_HPP @@ -73,6 +75,19 @@ public: } }; +class StringToken : public IToken{ +public: + string Value; + + explicit StringToken(string value, unsigned int position, unsigned int length) : IToken(position, length){ + Value = std::move(value); + } + + TokenKind GetKind() override{ + return TokenKind::String; + } +}; + class IdentifierToken : public IToken{ public: string Value; diff --git a/src/Parser/TokenKind.hpp b/src/Parser/TokenKind.hpp index 24d898c..ad4d7a0 100644 --- a/src/Parser/TokenKind.hpp +++ b/src/Parser/TokenKind.hpp @@ -19,6 +19,7 @@ enum class TokenKind{ Integer, Float, + String, AndKeyword, BreakKeyword, diff --git a/tests/parser/LexerTests.cpp b/tests/parser/LexerTests.cpp index a99264d..251e023 100644 --- a/tests/parser/LexerTests.cpp +++ b/tests/parser/LexerTests.cpp @@ -276,4 +276,43 @@ TEST_CASE( "Lex End Position", "[lexer]" ) { CHECK(((IdentifierToken*)tokens[3]) -> GetEndPosition() == 11); CHECK(((IdentifierToken*)tokens[4]) -> GetEndPosition() == 12); } + +TEST_CASE("Lex Double Quote String", "[lexer]") { + Lexer lexer = Lexer("\"foo bar\"", nullptr); + auto tokens = lexer.Lex(); + REQUIRE(tokens.size() == 2); + IToken* firstToken = tokens[0]; + REQUIRE(firstToken -> GetKind() == TokenKind::String); + REQUIRE(((StringToken*)firstToken) -> Value == "foo bar"); +} + +TEST_CASE("Lex Single Quote String", "[lexer]") { + Lexer lexer = Lexer("'foo bar'", nullptr); + auto tokens = lexer.Lex(); + REQUIRE(tokens.size() == 2); + IToken* firstToken = tokens[0]; + REQUIRE(firstToken -> GetKind() == TokenKind::String); + REQUIRE(((StringToken*)firstToken) -> Value == "foo bar"); +} + +TEST_CASE("Lex Double Quote String, Escape Quote", "[lexer]") { + Lexer lexer = Lexer("'foo\\\"bar'", nullptr); + auto tokens = lexer.Lex(); + REQUIRE(tokens.size() == 2); + IToken* firstToken = tokens[0]; + REQUIRE(firstToken -> GetKind() == TokenKind::String); + REQUIRE(((StringToken*)firstToken) -> Value == "foo\"bar"); +} + +TEST_CASE("Lex String with newline", "[lexer]") { + Lexer lexer = Lexer("'foo\\nbar'", nullptr); + auto tokens = lexer.Lex(); + REQUIRE(tokens.size() == 2); + IToken* firstToken = tokens[0]; + REQUIRE(firstToken -> GetKind() == TokenKind::String); + REQUIRE(((StringToken*)firstToken) -> Value == "foo\nbar"); +} + + + #endif \ No newline at end of file