From 6eb005ab3f1227c9cb9383956ae883ac5e0e3035 Mon Sep 17 00:00:00 2001
From: Deukhoofd <Deukhoofd@gmail.com>
Date: Wed, 22 May 2019 13:24:28 +0200
Subject: [PATCH] Lex Strings

---
 src/Diagnostics/DiagnosticCode.hpp |  5 +++
 src/Parser/Lexer.cpp               | 65 ++++++++++++++++++++++++++++--
 src/Parser/Lexer.hpp               |  4 +-
 src/Parser/Token.hpp               | 15 +++++++
 src/Parser/TokenKind.hpp           |  1 +
 tests/parser/LexerTests.cpp        | 39 ++++++++++++++++++
 6 files changed, 125 insertions(+), 4 deletions(-)
diff --git a/src/Diagnostics/DiagnosticCode.hpp b/src/Diagnostics/DiagnosticCode.hpp
index 72fb34c..a76b71f 100644
--- a/src/Diagnostics/DiagnosticCode.hpp
+++ b/src/Diagnostics/DiagnosticCode.hpp
@@ -3,9 +3,14 @@
 #define PORYGONLANG_DIAGNOSTICCODE_HPP
 
 enum class DiagnosticCode{
+    // Lex diagnostics
     UnexpectedCharacter,
+    InvalidStringControlCharacter,
+
+    // Parse diagnostics
     UnexpectedToken,
 
+    // Bind diagnostics
     NoBinaryOperationFound,
     NoUnaryOperationFound,
 };
diff --git a/src/Parser/Lexer.cpp b/src/Parser/Lexer.cpp
index 09b5860..fc5fe9e 100644
--- a/src/Parser/Lexer.cpp
+++ b/src/Parser/Lexer.cpp
@@ -1,5 +1,7 @@
 #include <utility>
 #include <cmath>
+#include <unordered_map>
+#include <sstream>
 
 #include "Lexer.hpp"
 
@@ -60,11 +62,15 @@ IToken* Lexer::LexNext(char c){
             return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);
         case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
             return LexNumber(c);
+        case '"':
+            return LexString(c);
+        case '\'':
+            return LexString(c);
         case '_':
-            return LexIdentifierOrKeyword(c);
+            return LexIdentifierOrKeyword();
         default:
             if (isalpha(c)){
-                return LexIdentifierOrKeyword(c);
+                return LexIdentifierOrKeyword();
             }
             this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);
             return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);
@@ -143,7 +149,7 @@ unsigned constexpr const_hash(char const *input) {
            5381;
 }
 
-IToken* Lexer::LexIdentifierOrKeyword(char c){
+IToken * Lexer::LexIdentifierOrKeyword() {
     auto start = this -> Position - 1;
     auto end = start;
     while (true){
@@ -181,4 +187,57 @@ IToken* Lexer::LexIdentifierOrKeyword(char c){
         case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
         default: return new IdentifierToken(s, start, s.length());
     }
+}
+
+const unordered_map<char, char> ControlCharacters{
+        {'0', '\0'},
+        {'a', '\a'},
+        {'b', '\b'},
+        {'t', '\t'},
+        {'n', '\n'},
+        {'v', '\v'},
+        {'f', '\f'},
+        {'r', '\r'},
+        {'"', '\"'},
+        {'\'', '\''},
+        {'\?', '\?'},
+        {'\\', '\\'},
+};
+
+IToken* Lexer::LexString(char c){
+    auto start = this -> Position - 1;
+    auto end = start;
+    char last = c;
+    while (true){
+        char next = this -> Peek();
+        if (next == '\0') break;
+        if (next == c && last != '\\') break;
+        this -> Next();
+        end++;
+        last = next;
+    }
+    auto closeToken = this -> Next();
+    if (closeToken != c){
+        this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1);
+        return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
+    }
+
+    string s = this -> ScriptString.substr(start + 1, end - start);
+    stringstream stream;
+    for (int i = 0; i < s.size(); i++){
+        c = s[i];
+        if (c == '\\'){
+            i++;
+            c = s[i];
+            if (ControlCharacters.find(c) != ControlCharacters.end()) {
+                stream << ControlCharacters.at(c);
+            } else{
+                this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
+                stream << c;
+            }
+        } else{
+            stream << c;
+        }
+    }
+    return new StringToken(stream.str(), start, end - start );
 }
\ No newline at end of file
diff --git a/src/Parser/Lexer.hpp b/src/Parser/Lexer.hpp
index 6742e2b..a489420 100644
--- a/src/Parser/Lexer.hpp
+++ b/src/Parser/Lexer.hpp
@@ -17,12 +17,14 @@ public:
     char Next();
     IToken* LexNext(char c);
     IToken* LexNumber(char c);
-    IToken *LexIdentifierOrKeyword(char c);
+    IToken *LexIdentifierOrKeyword();
+    IToken *LexString(char c);
 public:
     Script* ScriptData;
 
     vector<IToken*> Lex();
     explicit Lexer(string scriptString, class Script* script);
+
 };
 
 
diff --git a/src/Parser/Token.hpp b/src/Parser/Token.hpp
index 78ebeb3..184cf83 100644
--- a/src/Parser/Token.hpp
+++ b/src/Parser/Token.hpp
@@ -1,3 +1,5 @@
+#include <utility>
+
 #ifndef PORYGONLANG_TOKEN_HPP
 #define PORYGONLANG_TOKEN_HPP
 
@@ -73,6 +75,19 @@ public:
     }
 };
 
+class StringToken : public IToken{
+public:
+    string Value;
+
+    explicit StringToken(string value, unsigned int position, unsigned int length) : IToken(position, length){
+        Value = std::move(value);
+    }
+
+    TokenKind GetKind() override{
+        return TokenKind::String;
+    }
+};
+
 class IdentifierToken : public IToken{
 public:
     string Value;
diff --git a/src/Parser/TokenKind.hpp b/src/Parser/TokenKind.hpp
index 24d898c..ad4d7a0 100644
--- a/src/Parser/TokenKind.hpp
+++ b/src/Parser/TokenKind.hpp
@@ -19,6 +19,7 @@ enum class TokenKind{
 
     Integer,
     Float,
+    String,
 
     AndKeyword,
     BreakKeyword,
diff --git a/tests/parser/LexerTests.cpp b/tests/parser/LexerTests.cpp
index a99264d..251e023 100644
--- a/tests/parser/LexerTests.cpp
+++ b/tests/parser/LexerTests.cpp
@@ -276,4 +276,43 @@ TEST_CASE( "Lex End Position", "[lexer]" ) {
     CHECK(((IdentifierToken*)tokens[3]) -> GetEndPosition() == 11);
     CHECK(((IdentifierToken*)tokens[4]) -> GetEndPosition() == 12);
 }
+
+TEST_CASE("Lex Double Quote String", "[lexer]") {
+    Lexer lexer = Lexer("\"foo bar\"", nullptr);
+    auto tokens = lexer.Lex();
+    REQUIRE(tokens.size() == 2);
+    IToken* firstToken = tokens[0];
+    REQUIRE(firstToken -> GetKind() == TokenKind::String);
+    REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
+}
+
+TEST_CASE("Lex Single Quote String", "[lexer]") {
+    Lexer lexer = Lexer("'foo bar'", nullptr);
+    auto tokens = lexer.Lex();
+    REQUIRE(tokens.size() == 2);
+    IToken* firstToken = tokens[0];
+    REQUIRE(firstToken -> GetKind() == TokenKind::String);
+    REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
+}
+
+TEST_CASE("Lex Double Quote String, Escape Quote", "[lexer]") {
+    Lexer lexer = Lexer("'foo\\\"bar'", nullptr);
+    auto tokens = lexer.Lex();
+    REQUIRE(tokens.size() == 2);
+    IToken* firstToken = tokens[0];
+    REQUIRE(firstToken -> GetKind() == TokenKind::String);
+    REQUIRE(((StringToken*)firstToken) -> Value == "foo\"bar");
+}
+
+TEST_CASE("Lex String with newline", "[lexer]") {
+    Lexer lexer = Lexer("'foo\\nbar'", nullptr);
+    auto tokens = lexer.Lex();
+    REQUIRE(tokens.size() == 2);
+    IToken* firstToken = tokens[0];
+    REQUIRE(firstToken -> GetKind() == TokenKind::String);
+    REQUIRE(((StringToken*)firstToken) -> Value == "foo\nbar");
+}
+
+
+
 #endif
\ No newline at end of file