Lex Strings

This commit is contained in:
Deukhoofd 2019-05-22 13:24:28 +02:00
parent 23991ab2ea
commit 6eb005ab3f
No known key found for this signature in database
GPG Key ID: B4C087AC81641654
6 changed files with 125 additions and 4 deletions

View File

@ -3,9 +3,14 @@
#define PORYGONLANG_DIAGNOSTICCODE_HPP
enum class DiagnosticCode{
// Lex diagnostics
UnexpectedCharacter,
InvalidStringControlCharacter,
// Parse diagnostics
UnexpectedToken,
// Bind diagnostics
NoBinaryOperationFound,
NoUnaryOperationFound,
};

View File

@ -1,5 +1,7 @@
#include <utility>
#include <cmath>
#include <unordered_map>
#include <sstream>
#include "Lexer.hpp"
@ -60,11 +62,15 @@ IToken* Lexer::LexNext(char c){
return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
return LexNumber(c);
case '"':
return LexString(c);
case '\'':
return LexString(c);
case '_':
return LexIdentifierOrKeyword(c);
return LexIdentifierOrKeyword();
default:
if (isalpha(c)){
return LexIdentifierOrKeyword(c);
return LexIdentifierOrKeyword();
}
this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);
return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);
@ -143,7 +149,7 @@ unsigned constexpr const_hash(char const *input) {
5381;
}
IToken* Lexer::LexIdentifierOrKeyword(char c){
IToken * Lexer::LexIdentifierOrKeyword() {
auto start = this -> Position - 1;
auto end = start;
while (true){
@ -181,4 +187,57 @@ IToken* Lexer::LexIdentifierOrKeyword(char c){
case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
default: return new IdentifierToken(s, start, s.length());
}
}
const unordered_map<char, char> ControlCharacters{
{'0', '\0'},
{'a', '\a'},
{'b', '\b'},
{'t', '\t'},
{'n', '\n'},
{'v', '\v'},
{'f', '\f'},
{'r', '\r'},
{'"', '\"'},
{'\'', '\''},
{'\?', '\?'},
{'\\', '\\'},
};
IToken* Lexer::LexString(char c){
auto start = this -> Position - 1;
auto end = start;
char last = c;
while (true){
char next = this -> Peek();
if (next == '\0') break;
if (next == c && last != '\\') break;
this -> Next();
end++;
last = next;
}
auto closeToken = this -> Next();
if (closeToken != c){
this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1);
return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
}
string s = this -> ScriptString.substr(start + 1, end - start);
stringstream stream;
for (int i = 0; i < s.size(); i++){
c = s[i];
if (c == '\\'){
i++;
c = s[i];
if (ControlCharacters.find(c) != ControlCharacters.end()) {
stream << ControlCharacters.at(c);
} else{
this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
stream << c;
}
} else{
stream << c;
}
}
return new StringToken(stream.str(), start, end - start );
}

View File

@ -17,12 +17,14 @@ public:
char Next();
IToken* LexNext(char c);
IToken* LexNumber(char c);
IToken *LexIdentifierOrKeyword(char c);
IToken *LexIdentifierOrKeyword();
IToken *LexString(char c);
public:
Script* ScriptData;
vector<IToken*> Lex();
explicit Lexer(string scriptString, class Script* script);
};

View File

@ -1,3 +1,5 @@
#include <utility>
#ifndef PORYGONLANG_TOKEN_HPP
#define PORYGONLANG_TOKEN_HPP
@ -73,6 +75,19 @@ public:
}
};
class StringToken : public IToken{
public:
string Value;
explicit StringToken(string value, unsigned int position, unsigned int length) : IToken(position, length){
Value = std::move(value);
}
TokenKind GetKind() override{
return TokenKind::String;
}
};
class IdentifierToken : public IToken{
public:
string Value;

View File

@ -19,6 +19,7 @@ enum class TokenKind{
Integer,
Float,
String,
AndKeyword,
BreakKeyword,

View File

@ -276,4 +276,43 @@ TEST_CASE( "Lex End Position", "[lexer]" ) {
CHECK(((IdentifierToken*)tokens[3]) -> GetEndPosition() == 11);
CHECK(((IdentifierToken*)tokens[4]) -> GetEndPosition() == 12);
}
TEST_CASE("Lex Double Quote String", "[lexer]") {
Lexer lexer = Lexer("\"foo bar\"", nullptr);
auto tokens = lexer.Lex();
REQUIRE(tokens.size() == 2);
IToken* firstToken = tokens[0];
REQUIRE(firstToken -> GetKind() == TokenKind::String);
REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
}
TEST_CASE("Lex Single Quote String", "[lexer]") {
Lexer lexer = Lexer("'foo bar'", nullptr);
auto tokens = lexer.Lex();
REQUIRE(tokens.size() == 2);
IToken* firstToken = tokens[0];
REQUIRE(firstToken -> GetKind() == TokenKind::String);
REQUIRE(((StringToken*)firstToken) -> Value == "foo bar");
}
TEST_CASE("Lex Double Quote String, Escape Quote", "[lexer]") {
Lexer lexer = Lexer("'foo\\\"bar'", nullptr);
auto tokens = lexer.Lex();
REQUIRE(tokens.size() == 2);
IToken* firstToken = tokens[0];
REQUIRE(firstToken -> GetKind() == TokenKind::String);
REQUIRE(((StringToken*)firstToken) -> Value == "foo\"bar");
}
TEST_CASE("Lex String with newline", "[lexer]") {
Lexer lexer = Lexer("'foo\\nbar'", nullptr);
auto tokens = lexer.Lex();
REQUIRE(tokens.size() == 2);
IToken* firstToken = tokens[0];
REQUIRE(firstToken -> GetKind() == TokenKind::String);
REQUIRE(((StringToken*)firstToken) -> Value == "foo\nbar");
}
#endif