2019-05-18 18:35:51 +00:00
|
|
|
#include <utility>
|
|
|
|
#include <cmath>
|
2019-05-22 11:24:28 +00:00
|
|
|
#include <unordered_map>
|
|
|
|
#include <sstream>
|
2019-05-18 18:35:51 +00:00
|
|
|
|
|
|
|
#include "Lexer.hpp"
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
namespace Porygon::Parser {
|
|
|
|
Lexer::Lexer(const u16string &scriptString, Porygon::Script *script)
|
|
|
|
: _scriptString(scriptString) {
|
|
|
|
this->_scriptSize = scriptString.size();
|
|
|
|
this->ScriptData = script;
|
|
|
|
this->_position = 0;
|
|
|
|
}
|
2019-06-05 17:11:56 +00:00
|
|
|
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
vector<const IToken *> Lexer::Lex() {
|
|
|
|
vector<const IToken *> tokens;
|
|
|
|
while (true) {
|
|
|
|
IToken *next = this->LexNext(this->Next());
|
|
|
|
auto nextKind = next->GetKind();
|
|
|
|
if (nextKind != TokenKind::WhiteSpace)
|
|
|
|
tokens.push_back(next);
|
|
|
|
else
|
|
|
|
delete next;
|
|
|
|
if (nextKind == TokenKind::EndOfFile)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return tokens;
|
2019-05-18 18:35:51 +00:00
|
|
|
}
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
char16_t Lexer::Peek() {
|
|
|
|
if (Lexer::_position >= this->_scriptSize)
|
|
|
|
return '\0';
|
|
|
|
return this->_scriptString.at(Lexer::_position);
|
|
|
|
}
|
2019-05-18 18:35:51 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
char16_t Lexer::Next() {
|
|
|
|
char16_t next = Peek();
|
|
|
|
Lexer::_position++;
|
|
|
|
return next;
|
|
|
|
}
|
2019-05-18 18:35:51 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
IToken *Lexer::LexNext(char16_t c) {
|
|
|
|
switch (c) {
|
|
|
|
case '\0':
|
|
|
|
return new SimpleToken(TokenKind::EndOfFile, this->_position - 1, 1);
|
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
case '\n':
|
|
|
|
case '\r':
|
|
|
|
case '\v':
|
|
|
|
case '\f':
|
|
|
|
return new SimpleToken(TokenKind::WhiteSpace, this->_position - 1, 1);
|
|
|
|
case '+':
|
|
|
|
return new SimpleToken(TokenKind::PlusToken, this->_position - 1, 1);
|
|
|
|
case '-':
|
|
|
|
return new SimpleToken(TokenKind::MinusToken, this->_position - 1, 1);
|
|
|
|
case '/':
|
|
|
|
return new SimpleToken(TokenKind::SlashToken, this->_position - 1, 1);
|
|
|
|
case '*':
|
|
|
|
return new SimpleToken(TokenKind::StarToken, this->_position - 1, 1);
|
|
|
|
case '(':
|
|
|
|
return new SimpleToken(TokenKind::OpenParenthesis, this->_position - 1, 1);
|
|
|
|
case ')':
|
|
|
|
return new SimpleToken(TokenKind::CloseParenthesis, this->_position - 1, 1);
|
|
|
|
case '[':
|
|
|
|
return new SimpleToken(TokenKind::OpenSquareBracket, this->_position - 1, 1);
|
|
|
|
case ']':
|
|
|
|
return new SimpleToken(TokenKind::CloseSquareBracket, this->_position - 1, 1);
|
|
|
|
case '{':
|
|
|
|
return new SimpleToken(TokenKind::OpenCurlyBracket, this->_position - 1, 1);
|
|
|
|
case '}':
|
|
|
|
return new SimpleToken(TokenKind::CloseCurlyBracket, this->_position - 1, 1);
|
|
|
|
case ',':
|
|
|
|
return new SimpleToken(TokenKind::CommaToken, this->_position - 1, 1);
|
|
|
|
case '.':
|
|
|
|
return new SimpleToken(TokenKind::PeriodToken, this->_position - 1, 1);
|
|
|
|
case '=':
|
|
|
|
if (Lexer::Peek() == '=') {
|
|
|
|
Lexer::Next();
|
|
|
|
return new SimpleToken(TokenKind::EqualityToken, this->_position - 2, 2);
|
|
|
|
}
|
|
|
|
return new SimpleToken(TokenKind::AssignmentToken, this->_position - 1, 1);
|
|
|
|
case '<':
|
|
|
|
if (Lexer::Peek() == '=') {
|
|
|
|
Lexer::Next();
|
|
|
|
return new SimpleToken(TokenKind::LessEquals, this->_position - 2, 2);
|
|
|
|
}
|
|
|
|
return new SimpleToken(TokenKind::Less, this->_position - 1, 1);
|
|
|
|
case '>':
|
|
|
|
if (Lexer::Peek() == '=') {
|
|
|
|
Lexer::Next();
|
|
|
|
return new SimpleToken(TokenKind::GreaterEquals, this->_position - 2, 2);
|
|
|
|
}
|
|
|
|
return new SimpleToken(TokenKind::Greater, this->_position - 1, 1);
|
|
|
|
case '~':
|
|
|
|
if (Lexer::Peek() == '=') {
|
|
|
|
Lexer::Next();
|
|
|
|
return new SimpleToken(TokenKind::InequalityToken, this->_position - 2, 2);
|
|
|
|
}
|
2019-06-18 14:39:36 +00:00
|
|
|
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
|
2019-06-17 16:35:12 +00:00
|
|
|
return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1);
|
|
|
|
case '0':
|
|
|
|
case '1':
|
|
|
|
case '2':
|
|
|
|
case '3':
|
|
|
|
case '4':
|
|
|
|
case '5':
|
|
|
|
case '6':
|
|
|
|
case '7':
|
|
|
|
case '8':
|
|
|
|
case '9':
|
|
|
|
return LexNumber(c);
|
|
|
|
case '"':
|
|
|
|
return LexString(c);
|
|
|
|
case '\'':
|
|
|
|
return LexString(c);
|
|
|
|
case '_':
|
2019-05-22 11:24:28 +00:00
|
|
|
return LexIdentifierOrKeyword();
|
2019-06-17 16:35:12 +00:00
|
|
|
default:
|
|
|
|
if (isalpha(c) || c > 255) {
|
|
|
|
return LexIdentifierOrKeyword();
|
|
|
|
}
|
2019-06-18 14:39:36 +00:00
|
|
|
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
|
2019-06-17 16:35:12 +00:00
|
|
|
return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1);
|
|
|
|
}
|
2019-05-18 18:35:51 +00:00
|
|
|
}
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
int CharToInt(char16_t c) {
|
|
|
|
switch (c) {
|
|
|
|
case '0':
|
|
|
|
return 0;
|
|
|
|
case '1':
|
|
|
|
return 1;
|
|
|
|
case '2':
|
|
|
|
return 2;
|
|
|
|
case '3':
|
|
|
|
return 3;
|
|
|
|
case '4':
|
|
|
|
return 4;
|
|
|
|
case '5':
|
|
|
|
return 5;
|
|
|
|
case '6':
|
|
|
|
return 6;
|
|
|
|
case '7':
|
|
|
|
return 7;
|
|
|
|
case '8':
|
|
|
|
return 8;
|
|
|
|
case '9':
|
|
|
|
return 9;
|
|
|
|
default:
|
|
|
|
return -1;
|
|
|
|
}
|
2019-05-18 18:35:51 +00:00
|
|
|
}
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
IToken *Lexer::LexNumber(char16_t c) {
|
|
|
|
long int_value = CharToInt(c);
|
|
|
|
double float_value = 0;
|
|
|
|
short decimal_index = 0;
|
|
|
|
bool has_point = false;
|
|
|
|
bool is_searching = true;
|
|
|
|
unsigned int start = this->_position - 1;
|
|
|
|
unsigned int length = 1;
|
|
|
|
while (is_searching) {
|
|
|
|
char16_t next = this->Peek();
|
|
|
|
int next_val = CharToInt(next);
|
|
|
|
if (next_val == -1) {
|
|
|
|
switch (next) {
|
|
|
|
case '_':
|
|
|
|
this->Next();
|
|
|
|
length++;
|
|
|
|
continue;
|
|
|
|
case '.':
|
|
|
|
this->Next();
|
|
|
|
has_point = true;
|
|
|
|
decimal_index = 0;
|
|
|
|
float_value = int_value;
|
|
|
|
length++;
|
|
|
|
continue;
|
|
|
|
default:
|
|
|
|
is_searching = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
this->Next();
|
|
|
|
length++;
|
|
|
|
if (has_point) {
|
|
|
|
decimal_index++;
|
|
|
|
float_value += next_val / pow(10, decimal_index);
|
|
|
|
} else {
|
|
|
|
int_value *= 10;
|
|
|
|
int_value += next_val;
|
|
|
|
}
|
2019-05-18 18:35:51 +00:00
|
|
|
}
|
|
|
|
}
|
2019-06-17 16:35:12 +00:00
|
|
|
if (has_point) {
|
|
|
|
return new FloatToken(float_value, start, length);
|
|
|
|
} else {
|
|
|
|
return new IntegerToken(int_value, start, length);
|
2019-05-18 18:35:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
IToken *Lexer::LexIdentifierOrKeyword() {
|
|
|
|
auto start = this->_position - 1;
|
|
|
|
auto end = start;
|
|
|
|
while (true) {
|
|
|
|
char16_t next = this->Peek();
|
|
|
|
if (next == '\0') break;
|
|
|
|
if (isalpha(next) || next == '_' || next > 255) {
|
|
|
|
this->Next();
|
|
|
|
end++;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2019-05-19 12:26:21 +00:00
|
|
|
}
|
2019-05-22 10:41:08 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
u16string s = this->_scriptString.substr(start, end - start + 1);
|
|
|
|
switch (HashedString::ConstHash(s.c_str())) {
|
|
|
|
case HashedString::ConstHash("and"):
|
|
|
|
return new SimpleToken(TokenKind::AndKeyword, start, 3);
|
|
|
|
case HashedString::ConstHash("break"):
|
|
|
|
return new SimpleToken(TokenKind::BreakKeyword, start, 5);
|
|
|
|
case HashedString::ConstHash("do"):
|
|
|
|
return new SimpleToken(TokenKind::DoKeyword, start, 2);
|
|
|
|
case HashedString::ConstHash("else"):
|
|
|
|
return new SimpleToken(TokenKind::ElseKeyword, start, 4);
|
|
|
|
case HashedString::ConstHash("elseif"):
|
|
|
|
return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
|
|
|
|
case HashedString::ConstHash("end"):
|
|
|
|
return new SimpleToken(TokenKind::EndKeyword, start, 3);
|
|
|
|
case HashedString::ConstHash("false"):
|
|
|
|
return new SimpleToken(TokenKind::FalseKeyword, start, 5);
|
|
|
|
case HashedString::ConstHash("for"):
|
|
|
|
return new SimpleToken(TokenKind::ForKeyword, start, 3);
|
|
|
|
case HashedString::ConstHash("function"):
|
|
|
|
return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
|
|
|
|
case HashedString::ConstHash("if"):
|
|
|
|
return new SimpleToken(TokenKind::IfKeyword, start, 2);
|
|
|
|
case HashedString::ConstHash("in"):
|
|
|
|
return new SimpleToken(TokenKind::InKeyword, start, 2);
|
|
|
|
case HashedString::ConstHash("local"):
|
|
|
|
return new SimpleToken(TokenKind::LocalKeyword, start, 5);
|
|
|
|
case HashedString::ConstHash("nil"):
|
|
|
|
return new SimpleToken(TokenKind::NilKeyword, start, 3);
|
|
|
|
case HashedString::ConstHash("not"):
|
|
|
|
return new SimpleToken(TokenKind::NotKeyword, start, 3);
|
|
|
|
case HashedString::ConstHash("or"):
|
|
|
|
return new SimpleToken(TokenKind::OrKeyword, start, 2);
|
|
|
|
case HashedString::ConstHash("return"):
|
|
|
|
return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
|
|
|
|
case HashedString::ConstHash("then"):
|
|
|
|
return new SimpleToken(TokenKind::ThenKeyword, start, 4);
|
|
|
|
case HashedString::ConstHash("true"):
|
|
|
|
return new SimpleToken(TokenKind::TrueKeyword, start, 4);
|
|
|
|
case HashedString::ConstHash("while"):
|
|
|
|
return new SimpleToken(TokenKind::WhileKeyword, start, 5);
|
|
|
|
default:
|
|
|
|
return new IdentifierToken(HashedString(s), start, s.length());
|
|
|
|
}
|
2019-05-19 10:49:26 +00:00
|
|
|
}
|
2019-05-22 11:24:28 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
const unordered_map<char16_t, char16_t> ControlCharacters{ // NOLINT(cert-err58-cpp)
|
|
|
|
{'0', '\0'},
|
|
|
|
{'a', '\a'},
|
|
|
|
{'b', '\b'},
|
|
|
|
{'t', '\t'},
|
|
|
|
{'n', '\n'},
|
|
|
|
{'v', '\v'},
|
|
|
|
{'f', '\f'},
|
|
|
|
{'r', '\r'},
|
|
|
|
{'"', '\"'},
|
|
|
|
{'\'', '\''},
|
|
|
|
{'\?', '\?'},
|
|
|
|
{'\\', '\\'},
|
|
|
|
};
|
2019-05-22 11:24:28 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
IToken *Lexer::LexString(char16_t c) {
|
|
|
|
auto start = this->_position - 1;
|
|
|
|
auto end = start;
|
|
|
|
char16_t last = c;
|
|
|
|
while (true) {
|
|
|
|
char16_t next = this->Peek();
|
|
|
|
if (next == '\0') break;
|
|
|
|
if (next == c && last != '\\') break;
|
|
|
|
this->Next();
|
|
|
|
end++;
|
|
|
|
last = next;
|
|
|
|
}
|
|
|
|
auto closeToken = this->Next();
|
|
|
|
if (closeToken != c) {
|
2019-06-18 17:56:47 +00:00
|
|
|
const char* s = string(1, closeToken).c_str();
|
|
|
|
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1, { s });
|
2019-06-17 16:35:12 +00:00
|
|
|
return new SimpleToken(TokenKind::BadToken, start, end - start + 1);
|
|
|
|
}
|
2019-05-22 11:24:28 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
u16string s = this->_scriptString.substr(start + 1, end - start);
|
|
|
|
std::basic_ostringstream<char16_t> stream;
|
|
|
|
for (int i = 0; i < s.size(); i++) {
|
2019-05-22 11:24:28 +00:00
|
|
|
c = s[i];
|
2019-06-17 16:35:12 +00:00
|
|
|
if (c == '\\') {
|
|
|
|
i++;
|
|
|
|
c = s[i];
|
|
|
|
if (ControlCharacters.find(c) != ControlCharacters.end()) {
|
|
|
|
stream << ControlCharacters.at(c);
|
|
|
|
} else {
|
2019-06-19 11:33:01 +00:00
|
|
|
auto v = ("\\" + string(1, c)).c_str();
|
2019-06-18 14:39:36 +00:00
|
|
|
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::InvalidStringControlCharacter,
|
2019-06-19 11:33:01 +00:00
|
|
|
start + i, 2, {v});
|
2019-06-17 16:35:12 +00:00
|
|
|
stream << c;
|
|
|
|
}
|
|
|
|
} else {
|
2019-05-22 11:24:28 +00:00
|
|
|
stream << c;
|
|
|
|
}
|
|
|
|
}
|
2019-06-17 16:35:12 +00:00
|
|
|
return new StringToken(stream.str(), start, end - start);
|
2019-05-22 11:24:28 +00:00
|
|
|
}
|
2019-06-05 17:11:56 +00:00
|
|
|
|
2019-06-17 16:35:12 +00:00
|
|
|
}
|