PorygonLang/src/Parser/Lexer.cpp

318 lines
12 KiB
C++
Raw Normal View History

2019-05-18 18:35:51 +00:00
#include <cmath>
2019-05-22 11:24:28 +00:00
#include <unordered_map>
#include <sstream>
2019-05-18 18:35:51 +00:00
#include "Lexer.hpp"
namespace Porygon::Parser {
Lexer::Lexer(const u16string &scriptString, Porygon::Script *script)
: _scriptString(scriptString) {
this->_scriptSize = scriptString.size();
this->ScriptData = script;
this->_position = 0;
}
2019-06-05 17:11:56 +00:00
vector<const Token *> Lexer::Lex() {
vector<const Token *> tokens;
while (true) {
Token *next = this->LexNext(this->Next());
auto nextKind = next->GetKind();
if (nextKind != TokenKind::WhiteSpace)
tokens.push_back(next);
else
delete next;
if (nextKind == TokenKind::EndOfFile)
break;
}
return tokens;
2019-05-18 18:35:51 +00:00
}
char16_t Lexer::Peek() {
if (Lexer::_position >= this->_scriptSize)
return '\0';
return this->_scriptString.at(Lexer::_position);
}
2019-05-18 18:35:51 +00:00
char16_t Lexer::Next() {
char16_t next = Peek();
Lexer::_position++;
return next;
}
2019-05-18 18:35:51 +00:00
Token *Lexer::LexNext(char16_t c) {
switch (c) {
case '\0':
return new SimpleToken(TokenKind::EndOfFile, this->_position - 1, 1);
case ' ':
case '\t':
case '\n':
case '\r':
case '\v':
case '\f':
return new SimpleToken(TokenKind::WhiteSpace, this->_position - 1, 1);
case '+':
return new SimpleToken(TokenKind::PlusToken, this->_position - 1, 1);
case '-':
return new SimpleToken(TokenKind::MinusToken, this->_position - 1, 1);
case '/':
return new SimpleToken(TokenKind::SlashToken, this->_position - 1, 1);
case '*':
return new SimpleToken(TokenKind::StarToken, this->_position - 1, 1);
case '(':
return new SimpleToken(TokenKind::OpenParenthesis, this->_position - 1, 1);
case ')':
return new SimpleToken(TokenKind::CloseParenthesis, this->_position - 1, 1);
case '[':
return new SimpleToken(TokenKind::OpenSquareBracket, this->_position - 1, 1);
case ']':
return new SimpleToken(TokenKind::CloseSquareBracket, this->_position - 1, 1);
case '{':
return new SimpleToken(TokenKind::OpenCurlyBracket, this->_position - 1, 1);
case '}':
return new SimpleToken(TokenKind::CloseCurlyBracket, this->_position - 1, 1);
case ',':
return new SimpleToken(TokenKind::CommaToken, this->_position - 1, 1);
case '.':
return new SimpleToken(TokenKind::PeriodToken, this->_position - 1, 1);
case '=':
if (Lexer::Peek() == '=') {
Lexer::Next();
return new SimpleToken(TokenKind::EqualityToken, this->_position - 2, 2);
}
return new SimpleToken(TokenKind::AssignmentToken, this->_position - 1, 1);
case '<':
if (Lexer::Peek() == '=') {
Lexer::Next();
return new SimpleToken(TokenKind::LessEquals, this->_position - 2, 2);
}
return new SimpleToken(TokenKind::Less, this->_position - 1, 1);
case '>':
if (Lexer::Peek() == '=') {
Lexer::Next();
return new SimpleToken(TokenKind::GreaterEquals, this->_position - 2, 2);
}
return new SimpleToken(TokenKind::Greater, this->_position - 1, 1);
case '~':
if (Lexer::Peek() == '=') {
Lexer::Next();
return new SimpleToken(TokenKind::InequalityToken, this->_position - 2, 2);
}
2019-06-18 14:39:36 +00:00
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1);
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return LexNumber(c);
case '"':
return LexString(c);
case '\'':
return LexString(c);
case '_':
2019-05-22 11:24:28 +00:00
return LexIdentifierOrKeyword();
default:
if (isalpha(c) || c > 255) {
return LexIdentifierOrKeyword();
}
2019-06-18 14:39:36 +00:00
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1);
}
2019-05-18 18:35:51 +00:00
}
2019-07-04 15:18:07 +00:00
static int CharToInt(char16_t c) {
switch (c) {
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
default:
return -1;
}
2019-05-18 18:35:51 +00:00
}
Token *Lexer::LexNumber(char16_t c) {
int64_t int_value = CharToInt(c);
double float_value = 0;
short decimal_index = 0;
bool has_point = false;
bool is_searching = true;
unsigned int start = this->_position - 1;
unsigned int length = 1;
while (is_searching) {
char16_t next = this->Peek();
int next_val = CharToInt(next);
if (next_val == -1) {
switch (next) {
case '_':
this->Next();
length++;
continue;
case '.':
this->Next();
has_point = true;
decimal_index = 0;
2019-07-25 15:23:54 +00:00
float_value = (double) int_value;
length++;
continue;
default:
is_searching = false;
continue;
}
} else {
this->Next();
length++;
if (has_point) {
decimal_index++;
float_value += next_val / pow(10, decimal_index);
} else {
int_value *= 10;
int_value += next_val;
}
2019-05-18 18:35:51 +00:00
}
}
if (has_point) {
return new FloatToken(float_value, start, length);
} else {
return new IntegerToken(int_value, start, length);
2019-05-18 18:35:51 +00:00
}
}
Token *Lexer::LexIdentifierOrKeyword() {
auto start = this->_position - 1;
auto end = start;
while (true) {
char16_t next = this->Peek();
if (next == '\0') break;
if (isalpha(next) || next == '_' || next > 255) {
this->Next();
end++;
} else {
break;
}
}
u16string s = this->_scriptString.substr(start, end - start + 1);
switch (HashedString::ConstHash(s.c_str())) {
case HashedString::ConstHash("and"):
return new SimpleToken(TokenKind::AndKeyword, start, 3);
case HashedString::ConstHash("break"):
return new SimpleToken(TokenKind::BreakKeyword, start, 5);
case HashedString::ConstHash("do"):
return new SimpleToken(TokenKind::DoKeyword, start, 2);
case HashedString::ConstHash("else"):
return new SimpleToken(TokenKind::ElseKeyword, start, 4);
case HashedString::ConstHash("elseif"):
return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
case HashedString::ConstHash("end"):
return new SimpleToken(TokenKind::EndKeyword, start, 3);
case HashedString::ConstHash("false"):
return new SimpleToken(TokenKind::FalseKeyword, start, 5);
case HashedString::ConstHash("for"):
return new SimpleToken(TokenKind::ForKeyword, start, 3);
case HashedString::ConstHash("function"):
return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
case HashedString::ConstHash("if"):
return new SimpleToken(TokenKind::IfKeyword, start, 2);
case HashedString::ConstHash("in"):
return new SimpleToken(TokenKind::InKeyword, start, 2);
case HashedString::ConstHash("local"):
return new SimpleToken(TokenKind::LocalKeyword, start, 5);
case HashedString::ConstHash("nil"):
return new SimpleToken(TokenKind::NilKeyword, start, 3);
case HashedString::ConstHash("not"):
return new SimpleToken(TokenKind::NotKeyword, start, 3);
case HashedString::ConstHash("or"):
return new SimpleToken(TokenKind::OrKeyword, start, 2);
case HashedString::ConstHash("return"):
return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
case HashedString::ConstHash("then"):
return new SimpleToken(TokenKind::ThenKeyword, start, 4);
case HashedString::ConstHash("true"):
return new SimpleToken(TokenKind::TrueKeyword, start, 4);
case HashedString::ConstHash("while"):
return new SimpleToken(TokenKind::WhileKeyword, start, 5);
default:
return new IdentifierToken(HashedString(new u16string(s)), start, s.length());
}
2019-05-19 10:49:26 +00:00
}
2019-05-22 11:24:28 +00:00
const unordered_map<char16_t, char16_t> ControlCharacters{ // NOLINT(cert-err58-cpp)
{'0', '\0'},
{'a', '\a'},
{'b', '\b'},
{'t', '\t'},
{'n', '\n'},
{'v', '\v'},
{'f', '\f'},
{'r', '\r'},
{'"', '\"'},
{'\'', '\''},
{'\?', '\?'},
{'\\', '\\'},
};
2019-05-22 11:24:28 +00:00
Token *Lexer::LexString(char16_t c) {
auto start = this->_position - 1;
auto end = start;
char16_t last = c;
while (true) {
char16_t next = this->Peek();
if (next == '\0') break;
if (next == c && last != '\\') break;
this->Next();
end++;
last = next;
}
auto closeToken = this->Next();
if (closeToken != c) {
2019-06-18 17:56:47 +00:00
const char* s = string(1, closeToken).c_str();
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1, { s });
return new SimpleToken(TokenKind::BadToken, start, end - start + 1);
}
2019-05-22 11:24:28 +00:00
u16string s = this->_scriptString.substr(start + 1, end - start);
std::basic_ostringstream<char16_t> stream;
2019-07-25 15:23:54 +00:00
for (size_t i = 0; i < s.size(); i++) {
2019-05-22 11:24:28 +00:00
c = s[i];
if (c == '\\') {
i++;
c = s[i];
if (ControlCharacters.find(c) != ControlCharacters.end()) {
stream << ControlCharacters.at(c);
} else {
auto v = ("\\" + string(1, c)).c_str();
2019-06-18 14:39:36 +00:00
this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::InvalidStringControlCharacter,
start + i, 2, {v});
stream << c;
}
} else {
2019-05-22 11:24:28 +00:00
stream << c;
}
}
return new StringToken(stream.str(), start, end - start);
2019-05-22 11:24:28 +00:00
}
2019-06-05 17:11:56 +00:00
}