#include #include #include #include #include "Lexer.hpp" namespace Porygon::Parser { Lexer::Lexer(const u16string &scriptString, Porygon::Script *script) : _scriptString(scriptString) { this->_scriptSize = scriptString.size(); this->ScriptData = script; this->_position = 0; } vector Lexer::Lex() { vector tokens; while (true) { IToken *next = this->LexNext(this->Next()); auto nextKind = next->GetKind(); if (nextKind != TokenKind::WhiteSpace) tokens.push_back(next); else delete next; if (nextKind == TokenKind::EndOfFile) break; } return tokens; } char16_t Lexer::Peek() { if (Lexer::_position >= this->_scriptSize) return '\0'; return this->_scriptString.at(Lexer::_position); } char16_t Lexer::Next() { char16_t next = Peek(); Lexer::_position++; return next; } IToken *Lexer::LexNext(char16_t c) { switch (c) { case '\0': return new SimpleToken(TokenKind::EndOfFile, this->_position - 1, 1); case ' ': case '\t': case '\n': case '\r': case '\v': case '\f': return new SimpleToken(TokenKind::WhiteSpace, this->_position - 1, 1); case '+': return new SimpleToken(TokenKind::PlusToken, this->_position - 1, 1); case '-': return new SimpleToken(TokenKind::MinusToken, this->_position - 1, 1); case '/': return new SimpleToken(TokenKind::SlashToken, this->_position - 1, 1); case '*': return new SimpleToken(TokenKind::StarToken, this->_position - 1, 1); case '(': return new SimpleToken(TokenKind::OpenParenthesis, this->_position - 1, 1); case ')': return new SimpleToken(TokenKind::CloseParenthesis, this->_position - 1, 1); case '[': return new SimpleToken(TokenKind::OpenSquareBracket, this->_position - 1, 1); case ']': return new SimpleToken(TokenKind::CloseSquareBracket, this->_position - 1, 1); case '{': return new SimpleToken(TokenKind::OpenCurlyBracket, this->_position - 1, 1); case '}': return new SimpleToken(TokenKind::CloseCurlyBracket, this->_position - 1, 1); case ',': return new SimpleToken(TokenKind::CommaToken, this->_position - 1, 1); case '.': return new SimpleToken(TokenKind::PeriodToken, this->_position - 1, 1); case '=': if (Lexer::Peek() == '=') { Lexer::Next(); return new SimpleToken(TokenKind::EqualityToken, this->_position - 2, 2); } return new SimpleToken(TokenKind::AssignmentToken, this->_position - 1, 1); case '<': if (Lexer::Peek() == '=') { Lexer::Next(); return new SimpleToken(TokenKind::LessEquals, this->_position - 2, 2); } return new SimpleToken(TokenKind::Less, this->_position - 1, 1); case '>': if (Lexer::Peek() == '=') { Lexer::Next(); return new SimpleToken(TokenKind::GreaterEquals, this->_position - 2, 2); } return new SimpleToken(TokenKind::Greater, this->_position - 1, 1); case '~': if (Lexer::Peek() == '=') { Lexer::Next(); return new SimpleToken(TokenKind::InequalityToken, this->_position - 2, 2); } this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1); return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexNumber(c); case '"': return LexString(c); case '\'': return LexString(c); case '_': return LexIdentifierOrKeyword(); default: if (isalpha(c) || c > 255) { return LexIdentifierOrKeyword(); } this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1); return new SimpleToken(TokenKind::BadToken, this->_position - 1, 1); } } int CharToInt(char16_t c) { switch (c) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; case '8': return 8; case '9': return 9; default: return -1; } } IToken *Lexer::LexNumber(char16_t c) { long int_value = CharToInt(c); double float_value = 0; short decimal_index = 0; bool has_point = false; bool is_searching = true; unsigned int start = this->_position - 1; unsigned int length = 1; while (is_searching) { char16_t next = this->Peek(); int next_val = CharToInt(next); if (next_val == -1) { switch (next) { case '_': this->Next(); length++; continue; case '.': this->Next(); has_point = true; decimal_index = 0; float_value = int_value; length++; continue; default: is_searching = false; continue; } } else { this->Next(); length++; if (has_point) { decimal_index++; float_value += next_val / pow(10, decimal_index); } else { int_value *= 10; int_value += next_val; } } } if (has_point) { return new FloatToken(float_value, start, length); } else { return new IntegerToken(int_value, start, length); } } IToken *Lexer::LexIdentifierOrKeyword() { auto start = this->_position - 1; auto end = start; while (true) { char16_t next = this->Peek(); if (next == '\0') break; if (isalpha(next) || next == '_' || next > 255) { this->Next(); end++; } else { break; } } u16string s = this->_scriptString.substr(start, end - start + 1); switch (HashedString::ConstHash(s.c_str())) { case HashedString::ConstHash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3); case HashedString::ConstHash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5); case HashedString::ConstHash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2); case HashedString::ConstHash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4); case HashedString::ConstHash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6); case HashedString::ConstHash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3); case HashedString::ConstHash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5); case HashedString::ConstHash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3); case HashedString::ConstHash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8); case HashedString::ConstHash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2); case HashedString::ConstHash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2); case HashedString::ConstHash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5); case HashedString::ConstHash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3); case HashedString::ConstHash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3); case HashedString::ConstHash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2); case HashedString::ConstHash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6); case HashedString::ConstHash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4); case HashedString::ConstHash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4); case HashedString::ConstHash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5); default: return new IdentifierToken(HashedString(new u16string(s)), start, s.length()); } } const unordered_map ControlCharacters{ // NOLINT(cert-err58-cpp) {'0', '\0'}, {'a', '\a'}, {'b', '\b'}, {'t', '\t'}, {'n', '\n'}, {'v', '\v'}, {'f', '\f'}, {'r', '\r'}, {'"', '\"'}, {'\'', '\''}, {'\?', '\?'}, {'\\', '\\'}, }; IToken *Lexer::LexString(char16_t c) { auto start = this->_position - 1; auto end = start; char16_t last = c; while (true) { char16_t next = this->Peek(); if (next == '\0') break; if (next == c && last != '\\') break; this->Next(); end++; last = next; } auto closeToken = this->Next(); if (closeToken != c) { const char* s = string(1, closeToken).c_str(); this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1, { s }); return new SimpleToken(TokenKind::BadToken, start, end - start + 1); } u16string s = this->_scriptString.substr(start + 1, end - start); std::basic_ostringstream stream; for (int i = 0; i < s.size(); i++) { c = s[i]; if (c == '\\') { i++; c = s[i]; if (ControlCharacters.find(c) != ControlCharacters.end()) { stream << ControlCharacters.at(c); } else { auto v = ("\\" + string(1, c)).c_str(); this->ScriptData->Diagnostics->LogError(Diagnostics::DiagnosticCode::InvalidStringControlCharacter, start + i, 2, {v}); stream << c; } } else { stream << c; } } return new StringToken(stream.str(), start, end - start); } }