PorygonLang/src/Parser/Lexer.cpp

275 lines
10 KiB
C++
Raw Normal View History

2019-05-18 18:35:51 +00:00
#include <utility>
#include <cmath>
2019-05-22 11:24:28 +00:00
#include <unordered_map>
#include <sstream>
2019-05-18 18:35:51 +00:00
#include "Lexer.hpp"
Lexer::Lexer(const u16string& scriptString, class Script* script)
: _scriptString(scriptString)
{
2019-06-05 17:11:56 +00:00
this->_scriptSize = scriptString.size();
this -> ScriptData = script;
this -> _position = 0;
}
2019-06-13 16:49:38 +00:00
vector<const IToken*> Lexer::Lex() {
vector<const IToken*> tokens;
2019-05-18 18:35:51 +00:00
while (true){
2019-05-21 11:56:08 +00:00
IToken* next = this -> LexNext(this -> Next());
auto nextKind = next -> GetKind();
if (nextKind != TokenKind::WhiteSpace)
tokens.push_back(next);
else
delete next;
if (nextKind == TokenKind::EndOfFile)
2019-05-18 18:35:51 +00:00
break;
}
return tokens;
}
char16_t Lexer::Peek(){
2019-06-05 17:11:56 +00:00
if (Lexer::_position >= this -> _scriptSize)
2019-05-18 18:35:51 +00:00
return '\0';
return this -> _scriptString.at(Lexer::_position);
2019-05-18 18:35:51 +00:00
}
char16_t Lexer::Next(){
char16_t next = Peek();
2019-06-05 17:11:56 +00:00
Lexer::_position++;
2019-05-18 18:35:51 +00:00
return next;
}
IToken* Lexer::LexNext(char16_t c){
2019-05-18 18:35:51 +00:00
switch (c) {
case '\0':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::EndOfFile, this -> _position - 1, 1);
case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::WhiteSpace, this -> _position - 1, 1);
2019-05-19 10:49:26 +00:00
case '+':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::PlusToken, this -> _position - 1, 1);
2019-05-19 10:49:26 +00:00
case '-':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::MinusToken, this -> _position - 1, 1);
2019-05-19 10:49:26 +00:00
case '/':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::SlashToken, this -> _position - 1, 1);
2019-05-19 10:49:26 +00:00
case '*':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::StarToken, this -> _position - 1, 1);
case '(':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::OpenParenthesis, this -> _position - 1, 1);
case ')':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::CloseParenthesis, this -> _position - 1, 1);
case '[':
return new SimpleToken(TokenKind::OpenSquareBracket, this -> _position - 1, 1);
case ']':
return new SimpleToken(TokenKind::CloseSquareBracket, this -> _position - 1, 1);
2019-06-09 18:15:09 +00:00
case '{':
return new SimpleToken(TokenKind::OpenCurlyBracket, this -> _position - 1, 1);
case '}':
return new SimpleToken(TokenKind::CloseCurlyBracket, this -> _position - 1, 1);
case ',':
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::CommaToken, this -> _position - 1, 1);
case '.':
return new SimpleToken(TokenKind::PeriodToken, this -> _position - 1, 1);
2019-05-19 10:49:26 +00:00
case '=':
if (Lexer::Peek() == '='){
Lexer::Next();
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::EqualityToken, this -> _position - 2, 2);
2019-05-19 10:49:26 +00:00
}
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::AssignmentToken, this -> _position - 1, 1);
case '<':
if (Lexer::Peek() == '='){
Lexer::Next();
return new SimpleToken(TokenKind::LessEquals, this -> _position - 2, 2);
}
return new SimpleToken(TokenKind::Less, this -> _position - 1, 1);
case '>':
if (Lexer::Peek() == '='){
Lexer::Next();
return new SimpleToken(TokenKind::GreaterEquals, this -> _position - 2, 2);
}
return new SimpleToken(TokenKind::Greater, this -> _position - 1, 1);
2019-05-25 12:17:52 +00:00
case '~':
if (Lexer::Peek() == '='){
Lexer::Next();
2019-06-05 17:11:56 +00:00
return new SimpleToken(TokenKind::InequalityToken, this -> _position - 2, 2);
2019-05-25 12:17:52 +00:00
}
2019-06-05 17:11:56 +00:00
this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> _position - 1, 1);
return new SimpleToken(TokenKind::BadToken, this -> _position - 1, 1);
2019-05-18 18:35:51 +00:00
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
return LexNumber(c);
2019-05-22 11:24:28 +00:00
case '"':
return LexString(c);
case '\'':
return LexString(c);
case '_':
2019-05-22 11:24:28 +00:00
return LexIdentifierOrKeyword();
2019-05-18 18:35:51 +00:00
default:
if (isalpha(c)){
2019-05-22 11:24:28 +00:00
return LexIdentifierOrKeyword();
}
2019-06-05 17:11:56 +00:00
this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> _position - 1, 1);
return new SimpleToken(TokenKind::BadToken, this -> _position - 1, 1);
2019-05-18 18:35:51 +00:00
}
}
int CharToInt(char16_t c){
2019-05-18 18:35:51 +00:00
switch (c){
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
default: return -1;
}
}
IToken* Lexer::LexNumber(char16_t c){
2019-05-18 18:35:51 +00:00
long int_value = CharToInt(c);
double float_value = 0;
short decimal_index = 0;
bool has_point = false;
bool is_searching = true;
2019-06-05 17:11:56 +00:00
unsigned int start = this -> _position - 1;
2019-05-19 14:11:16 +00:00
unsigned int length = 1;
2019-05-18 18:35:51 +00:00
while (is_searching){
char16_t next = this -> Peek();
2019-05-18 18:35:51 +00:00
int next_val = CharToInt(next);
if (next_val == -1){
switch (next){
case '_':
2019-05-21 11:56:08 +00:00
this -> Next();
2019-05-19 14:11:16 +00:00
length++;
continue;
2019-05-18 18:35:51 +00:00
case '.':
2019-05-21 11:56:08 +00:00
this -> Next();
2019-05-18 18:35:51 +00:00
has_point = true;
decimal_index = 0;
float_value = int_value;
2019-05-19 14:11:16 +00:00
length++;
2019-05-18 18:35:51 +00:00
continue;
default:
is_searching = false;
continue;
}
}
else{
2019-05-21 11:56:08 +00:00
this -> Next();
2019-05-19 14:11:16 +00:00
length++;
2019-05-18 18:35:51 +00:00
if (has_point){
decimal_index++;
float_value += next_val / pow(10, decimal_index);
}
else {
int_value *= 10;
int_value += next_val;
}
}
}
if (has_point){
2019-05-19 14:11:16 +00:00
return new FloatToken(float_value, start, length);
2019-05-18 18:35:51 +00:00
}
else{
2019-05-19 14:11:16 +00:00
return new IntegerToken(int_value, start, length);
2019-05-18 18:35:51 +00:00
}
}
2019-05-22 11:24:28 +00:00
IToken * Lexer::LexIdentifierOrKeyword() {
2019-06-05 17:11:56 +00:00
auto start = this -> _position - 1;
auto end = start;
while (true){
char16_t next = this -> Peek();
if (next == '\0') break;
if (isalpha(next) || next == '_'){
2019-05-21 11:56:08 +00:00
this -> Next();
end++;
}
else{
break;
}
2019-05-18 18:35:51 +00:00
}
u16string s = this -> _scriptString.substr(start, end - start + 1);
2019-06-05 17:11:56 +00:00
switch (HashedString::ConstHash(s.c_str())){
case HashedString::ConstHash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);
case HashedString::ConstHash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);
case HashedString::ConstHash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);
case HashedString::ConstHash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);
case HashedString::ConstHash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
case HashedString::ConstHash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);
case HashedString::ConstHash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);
case HashedString::ConstHash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);
case HashedString::ConstHash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
case HashedString::ConstHash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);
case HashedString::ConstHash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);
case HashedString::ConstHash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);
case HashedString::ConstHash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);
case HashedString::ConstHash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);
case HashedString::ConstHash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);
case HashedString::ConstHash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
case HashedString::ConstHash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);
case HashedString::ConstHash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);
case HashedString::ConstHash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
2019-06-13 16:49:38 +00:00
default: return new IdentifierToken(HashedString(s), start, s.length());
2019-05-19 10:49:26 +00:00
}
2019-05-22 11:24:28 +00:00
}
const unordered_map<char16_t, char16_t> ControlCharacters{ // NOLINT(cert-err58-cpp)
2019-05-22 11:24:28 +00:00
{'0', '\0'},
{'a', '\a'},
{'b', '\b'},
{'t', '\t'},
{'n', '\n'},
{'v', '\v'},
{'f', '\f'},
{'r', '\r'},
{'"', '\"'},
{'\'', '\''},
{'\?', '\?'},
{'\\', '\\'},
};
IToken* Lexer::LexString(char16_t c){
2019-06-05 17:11:56 +00:00
auto start = this -> _position - 1;
2019-05-22 11:24:28 +00:00
auto end = start;
char16_t last = c;
2019-05-22 11:24:28 +00:00
while (true){
char16_t next = this -> Peek();
2019-05-22 11:24:28 +00:00
if (next == '\0') break;
if (next == c && last != '\\') break;
this -> Next();
end++;
last = next;
}
auto closeToken = this -> Next();
if (closeToken != c){
2019-06-05 17:11:56 +00:00
this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
2019-05-22 11:24:28 +00:00
return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
}
u16string s = this -> _scriptString.substr(start + 1, end - start);
std::basic_ostringstream<char16_t > stream;
2019-05-22 11:24:28 +00:00
for (int i = 0; i < s.size(); i++){
c = s[i];
if (c == '\\'){
i++;
c = s[i];
if (ControlCharacters.find(c) != ControlCharacters.end()) {
stream << ControlCharacters.at(c);
} else{
this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
stream << c;
}
} else{
stream << c;
}
}
return new StringToken(stream.str(), start, end - start );
2019-06-05 17:11:56 +00:00
}