PorygonLang/src/Parser/Lexer.cpp

#include <utility>
#include <cmath>
#include <unordered_map>
#include <sstream>

#include "Lexer.hpp"

Lexer::Lexer(string scriptString, class Script* script) {
    this -> ScriptString = std::move(scriptString);
    this -> ScriptData = script;
    this -> Position = 0;
}

vector<IToken*> Lexer::Lex() {
    vector<IToken*> tokens;
    while (true){
        IToken* next = this -> LexNext(this -> Next());
        auto nextKind = next -> GetKind();
        if (nextKind != TokenKind::WhiteSpace)
            tokens.push_back(next);
        if (nextKind == TokenKind::EndOfFile)
            break;
    }
    return tokens;
}

char Lexer::Peek(){
    if (Lexer::Position > this -> ScriptString.length())
        return '\0';
    return this -> ScriptString[Lexer::Position];
}

char Lexer::Next(){
    char next = Peek();
    Lexer::Position++;
    return next;
}

IToken* Lexer::LexNext(char c){
    switch (c) {
        case '\0':
            return new SimpleToken(TokenKind::EndOfFile, this -> Position - 1, 1);
        case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
            return new SimpleToken(TokenKind::WhiteSpace, this -> Position - 1, 1);
        case '+':
            return new SimpleToken(TokenKind::PlusToken, this -> Position - 1, 1);
        case '-':
            return new SimpleToken(TokenKind::MinusToken, this -> Position - 1, 1);
        case '/':
            return new SimpleToken(TokenKind::SlashToken, this -> Position - 1, 1);
        case '*':
            return new SimpleToken(TokenKind::StarToken, this -> Position - 1, 1);
        case '(':
            return new SimpleToken(TokenKind::OpenParenthesis, this -> Position - 1, 1);
        case ')':
            return new SimpleToken(TokenKind::CloseParenthesis, this -> Position - 1, 1);
        case '=':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::EqualityToken, this -> Position - 2, 2);
            }
            return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);
        case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
            return LexNumber(c);
        case '"':
            return LexString(c);
        case '\'':
            return LexString(c);
        case '_':
            return LexIdentifierOrKeyword();
        default:
            if (isalpha(c)){
                return LexIdentifierOrKeyword();
            }
            this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);
            return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);
    }
}

int CharToInt(char c){
    switch (c){
        case '0': return 0;
        case '1': return 1;
        case '2': return 2;
        case '3': return 3;
        case '4': return 4;
        case '5': return 5;
        case '6': return 6;
        case '7': return 7;
        case '8': return 8;
        case '9': return 9;
        default: return -1;
    }
}

IToken* Lexer::LexNumber(char c){
    long int_value = CharToInt(c);
    double float_value = 0;
    short decimal_index = 0;
    bool has_point = false;
    bool is_searching = true;
    unsigned int start = this -> Position - 1;
    unsigned int length = 1;
    while (is_searching){
        char next = this -> Peek();
        int next_val = CharToInt(next);
        if (next_val == -1){
            switch (next){
                case '_':
                    this -> Next();
                    length++;
                    continue;
                case '.':
                    this -> Next();
                    has_point = true;
                    decimal_index = 0;
                    float_value = int_value;
                    length++;
                    continue;
                default:
                    is_searching = false;
                    continue;
            }
        }
        else{
            this -> Next();
            length++;
            if (has_point){
                decimal_index++;
                float_value += next_val / pow(10, decimal_index);
            }
            else {
                int_value *= 10;
                int_value += next_val;
            }
        }
    }
    if (has_point){
        return new FloatToken(float_value, start, length);
    }
    else{
        return new IntegerToken(int_value, start, length);
    }
}

unsigned constexpr const_hash(char const *input) {
    return *input ?
           static_cast<unsigned int>(*input) + 33 * const_hash(input + 1) :
           5381;
}

IToken * Lexer::LexIdentifierOrKeyword() {
    auto start = this -> Position - 1;
    auto end = start;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (isalpha(next) || next == '_'){
            this -> Next();
            end++;
        }
        else{
            break;
        }
    }

    string s = this -> ScriptString.substr(start, end - start + 1);
    switch (const_hash(s.c_str())){
        case const_hash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);
        case const_hash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);
        case const_hash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);
        case const_hash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);
        case const_hash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
        case const_hash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);
        case const_hash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);
        case const_hash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);
        case const_hash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
        case const_hash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);
        case const_hash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);
        case const_hash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);
        case const_hash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);
        case const_hash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);
        case const_hash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);
        case const_hash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
        case const_hash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);
        case const_hash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);
        case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
        default: return new IdentifierToken(s, start, s.length());
    }
}

const unordered_map<char, char> ControlCharacters{
        {'0', '\0'},
        {'a', '\a'},
        {'b', '\b'},
        {'t', '\t'},
        {'n', '\n'},
        {'v', '\v'},
        {'f', '\f'},
        {'r', '\r'},
        {'"', '\"'},
        {'\'', '\''},
        {'\?', '\?'},
        {'\\', '\\'},
};

IToken* Lexer::LexString(char c){
    auto start = this -> Position - 1;
    auto end = start;
    char last = c;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (next == c && last != '\\') break;
        this -> Next();
        end++;
        last = next;
    }
    auto closeToken = this -> Next();
    if (closeToken != c){
        this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1);
        return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
    }

    string s = this -> ScriptString.substr(start + 1, end - start);
    stringstream stream;
    for (int i = 0; i < s.size(); i++){
        c = s[i];
        if (c == '\\'){
            i++;
            c = s[i];
            if (ControlCharacters.find(c) != ControlCharacters.end()) {
                stream << ControlCharacters.at(c);
            } else{
                this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
                stream << c;
            }
        } else{
            stream << c;
        }
    }
    return new StringToken(stream.str(), start, end - start );
}
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`#include <utility>`
			`#include <cmath>`
Lex Strings 2019-05-22 11:24:28 +00:00			`#include <unordered_map>`
			`#include <sstream>`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00
			`#include "Lexer.hpp"`

Add support for diagnostics 2019-05-21 11:56:08 +00:00			`Lexer::Lexer(string scriptString, class Script* script) {`
			`this -> ScriptString = std::move(scriptString);`
			`this -> ScriptData = script;`
			`this -> Position = 0;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

			`vector<IToken*> Lexer::Lex() {`
			`vector<IToken*> tokens;`
			`while (true){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`IToken* next = this -> LexNext(this -> Next());`
Expanded on diagnostics, make whitespace completely ignored 2019-05-21 13:11:00 +00:00			`auto nextKind = next -> GetKind();`
			`if (nextKind != TokenKind::WhiteSpace)`
			`tokens.push_back(next);`
			`if (nextKind == TokenKind::EndOfFile)`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`break;`
			`}`
			`return tokens;`
			`}`

			`char Lexer::Peek(){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`if (Lexer::Position > this -> ScriptString.length())`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`return '\0';`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return this -> ScriptString[Lexer::Position];`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

			`char Lexer::Next(){`
			`char next = Peek();`
			`Lexer::Position++;`
			`return next;`
			`}`

			`IToken* Lexer::LexNext(char c){`
			`switch (c) {`
			`case '\0':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::EndOfFile, this -> Position - 1, 1);`
Adds support for parenthesized expressions 2019-05-21 15:16:53 +00:00			`case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::WhiteSpace, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '+':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::PlusToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '-':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::MinusToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '/':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::SlashToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '*':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::StarToken, this -> Position - 1, 1);`
Adds support for parenthesized expressions 2019-05-21 15:16:53 +00:00			`case '(':`
			`return new SimpleToken(TokenKind::OpenParenthesis, this -> Position - 1, 1);`
			`case ')':`
			`return new SimpleToken(TokenKind::CloseParenthesis, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '=':`
			`if (Lexer::Peek() == '='){`
			`Lexer::Next();`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::EqualityToken, this -> Position - 2, 2);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`}`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':`
			`return LexNumber(c);`
Lex Strings 2019-05-22 11:24:28 +00:00			`case '"':`
			`return LexString(c);`
			`case '\'':`
			`return LexString(c);`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`case '_':`
Lex Strings 2019-05-22 11:24:28 +00:00			`return LexIdentifierOrKeyword();`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`default:`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`if (isalpha(c)){`
Lex Strings 2019-05-22 11:24:28 +00:00			`return LexIdentifierOrKeyword();`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`}`
Make Diagnostics usage a pointer 2019-05-21 12:15:39 +00:00			`this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`}`

			`int CharToInt(char c){`
			`switch (c){`
			`case '0': return 0;`
			`case '1': return 1;`
			`case '2': return 2;`
			`case '3': return 3;`
			`case '4': return 4;`
			`case '5': return 5;`
			`case '6': return 6;`
			`case '7': return 7;`
			`case '8': return 8;`
			`case '9': return 9;`
			`default: return -1;`
			`}`
			`}`

			`IToken* Lexer::LexNumber(char c){`
			`long int_value = CharToInt(c);`
			`double float_value = 0;`
			`short decimal_index = 0;`
			`bool has_point = false;`
			`bool is_searching = true;`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`unsigned int start = this -> Position - 1;`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`unsigned int length = 1;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`while (is_searching){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`char next = this -> Peek();`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`int next_val = CharToInt(next);`
			`if (next_val == -1){`
			`switch (next){`
Dont consume the character immediately following a number 2019-05-19 10:20:08 +00:00			`case '_':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Dont consume the character immediately following a number 2019-05-19 10:20:08 +00:00			`continue;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`case '.':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`has_point = true;`
			`decimal_index = 0;`
			`float_value = int_value;`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`continue;`
			`default:`
			`is_searching = false;`
			`continue;`
			`}`
			`}`
			`else{`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`if (has_point){`
			`decimal_index++;`
			`float_value += next_val / pow(10, decimal_index);`
			`}`
			`else {`
			`int_value *= 10;`
			`int_value += next_val;`
			`}`
			`}`
			`}`
			`if (has_point){`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`return new FloatToken(float_value, start, length);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`else{`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`return new IntegerToken(int_value, start, length);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`}`

Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`unsigned constexpr const_hash(char const *input) {`
			`return *input ?`
			`static_cast<unsigned int>(input) + 33 const_hash(input + 1) :`
			`5381;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

Lex Strings 2019-05-22 11:24:28 +00:00			`IToken * Lexer::LexIdentifierOrKeyword() {`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`auto start = this -> Position - 1;`
Improved performance for lexing identifiers/keywords 2019-05-22 10:41:08 +00:00			`auto end = start;`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`while (true){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`char next = this -> Peek();`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`if (next == '\0') break;`
			`if (isalpha(next) \|\| next == '_'){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Improved performance for lexing identifiers/keywords 2019-05-22 10:41:08 +00:00			`end++;`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`}`
			`else{`
			`break;`
			`}`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
Improved performance for lexing identifiers/keywords 2019-05-22 10:41:08 +00:00
			`string s = this -> ScriptString.substr(start, end - start + 1);`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`switch (const_hash(s.c_str())){`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`case const_hash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);`
			`case const_hash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);`
			`case const_hash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);`
			`case const_hash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);`
			`case const_hash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);`
			`case const_hash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);`
			`case const_hash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);`
			`case const_hash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);`
			`case const_hash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);`
			`case const_hash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);`
			`case const_hash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);`
			`case const_hash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);`
			`case const_hash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);`
			`case const_hash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);`
			`case const_hash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);`
			`case const_hash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);`
			`case const_hash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);`
			`case const_hash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);`
			`case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);`
			`default: return new IdentifierToken(s, start, s.length());`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`}`
Lex Strings 2019-05-22 11:24:28 +00:00			`}`

			`const unordered_map<char, char> ControlCharacters{`
			`{'0', '\0'},`
			`{'a', '\a'},`
			`{'b', '\b'},`
			`{'t', '\t'},`
			`{'n', '\n'},`
			`{'v', '\v'},`
			`{'f', '\f'},`
			`{'r', '\r'},`
			`{'"', '\"'},`
			`{'\'', '\''},`
			`{'\?', '\?'},`
			`{'\\', '\\'},`
			`};`

			`IToken* Lexer::LexString(char c){`
			`auto start = this -> Position - 1;`
			`auto end = start;`
			`char last = c;`
			`while (true){`
			`char next = this -> Peek();`
			`if (next == '\0') break;`
			`if (next == c && last != '\\') break;`
			`this -> Next();`
			`end++;`
			`last = next;`
			`}`
			`auto closeToken = this -> Next();`
			`if (closeToken != c){`
			`this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->Position - 1, 1);`
			`return new SimpleToken(TokenKind::BadToken, start, end -start + 1);`
			`}`

			`string s = this -> ScriptString.substr(start + 1, end - start);`
			`stringstream stream;`
			`for (int i = 0; i < s.size(); i++){`
			`c = s[i];`
			`if (c == '\\'){`
			`i++;`
			`c = s[i];`
			`if (ControlCharacters.find(c) != ControlCharacters.end()) {`
			`stream << ControlCharacters.at(c);`
			`} else{`
			`this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);`
			`stream << c;`
			`}`
			`} else{`
			`stream << c;`
			`}`
			`}`
			`return new StringToken(stream.str(), start, end - start );`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`}`