PorygonLang/src/Parser/Lexer.cpp

#include <utility>
#include <cmath>

#include "Lexer.hpp"

Lexer::Lexer(string scriptString, class Script* script) {
    this -> ScriptString = std::move(scriptString);
    this -> ScriptData = script;
    this -> Position = 0;
}

vector<IToken*> Lexer::Lex() {
    vector<IToken*> tokens;
    while (true){
        IToken* next = this -> LexNext(this -> Next());
        tokens.push_back(next);
        if (next->GetKind() == TokenKind::EndOfFile)
            break;
    }
    return tokens;
}

char Lexer::Peek(){
    if (Lexer::Position > this -> ScriptString.length())
        return '\0';
    return this -> ScriptString[Lexer::Position];
}

char Lexer::Next(){
    char next = Peek();
    Lexer::Position++;
    return next;
}

IToken* Lexer::LexNext(char c){
    switch (c) {
        case '\0':
            return new SimpleToken(TokenKind::EndOfFile, this -> Position - 1, 1);
            case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
            return new SimpleToken(TokenKind::WhiteSpace, this -> Position - 1, 1);
        case '+':
            return new SimpleToken(TokenKind::PlusToken, this -> Position - 1, 1);
        case '-':
            return new SimpleToken(TokenKind::MinusToken, this -> Position - 1, 1);
        case '/':
            return new SimpleToken(TokenKind::SlashToken, this -> Position - 1, 1);
        case '*':
            return new SimpleToken(TokenKind::StarToken, this -> Position - 1, 1);
        case '=':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::EqualityToken, this -> Position - 2, 2);
            }
            return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);
        case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
            return LexNumber(c);
        case '_':
            return LexIdentifierOrKeyword(c);
        default:
            if (isalpha(c)){
                return LexIdentifierOrKeyword(c);
            }
            this -> ScriptData->Diagnostics.LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);
            return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);
    }
}

int CharToInt(char c){
    switch (c){
        case '0': return 0;
        case '1': return 1;
        case '2': return 2;
        case '3': return 3;
        case '4': return 4;
        case '5': return 5;
        case '6': return 6;
        case '7': return 7;
        case '8': return 8;
        case '9': return 9;
        default: return -1;
    }
}

IToken* Lexer::LexNumber(char c){
    long int_value = CharToInt(c);
    double float_value = 0;
    short decimal_index = 0;
    bool has_point = false;
    bool is_searching = true;
    unsigned int start = this -> Position - 1;
    unsigned int length = 1;
    while (is_searching){
        char next = this -> Peek();
        int next_val = CharToInt(next);
        if (next_val == -1){
            switch (next){
                case '_':
                    this -> Next();
                    length++;
                    continue;
                case '.':
                    this -> Next();
                    has_point = true;
                    decimal_index = 0;
                    float_value = int_value;
                    length++;
                    continue;
                default:
                    is_searching = false;
                    continue;
            }
        }
        else{
            this -> Next();
            length++;
            if (has_point){
                decimal_index++;
                float_value += next_val / pow(10, decimal_index);
            }
            else {
                int_value *= 10;
                int_value += next_val;
            }
        }
    }
    if (has_point){
        return new FloatToken(float_value, start, length);
    }
    else{
        return new IntegerToken(int_value, start, length);
    }
}

unsigned constexpr const_hash(char const *input) {
    return *input ?
           static_cast<unsigned int>(*input) + 33 * const_hash(input + 1) :
           5381;
}

IToken* Lexer::LexIdentifierOrKeyword(char c){
    vector<char> charVec(1, c);
    auto start = this -> Position - 1;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (isalpha(next) || next == '_'){
            this -> Next();
            charVec.push_back(next);
        }
        else{
            break;
        }
    }
    string s = string(charVec.begin(), charVec.end());
    switch (const_hash(s.c_str())){
        case const_hash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);
        case const_hash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);
        case const_hash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);
        case const_hash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);
        case const_hash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
        case const_hash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);
        case const_hash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);
        case const_hash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);
        case const_hash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
        case const_hash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);
        case const_hash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);
        case const_hash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);
        case const_hash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);
        case const_hash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);
        case const_hash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);
        case const_hash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
        case const_hash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);
        case const_hash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);
        case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
        default: return new IdentifierToken(s, start, s.length());
    }
}
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`#include <utility>`
			`#include <cmath>`

			`#include "Lexer.hpp"`

Add support for diagnostics 2019-05-21 11:56:08 +00:00			`Lexer::Lexer(string scriptString, class Script* script) {`
			`this -> ScriptString = std::move(scriptString);`
			`this -> ScriptData = script;`
			`this -> Position = 0;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

			`vector<IToken*> Lexer::Lex() {`
			`vector<IToken*> tokens;`
			`while (true){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`IToken* next = this -> LexNext(this -> Next());`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`tokens.push_back(next);`
			`if (next->GetKind() == TokenKind::EndOfFile)`
			`break;`
			`}`
			`return tokens;`
			`}`

			`char Lexer::Peek(){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`if (Lexer::Position > this -> ScriptString.length())`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`return '\0';`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return this -> ScriptString[Lexer::Position];`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

			`char Lexer::Next(){`
			`char next = Peek();`
			`Lexer::Position++;`
			`return next;`
			`}`

			`IToken* Lexer::LexNext(char c){`
			`switch (c) {`
			`case '\0':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::EndOfFile, this -> Position - 1, 1);`
Adds \v and \f as whitespace characters 2019-05-19 13:28:45 +00:00			`case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::WhiteSpace, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '+':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::PlusToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '-':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::MinusToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '/':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::SlashToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '*':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::StarToken, this -> Position - 1, 1);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`case '=':`
			`if (Lexer::Peek() == '='){`
			`Lexer::Next();`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::EqualityToken, this -> Position - 2, 2);`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`}`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`return new SimpleToken(TokenKind::AssignmentToken, this -> Position - 1, 1);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':`
			`return LexNumber(c);`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`case '_':`
			`return LexIdentifierOrKeyword(c);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`default:`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`if (isalpha(c)){`
			`return LexIdentifierOrKeyword(c);`
			`}`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> ScriptData->Diagnostics.LogError(DiagnosticCode::UnexpectedCharacter, this -> Position - 1, 1);`
			`return new SimpleToken(TokenKind::BadToken, this -> Position - 1, 1);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`}`

			`int CharToInt(char c){`
			`switch (c){`
			`case '0': return 0;`
			`case '1': return 1;`
			`case '2': return 2;`
			`case '3': return 3;`
			`case '4': return 4;`
			`case '5': return 5;`
			`case '6': return 6;`
			`case '7': return 7;`
			`case '8': return 8;`
			`case '9': return 9;`
			`default: return -1;`
			`}`
			`}`

			`IToken* Lexer::LexNumber(char c){`
			`long int_value = CharToInt(c);`
			`double float_value = 0;`
			`short decimal_index = 0;`
			`bool has_point = false;`
			`bool is_searching = true;`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`unsigned int start = this -> Position - 1;`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`unsigned int length = 1;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`while (is_searching){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`char next = this -> Peek();`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`int next_val = CharToInt(next);`
			`if (next_val == -1){`
			`switch (next){`
Dont consume the character immediately following a number 2019-05-19 10:20:08 +00:00			`case '_':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Dont consume the character immediately following a number 2019-05-19 10:20:08 +00:00			`continue;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`case '.':`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`has_point = true;`
			`decimal_index = 0;`
			`float_value = int_value;`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`continue;`
			`default:`
			`is_searching = false;`
			`continue;`
			`}`
			`}`
			`else{`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`length++;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`if (has_point){`
			`decimal_index++;`
			`float_value += next_val / pow(10, decimal_index);`
			`}`
			`else {`
			`int_value *= 10;`
			`int_value += next_val;`
			`}`
			`}`
			`}`
			`if (has_point){`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`return new FloatToken(float_value, start, length);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`else{`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`return new IntegerToken(int_value, start, length);`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
			`}`

Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`unsigned constexpr const_hash(char const *input) {`
			`return *input ?`
			`static_cast<unsigned int>(input) + 33 const_hash(input + 1) :`
			`5381;`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`

Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`IToken* Lexer::LexIdentifierOrKeyword(char c){`
			`vector<char> charVec(1, c);`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`auto start = this -> Position - 1;`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`while (true){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`char next = this -> Peek();`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`if (next == '\0') break;`
			`if (isalpha(next) \|\| next == '_'){`
Add support for diagnostics 2019-05-21 11:56:08 +00:00			`this -> Next();`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`charVec.push_back(next);`
			`}`
			`else{`
			`break;`
			`}`
Initial commit, adds very basic Lexing 2019-05-18 18:35:51 +00:00			`}`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`string s = string(charVec.begin(), charVec.end());`
			`switch (const_hash(s.c_str())){`
Save position and length of tokens 2019-05-19 14:11:16 +00:00			`case const_hash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);`
			`case const_hash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);`
			`case const_hash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);`
			`case const_hash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);`
			`case const_hash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);`
			`case const_hash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);`
			`case const_hash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);`
			`case const_hash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);`
			`case const_hash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);`
			`case const_hash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);`
			`case const_hash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);`
			`case const_hash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);`
			`case const_hash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);`
			`case const_hash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);`
			`case const_hash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);`
			`case const_hash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);`
			`case const_hash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);`
			`case const_hash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);`
			`case const_hash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);`
			`default: return new IdentifierToken(s, start, s.length());`
Add a couple more characters to lex 2019-05-19 10:49:26 +00:00			`}`
Added lexing support for identifiers and keywords 2019-05-19 12:26:21 +00:00			`}`