PorygonLang/src/Parser/Lexer.cpp

#include <utility>
#include <cmath>
#include <unordered_map>
#include <sstream>

#include "Lexer.hpp"

Lexer::Lexer(string* scriptString, class Script* script) {
    this -> _scriptString = scriptString;
    this->_scriptSize = scriptString->size();
    this -> ScriptData = script;
    this -> _position = 0;
}

Lexer::Lexer(const string& scriptString, class Script *script) {
    this -> _scriptString = new string(scriptString);
    this->_scriptSize = scriptString.size();
    this -> ScriptData = script;
    this -> _position = 0;
}


vector<IToken*> Lexer::Lex() {
    vector<IToken*> tokens;
    while (true){
        IToken* next = this -> LexNext(this -> Next());
        auto nextKind = next -> GetKind();
        if (nextKind != TokenKind::WhiteSpace)
            tokens.push_back(next);
        else
            delete next;
        if (nextKind == TokenKind::EndOfFile)
            break;
    }
    return tokens;
}

char Lexer::Peek(){
    if (Lexer::_position >= this -> _scriptSize)
        return '\0';
    return this -> _scriptString->at(Lexer::_position);
}

char Lexer::Next(){
    char next = Peek();
    Lexer::_position++;
    return next;
}

IToken* Lexer::LexNext(char c){
    switch (c) {
        case '\0':
            return new SimpleToken(TokenKind::EndOfFile, this -> _position - 1, 1);
        case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
            return new SimpleToken(TokenKind::WhiteSpace, this -> _position - 1, 1);
        case '+':
            return new SimpleToken(TokenKind::PlusToken, this -> _position - 1, 1);
        case '-':
            return new SimpleToken(TokenKind::MinusToken, this -> _position - 1, 1);
        case '/':
            return new SimpleToken(TokenKind::SlashToken, this -> _position - 1, 1);
        case '*':
            return new SimpleToken(TokenKind::StarToken, this -> _position - 1, 1);
        case '(':
            return new SimpleToken(TokenKind::OpenParenthesis, this -> _position - 1, 1);
        case ')':
            return new SimpleToken(TokenKind::CloseParenthesis, this -> _position - 1, 1);
        case '[':
            return new SimpleToken(TokenKind::OpenSquareBracket, this -> _position - 1, 1);
        case ']':
            return new SimpleToken(TokenKind::CloseSquareBracket, this -> _position - 1, 1);
        case ',':
            return new SimpleToken(TokenKind::CommaToken, this -> _position - 1, 1);
        case '.':
            return new SimpleToken(TokenKind::PeriodToken, this -> _position - 1, 1);
        case '=':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::EqualityToken, this -> _position - 2, 2);
            }
            return new SimpleToken(TokenKind::AssignmentToken, this -> _position - 1, 1);
        case '<':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::LessEquals, this -> _position - 2, 2);
            }
            return new SimpleToken(TokenKind::Less, this -> _position - 1, 1);
        case '>':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::GreaterEquals, this -> _position - 2, 2);
            }
            return new SimpleToken(TokenKind::Greater, this -> _position - 1, 1);
        case '~':
            if (Lexer::Peek() == '='){
                Lexer::Next();
                return new SimpleToken(TokenKind::InequalityToken, this -> _position - 2, 2);
            }
            this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> _position - 1, 1);
            return new SimpleToken(TokenKind::BadToken, this -> _position - 1, 1);
        case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
            return LexNumber(c);
        case '"':
            return LexString(c);
        case '\'':
            return LexString(c);
        case '_':
            return LexIdentifierOrKeyword();
        default:
            if (isalpha(c)){
                return LexIdentifierOrKeyword();
            }
            this -> ScriptData -> Diagnostics -> LogError(DiagnosticCode::UnexpectedCharacter, this -> _position - 1, 1);
            return new SimpleToken(TokenKind::BadToken, this -> _position - 1, 1);
    }
}

int CharToInt(char c){
    switch (c){
        case '0': return 0;
        case '1': return 1;
        case '2': return 2;
        case '3': return 3;
        case '4': return 4;
        case '5': return 5;
        case '6': return 6;
        case '7': return 7;
        case '8': return 8;
        case '9': return 9;
        default: return -1;
    }
}

IToken* Lexer::LexNumber(char c){
    long int_value = CharToInt(c);
    double float_value = 0;
    short decimal_index = 0;
    bool has_point = false;
    bool is_searching = true;
    unsigned int start = this -> _position - 1;
    unsigned int length = 1;
    while (is_searching){
        char next = this -> Peek();
        int next_val = CharToInt(next);
        if (next_val == -1){
            switch (next){
                case '_':
                    this -> Next();
                    length++;
                    continue;
                case '.':
                    this -> Next();
                    has_point = true;
                    decimal_index = 0;
                    float_value = int_value;
                    length++;
                    continue;
                default:
                    is_searching = false;
                    continue;
            }
        }
        else{
            this -> Next();
            length++;
            if (has_point){
                decimal_index++;
                float_value += next_val / pow(10, decimal_index);
            }
            else {
                int_value *= 10;
                int_value += next_val;
            }
        }
    }
    if (has_point){
        return new FloatToken(float_value, start, length);
    }
    else{
        return new IntegerToken(int_value, start, length);
    }
}

IToken * Lexer::LexIdentifierOrKeyword() {
    auto start = this -> _position - 1;
    auto end = start;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (isalpha(next) || next == '_'){
            this -> Next();
            end++;
        }
        else{
            break;
        }
    }

    string s = this -> _scriptString->substr(start, end - start + 1);
    switch (HashedString::ConstHash(s.c_str())){
        case HashedString::ConstHash("and"): return new SimpleToken(TokenKind::AndKeyword, start, 3);
        case HashedString::ConstHash("break"): return new SimpleToken(TokenKind::BreakKeyword, start, 5);
        case HashedString::ConstHash("do"): return new SimpleToken(TokenKind::DoKeyword, start, 2);
        case HashedString::ConstHash("else"): return new SimpleToken(TokenKind::ElseKeyword, start, 4);
        case HashedString::ConstHash("elseif"): return new SimpleToken(TokenKind::ElseIfKeyword, start, 6);
        case HashedString::ConstHash("end"): return new SimpleToken(TokenKind::EndKeyword, start, 3);
        case HashedString::ConstHash("false"): return new SimpleToken(TokenKind::FalseKeyword, start, 5);
        case HashedString::ConstHash("for"): return new SimpleToken(TokenKind::ForKeyword, start, 3);
        case HashedString::ConstHash("function"): return new SimpleToken(TokenKind::FunctionKeyword, start, 8);
        case HashedString::ConstHash("if"): return new SimpleToken(TokenKind::IfKeyword, start, 2);
        case HashedString::ConstHash("in"): return new SimpleToken(TokenKind::InKeyword, start, 2);
        case HashedString::ConstHash("local"): return new SimpleToken(TokenKind::LocalKeyword, start, 5);
        case HashedString::ConstHash("nil"): return new SimpleToken(TokenKind::NilKeyword, start, 3);
        case HashedString::ConstHash("not"): return new SimpleToken(TokenKind::NotKeyword, start, 3);
        case HashedString::ConstHash("or"): return new SimpleToken(TokenKind::OrKeyword, start, 2);
        case HashedString::ConstHash("return"): return new SimpleToken(TokenKind::ReturnKeyword, start, 6);
        case HashedString::ConstHash("then"): return new SimpleToken(TokenKind::ThenKeyword, start, 4);
        case HashedString::ConstHash("true"): return new SimpleToken(TokenKind::TrueKeyword, start, 4);
        case HashedString::ConstHash("while"): return new SimpleToken(TokenKind::WhileKeyword, start, 5);
        default: return new IdentifierToken(s, start, s.length());
    }
}

const unordered_map<char, char> ControlCharacters{
        {'0', '\0'},
        {'a', '\a'},
        {'b', '\b'},
        {'t', '\t'},
        {'n', '\n'},
        {'v', '\v'},
        {'f', '\f'},
        {'r', '\r'},
        {'"', '\"'},
        {'\'', '\''},
        {'\?', '\?'},
        {'\\', '\\'},
};

IToken* Lexer::LexString(char c){
    auto start = this -> _position - 1;
    auto end = start;
    char last = c;
    while (true){
        char next = this -> Peek();
        if (next == '\0') break;
        if (next == c && last != '\\') break;
        this -> Next();
        end++;
        last = next;
    }
    auto closeToken = this -> Next();
    if (closeToken != c){
        this -> ScriptData->Diagnostics->LogError(DiagnosticCode::UnexpectedCharacter, this->_position - 1, 1);
        return new SimpleToken(TokenKind::BadToken, start, end -start + 1);
    }

    string s = this -> _scriptString->substr(start + 1, end - start);
    stringstream stream;
    for (int i = 0; i < s.size(); i++){
        c = s[i];
        if (c == '\\'){
            i++;
            c = s[i];
            if (ControlCharacters.find(c) != ControlCharacters.end()) {
                stream << ControlCharacters.at(c);
            } else{
                this -> ScriptData->Diagnostics->LogError(DiagnosticCode::InvalidStringControlCharacter, start + 1 + i, 1);
                stream << c;
            }
        } else{
            stream << c;
        }
    }
    return new StringToken(stream.str(), start, end - start );
}