#include "Lexer.hpp" #include #include #include "NumericalLexers.hpp" namespace MalachScript::Parser { const LexToken* Lexer::Lex() { auto* first = LexNext(); if (first->GetKind() == LexTokenKind::EndOfFile) { return first; } auto* last = first; while (true) { auto* next = LexNext(); last->SetNext(next); last = next; if (next->GetKind() == LexTokenKind::EndOfFile) { break; } } return first; } LexToken* Lexer::LexNext() { auto start = _position; auto c = Consume(); switch (c) { case '\0': return Create>(TextSpan(start + 1, start + 2)); case '*': { auto n = Peek(); if (n == '*') { Progress(); n = Peek(); if (n == '=') { Progress(); // **= return Create>(TextSpan(start, start + 3)); } // ** return Create>(TextSpan(start, start + 2)); } if (n == '=') { Progress(); // *= return Create>(TextSpan(start, start + 2)); } // * return Create>(TextSpan(start, start + 1)); } case '/': if (Peek() == '=') { Progress(); // /= return Create>(TextSpan(start, start + 2)); } // / return Create>(TextSpan(start, start + 1)); case '%': if (Peek() == '=') { Progress(); // %= return Create>(TextSpan(start, start + 2)); } // % return Create>(TextSpan(start, start + 1)); case '+': { auto n = Peek(); if (n == '=') { Progress(); // += return Create>(TextSpan(start, start + 2)); } if (n == '+') { Progress(); // ++ return Create>(TextSpan(start, start + 2)); } // + return Create>(TextSpan(start, start + 1)); } case '-': { auto n = Peek(); if (n == '=') { Progress(); // -= return Create>(TextSpan(start, start + 2)); } if (n == '-') { Progress(); // -- return Create>(TextSpan(start, start + 2)); } // - return Create>(TextSpan(start, start + 1)); } case '<': { auto n = Peek(); if (n == '=') { Progress(); // <= return Create>(TextSpan(start, start + 2)); } if (n == '<') { Progress(); if (Peek() == '=') { Progress(); // <<= return Create>( TextSpan(start, start + 3)); } // << return Create>(TextSpan(start, start + 2)); } // < return Create>(TextSpan(start, start + 1)); } case '>': { auto n = Peek(); if (n == '=') { Progress(); // >= return Create>(TextSpan(start, start + 2)); } if (n == '>') { Progress(); n = Peek(); if (n == '=') { Progress(); // >>= return Create>( TextSpan(start, start + 3)); } if (n == '>') { Progress(); if (Peek() == '=') { Progress(); // >>>= return Create>( TextSpan(start, start + 4)); } // >>> return Create>( TextSpan(start, start + 3)); } // >> return Create>(TextSpan(start, start + 2)); } // > return Create>(TextSpan(start, start + 1)); } case '(': return Create>(TextSpan(start, start + 1)); case ')': return Create>(TextSpan(start, start + 1)); case '=': { if (Peek() == '=') { Progress(); // == return Create>(TextSpan(start, start + 2)); } // = return Create>(TextSpan(start, start + 1)); } case '!': { auto n = Peek(); if (n == '=') { Progress(); // != return Create>(TextSpan(start, start + 2)); } if (n == 'i' && Peek(2) == 's') { Progress(2); // !is return Create>(TextSpan(start, start + 3)); } // ! return Create>(TextSpan(start, start + 1)); } case '?': return Create>(TextSpan(start, start + 1)); case ':': { if (Peek() == ':') { Progress(); // :: return Create>(TextSpan(start, start + 2)); } // : return Create>(TextSpan(start, start + 1)); } case '&': { auto n = Peek(); if (n == '=') { Progress(); // &= return Create>(TextSpan(start, start + 2)); } if (n == '&') { Progress(); // && return Create>(TextSpan(start, start + 2)); } // & return Create>(TextSpan(start, start + 1)); } case ',': return Create>(TextSpan(start, start + 1)); case '{': return Create>(TextSpan(start, start + 1)); case '}': return Create>(TextSpan(start, start + 1)); case ';': return Create>(TextSpan(start, start + 1)); case '|': { auto n = Peek(); if (n == '=') { Progress(); // |= return Create>(TextSpan(start, start + 2)); } if (n == '|') { Progress(); // || return Create>( TextSpan(start, start + 2)); } // | return Create>(TextSpan(start, start + 1)); } case '^': { auto n = Peek(); if (n == '=') { Progress(); // ^= return Create>(TextSpan(start, start + 2)); } if (n == '^') { Progress(); // ^^ return Create>(TextSpan(start, start + 2)); } // ^ return Create>(TextSpan(start, start + 1)); } case '~': return Create>(TextSpan(start, start + 1)); case '.': return Create>(TextSpan(start, start + 1)); case '[': return Create>(TextSpan(start, start + 1)); case ']': return Create>(TextSpan(start, start + 1)); case '@': return Create>(TextSpan(start, start + 1)); case ' ': case '\r': case '\n': case '\t': return Create>(TextSpan(start, start + 1)); // Byte order mark case '\xEF': { if (Peek() == '\xBB' && Peek(2) == '\xBF') { Progress(2); return Create>(TextSpan(start, start + 3)); } LogError(Diagnostics::DiagnosticType::UnknownCharacter, TextSpan(start, start + 1), {std::string(1, c)}); return Create>(TextSpan(start, start + 1)); } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexNumerical(c); case '\'': return LexString('\'', false); case '"': { if (Peek() == '"' && Peek(2) == '\"') { return LexString('"', true); } return LexString('"', false); } default: if (IsAlphaNumericalOrUnderscore(c)) return LexKeywordOrIdentifier(); LogError(Diagnostics::DiagnosticType::UnknownCharacter, TextSpan(start, start + 1), {std::string(1, (char)c)}); return Create>(TextSpan(start, start + 1)); } } LexToken* Lexer::LexNumerical(char c) { auto initialValue = LexDecimalValue(c); auto numericalSystem = 10; // Default to decimal system. if (initialValue == 0) { auto secondChar = Peek(); auto secondValue = LexDecimalValue(secondChar); if (secondChar != '.' && secondValue == 255) { switch (secondChar) { case 'x': case 'X': Progress(); numericalSystem = 16; break; case 'd': case 'D': Progress(); numericalSystem = 10; break; case 'o': case 'O': Progress(); numericalSystem = 8; break; case 'b': case 'B': Progress(); numericalSystem = 2; break; default: return Create(TextSpan(_position - 1, _position), 0); } } } switch (numericalSystem) { case 10: return LexDecimal(initialValue); case 16: return LexHexadecimal(); case 8: return LexOctal(); case 2: return LexBinary(); default: throw std::logic_error("Not implemented"); } } constexpr int64_t quick_pow10(int n) { constexpr int64_t pow10[20] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 10000000000000, 100000000000000, 1000000000000000, 10000000000000000, 100000000000000000, 1000000000000000000}; return pow10[n]; } LexToken* Lexer::LexDecimal(ParseInt initial) { auto start = _position; ParseInt value = initial; ParseInt decimalValue = 0; ParseInt exponentValue = 0; uint8_t decimalLength = 0; bool isDecimal = false; bool isExponent = false; while (true) { auto v = (ParseInt)LexDecimalValue(Peek()); if (v == 255) { if (!isDecimal && Peek() == '.') { isDecimal = true; Progress(); continue; } if (isDecimal && (Peek() == 'e' || Peek() == 'E')) { isDecimal = false; isExponent = true; Progress(); continue; } break; } Progress(); if (isDecimal) { decimalValue *= 10; decimalValue += v; decimalLength++; } else if (isExponent) { exponentValue *= 10; exponentValue += v; } else { value *= 10; value += v; } } if (isDecimal || isExponent) { auto val = value + ((ParseFloat)decimalValue / quick_pow10(decimalLength)); if (isExponent) { val *= pow(10, exponentValue); } return Create(TextSpan(start, _position), val); } return Create(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexHexadecimal() { auto start = _position; ParseInt value = 0; while (true) { auto v = LexHexadecimalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 4; value += v; } return Create(TextSpan(start - 1, _position), value); } IntegerLiteral* Lexer::LexOctal() { auto start = _position; ParseInt value = 0; while (true) { auto v = LexOctalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 3; value += v; } return Create(TextSpan(start - 1, _position), value); } IntegerLiteral* Lexer::LexBinary() { auto start = _position; ParseInt value = 0; while (true) { auto v = LexBinaryValue(Peek()); if (v == 255) { break; } Progress(); value <<= 1; value += v; } return Create(TextSpan(start - 1, _position), value); } StringLiteral* Lexer::LexString(char opening, bool heredoc) { auto openingPos = _position; Progress(); if (heredoc) { Progress(2); } auto start = _position; size_t offset = 0; while (true) { auto current = Peek(offset); if (heredoc) { if (current == '"' && Peek(offset + 1) == '"' && Peek(offset + 2) == '"' && Peek(offset + 3) != '"') { break; } } else if (current == opening) { break; } if (current == '\0') { LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset), {"EndOfFile"}); break; } if (!heredoc && (current == '\n' || current == '\r')) { LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset), {"Newline"}); break; } offset++; } Progress(offset); if (heredoc) { Progress(2); } return Create(TextSpan(openingPos, openingPos + _position), ParseString(_script.substr(start, offset))); } LexToken* Lexer::LexKeywordOrIdentifier() { auto start = _position; auto offset = 0; while (IsAlphaNumericalOrUnderscore(Peek(offset))) { offset++; } auto str = _script.substr(start, offset); Progress(offset - 1); auto hash = Identifier::Hash(str); switch (hash) { case Identifier::Hash("and"): return Create>(TextSpan(start, _position)); case Identifier::Hash("abstract"): return Create>(TextSpan(start, _position)); case Identifier::Hash("auto"): return Create>(TextSpan(start, _position)); case Identifier::Hash("bool"): return Create>(TextSpan(start, _position)); case Identifier::Hash("break"): return Create>(TextSpan(start, _position)); case Identifier::Hash("case"): return Create>(TextSpan(start, _position)); case Identifier::Hash("cast"): return Create>(TextSpan(start, _position)); case Identifier::Hash("catch"): return Create>(TextSpan(start, _position)); case Identifier::Hash("class"): return Create>(TextSpan(start, _position)); case Identifier::Hash("const"): return Create>(TextSpan(start, _position)); case Identifier::Hash("continue"): return Create>(TextSpan(start, _position)); case Identifier::Hash("default"): return Create>(TextSpan(start, _position)); case Identifier::Hash("do"): return Create>(TextSpan(start, _position)); case Identifier::Hash("double"): return Create>(TextSpan(start, _position)); case Identifier::Hash("else"): return Create>(TextSpan(start, _position)); case Identifier::Hash("enum"): return Create>(TextSpan(start, _position)); case Identifier::Hash("explicit"): return Create>(TextSpan(start, _position)); case Identifier::Hash("external"): return Create>(TextSpan(start, _position)); case Identifier::Hash("false"): return Create>(TextSpan(start, _position)); case Identifier::Hash("final"): return Create>(TextSpan(start, _position)); case Identifier::Hash("float"): return Create>(TextSpan(start, _position)); case Identifier::Hash("for"): return Create>(TextSpan(start, _position)); case Identifier::Hash("from"): return Create>(TextSpan(start, _position)); case Identifier::Hash("funcdef"): return Create>(TextSpan(start, _position)); case Identifier::Hash("function"): return Create>(TextSpan(start, _position)); case Identifier::Hash("get"): return Create>(TextSpan(start, _position)); case Identifier::Hash("if"): return Create>(TextSpan(start, _position)); case Identifier::Hash("import"): return Create>(TextSpan(start, _position)); case Identifier::Hash("in"): return Create>(TextSpan(start, _position)); case Identifier::Hash("inout"): return Create>(TextSpan(start, _position)); case Identifier::Hash("int"): return Create>(TextSpan(start, _position)); case Identifier::Hash("interface"): return Create>(TextSpan(start, _position)); case Identifier::Hash("int8"): return Create>(TextSpan(start, _position)); case Identifier::Hash("int16"): return Create>(TextSpan(start, _position)); case Identifier::Hash("int32"): return Create>(TextSpan(start, _position)); case Identifier::Hash("int64"): return Create>(TextSpan(start, _position)); case Identifier::Hash("is"): return Create>(TextSpan(start, _position)); case Identifier::Hash("mixin"): return Create>(TextSpan(start, _position)); case Identifier::Hash("namespace"): return Create>(TextSpan(start, _position)); case Identifier::Hash("not"): return Create>(TextSpan(start, _position)); case Identifier::Hash("null"): return Create>(TextSpan(start, _position)); case Identifier::Hash("or"): return Create>(TextSpan(start, _position)); case Identifier::Hash("out"): return Create>(TextSpan(start, _position)); case Identifier::Hash("override"): return Create>(TextSpan(start, _position)); case Identifier::Hash("private"): return Create>(TextSpan(start, _position)); case Identifier::Hash("property"): return Create>(TextSpan(start, _position)); case Identifier::Hash("protected"): return Create>(TextSpan(start, _position)); case Identifier::Hash("return"): return Create>(TextSpan(start, _position)); case Identifier::Hash("set"): return Create>(TextSpan(start, _position)); case Identifier::Hash("shared"): return Create>(TextSpan(start, _position)); case Identifier::Hash("super"): return Create>(TextSpan(start, _position)); case Identifier::Hash("switch"): return Create>(TextSpan(start, _position)); case Identifier::Hash("this"): return Create>(TextSpan(start, _position)); case Identifier::Hash("true"): return Create>(TextSpan(start, _position)); case Identifier::Hash("try"): return Create>(TextSpan(start, _position)); case Identifier::Hash("typedef"): return Create>(TextSpan(start, _position)); case Identifier::Hash("uint"): return Create>(TextSpan(start, _position)); case Identifier::Hash("uint8"): return Create>(TextSpan(start, _position)); case Identifier::Hash("uint16"): return Create>(TextSpan(start, _position)); case Identifier::Hash("uint32"): return Create>(TextSpan(start, _position)); case Identifier::Hash("uint64"): return Create>(TextSpan(start, _position)); case Identifier::Hash("void"): return Create>(TextSpan(start, _position)); case Identifier::Hash("while"): return Create>(TextSpan(start, _position)); case Identifier::Hash("xor"): return Create>(TextSpan(start, _position)); default: return Create(TextSpan(start, _position), Identifier(str.data(), offset, hash)); } } bool Lexer::IsAlphaNumericalOrUnderscore(char c) { if (c >= 'a' && c <= 'z') { return true; } if (c >= 'A' && c <= 'Z') { return true; } if (c >= '0' && c <= '9') { return true; } if (c == '_') { return true; } return false; } }