#include "Lexer.hpp" #include #include #include "NumericalLexers.hpp" namespace MalachScript::Parser { const LexToken* Lexer::Lex() { auto* first = LexNext(); if (first->GetKind() == LexTokenKind::EndOfFile) { return first; } auto* last = first; while (true) { auto* next = LexNext(); last->_next = std::unique_ptr(next); last = next; if (next->GetKind() == LexTokenKind::EndOfFile) { break; } } return first; } LexToken* Lexer::LexNext() { auto start = _position; auto c = Consume(); switch (c) { case u8'\0': return Create>(TextSpan(start, 1)); case u8'*': { auto n = Peek(); if (n == u8'*') { Progress(); n = Peek(); if (n == u8'=') { Progress(); // **= return Create>(TextSpan(start, 3)); } // ** return Create>(TextSpan(start, 2)); } if (n == u8'=') { Progress(); // *= return Create>(TextSpan(start, 2)); } // * return Create>(TextSpan(start, 1)); } case u8'/': if (Peek() == u8'=') { Progress(); // /= return Create>(TextSpan(start, 2)); } // / return Create>(TextSpan(start, 1)); case u8'%': if (Peek() == u8'=') { Progress(); // %= return Create>(TextSpan(start, 2)); } // % return Create>(TextSpan(start, 1)); case u8'+': { auto n = Peek(); if (n == u8'=') { Progress(); // += return Create>(TextSpan(start, 2)); } if (n == u8'+') { Progress(); // ++ return Create>(TextSpan(start, 2)); } // + return Create>(TextSpan(start, 1)); } case u8'-': { auto n = Peek(); if (n == u8'=') { Progress(); // -= return Create>(TextSpan(start, 2)); } if (n == u8'-') { Progress(); // -- return Create>(TextSpan(start, 2)); } // - return Create>(TextSpan(start, 1)); } case u8'<': { auto n = Peek(); if (n == u8'=') { Progress(); // <= return Create>(TextSpan(start, 2)); } if (n == u8'<') { Progress(); if (Peek() == u8'=') { Progress(); // <<= return Create>(TextSpan(start, 3)); } // << return Create>(TextSpan(start, 2)); } // < return Create>(TextSpan(start, 1)); } case u8'>': { auto n = Peek(); if (n == u8'=') { Progress(); // >= return Create>(TextSpan(start, 2)); } if (n == u8'>') { Progress(); n = Peek(); if (n == u8'=') { Progress(); // >>= return Create>( TextSpan(start, 3)); } if (n == u8'>') { Progress(); if (Peek() == u8'=') { Progress(); // >>>= return Create>( TextSpan(start, 4)); } // >>> return Create>( TextSpan(start, 3)); } // >> return Create>(TextSpan(start, 2)); } // > return Create>(TextSpan(start, 1)); } case u8'(': return Create>(TextSpan(start, 1)); case u8')': return Create>(TextSpan(start, 1)); case u8'=': { if (Peek() == u8'=') { Progress(); // == return Create>(TextSpan(start, 2)); } // = return Create>(TextSpan(start, 1)); } case u8'!': { auto n = Peek(); if (n == u8'=') { Progress(); // != return Create>(TextSpan(start, 2)); } if (n == u8'i' && Peek(2) == u8's') { Progress(2); // !is return Create>(TextSpan(start, 3)); } // ! return Create>(TextSpan(start, 1)); } case u8'?': return Create>(TextSpan(start, 1)); case u8':': { if (Peek() == u8':') { Progress(); // :: return Create>(TextSpan(start, 2)); } // : return Create>(TextSpan(start, 1)); } case u8'&': { auto n = Peek(); if (n == u8'=') { Progress(); // &= return Create>(TextSpan(start, 2)); } if (n == u8'&') { Progress(); // && return Create>(TextSpan(start, 2)); } // & return Create>(TextSpan(start, 1)); } case u8',': return Create>(TextSpan(start, 1)); case u8'{': return Create>(TextSpan(start, 1)); case u8'}': return Create>(TextSpan(start, 1)); case u8';': return Create>(TextSpan(start, 1)); case u8'|': { auto n = Peek(); if (n == u8'=') { Progress(); // |= return Create>(TextSpan(start, 2)); } if (n == u8'|') { Progress(); // || return Create>(TextSpan(start, 2)); } // | return Create>(TextSpan(start, 1)); } case u8'^': { auto n = Peek(); if (n == u8'=') { Progress(); // ^= return Create>(TextSpan(start, start + 2)); } if (n == u8'^') { Progress(); // ^^ return Create>(TextSpan(start, start + 2)); } // ^ return Create>(TextSpan(start, start + 1)); } case u8'~': return Create>(TextSpan(start, start + 1)); case u8'.': return Create>(TextSpan(start, start + 1)); case u8'[': return Create>(TextSpan(start, start + 1)); case u8']': return Create>(TextSpan(start, start + 1)); case u8'@': return Create>(TextSpan(start, start + 1)); case u8' ': case u8'\r': case u8'\n': case u8'\t': return Create>(TextSpan(start, start + 1)); // Byte order mark case u8'\xEF': { if (Peek() == u8'\xBB' && Peek(2) == u8'\xBF') { Progress(2); return Create>(TextSpan(start, start + 3)); } } case u8'0': case u8'1': case u8'2': case u8'3': case u8'4': case u8'5': case u8'6': case u8'7': case u8'8': case u8'9': return LexNumerical(c); case u8'\'': return LexString(u8'\'', false); case u8'"': { if (Peek() == '"' && Peek(2) == '\"') { return LexString(u8'"', true); } return LexString(u8'"', false); } default: if (IsAlphaNumericalOrUnderscore(c)) return LexKeywordOrIdentifier(); LogError(Diagnostics::DiagnosticType::UnknownToken, TextSpan(start, start + 1)); return Create>(TextSpan(start, start + 1)); } } LexToken* Lexer::LexNumerical(char8_t c) { auto initialValue = LexDecimalValue(c); auto numericalSystem = 10; // Default to decimal system. if (initialValue == 0) { auto secondChar = Peek(); auto secondValue = LexDecimalValue(secondChar); if (secondChar != '.' && secondValue == 255) { Progress(); switch (secondChar) { case 'x': case 'X': numericalSystem = 16; break; case 'd': case 'D': numericalSystem = 10; break; case 'o': case 'O': numericalSystem = 8; break; case 'b': case 'B': numericalSystem = 2; break; default: LogError(Diagnostics::DiagnosticType::InvalidNumericalBase, TextSpan(_position - 1, _position + 1)); // Set to the largest numerical system, so we can prevent errors down the line. numericalSystem = 16; break; } } } switch (numericalSystem) { case 10: return LexDecimal(initialValue); case 16: return LexHexadecimal(); case 8: return LexOctal(); case 2: return LexBinary(); default: throw std::logic_error("Not implemented"); } } constexpr int64_t quick_pow10(int n) { constexpr int64_t pow10[20] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 10000000000000, 100000000000000, 1000000000000000, 10000000000000000, 100000000000000000, 1000000000000000000}; return pow10[n]; } LexToken* Lexer::LexDecimal(uint64_t initial) { auto start = _position; uint64_t value = initial; uint64_t decimalValue = 0; uint64_t exponentValue = 0; uint8_t decimalLength = 0; bool isDecimal = false; bool isExponent = false; while (true) { auto v = (uint64_t)LexDecimalValue(Peek()); if (v == 255) { if (!isDecimal && Peek() == u8'.') { isDecimal = true; Progress(); continue; } if (isDecimal && (Peek() == u8'e' || Peek() == u8'E')) { isDecimal = false; isExponent = true; Progress(); continue; } break; } Progress(); if (isDecimal) { decimalValue *= 10; decimalValue += v; decimalLength++; } else if (isExponent) { exponentValue *= 10; exponentValue += v; } else { value *= 10; value += v; } } if (isDecimal || isExponent) { auto val = value + ((double)decimalValue / quick_pow10(decimalLength)); if (isExponent) { val *= pow(10, exponentValue); } return Create(TextSpan(start, _position), val); } return Create(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexHexadecimal() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexHexadecimalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 4; value += v; } return Create(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexOctal() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexOctalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 3; value += v; } return Create(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexBinary() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexBinaryValue(Peek()); if (v == 255) { break; } Progress(); value <<= 1; value += v; } return Create(TextSpan(start, _position), value); } StringLiteral* Lexer::LexString(char8_t opening, bool heredoc) { Progress(); if (heredoc) { Progress(2); } auto start = _position; size_t offset = 0; while (true) { auto current = Peek(offset); if (heredoc) { if (current == '"' && Peek(offset + 1) == '"' && Peek(offset + 2) == '"' && Peek(offset + 3) != '"') { break; } } else if (current == opening) { break; } if (current == u8'\0') { LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset)); break; } if (!heredoc && (current == u8'\n' || current == u8'\r')) { LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset)); break; } offset++; } Progress(offset); if (heredoc) { Progress(2); } return Create(TextSpan(start, start + _position), std::u8string(_script.substr(start, offset))); } static uint32_t constexpr Hash(const char8_t* input) { return *input != 0U ? static_cast(*input) + 33 * Hash(input + 1) : 5381; }; LexToken* Lexer::LexKeywordOrIdentifier() { auto start = _position; auto offset = 0; while (IsAlphaNumericalOrUnderscore(Peek(offset))) { offset++; } auto str = std::u8string(_script.substr(start, offset)); Progress(offset - 1); switch (Hash(str.c_str())) { case Hash(u8"and"): return Create>(TextSpan(start, _position)); case Hash(u8"abstract"): return Create>(TextSpan(start, _position)); case Hash(u8"auto"): return Create>(TextSpan(start, _position)); case Hash(u8"bool"): return Create>(TextSpan(start, _position)); case Hash(u8"break"): return Create>(TextSpan(start, _position)); case Hash(u8"case"): return Create>(TextSpan(start, _position)); case Hash(u8"cast"): return Create>(TextSpan(start, _position)); case Hash(u8"catch"): return Create>(TextSpan(start, _position)); case Hash(u8"class"): return Create>(TextSpan(start, _position)); case Hash(u8"const"): return Create>(TextSpan(start, _position)); case Hash(u8"continue"): return Create>(TextSpan(start, _position)); case Hash(u8"default"): return Create>(TextSpan(start, _position)); case Hash(u8"do"): return Create>(TextSpan(start, _position)); case Hash(u8"double"): return Create>(TextSpan(start, _position)); case Hash(u8"else"): return Create>(TextSpan(start, _position)); case Hash(u8"enum"): return Create>(TextSpan(start, _position)); case Hash(u8"explicit"): return Create>(TextSpan(start, _position)); case Hash(u8"external"): return Create>(TextSpan(start, _position)); case Hash(u8"false"): return Create>(TextSpan(start, _position)); case Hash(u8"final"): return Create>(TextSpan(start, _position)); case Hash(u8"float"): return Create>(TextSpan(start, _position)); case Hash(u8"for"): return Create>(TextSpan(start, _position)); case Hash(u8"from"): return Create>(TextSpan(start, _position)); case Hash(u8"funcdef"): return Create>(TextSpan(start, _position)); case Hash(u8"function"): return Create>(TextSpan(start, _position)); case Hash(u8"get"): return Create>(TextSpan(start, _position)); case Hash(u8"if"): return Create>(TextSpan(start, _position)); case Hash(u8"import"): return Create>(TextSpan(start, _position)); case Hash(u8"in"): return Create>(TextSpan(start, _position)); case Hash(u8"inout"): return Create>(TextSpan(start, _position)); case Hash(u8"int"): return Create>(TextSpan(start, _position)); case Hash(u8"interface"): return Create>(TextSpan(start, _position)); case Hash(u8"int8"): return Create>(TextSpan(start, _position)); case Hash(u8"int16"): return Create>(TextSpan(start, _position)); case Hash(u8"int32"): return Create>(TextSpan(start, _position)); case Hash(u8"int64"): return Create>(TextSpan(start, _position)); case Hash(u8"is"): return Create>(TextSpan(start, _position)); case Hash(u8"mixin"): return Create>(TextSpan(start, _position)); case Hash(u8"namespace"): return Create>(TextSpan(start, _position)); case Hash(u8"not"): return Create>(TextSpan(start, _position)); case Hash(u8"null"): return Create>(TextSpan(start, _position)); case Hash(u8"or"): return Create>(TextSpan(start, _position)); case Hash(u8"out"): return Create>(TextSpan(start, _position)); case Hash(u8"override"): return Create>(TextSpan(start, _position)); case Hash(u8"private"): return Create>(TextSpan(start, _position)); case Hash(u8"property"): return Create>(TextSpan(start, _position)); case Hash(u8"protected"): return Create>(TextSpan(start, _position)); case Hash(u8"return"): return Create>(TextSpan(start, _position)); case Hash(u8"set"): return Create>(TextSpan(start, _position)); case Hash(u8"shared"): return Create>(TextSpan(start, _position)); case Hash(u8"super"): return Create>(TextSpan(start, _position)); case Hash(u8"switch"): return Create>(TextSpan(start, _position)); case Hash(u8"this"): return Create>(TextSpan(start, _position)); case Hash(u8"true"): return Create>(TextSpan(start, _position)); case Hash(u8"try"): return Create>(TextSpan(start, _position)); case Hash(u8"typedef"): return Create>(TextSpan(start, _position)); case Hash(u8"uint"): return Create>(TextSpan(start, _position)); case Hash(u8"uint8"): return Create>(TextSpan(start, _position)); case Hash(u8"uint16"): return Create>(TextSpan(start, _position)); case Hash(u8"uint32"): return Create>(TextSpan(start, _position)); case Hash(u8"uint64"): return Create>(TextSpan(start, _position)); case Hash(u8"void"): return Create>(TextSpan(start, _position)); case Hash(u8"while"): return Create>(TextSpan(start, _position)); case Hash(u8"xor"): return Create>(TextSpan(start, _position)); default: return Create(TextSpan(start, _position), str); } } bool Lexer::IsAlphaNumericalOrUnderscore(char8_t c) { if (c >= 'a' && c <= 'z') { return true; } if (c >= 'A' && c <= 'Z') { return true; } if (c >= '0' && c <= '9') { return true; } if (c == '_') { return true; } return false; } }