#include "Lexer.hpp" #include #include #include "NumericalLexers.hpp" namespace ElohimScript::Parser { const LexToken* Lexer::Lex() { auto* first = LexNext(); if (first->GetKind() == LexTokenKind::EndOfFile) { return first; } auto* last = first; while (true) { auto* next = LexNext(); last->_next = std::unique_ptr(next); last = next; if (next->GetKind() == LexTokenKind::EndOfFile) { break; } } return first; } LexToken* Lexer::LexNext() { auto start = _position; auto c = Consume(); switch (c) { case u8'\0': return new LexTokenImpl(TextSpan(start, 1)); case u8'*': { auto n = Peek(); if (n == u8'*') { Progress(); n = Peek(); if (n == u8'=') { Progress(); // **= return new LexTokenImpl(TextSpan(start, 3)); } // ** return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'=') { Progress(); // *= return new LexTokenImpl(TextSpan(start, 2)); } // * return new LexTokenImpl(TextSpan(start, 1)); } case u8'/': if (Peek() == u8'=') { Progress(); // /= return new LexTokenImpl(TextSpan(start, 2)); } // / return new LexTokenImpl(TextSpan(start, 1)); case u8'%': if (Peek() == u8'=') { Progress(); // %= return new LexTokenImpl(TextSpan(start, 2)); } // % return new LexTokenImpl(TextSpan(start, 1)); case u8'+': { auto n = Peek(); if (n == u8'=') { Progress(); // += return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'+') { Progress(); // ++ return new LexTokenImpl(TextSpan(start, 2)); } // + return new LexTokenImpl(TextSpan(start, 1)); } case u8'-': { auto n = Peek(); if (n == u8'=') { Progress(); // -= return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'-') { Progress(); // -- return new LexTokenImpl(TextSpan(start, 2)); } // - return new LexTokenImpl(TextSpan(start, 1)); } case u8'<': { auto n = Peek(); if (n == u8'=') { Progress(); // <= return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'<') { Progress(); if (Peek() == u8'=') { Progress(); // <<= return new LexTokenImpl(TextSpan(start, 3)); } // << return new LexTokenImpl(TextSpan(start, 2)); } // < return new LexTokenImpl(TextSpan(start, 1)); } case u8'>': { auto n = Peek(); if (n == u8'=') { Progress(); // >= return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'>') { Progress(); n = Peek(); if (n == u8'=') { Progress(); // >>= return new LexTokenImpl(TextSpan(start, 3)); } if (n == u8'>') { Progress(); if (Peek() == u8'=') { Progress(); // >>>= return new LexTokenImpl( TextSpan(start, 4)); } // >>> return new LexTokenImpl( TextSpan(start, 3)); } // >> return new LexTokenImpl(TextSpan(start, 2)); } // > return new LexTokenImpl(TextSpan(start, 1)); } case u8'(': return new LexTokenImpl(TextSpan(start, 1)); case u8')': return new LexTokenImpl(TextSpan(start, 1)); case u8'=': { if (Peek() == u8'=') { Progress(); // == return new LexTokenImpl(TextSpan(start, 2)); } // = return new LexTokenImpl(TextSpan(start, 1)); } case u8'!': { auto n = Peek(); if (n == u8'=') { Progress(); // != return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'i' && Peek(2) == u8's') { Progress(2); // !is return new LexTokenImpl(TextSpan(start, 3)); } // ! return new LexTokenImpl(TextSpan(start, 1)); } case u8'?': return new LexTokenImpl(TextSpan(start, 1)); case u8':': { if (Peek() == u8':') { Progress(); // :: return new LexTokenImpl(TextSpan(start, 2)); } // : return new LexTokenImpl(TextSpan(start, 1)); } case u8'&': { auto n = Peek(); if (n == u8'=') { Progress(); // &= return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'&') { Progress(); // && return new LexTokenImpl(TextSpan(start, 2)); } // & return new LexTokenImpl(TextSpan(start, 1)); } case u8',': return new LexTokenImpl(TextSpan(start, 1)); case u8'{': return new LexTokenImpl(TextSpan(start, 1)); case u8'}': return new LexTokenImpl(TextSpan(start, 1)); case u8';': return new LexTokenImpl(TextSpan(start, 1)); case u8'|': { auto n = Peek(); if (n == u8'=') { Progress(); // |= return new LexTokenImpl(TextSpan(start, 2)); } if (n == u8'|') { Progress(); // || return new LexTokenImpl(TextSpan(start, 2)); } // | return new LexTokenImpl(TextSpan(start, 1)); } case u8'^': { auto n = Peek(); if (n == u8'=') { Progress(); // ^= return new LexTokenImpl(TextSpan(start, start + 2)); } if (n == u8'^') { Progress(); // ^^ return new LexTokenImpl(TextSpan(start, start + 2)); } // ^ return new LexTokenImpl(TextSpan(start, start + 1)); } case u8'~': return new LexTokenImpl(TextSpan(start, start + 1)); case u8'.': return new LexTokenImpl(TextSpan(start, start + 1)); case u8'[': return new LexTokenImpl(TextSpan(start, start + 1)); case u8']': return new LexTokenImpl(TextSpan(start, start + 1)); case u8'@': return new LexTokenImpl(TextSpan(start, start + 1)); case u8' ': case u8'\r': case u8'\n': case u8'\t': return new LexTokenImpl(TextSpan(start, start + 1)); // Byte order mark case u8'\xEF': { if (Peek() == u8'\xBB' && Peek(2) == u8'\xBF') { Progress(2); return new LexTokenImpl(TextSpan(start, start + 3)); } } case u8'0': case u8'1': case u8'2': case u8'3': case u8'4': case u8'5': case u8'6': case u8'7': case u8'8': case u8'9': return LexNumerical(c); case u8'\'': return LexString(u8'\'', false); case u8'"': { if (Peek() == '"' && Peek(2) == '\"') { return LexString(u8'"', true); } return LexString(u8'"', false); } default: if (IsAlphaNumericalOrUnderscore(c)) return LexKeywordOrIdentifier(); _diagnostics->LogError(Diagnostics::DiagnosticType::UnknownToken, TextSpan(start, start + 1)); return new LexTokenImpl(TextSpan(start, start + 1)); } } LexToken* Lexer::LexNumerical(char8_t c) { auto initialValue = LexDecimalValue(c); auto numericalSystem = 10; // Default to decimal system. if (initialValue == 0) { auto secondChar = Peek(); auto secondValue = LexDecimalValue(secondChar); if (secondChar != '.' && secondValue == 255) { Progress(); switch (secondChar) { case 'x': numericalSystem = 16; break; case 'd': numericalSystem = 10; break; case 'o': numericalSystem = 8; break; ; case 'b': numericalSystem = 2; break; default: _diagnostics->LogError(Diagnostics::DiagnosticType::InvalidNumericalBase, TextSpan(_position - 1, _position + 1)); // Set to the largest numerical system, so we can prevent errors down the line. numericalSystem = 16; break; } } } switch (numericalSystem) { case 10: return LexDecimal(initialValue); case 16: return LexHexadecimal(); case 8: return LexOctal(); case 2: return LexBinary(); default: throw std::logic_error("Not implemented"); } } constexpr int64_t quick_pow10(int n) { constexpr int64_t pow10[20] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 10000000000000, 100000000000000, 1000000000000000, 10000000000000000, 100000000000000000, 1000000000000000000}; return pow10[n]; } LexToken* Lexer::LexDecimal(uint64_t initial) { auto start = _position; uint64_t value = initial; uint64_t decimalValue = 0; uint64_t exponentValue = 0; uint8_t decimalLength = 0; bool isDecimal = false; bool isExponent = false; while (true) { auto v = (uint64_t)LexDecimalValue(Peek()); if (v == 255) { if (!isDecimal && Peek() == u8'.') { isDecimal = true; Progress(); continue; } if (isDecimal && (Peek() == u8'e' || Peek() == u8'E')) { isDecimal = false; isExponent = true; Progress(); continue; } break; } Progress(); if (isDecimal) { decimalValue *= 10; decimalValue += v; decimalLength++; } else if (isExponent) { exponentValue *= 10; exponentValue += v; } else { value *= 10; value += v; } } if (isDecimal || isExponent) { auto val = value + ((double)decimalValue / quick_pow10(decimalLength)); if (isExponent) { val *= pow(10, exponentValue); } return new FloatLiteral(TextSpan(start, _position), val); } return new IntegerLiteral(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexHexadecimal() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexHexadecimalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 4; value += v; } return new IntegerLiteral(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexOctal() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexOctalValue(Peek()); if (v == 255) { break; } Progress(); value <<= 3; value += v; } return new IntegerLiteral(TextSpan(start, _position), value); } IntegerLiteral* Lexer::LexBinary() { auto start = _position; uint64_t value = 0; while (true) { auto v = LexBinaryValue(Peek()); if (v == 255) { break; } Progress(); value <<= 1; value += v; } return new IntegerLiteral(TextSpan(start, _position), value); } StringLiteral* Lexer::LexString(char8_t opening, bool heredoc) { Progress(); if (heredoc) { Progress(2); } auto start = _position; size_t offset = 0; while (true) { auto current = Peek(offset); if (heredoc) { if (current == '"' && Peek(offset + 1) == '"' && Peek(offset + 2) == '"' && Peek(offset + 3) != '"') { break; } } else if (current == opening) { break; } if (current == u8'\0') { _diagnostics->LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset)); break; } if (!heredoc && (current == u8'\n' || current == u8'\r')) { _diagnostics->LogError(Diagnostics::DiagnosticType::ExpectedEndOfString, TextSpan(start, start + offset)); break; } offset++; } Progress(offset); if (heredoc) { Progress(2); } return new StringLiteral(TextSpan(start, start + _position), std::u8string(_script.substr(start, offset))); } static uint32_t constexpr Hash(const char8_t* input) { return *input != 0U ? static_cast(*input) + 33 * Hash(input + 1) : 5381; }; LexToken* Lexer::LexKeywordOrIdentifier() { auto start = _position; auto offset = 0; while (IsAlphaNumericalOrUnderscore(Peek(offset))) { offset++; } auto str = _script.substr(_position, offset); Progress(offset); switch (Hash(str.data())) { case Hash(u8"and"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"abstract"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"auto"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"bool"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"break"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"case"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"cast"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"catch"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"class"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"const"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"continue"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"default"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"do"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"double"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"else"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"enum"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"explicit"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"external"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"false"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"final"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"float"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"for"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"from"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"funcdef"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"function"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"get"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"if"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"import"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"in"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"inout"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"int"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"interface"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"int8"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"int16"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"int32"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"int64"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"is"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"mixin"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"namespace"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"not"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"null"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"or"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"out"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"override"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"private"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"property"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"protected"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"return"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"set"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"shared"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"super"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"switch"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"this"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"true"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"try"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"typedef"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"uint"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"uint8"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"uint16"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"uint32"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"uint64"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"void"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"while"): return new LexTokenImpl(TextSpan(start, _position)); case Hash(u8"xor"): return new LexTokenImpl(TextSpan(start, _position)); default: return new IdentifierToken(TextSpan(start, _position), std::u8string(str)); } } bool Lexer::IsAlphaNumericalOrUnderscore(char8_t c) { if (c >= 'a' && c <= 'z') { return true; } if (c >= 'A' && c <= 'Z') { return true; } if (c >= '0' && c <= '9') { return true; } if (c == '_') { return true; } return false; } }