Initial commit, support for lexing symbols and numericals.

This commit is contained in:
2020-10-04 16:33:12 +02:00
commit e0c52f4ae7
13 changed files with 6895 additions and 0 deletions

View File

@@ -0,0 +1,42 @@
#ifndef ELOHIMSCRIPT_LEXTOKEN_HPP
#define ELOHIMSCRIPT_LEXTOKEN_HPP
#include <memory>
#include "LexTokenKind.hpp"
namespace ElohimScript::Parser {
class LexToken {
friend class Lexer;
std::unique_ptr<const LexToken> _next;
public:
virtual ~LexToken() = default;
[[nodiscard]] virtual LexTokenKind GetKind() const noexcept = 0;
[[nodiscard]] const std::unique_ptr<const LexToken>& GetNext() const noexcept { return _next; }
};
template <LexTokenKind kind> class LexTokenImpl : public LexToken {
public:
LexTokenImpl() = default;
[[nodiscard]] LexTokenKind GetKind() const noexcept override { return kind; }
};
class IntegerToken : public LexTokenImpl<LexTokenKind::IntegerToken> {
uint64_t _value;
public:
IntegerToken(uint64_t value) : _value(value) {}
[[nodiscard]] uint64_t GetValue() const noexcept { return _value; }
};
class FloatToken : public LexTokenImpl<LexTokenKind::FloatToken> {
double _value;
public:
FloatToken(double value) : _value(value) {}
[[nodiscard]] double GetValue() const noexcept { return _value; }
};
}
#endif // ELOHIMSCRIPT_LEXTOKEN_HPP

View File

@@ -0,0 +1,71 @@
#ifndef ELOHIMSCRIPT_LEXTOKENKIND_HPP
#define ELOHIMSCRIPT_LEXTOKENKIND_HPP
#include <cstdint>
namespace ElohimScript::Parser {
enum class LexTokenKind : uint8_t {
Unknown,
EndOfFile,
Whitespace,
// Symbols
StarSymbol,
StarStarSymbol,
SlashSymbol,
PercentSymbol,
PlusSymbol,
MinusSymbol,
LessThanEqualsSymbol,
LessThanSymbol,
GreaterThanEqualsSymbol,
GreaterThanSymbol,
OpenParenthesisSymbol,
CloseParenthesisSymbol,
EqualsEqualsSymbol,
ExclamationMarkEqualsSymbol,
QuestionMarkSymbol,
ColonSymbol,
EqualsSymbol,
PlusEqualsSymbol,
MinusEqualsSymbol,
StarEqualsSymbol,
SlashEqualsSymbol,
PercentEqualsSymbol,
StarStarEqualsSymbol,
PlusPlusSymbol,
MinusMinusSymbol,
AmpersandSymbol,
CommaSymbol,
OpenCurlyParenthesisSymbol,
CloseCurlyParenthesisSymbol,
SemicolonSymbol,
VerticalLineSymbol,
CaretSymbol,
TildeSymbol,
LessThanLessThanSymbol,
GreaterThanGreaterThanSymbol,
GreaterThanGreaterThanGreaterThanSymbol,
AmpersandEqualsSymbol,
VerticalLineEqualsSymbol,
CaretEqualsSymbol,
LessThanLessThanEqualsSymbol,
GreaterThanGreaterThanEqualsSymbol,
GreaterThanGreaterThanGreaterThanEqualsSymbol,
DotSymbol,
AmpersandAmpersandSymbol,
VerticalLineVerticalLineSymbol,
ExclamationMarkSymbol,
OpenBlockParenthesisSymbol,
CloseBlockParenthesisSymbol,
CaretCaretSymbol,
AtSymbol,
ExclamationMarkIsSymbol,
ColonColonSymbol,
// Misc
FloatToken,
IntegerToken,
};
}
#endif // ELOHIMSCRIPT_LEXTOKENKIND_HPP

363
src/Parser/Lexer/Lexer.cpp Normal file
View File

@@ -0,0 +1,363 @@
#include "Lexer.hpp"
#include <cmath>
#include <stdexcept>
#include "NumericalLexers.hpp"
namespace ElohimScript::Parser {
const LexToken* Lexer::Lex() {
auto* first = LexNext();
if (first->GetKind() == LexTokenKind::EndOfFile) {
return first;
}
auto* last = first;
while (true) {
auto* next = LexNext();
last->_next = std::unique_ptr<const LexToken>(next);
last = next;
if (next->GetKind() == LexTokenKind::EndOfFile) {
break;
}
}
return first;
}
LexToken* Lexer::LexNext() {
auto c = Consume();
switch (c) {
case u8'\0': return new LexTokenImpl<LexTokenKind::EndOfFile>();
case u8'*': {
auto n = Peek();
if (n == u8'*') {
Progress();
n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::StarStarEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::StarStarSymbol>();
}
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::StarEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::StarSymbol>();
}
case u8'/':
if (Peek() == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::SlashEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::SlashSymbol>();
case u8'%':
if (Peek() == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::PercentEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::PercentSymbol>();
case u8'+': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::PlusEqualsSymbol>();
}
if (n == u8'+') {
Progress();
return new LexTokenImpl<LexTokenKind::PlusPlusSymbol>();
}
return new LexTokenImpl<LexTokenKind::PlusSymbol>();
}
case u8'-': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::MinusEqualsSymbol>();
}
if (n == u8'-') {
Progress();
return new LexTokenImpl<LexTokenKind::MinusMinusSymbol>();
}
return new LexTokenImpl<LexTokenKind::MinusSymbol>();
}
case u8'<': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::LessThanEqualsSymbol>();
}
if (n == u8'<') {
Progress();
if (Peek() == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::LessThanLessThanEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::LessThanLessThanSymbol>();
}
return new LexTokenImpl<LexTokenKind::LessThanSymbol>();
}
case u8'>': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::GreaterThanEqualsSymbol>();
}
if (n == u8'>') {
Progress();
n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::GreaterThanGreaterThanEqualsSymbol>();
}
if (n == u8'>') {
Progress();
if (Peek() == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::GreaterThanGreaterThanGreaterThanEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::GreaterThanGreaterThanGreaterThanSymbol>();
}
return new LexTokenImpl<LexTokenKind::GreaterThanGreaterThanSymbol>();
}
return new LexTokenImpl<LexTokenKind::GreaterThanSymbol>();
}
case u8'(': return new LexTokenImpl<LexTokenKind::OpenParenthesisSymbol>();
case u8')': return new LexTokenImpl<LexTokenKind::CloseParenthesisSymbol>();
case u8'=': {
if (Peek() == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::EqualsEqualsSymbol>();
}
return new LexTokenImpl<LexTokenKind::EqualsSymbol>();
}
case u8'!': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::ExclamationMarkEqualsSymbol>();
}
if (n == u8'i' && Peek(2) == u8's') {
Progress(2);
return new LexTokenImpl<LexTokenKind::ExclamationMarkIsSymbol>();
}
return new LexTokenImpl<LexTokenKind::ExclamationMarkSymbol>();
}
case u8'?': return new LexTokenImpl<LexTokenKind::QuestionMarkSymbol>();
case u8':': {
if (Peek() == u8':') {
Progress();
return new LexTokenImpl<LexTokenKind::ColonColonSymbol>();
}
return new LexTokenImpl<LexTokenKind::ColonSymbol>();
}
case u8'&': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::AmpersandEqualsSymbol>();
}
if (n == u8'&') {
Progress();
return new LexTokenImpl<LexTokenKind::AmpersandAmpersandSymbol>();
}
return new LexTokenImpl<LexTokenKind::AmpersandSymbol>();
}
case u8',': return new LexTokenImpl<LexTokenKind::CommaSymbol>();
case u8'{': return new LexTokenImpl<LexTokenKind::OpenCurlyParenthesisSymbol>();
case u8'}': return new LexTokenImpl<LexTokenKind::CloseCurlyParenthesisSymbol>();
case u8';': return new LexTokenImpl<LexTokenKind::SemicolonSymbol>();
case u8'|': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::VerticalLineEqualsSymbol>();
}
if (n == u8'|') {
Progress();
return new LexTokenImpl<LexTokenKind::VerticalLineVerticalLineSymbol>();
}
return new LexTokenImpl<LexTokenKind::VerticalLineSymbol>();
}
case u8'^': {
auto n = Peek();
if (n == u8'=') {
Progress();
return new LexTokenImpl<LexTokenKind::CaretEqualsSymbol>();
}
if (n == u8'^') {
Progress();
return new LexTokenImpl<LexTokenKind::CaretCaretSymbol>();
}
return new LexTokenImpl<LexTokenKind::CaretSymbol>();
}
case u8'~': return new LexTokenImpl<LexTokenKind::TildeSymbol>();
case u8'.': return new LexTokenImpl<LexTokenKind::DotSymbol>();
case u8'[': return new LexTokenImpl<LexTokenKind::OpenBlockParenthesisSymbol>();
case u8']': return new LexTokenImpl<LexTokenKind::CloseBlockParenthesisSymbol>();
case u8'@': return new LexTokenImpl<LexTokenKind::AtSymbol>();
case u8' ':
case u8'\r':
case u8'\n':
case u8'\t': return new LexTokenImpl<LexTokenKind::Whitespace>();
// Byte order mark
case u8'\xEF': {
if (Peek() == u8'\xBB' && Peek(2) == u8'\xBF') {
Progress(2);
return new LexTokenImpl<LexTokenKind::Whitespace>();
}
}
case u8'0':
case u8'1':
case u8'2':
case u8'3':
case u8'4':
case u8'5':
case u8'6':
case u8'7':
case u8'8':
case u8'9': return LexNumerical(c);
default: return new LexTokenImpl<LexTokenKind::Unknown>();
}
}
LexToken* Lexer::LexNumerical(char8_t c) {
auto initialValue = LexDecimalValue(c);
auto numericalSystem = 10; // Default to decimal system.
if (initialValue == 0) {
auto secondChar = Peek();
auto secondValue = LexDecimalValue(secondChar);
if (secondChar != '.' && secondValue == 255) {
Progress();
switch (secondChar) {
case 'x': numericalSystem = 16; break;
case 'd': numericalSystem = 10; break;
case 'o':
numericalSystem = 8;
break;
;
case 'b': numericalSystem = 2; break;
default:
// TODO: Log Invalid numerical system
break;
}
}
}
switch (numericalSystem) {
case 10: return LexDecimal(initialValue);
case 16: return LexHexadecimal();
case 8: return LexOctal();
case 2: return LexBinary();
default: throw std::logic_error("Not implemented");
}
}
constexpr int64_t quick_pow10(int n) {
constexpr int64_t pow10[20] = {1,
10,
100,
1000,
10000,
100000,
1000000,
10000000,
100000000,
1000000000,
10000000000,
100000000000,
1000000000000,
10000000000000,
10000000000000,
100000000000000,
1000000000000000,
10000000000000000,
100000000000000000,
1000000000000000000};
return pow10[n];
}
LexToken* Lexer::LexDecimal(uint64_t initial) {
uint64_t value = initial;
uint64_t decimalValue = 0;
uint64_t exponentValue = 0;
uint8_t decimalLength = 0;
bool isDecimal = false;
bool isExponent = false;
while (true) {
auto v = (uint64_t)LexDecimalValue(Peek());
if (v == 255) {
if (!isDecimal && Peek() == '.') {
isDecimal = true;
Progress();
continue;
}
if (isDecimal && (Peek() == 'e' || Peek() == 'E')) {
isDecimal = false;
isExponent = true;
Progress();
continue;
}
break;
}
Progress();
if (isDecimal) {
decimalValue *= 10;
decimalValue += v;
decimalLength++;
} else if (isExponent) {
exponentValue *= 10;
exponentValue += v;
} else {
value *= 10;
value += v;
}
}
if (isDecimal || isExponent) {
auto val = value + ((double)decimalValue / quick_pow10(decimalLength));
if (isExponent) {
val *= pow(10, exponentValue);
}
return new FloatToken(val);
}
return new IntegerToken(value);
}
IntegerToken* Lexer::LexHexadecimal() {
uint64_t value = 0;
while (true) {
auto v = LexHexadecimalValue(Peek());
if (v == 255) {
break;
}
Progress();
value <<= 4;
value += v;
}
return new IntegerToken(value);
}
IntegerToken* Lexer::LexOctal() {
uint64_t value = 0;
while (true) {
auto v = LexOctalValue(Peek());
if (v == 255) {
break;
}
Progress();
value <<= 3;
value += v;
}
return new IntegerToken(value);
}
IntegerToken* Lexer::LexBinary() {
uint64_t value = 0;
while (true) {
auto v = LexBinaryValue(Peek());
if (v == 255) {
break;
}
Progress();
value <<= 1;
value += v;
}
return new IntegerToken(value);
}
}

View File

@@ -0,0 +1,46 @@
#ifndef ELOHIMSCRIPT_LEXER_HPP
#define ELOHIMSCRIPT_LEXER_HPP
#include <string_view>
#include "LexToken.hpp"
namespace ElohimScript::Parser {
class Lexer {
public:
Lexer(const char* script) : _script(reinterpret_cast<const char8_t*>(script)) {}
Lexer(std::u8string_view script) : _script(script) {}
const LexToken* Lex();
private:
std::u8string_view _script;
size_t _position = -1;
inline char8_t Consume() {
if (++_position >= _script.size()) {
return '\0';
}
return _script[_position];
}
inline void Progress(size_t steps = 1){
_position += steps;
}
inline char8_t Peek(size_t offset = 1) {
auto pos = _position + offset;
if (pos >= _script.size()) {
return '\0';
}
return _script[pos];
}
LexToken* LexNext();
LexToken* LexNumerical(char8_t);
LexToken* LexDecimal(uint64_t initial);
IntegerToken* LexHexadecimal();
IntegerToken* LexOctal();
IntegerToken* LexBinary();
};
}
#endif // ELOHIMSCRIPT_LEXER_HPP

View File

@@ -0,0 +1,67 @@
#include <cstdint>
#include "NumericalLexers.hpp"
uint8_t LexDecimalValue(char8_t c) {
switch (c) {
case u8'0': return 0;
case u8'1': return 1;
case u8'2': return 2;
case u8'3': return 3;
case u8'4': return 4;
case u8'5': return 5;
case u8'6': return 6;
case u8'7': return 7;
case u8'8': return 8;
case u8'9': return 9;
default: return 255;
}
}
uint8_t LexHexadecimalValue(char8_t c) {
switch (c) {
case u8'0': return 0;
case u8'1': return 1;
case u8'2': return 2;
case u8'3': return 3;
case u8'4': return 4;
case u8'5': return 5;
case u8'6': return 6;
case u8'7': return 7;
case u8'8': return 8;
case u8'9': return 9;
case u8'a':
case u8'A': return 10;
case u8'b':
case u8'B': return 11;
case u8'c':
case u8'C': return 12;
case u8'd':
case u8'D': return 13;
case u8'e':
case u8'E': return 14;
case u8'f':
case u8'F': return 15;
default: return 255;
}
}
uint8_t LexOctalValue(char8_t c) {
switch (c) {
case u8'0': return 0;
case u8'1': return 1;
case u8'2': return 2;
case u8'3': return 3;
case u8'4': return 4;
case u8'5': return 5;
case u8'6': return 6;
case u8'7': return 7;
default: return 255;
}
}
uint8_t LexBinaryValue(char8_t c) {
switch (c) {
case u8'0': return 0;
case u8'1': return 1;
default: return 255;
}
}

View File

@@ -0,0 +1,11 @@
#ifndef ELOHIMSCRIPT_NUMERICALLEXERS_HPP
#define ELOHIMSCRIPT_NUMERICALLEXERS_HPP
#include <cstdint>
uint8_t LexDecimalValue(char8_t c);
uint8_t LexHexadecimalValue(char8_t c);
uint8_t LexOctalValue(char8_t c);
uint8_t LexBinaryValue(char8_t c);
#endif // ELOHIMSCRIPT_NUMERICALLEXERS_HPP