SeraphScript/src/parsing/lexer/mod.rs

use crate::logger::messages::Message;
use crate::parsing::lexer::lex_tokens::LexToken;
use crate::span::Span;
use lex_numerical::lex_numeric;
use lex_tokens::TokenType;
use string_walker::StringWalker;

mod lex_numerical;
pub mod lex_tokens;
#[cfg(test)]
mod lexer_tests;
mod string_walker;

#[inline(always)]
fn lex_and_consume(
    chars: &mut StringWalker,
    eq: TokenType,
    f: &mut dyn FnMut(TokenType, usize, usize),
) {
    chars.next();
    f(eq, chars.real_position - 1, chars.real_position)
}

#[inline(always)]
fn lex_eq_or(
    chars: &mut StringWalker,
    eq: TokenType,
    or: TokenType,
    start_pos: usize,
    f: &mut dyn FnMut(TokenType, usize, usize),
) {
    chars.next();
    if let Some('=') = chars.peek() {
        chars.next();
        f(eq, start_pos, chars.real_position);
    } else {
        f(or, start_pos, chars.real_position);
    }
}

#[inline(always)]
fn lex_eq_rep_or(
    chars: &mut StringWalker,
    v: char,
    eq: TokenType,
    rep: TokenType,
    or: TokenType,
    f: &mut dyn FnMut(TokenType, usize, usize),
) {
    let start_pos = chars.real_position;
    chars.next();
    match chars.peek() {
        Some(c) => {
            if *c == v {
                chars.next();
                f(rep, start_pos, chars.real_position);
            } else if *c == '=' {
                chars.next();
                f(eq, start_pos, chars.real_position);
            } else {
                f(or, start_pos, chars.real_position);
            }
        }
        None => f(or, start_pos, chars.real_position),
    };
}

type TT = TokenType;

fn lex_keyword_or_identifier(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) {
    let mut reading = true;
    let mut length = 1;
    let start_pos = chars.real_position;
    while reading {
        match chars.peek() {
            Some(c) => match c {
                'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => {
                    length += 1;
                }
                _ => {
                    reading = false;
                }
            },
            None => {
                reading = false;
            }
        };
    }
    chars.reset_peek();
    let c: String = chars.take(length);

    let token_type = match c.as_str() {
        "and" => TT::AndKeyword,
        "abstract" => TT::AbstractKeyword,
        "auto" => TT::AutoKeyword,
        "bool" => TT::BoolKeyword,
        "break" => TT::BreakKeyword,
        "case" => TT::CaseKeyword,
        "cast" => TT::CastKeyword,
        "catch" => TT::CatchKeyword,
        "class" => TT::ClassKeyword,
        "const" => TT::ConstKeyword,
        "continue" => TT::ContinueKeyword,
        "default" => TT::DefaultKeyword,
        "do" => TT::DoKeyword,
        "double" => TT::DoubleKeyword,
        "else" => TT::ElseKeyword,
        "enum" => TT::EnumKeyword,
        "explicit" => TT::ExplicitKeyword,
        "external" => TT::ExternalKeyword,
        "false" => TT::FalseKeyword,
        "final" => TT::FinalKeyword,
        "float" => TT::FloatKeyword,
        "for" => TT::ForKeyword,
        "from" => TT::FromKeyword,
        "funcdef" => TT::FuncDefKeyword,
        "function" => TT::FunctionKeyword,
        "get" => TT::GetKeyword,
        "if" => TT::IfKeyword,
        "import" => TT::ImportKeyword,
        "in" => TT::InKeyword,
        "inout" => TT::InOutKeyword,
        "int" => TT::IntKeyword,
        "interface" => TT::InterfaceKeyword,
        "int8" => TT::Int8Keyword,
        "int16" => TT::Int16Keyword,
        "int32" => TT::Int32Keyword,
        "int64" => TT::Int64Keyword,
        "is" => TT::IsKeyword,
        "mixin" => TT::MixinKeyword,
        "namespace" => TT::NamespaceKeyword,
        "not" => TT::NotKeyword,
        "null" => TT::NullKeyword,
        "or" => TT::OrKeyword,
        "out" => TT::OutKeyword,
        "override" => TT::OverrideKeyword,
        "private" => TT::PrivateKeyword,
        "property" => TT::PropertyKeyword,
        "protected" => TT::ProtectedKeyword,
        "return" => TT::ReturnKeyword,
        "set" => TT::SetKeyword,
        "shared" => TT::SharedKeyword,
        "super" => TT::SuperKeyword,
        "switch" => TT::SwitchKeyword,
        "this" => TT::ThisKeyword,
        "true" => TT::TrueKeyword,
        "try" => TT::TryKeyword,
        "typedef" => TT::TypeDefKeyword,
        "uint" => TT::UintKeyword,
        "uint8" => TT::Uint8Keyword,
        "uint16" => TT::Uint16Keyword,
        "uint32" => TT::Uint32Keyword,
        "uint64" => TT::Uint64Keyword,
        "void" => TT::VoidKeyword,
        "while" => TT::WhileKeyword,
        "xor" => TT::XorKeyword,
        _ => TT::Identifier(c),
    };
    f(token_type, start_pos, chars.real_position);
}

fn lex_string(
    chars: &mut StringWalker,
    opening_char: &char,
    heredoc: bool,
    log: &mut dyn FnMut(Message, Span),
    f: &mut dyn FnMut(TokenType, usize, usize),
) {
    let start_pos = chars.real_position;
    chars.next();
    if heredoc {
        chars.next();
        chars.next();
    }
    let mut length: i32 = 0;
    let mut string_length = 0;
    let mut last_was_control = false;

    // We loop twice here. In the first loop we get the number of characters to read, the number of
    // characters the string should be, and whether it's valid. This reduces the amount of allocations
    // we need to do to read a string.
    loop {
        let p = chars.peek();
        match p {
            None => {
                log(
                    Message::UnclosedStringLiteral,
                    Span::new(chars.peek_position - 1, chars.peek_position),
                );
                break;
            }
            Some(&'\\') if !last_was_control => {
                last_was_control = true;
                length += 1;
            }
            Some(c) => {
                if c == opening_char && !last_was_control {
                    if heredoc {
                        if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') {
                            break;
                        } else {
                            length += 1;
                            string_length += 1;
                        }
                    } else {
                        break;
                    }
                }
                length += 1;
                string_length += 1;
                last_was_control = false;
            }
        }
    }
    chars.reset_peek();
    let mut s: String = String::with_capacity(string_length);
    for _ in 0..length {
        let p = chars.next().unwrap();
        match p {
            '\\' => {
                if last_was_control {
                    s.push('\\');
                } else {
                    last_was_control = true;
                    continue;
                }
            }
            '0' if last_was_control => s.push('\0'),
            'n' if last_was_control => s.push('\n'),
            'r' if last_was_control => s.push('\r'),
            't' if last_was_control => s.push('\t'),
            _ => s.push(p),
        };
        last_was_control = false;
    }
    assert_eq!(s.len(), string_length);
    chars.reset_peek();
    chars.next();
    if heredoc {
        chars.next();
        chars.next();
    }

    f(TT::StringLiteral(s), start_pos, chars.real_position);
}

pub fn lex(s: &str, log: &mut dyn FnMut(Message, Span)) -> Vec<LexToken> {
    let mut tokens: Vec<LexToken> = Vec::new();
    let mut chars = StringWalker::create(s);
    let mut reading = true;

    let mut add_token = |token_type: TokenType, start: usize, end: usize| {
        tokens.push(LexToken {
            token_type,
            span: Span::new(start, end),
        })
    };

    while reading {
        let p = chars.peek().cloned();
        match p {
            Some(c) => match c {
                ' ' | '\t' | '\r' | '\n' => {
                    chars.next();
                    add_token(TT::WhiteSpace, chars.real_position - 1, chars.real_position);
                }
                '=' => {
                    let start_pos = chars.real_position;
                    lex_eq_or(
                        &mut chars,
                        TT::EqualsEquals,
                        TT::Equals,
                        start_pos,
                        &mut add_token,
                    )
                }
                '+' => lex_eq_rep_or(
                    &mut chars,
                    '+',
                    TT::PlusEquals,
                    TT::PlusPlus,
                    TT::Plus,
                    &mut add_token,
                ),
                '-' => lex_eq_rep_or(
                    &mut chars,
                    '-',
                    TT::MinusEquals,
                    TT::MinusMinus,
                    TT::Minus,
                    &mut add_token,
                ),
                '*' => {
                    let start_pos = chars.real_position;
                    if chars.peek() == Some(&'*') {
                        chars.next();
                        lex_eq_or(
                            &mut chars,
                            TT::StarStarEquals,
                            TT::StarStar,
                            start_pos,
                            &mut add_token,
                        )
                    } else {
                        lex_eq_or(
                            &mut chars,
                            TT::StarEquals,
                            TT::Star,
                            start_pos,
                            &mut add_token,
                        )
                    }
                }
                '/' => {
                    let start_pos = chars.real_position;
                    lex_eq_or(
                        &mut chars,
                        TT::SlashEquals,
                        TT::Slash,
                        start_pos,
                        &mut add_token,
                    );
                }
                '%' => {
                    let start_pos = chars.real_position;
                    lex_eq_or(
                        &mut chars,
                        TT::PercentEquals,
                        TT::Percent,
                        start_pos,
                        &mut add_token,
                    );
                }
                '|' => lex_eq_rep_or(
                    &mut chars,
                    '|',
                    TT::LineEquals,
                    TT::LineLine,
                    TT::VerticalLine,
                    &mut add_token,
                ),
                '&' => lex_eq_rep_or(
                    &mut chars,
                    '&',
                    TT::AmpersandEquals,
                    TT::AmpersandAmpersand,
                    TT::Ampersand,
                    &mut add_token,
                ),
                '^' => lex_eq_rep_or(
                    &mut chars,
                    '^',
                    TT::RoofEquals,
                    TT::RoofRoof,
                    TT::Roof,
                    &mut add_token,
                ),
                '<' => {
                    let start_pos = chars.real_position;
                    if chars.peek() == Some(&'<') {
                        chars.next();
                        lex_eq_or(
                            &mut chars,
                            TT::LeftLeftEquals,
                            TT::LeftLeft,
                            start_pos,
                            &mut add_token,
                        )
                    } else {
                        lex_eq_or(
                            &mut chars,
                            TT::LessThanEquals,
                            TT::LessThan,
                            start_pos,
                            &mut add_token,
                        )
                    }
                }
                '>' => {
                    let start_pos = chars.real_position;
                    if chars.peek() == Some(&'>') {
                        if chars.peek() == Some(&'>') {
                            chars.next();
                            chars.next();
                            lex_eq_or(
                                &mut chars,
                                TT::RightRightRightEquals,
                                TT::RightRightRight,
                                start_pos,
                                &mut add_token,
                            )
                        } else {
                            chars.next();
                            lex_eq_or(
                                &mut chars,
                                TT::RightRightEquals,
                                TT::RightRight,
                                start_pos,
                                &mut add_token,
                            )
                        }
                    } else {
                        lex_eq_or(
                            &mut chars,
                            TT::GreaterThanEquals,
                            TT::GreaterThan,
                            start_pos,
                            &mut add_token,
                        )
                    }
                }
                '!' => {
                    let start_pos = chars.real_position;
                    let next = chars.peek();
                    if next == Some(&'=') {
                        chars.next();
                        chars.next();
                        add_token(TT::NotEquals, start_pos, chars.real_position);
                    } else if next == Some(&'i') && chars.peek() == Some(&'s') {
                        chars.next();
                        chars.next();
                        chars.next();
                        add_token(TT::NotIsKeyword, start_pos, chars.real_position);
                    } else {
                        chars.next();
                        add_token(TT::ExclamationMark, start_pos, chars.real_position);
                    }
                }
                '~' => lex_and_consume(&mut chars, TT::Tilde, &mut add_token),
                '@' => lex_and_consume(&mut chars, TT::AtSymbol, &mut add_token),
                ';' => lex_and_consume(&mut chars, TT::Semicolon, &mut add_token),
                ':' => {
                    let start_pos = chars.real_position;
                    if chars.peek() == Some(&':') {
                        chars.next();
                        chars.next();
                        add_token(TT::ColonColon, start_pos, chars.real_position);
                    } else {
                        chars.next();
                        add_token(TT::Colon, start_pos, chars.real_position);
                    }
                }

                '(' => lex_and_consume(&mut chars, TT::OpenBracket, &mut add_token),
                ')' => lex_and_consume(&mut chars, TT::CloseBracket, &mut add_token),
                '{' => lex_and_consume(&mut chars, TT::OpenCurlyBracket, &mut add_token),
                '}' => lex_and_consume(&mut chars, TT::CloseCurlyBracket, &mut add_token),
                '[' => lex_and_consume(&mut chars, TT::OpenBlockBracket, &mut add_token),
                ']' => lex_and_consume(&mut chars, TT::CloseBlockBracket, &mut add_token),
                ',' => lex_and_consume(&mut chars, TT::Comma, &mut add_token),
                '.' => lex_and_consume(&mut chars, TT::Dot, &mut add_token),
                '?' => lex_and_consume(&mut chars, TT::QuestionMark, &mut add_token),

                '0'..'9' => lex_numeric(&mut chars, &mut add_token),
                'a'..'z' | 'A'..'Z' | '_' => lex_keyword_or_identifier(&mut chars, &mut add_token),
                '\'' => lex_string(&mut chars, &'\'', false, log, &mut add_token),
                '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => {
                    lex_string(&mut chars, &'"', true, log, &mut add_token)
                }
                '"' => lex_string(&mut chars, &'"', false, log, &mut add_token),

                _ => log(
                    Message::UnexpectedCharacter(c),
                    Span::new(chars.real_position, chars.real_position + 1),
                ),
            },
            None => {
                add_token(TT::EndOfFile, chars.real_position, chars.real_position);
                reading = false;
            }
        }
    }
    tokens
}