use crate::logger::messages::Message; use crate::parsing::lexer::lex_tokens::LexToken; use crate::span::Span; use lex_numerical::lex_numeric; use lex_tokens::TokenType; use string_walker::StringWalker; mod lex_numerical; pub mod lex_tokens; #[cfg(test)] mod lexer_tests; mod string_walker; #[inline(always)] fn lex_and_consume( chars: &mut StringWalker, eq: TokenType, f: &mut dyn FnMut(TokenType, usize, usize), ) { chars.next(); f(eq, chars.real_position - 1, chars.real_position) } #[inline(always)] fn lex_eq_or( chars: &mut StringWalker, eq: TokenType, or: TokenType, start_pos: usize, f: &mut dyn FnMut(TokenType, usize, usize), ) { chars.next(); if let Some('=') = chars.peek() { chars.next(); f(eq, start_pos, chars.real_position); } else { f(or, start_pos, chars.real_position); } } #[inline(always)] fn lex_eq_rep_or( chars: &mut StringWalker, v: char, eq: TokenType, rep: TokenType, or: TokenType, f: &mut dyn FnMut(TokenType, usize, usize), ) { let start_pos = chars.real_position; chars.next(); match chars.peek() { Some(c) => { if *c == v { chars.next(); f(rep, start_pos, chars.real_position); } else if *c == '=' { chars.next(); f(eq, start_pos, chars.real_position); } else { f(or, start_pos, chars.real_position); } } None => f(or, start_pos, chars.real_position), }; } type TT = TokenType; fn lex_keyword_or_identifier(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) { let mut reading = true; let mut length = 1; let start_pos = chars.real_position; while reading { match chars.peek() { Some(c) => match c { 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => { length += 1; } _ => { reading = false; } }, None => { reading = false; } }; } chars.reset_peek(); let c: String = chars.take(length); let token_type = match c.as_str() { "and" => TT::AndKeyword, "abstract" => TT::AbstractKeyword, "auto" => TT::AutoKeyword, "bool" => TT::BoolKeyword, "break" => TT::BreakKeyword, "case" => TT::CaseKeyword, "cast" => TT::CastKeyword, "catch" => TT::CatchKeyword, "class" => TT::ClassKeyword, "const" => TT::ConstKeyword, "continue" => TT::ContinueKeyword, "default" => TT::DefaultKeyword, "do" => TT::DoKeyword, "double" => TT::DoubleKeyword, "else" => TT::ElseKeyword, "enum" => TT::EnumKeyword, "explicit" => TT::ExplicitKeyword, "external" => TT::ExternalKeyword, "false" => TT::FalseKeyword, "final" => TT::FinalKeyword, "float" => TT::FloatKeyword, "for" => TT::ForKeyword, "from" => TT::FromKeyword, "funcdef" => TT::FuncDefKeyword, "function" => TT::FunctionKeyword, "get" => TT::GetKeyword, "if" => TT::IfKeyword, "import" => TT::ImportKeyword, "in" => TT::InKeyword, "inout" => TT::InOutKeyword, "int" => TT::IntKeyword, "interface" => TT::InterfaceKeyword, "int8" => TT::Int8Keyword, "int16" => TT::Int16Keyword, "int32" => TT::Int32Keyword, "int64" => TT::Int64Keyword, "is" => TT::IsKeyword, "mixin" => TT::MixinKeyword, "namespace" => TT::NamespaceKeyword, "not" => TT::NotKeyword, "null" => TT::NullKeyword, "or" => TT::OrKeyword, "out" => TT::OutKeyword, "override" => TT::OverrideKeyword, "private" => TT::PrivateKeyword, "property" => TT::PropertyKeyword, "protected" => TT::ProtectedKeyword, "return" => TT::ReturnKeyword, "set" => TT::SetKeyword, "shared" => TT::SharedKeyword, "super" => TT::SuperKeyword, "switch" => TT::SwitchKeyword, "this" => TT::ThisKeyword, "true" => TT::TrueKeyword, "try" => TT::TryKeyword, "typedef" => TT::TypeDefKeyword, "uint" => TT::UintKeyword, "uint8" => TT::Uint8Keyword, "uint16" => TT::Uint16Keyword, "uint32" => TT::Uint32Keyword, "uint64" => TT::Uint64Keyword, "void" => TT::VoidKeyword, "while" => TT::WhileKeyword, "xor" => TT::XorKeyword, _ => TT::Identifier(c), }; f(token_type, start_pos, chars.real_position); } fn lex_string( chars: &mut StringWalker, opening_char: &char, heredoc: bool, log: &mut dyn FnMut(Message, Span), f: &mut dyn FnMut(TokenType, usize, usize), ) { let start_pos = chars.real_position; chars.next(); if heredoc { chars.next(); chars.next(); } let mut length: i32 = 0; let mut string_length = 0; let mut last_was_control = false; // We loop twice here. In the first loop we get the number of characters to read, the number of // characters the string should be, and whether it's valid. This reduces the amount of allocations // we need to do to read a string. loop { let p = chars.peek(); match p { None => { log( Message::UnclosedStringLiteral, Span::new(chars.peek_position - 1, chars.peek_position), ); break; } Some(&'\\') if !last_was_control => { last_was_control = true; length += 1; } Some(c) => { if c == opening_char && !last_was_control { if heredoc { if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') { break; } else { length += 1; string_length += 1; } } else { break; } } length += 1; string_length += 1; last_was_control = false; } } } chars.reset_peek(); let mut s: String = String::with_capacity(string_length); for _ in 0..length { let p = chars.next().unwrap(); match p { '\\' => { if last_was_control { s.push('\\'); } else { last_was_control = true; continue; } } '0' if last_was_control => s.push('\0'), 'n' if last_was_control => s.push('\n'), 'r' if last_was_control => s.push('\r'), 't' if last_was_control => s.push('\t'), _ => s.push(p), }; last_was_control = false; } assert_eq!(s.len(), string_length); chars.reset_peek(); chars.next(); if heredoc { chars.next(); chars.next(); } f(TT::StringLiteral(s), start_pos, chars.real_position); } pub fn lex(s: &str, log: &mut dyn FnMut(Message, Span)) -> Vec { let mut tokens: Vec = Vec::new(); let mut chars = StringWalker::create(s); let mut reading = true; let mut add_token = |token_type: TokenType, start: usize, end: usize| { tokens.push(LexToken { token_type, span: Span::new(start, end), }) }; while reading { let p = chars.peek().cloned(); match p { Some(c) => match c { ' ' | '\t' | '\r' | '\n' => { chars.next(); add_token(TT::WhiteSpace, chars.real_position - 1, chars.real_position); } '=' => { let start_pos = chars.real_position; lex_eq_or( &mut chars, TT::EqualsEquals, TT::Equals, start_pos, &mut add_token, ) } '+' => lex_eq_rep_or( &mut chars, '+', TT::PlusEquals, TT::PlusPlus, TT::Plus, &mut add_token, ), '-' => lex_eq_rep_or( &mut chars, '-', TT::MinusEquals, TT::MinusMinus, TT::Minus, &mut add_token, ), '*' => { let start_pos = chars.real_position; if chars.peek() == Some(&'*') { chars.next(); lex_eq_or( &mut chars, TT::StarStarEquals, TT::StarStar, start_pos, &mut add_token, ) } else { lex_eq_or( &mut chars, TT::StarEquals, TT::Star, start_pos, &mut add_token, ) } } '/' => { let start_pos = chars.real_position; lex_eq_or( &mut chars, TT::SlashEquals, TT::Slash, start_pos, &mut add_token, ); } '%' => { let start_pos = chars.real_position; lex_eq_or( &mut chars, TT::PercentEquals, TT::Percent, start_pos, &mut add_token, ); } '|' => lex_eq_rep_or( &mut chars, '|', TT::LineEquals, TT::LineLine, TT::VerticalLine, &mut add_token, ), '&' => lex_eq_rep_or( &mut chars, '&', TT::AmpersandEquals, TT::AmpersandAmpersand, TT::Ampersand, &mut add_token, ), '^' => lex_eq_rep_or( &mut chars, '^', TT::RoofEquals, TT::RoofRoof, TT::Roof, &mut add_token, ), '<' => { let start_pos = chars.real_position; if chars.peek() == Some(&'<') { chars.next(); lex_eq_or( &mut chars, TT::LeftLeftEquals, TT::LeftLeft, start_pos, &mut add_token, ) } else { lex_eq_or( &mut chars, TT::LessThanEquals, TT::LessThan, start_pos, &mut add_token, ) } } '>' => { let start_pos = chars.real_position; if chars.peek() == Some(&'>') { if chars.peek() == Some(&'>') { chars.next(); chars.next(); lex_eq_or( &mut chars, TT::RightRightRightEquals, TT::RightRightRight, start_pos, &mut add_token, ) } else { chars.next(); lex_eq_or( &mut chars, TT::RightRightEquals, TT::RightRight, start_pos, &mut add_token, ) } } else { lex_eq_or( &mut chars, TT::GreaterThanEquals, TT::GreaterThan, start_pos, &mut add_token, ) } } '!' => { let start_pos = chars.real_position; let next = chars.peek(); if next == Some(&'=') { chars.next(); chars.next(); add_token(TT::NotEquals, start_pos, chars.real_position); } else if next == Some(&'i') && chars.peek() == Some(&'s') { chars.next(); chars.next(); chars.next(); add_token(TT::NotIsKeyword, start_pos, chars.real_position); } else { chars.next(); add_token(TT::ExclamationMark, start_pos, chars.real_position); } } '~' => lex_and_consume(&mut chars, TT::Tilde, &mut add_token), '@' => lex_and_consume(&mut chars, TT::AtSymbol, &mut add_token), ';' => lex_and_consume(&mut chars, TT::Semicolon, &mut add_token), ':' => { let start_pos = chars.real_position; if chars.peek() == Some(&':') { chars.next(); chars.next(); add_token(TT::ColonColon, start_pos, chars.real_position); } else { chars.next(); add_token(TT::Colon, start_pos, chars.real_position); } } '(' => lex_and_consume(&mut chars, TT::OpenBracket, &mut add_token), ')' => lex_and_consume(&mut chars, TT::CloseBracket, &mut add_token), '{' => lex_and_consume(&mut chars, TT::OpenCurlyBracket, &mut add_token), '}' => lex_and_consume(&mut chars, TT::CloseCurlyBracket, &mut add_token), '[' => lex_and_consume(&mut chars, TT::OpenBlockBracket, &mut add_token), ']' => lex_and_consume(&mut chars, TT::CloseBlockBracket, &mut add_token), ',' => lex_and_consume(&mut chars, TT::Comma, &mut add_token), '0'..'9' => lex_numeric(&mut chars, &mut add_token), 'a'..'z' | 'A'..'Z' | '_' => lex_keyword_or_identifier(&mut chars, &mut add_token), '\'' => lex_string(&mut chars, &'\'', false, log, &mut add_token), '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => { lex_string(&mut chars, &'"', true, log, &mut add_token) } '"' => lex_string(&mut chars, &'"', false, log, &mut add_token), _ => log( Message::UnexpectedCharacter(c), Span::new(chars.real_position, chars.real_position + 1), ), }, None => { add_token(TT::EndOfFile, chars.real_position, chars.real_position); reading = false; } } } tokens }