use super::lex_numerical::lex_numeric; use crate::parsing::lex_tokens::LexToken; use itertools::{Itertools, MultiPeek}; use std::str::Chars; #[inline(always)] fn lex_and_consume(chars: &mut MultiPeek, eq: LexToken) -> LexToken { chars.next(); eq } #[inline(always)] fn lex_eq_or(chars: &mut MultiPeek, eq: LexToken, or: LexToken) -> LexToken { chars.next(); if let Some('=') = chars.peek() { chars.next(); eq } else { or } } #[inline(always)] fn lex_eq_rep_or( chars: &mut MultiPeek, v: char, eq: LexToken, rep: LexToken, or: LexToken, ) -> LexToken { chars.next(); return match chars.peek() { Some(c) => { if *c == v { chars.next(); rep } else if *c == '=' { chars.next(); eq } else { or } } None => or, }; } type LT = LexToken; fn lex_keyword_or_identifier(chars: &mut MultiPeek) -> LexToken { let mut reading = true; let mut length = 1; while reading { match chars.peek() { Some(c) => match c { 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => { length += 1; } _ => { reading = false; } }, None => { reading = false; } }; } chars.reset_peek(); let c: String = chars.take(length).collect(); match c.as_str() { "and" => LT::AndKeyword, "abstract" => LT::AbstractKeyword, "auto" => LT::AutoKeyword, "bool" => LT::BoolKeyword, "break" => LT::BreakKeyword, "case" => LT::CaseKeyword, "cast" => LT::CastKeyword, "catch" => LT::CatchKeyword, "class" => LT::ClassKeyword, "const" => LT::ConstKeyword, "continue" => LT::ContinueKeyword, "default" => LT::DefaultKeyword, "do" => LT::DoKeyword, "double" => LT::DoubleKeyword, "else" => LT::ElseKeyword, "enum" => LT::EnumKeyword, "explicit" => LT::ExplicitKeyword, "external" => LT::ExternalKeyword, "false" => LT::FalseKeyword, "final" => LT::FinalKeyword, "float" => LT::FloatKeyword, "for" => LT::ForKeyword, "from" => LT::FromKeyword, "funcdef" => LT::FuncDefKeyword, "function" => LT::FunctionKeyword, "get" => LT::GetKeyword, "if" => LT::IfKeyword, "import" => LT::ImportKeyword, "in" => LT::InKeyword, "inout" => LT::InOutKeyword, "int" => LT::IntKeyword, "interface" => LT::InterfaceKeyword, "int8" => LT::Int8Keyword, "int16" => LT::Int16Keyword, "int32" => LT::Int32Keyword, "int64" => LT::Int64Keyword, "is" => LT::IsKeyword, "mixin" => LT::MixinKeyword, "namespace" => LT::NamespaceKeyword, "not" => LT::NotKeyword, "null" => LT::NullKeyword, "or" => LT::OrKeyword, "out" => LT::OutKeyword, "override" => LT::OverrideKeyword, "private" => LT::PrivateKeyword, "property" => LT::PropertyKeyword, "protected" => LT::ProtectedKeyword, "return" => LT::ReturnKeyword, "set" => LT::SetKeyword, "shared" => LT::SharedKeyword, "super" => LT::SuperKeyword, "switch" => LT::SwitchKeyword, "this" => LT::ThisKeyword, "true" => LT::TrueKeyword, "try" => LT::TryKeyword, "typedef" => LT::TypeDefKeyword, "uint" => LT::UintKeyword, "uint8" => LT::Uint8Keyword, "uint16" => LT::Uint16Keyword, "uint32" => LT::Uint32Keyword, "uint64" => LT::Uint64Keyword, "void" => LT::VoidKeyword, "while" => LT::WhileKeyword, "xor" => LT::XorKeyword, _ => LT::Identifier(c), } } fn lex_string(chars: &mut MultiPeek, opening_char: &char, heredoc: bool) -> LexToken { chars.next(); if heredoc { chars.next(); chars.next(); } let mut length: i32 = 0; let mut string_length = 0; let mut last_was_control = false; // We loop twice here. In the first loop we get the number of characters to read, the number of // characters the string should be, and whether it's valid. This reduces the amount of allocations // we need to do to read a string. loop { let p = chars.peek(); match p { None => { // TODO: log error. Strings need to be closed, EOF should error. unimplemented!(); } Some(&'\\') if !last_was_control => { last_was_control = true; length += 1; } Some(c) => { if c == opening_char && !last_was_control { if heredoc { if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') { break; } else { length += 1; string_length += 1; } } else { break; } } length += 1; string_length += 1; last_was_control = false; } } } chars.reset_peek(); let mut s: String = String::with_capacity(string_length); for _ in 0..length { let p = chars.next().unwrap(); match p { '\\' => { if last_was_control { s.push('\\'); } else { last_was_control = true; continue; } } '0' if last_was_control => s.push('\0'), 'n' if last_was_control => s.push('\n'), 'r' if last_was_control => s.push('\r'), 't' if last_was_control => s.push('\t'), _ => s.push(p), }; last_was_control = false; } assert_eq!(s.len(), string_length); chars.reset_peek(); chars.next(); if heredoc { chars.next(); chars.next(); } LT::StringLiteral(s) } pub fn lex(s: &str) -> Vec { let mut tokens: Vec = Vec::new(); let mut chars = s.chars().multipeek(); let mut reading = true; while reading { let p = chars.peek().cloned(); match p { Some(c) => match c { ' ' | '\t' | '\r' | '\n' => { chars.next(); tokens.push(LT::WhiteSpace); } '=' => tokens.push(lex_eq_or(&mut chars, LT::EqualsEquals, LT::Equals)), '+' => tokens.push(lex_eq_rep_or( &mut chars, '+', LT::PlusEquals, LT::PlusPlus, LT::Plus, )), '-' => tokens.push(lex_eq_rep_or( &mut chars, '-', LT::MinusEquals, LT::MinusMinus, LT::Minus, )), '*' => { if chars.peek() == Some(&'*') { chars.next(); tokens.push(lex_eq_or(&mut chars, LT::StarStarEquals, LT::StarStar)) } else { tokens.push(lex_eq_or(&mut chars, LT::StarEquals, LT::Star)) } } '/' => tokens.push(lex_eq_or(&mut chars, LT::SlashEquals, LT::Slash)), '%' => tokens.push(lex_eq_or(&mut chars, LT::PercentEquals, LT::Percent)), '|' => tokens.push(lex_eq_rep_or( &mut chars, '|', LT::LineEquals, LT::LineLine, LT::VerticalLine, )), '&' => tokens.push(lex_eq_rep_or( &mut chars, '&', LT::AmpersandEquals, LT::AmpersandAmpersand, LT::Ampersand, )), '^' => tokens.push(lex_eq_rep_or( &mut chars, '^', LT::RoofEquals, LT::RoofRoof, LT::Roof, )), '<' => { if chars.peek() == Some(&'<') { chars.next(); tokens.push(lex_eq_or(&mut chars, LT::LeftLeftEquals, LT::LeftLeft)) } else { tokens.push(lex_eq_or(&mut chars, LT::LessThanEquals, LT::LessThan)) } } '>' => { if chars.peek() == Some(&'>') { if chars.peek() == Some(&'>') { chars.next(); chars.next(); tokens.push(lex_eq_or( &mut chars, LT::RightRightRightEquals, LT::RightRightRight, )) } else { chars.next(); tokens.push(lex_eq_or(&mut chars, LT::RightRightEquals, LT::RightRight)) } } else { tokens.push(lex_eq_or( &mut chars, LT::GreaterThanEquals, LT::GreaterThan, )) } } '!' => { let next = chars.peek(); if next == Some(&'=') { chars.next(); chars.next(); tokens.push(LT::NotEquals); } else if next == Some(&'i') && chars.peek() == Some(&'s') { chars.next(); chars.next(); chars.next(); tokens.push(LT::NotIsKeyword); } else { chars.next(); tokens.push(LT::ExclamationMark); } } '~' => tokens.push(lex_and_consume(&mut chars, LT::Tilde)), '@' => tokens.push(lex_and_consume(&mut chars, LT::AtSymbol)), ';' => tokens.push(lex_and_consume(&mut chars, LT::Semicolon)), ':' => tokens.push(lex_and_consume(&mut chars, LT::Colon)), '(' => tokens.push(lex_and_consume(&mut chars, LT::OpenBracket)), ')' => tokens.push(lex_and_consume(&mut chars, LT::CloseBracket)), '{' => tokens.push(lex_and_consume(&mut chars, LT::OpenCurlyBracket)), '}' => tokens.push(lex_and_consume(&mut chars, LT::CloseCurlyBracket)), '[' => tokens.push(lex_and_consume(&mut chars, LT::OpenBlockBracket)), ']' => tokens.push(lex_and_consume(&mut chars, LT::CloseBlockBracket)), '0'..'9' => tokens.push(lex_numeric(&mut chars)), 'a'..'z' | 'A'..'Z' | '_' => tokens.push(lex_keyword_or_identifier(&mut chars)), '\'' => tokens.push(lex_string(&mut chars, &'\'', false)), '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => { tokens.push(lex_string(&mut chars, &'"', true)) } '"' => tokens.push(lex_string(&mut chars, &'"', false)), // TODO: Definitely not unreachable. Log a proper error here. _ => unreachable!(), }, None => { tokens.push(LT::EndOfFile); reading = false; } } } tokens }