diff --git a/src/lib.rs b/src/lib.rs index 7aca25b..0b0c0ec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ #![feature(concat_idents)] #![feature(exclusive_range_pattern)] +#![feature(assert_matches)] pub(crate) mod defines; +pub mod logger; pub mod parsing; +pub mod span; diff --git a/src/logger/messages.rs b/src/logger/messages.rs new file mode 100644 index 0000000..f566bb3 --- /dev/null +++ b/src/logger/messages.rs @@ -0,0 +1,6 @@ +#[derive(Debug)] +pub enum Message { + UnexpectedCharacter(char), + InvalidCharacter { found: char, expected: char }, + UnclosedStringLiteral, +} diff --git a/src/logger/mod.rs b/src/logger/mod.rs new file mode 100644 index 0000000..6c1128c --- /dev/null +++ b/src/logger/mod.rs @@ -0,0 +1,23 @@ +pub mod messages; +use crate::span::Span; +use messages::Message; + +pub struct Log { + pub message: Message, + pub filename: String, + pub span: Span, +} + +pub struct Logger { + pub logs: Vec, +} + +impl Logger { + pub fn log(&mut self, message: Message, filename: String, start: usize, end: usize) { + self.logs.push(Log { + message, + filename, + span: Span { start, end }, + }) + } +} diff --git a/src/parsing/lexer.rs b/src/parsing/lexer.rs deleted file mode 100644 index fdf5f6c..0000000 --- a/src/parsing/lexer.rs +++ /dev/null @@ -1,350 +0,0 @@ -use super::lex_numerical::lex_numeric; -use crate::parsing::lex_tokens::LexToken; -use itertools::{Itertools, MultiPeek}; -use std::str::Chars; - -#[inline(always)] -fn lex_and_consume(chars: &mut MultiPeek, eq: LexToken) -> LexToken { - chars.next(); - eq -} - -#[inline(always)] -fn lex_eq_or(chars: &mut MultiPeek, eq: LexToken, or: LexToken) -> LexToken { - chars.next(); - if let Some('=') = chars.peek() { - chars.next(); - eq - } else { - or - } -} - -#[inline(always)] -fn lex_eq_rep_or( - chars: &mut MultiPeek, - v: char, - eq: LexToken, - rep: LexToken, - or: LexToken, -) -> LexToken { - chars.next(); - return match chars.peek() { - Some(c) => { - if *c == v { - chars.next(); - rep - } else if *c == '=' { - chars.next(); - eq - } else { - or - } - } - None => or, - }; -} - -type LT = LexToken; - -fn lex_keyword_or_identifier(chars: &mut MultiPeek) -> LexToken { - let mut reading = true; - let mut length = 1; - while reading { - match chars.peek() { - Some(c) => match c { - 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => { - length += 1; - } - _ => { - reading = false; - } - }, - None => { - reading = false; - } - }; - } - chars.reset_peek(); - let c: String = chars.take(length).collect(); - - match c.as_str() { - "and" => LT::AndKeyword, - "abstract" => LT::AbstractKeyword, - "auto" => LT::AutoKeyword, - "bool" => LT::BoolKeyword, - "break" => LT::BreakKeyword, - "case" => LT::CaseKeyword, - "cast" => LT::CastKeyword, - "catch" => LT::CatchKeyword, - "class" => LT::ClassKeyword, - "const" => LT::ConstKeyword, - "continue" => LT::ContinueKeyword, - "default" => LT::DefaultKeyword, - "do" => LT::DoKeyword, - "double" => LT::DoubleKeyword, - "else" => LT::ElseKeyword, - "enum" => LT::EnumKeyword, - "explicit" => LT::ExplicitKeyword, - "external" => LT::ExternalKeyword, - "false" => LT::FalseKeyword, - "final" => LT::FinalKeyword, - "float" => LT::FloatKeyword, - "for" => LT::ForKeyword, - "from" => LT::FromKeyword, - "funcdef" => LT::FuncDefKeyword, - "function" => LT::FunctionKeyword, - "get" => LT::GetKeyword, - "if" => LT::IfKeyword, - "import" => LT::ImportKeyword, - "in" => LT::InKeyword, - "inout" => LT::InOutKeyword, - "int" => LT::IntKeyword, - "interface" => LT::InterfaceKeyword, - "int8" => LT::Int8Keyword, - "int16" => LT::Int16Keyword, - "int32" => LT::Int32Keyword, - "int64" => LT::Int64Keyword, - "is" => LT::IsKeyword, - "mixin" => LT::MixinKeyword, - "namespace" => LT::NamespaceKeyword, - "not" => LT::NotKeyword, - "null" => LT::NullKeyword, - "or" => LT::OrKeyword, - "out" => LT::OutKeyword, - "override" => LT::OverrideKeyword, - "private" => LT::PrivateKeyword, - "property" => LT::PropertyKeyword, - "protected" => LT::ProtectedKeyword, - "return" => LT::ReturnKeyword, - "set" => LT::SetKeyword, - "shared" => LT::SharedKeyword, - "super" => LT::SuperKeyword, - "switch" => LT::SwitchKeyword, - "this" => LT::ThisKeyword, - "true" => LT::TrueKeyword, - "try" => LT::TryKeyword, - "typedef" => LT::TypeDefKeyword, - "uint" => LT::UintKeyword, - "uint8" => LT::Uint8Keyword, - "uint16" => LT::Uint16Keyword, - "uint32" => LT::Uint32Keyword, - "uint64" => LT::Uint64Keyword, - "void" => LT::VoidKeyword, - "while" => LT::WhileKeyword, - "xor" => LT::XorKeyword, - _ => LT::Identifier(c), - } -} - -fn lex_string(chars: &mut MultiPeek, opening_char: &char, heredoc: bool) -> LexToken { - chars.next(); - if heredoc { - chars.next(); - chars.next(); - } - let mut length: i32 = 0; - let mut string_length = 0; - let mut last_was_control = false; - - // We loop twice here. In the first loop we get the number of characters to read, the number of - // characters the string should be, and whether it's valid. This reduces the amount of allocations - // we need to do to read a string. - loop { - let p = chars.peek(); - match p { - None => { - // TODO: log error. Strings need to be closed, EOF should error. - unimplemented!(); - } - Some(&'\\') if !last_was_control => { - last_was_control = true; - length += 1; - } - Some(c) => { - if c == opening_char && !last_was_control { - if heredoc { - if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') { - break; - } else { - length += 1; - string_length += 1; - } - } else { - break; - } - } - length += 1; - string_length += 1; - last_was_control = false; - } - } - } - chars.reset_peek(); - let mut s: String = String::with_capacity(string_length); - for _ in 0..length { - let p = chars.next().unwrap(); - match p { - '\\' => { - if last_was_control { - s.push('\\'); - } else { - last_was_control = true; - continue; - } - } - '0' if last_was_control => s.push('\0'), - 'n' if last_was_control => s.push('\n'), - 'r' if last_was_control => s.push('\r'), - 't' if last_was_control => s.push('\t'), - _ => s.push(p), - }; - last_was_control = false; - } - assert_eq!(s.len(), string_length); - chars.reset_peek(); - chars.next(); - if heredoc { - chars.next(); - chars.next(); - } - - LT::StringLiteral(s) -} - -pub fn lex(s: &str) -> Vec { - let mut tokens: Vec = Vec::new(); - let mut chars = s.chars().multipeek(); - let mut reading = true; - while reading { - let p = chars.peek().cloned(); - match p { - Some(c) => match c { - ' ' | '\t' | '\r' | '\n' => { - chars.next(); - tokens.push(LT::WhiteSpace); - } - '=' => tokens.push(lex_eq_or(&mut chars, LT::EqualsEquals, LT::Equals)), - '+' => tokens.push(lex_eq_rep_or( - &mut chars, - '+', - LT::PlusEquals, - LT::PlusPlus, - LT::Plus, - )), - '-' => tokens.push(lex_eq_rep_or( - &mut chars, - '-', - LT::MinusEquals, - LT::MinusMinus, - LT::Minus, - )), - '*' => { - if chars.peek() == Some(&'*') { - chars.next(); - tokens.push(lex_eq_or(&mut chars, LT::StarStarEquals, LT::StarStar)) - } else { - tokens.push(lex_eq_or(&mut chars, LT::StarEquals, LT::Star)) - } - } - '/' => tokens.push(lex_eq_or(&mut chars, LT::SlashEquals, LT::Slash)), - '%' => tokens.push(lex_eq_or(&mut chars, LT::PercentEquals, LT::Percent)), - '|' => tokens.push(lex_eq_rep_or( - &mut chars, - '|', - LT::LineEquals, - LT::LineLine, - LT::VerticalLine, - )), - '&' => tokens.push(lex_eq_rep_or( - &mut chars, - '&', - LT::AmpersandEquals, - LT::AmpersandAmpersand, - LT::Ampersand, - )), - '^' => tokens.push(lex_eq_rep_or( - &mut chars, - '^', - LT::RoofEquals, - LT::RoofRoof, - LT::Roof, - )), - '<' => { - if chars.peek() == Some(&'<') { - chars.next(); - tokens.push(lex_eq_or(&mut chars, LT::LeftLeftEquals, LT::LeftLeft)) - } else { - tokens.push(lex_eq_or(&mut chars, LT::LessThanEquals, LT::LessThan)) - } - } - '>' => { - if chars.peek() == Some(&'>') { - if chars.peek() == Some(&'>') { - chars.next(); - chars.next(); - tokens.push(lex_eq_or( - &mut chars, - LT::RightRightRightEquals, - LT::RightRightRight, - )) - } else { - chars.next(); - tokens.push(lex_eq_or(&mut chars, LT::RightRightEquals, LT::RightRight)) - } - } else { - tokens.push(lex_eq_or( - &mut chars, - LT::GreaterThanEquals, - LT::GreaterThan, - )) - } - } - '!' => { - let next = chars.peek(); - if next == Some(&'=') { - chars.next(); - chars.next(); - tokens.push(LT::NotEquals); - } else if next == Some(&'i') && chars.peek() == Some(&'s') { - chars.next(); - chars.next(); - chars.next(); - tokens.push(LT::NotIsKeyword); - } else { - chars.next(); - tokens.push(LT::ExclamationMark); - } - } - - '~' => tokens.push(lex_and_consume(&mut chars, LT::Tilde)), - '@' => tokens.push(lex_and_consume(&mut chars, LT::AtSymbol)), - ';' => tokens.push(lex_and_consume(&mut chars, LT::Semicolon)), - ':' => tokens.push(lex_and_consume(&mut chars, LT::Colon)), - - '(' => tokens.push(lex_and_consume(&mut chars, LT::OpenBracket)), - ')' => tokens.push(lex_and_consume(&mut chars, LT::CloseBracket)), - '{' => tokens.push(lex_and_consume(&mut chars, LT::OpenCurlyBracket)), - '}' => tokens.push(lex_and_consume(&mut chars, LT::CloseCurlyBracket)), - '[' => tokens.push(lex_and_consume(&mut chars, LT::OpenBlockBracket)), - ']' => tokens.push(lex_and_consume(&mut chars, LT::CloseBlockBracket)), - - '0'..'9' => tokens.push(lex_numeric(&mut chars)), - 'a'..'z' | 'A'..'Z' | '_' => tokens.push(lex_keyword_or_identifier(&mut chars)), - '\'' => tokens.push(lex_string(&mut chars, &'\'', false)), - '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => { - tokens.push(lex_string(&mut chars, &'"', true)) - } - '"' => tokens.push(lex_string(&mut chars, &'"', false)), - - // TODO: Definitely not unreachable. Log a proper error here. - _ => unreachable!(), - }, - None => { - tokens.push(LT::EndOfFile); - reading = false; - } - } - } - tokens -} diff --git a/src/parsing/lex_numerical.rs b/src/parsing/lexer/lex_numerical.rs similarity index 83% rename from src/parsing/lex_numerical.rs rename to src/parsing/lexer/lex_numerical.rs index d7f80cb..c848f95 100644 --- a/src/parsing/lex_numerical.rs +++ b/src/parsing/lexer/lex_numerical.rs @@ -1,7 +1,6 @@ use crate::defines::{LiteralFloat, LiteralInt}; -use crate::parsing::lex_tokens::LexToken; -use itertools::MultiPeek; -use std::str::Chars; +use crate::parsing::lexer::lex_tokens::TokenType; +use crate::parsing::lexer::StringWalker; #[inline(always)] fn get_decimal_value(c: char) -> Option { @@ -21,7 +20,7 @@ fn get_decimal_value(c: char) -> Option { } #[inline(always)] -fn lex_numeric_default(chars: &mut MultiPeek) -> LexToken { +fn lex_numeric_default(chars: &mut StringWalker) -> TokenType { let mut int_value: LiteralInt = 0; let mut decimal_value: LiteralInt = 0; let mut exponent_value: LiteralInt = 0; @@ -81,9 +80,9 @@ fn lex_numeric_default(chars: &mut MultiPeek) -> LexToken { if is_exponent { val *= exponent_value.pow(10) as LiteralFloat; } - LexToken::FloatLiteral(val) + TokenType::FloatLiteral(val) } else { - LexToken::IntegerLiteral(int_value) + TokenType::IntegerLiteral(int_value) } } @@ -111,7 +110,7 @@ fn get_hexadecimal_value(c: char) -> Option { } #[inline(always)] -fn lex_numeric_hexadecimal(chars: &mut MultiPeek) -> LexToken { +fn lex_numeric_hexadecimal(chars: &mut StringWalker) -> TokenType { let mut int_value: LiteralInt = 0; let mut reading = true; let mut n = chars.peek().cloned(); @@ -132,7 +131,7 @@ fn lex_numeric_hexadecimal(chars: &mut MultiPeek) -> LexToken { } n = chars.peek().cloned(); } - LexToken::IntegerLiteral(int_value) + TokenType::IntegerLiteral(int_value) } #[inline(always)] @@ -151,7 +150,7 @@ fn get_octal_value(c: char) -> Option { } #[inline(always)] -fn lex_numeric_octal(chars: &mut MultiPeek) -> LexToken { +fn lex_numeric_octal(chars: &mut StringWalker) -> TokenType { let mut int_value: LiteralInt = 0; let mut reading = true; let mut n = chars.peek().cloned(); @@ -172,7 +171,7 @@ fn lex_numeric_octal(chars: &mut MultiPeek) -> LexToken { } n = chars.peek().cloned(); } - LexToken::IntegerLiteral(int_value) + TokenType::IntegerLiteral(int_value) } #[inline(always)] @@ -185,7 +184,7 @@ fn get_binary_value(c: char) -> Option { } #[inline(always)] -fn lex_numeric_binary(chars: &mut MultiPeek) -> LexToken { +fn lex_numeric_binary(chars: &mut StringWalker) -> TokenType { let mut int_value: LiteralInt = 0; let mut reading = true; let mut n = chars.peek().cloned(); @@ -206,37 +205,44 @@ fn lex_numeric_binary(chars: &mut MultiPeek) -> LexToken { } n = chars.peek().cloned(); } - LexToken::IntegerLiteral(int_value) + TokenType::IntegerLiteral(int_value) } #[inline(always)] -pub fn lex_numeric(chars: &mut MultiPeek) -> LexToken { +pub(super) fn lex_numeric(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) { chars.reset_peek(); - if chars.peek() == Some(&'0') { + let start_pos = chars.real_position; + let token_type = if chars.peek() == Some(&'0') { match chars.peek() { Some(&'D') | Some(&'d') => { chars.next(); chars.next(); - return lex_numeric_default(chars); + lex_numeric_default(chars) } Some(&'X') | Some(&'x') => { chars.next(); chars.next(); - return lex_numeric_hexadecimal(chars); + lex_numeric_hexadecimal(chars) } Some(&'O') | Some(&'o') => { chars.next(); chars.next(); - return lex_numeric_octal(chars); + lex_numeric_octal(chars) } Some(&'B') | Some(&'b') => { chars.next(); chars.next(); - return lex_numeric_binary(chars); + lex_numeric_binary(chars) + } + _ => { + chars.reset_peek(); + lex_numeric_default(chars) } - _ => {} } - } - chars.reset_peek(); - lex_numeric_default(chars) + } else { + chars.reset_peek(); + lex_numeric_default(chars) + }; + + f(token_type, start_pos, chars.real_position); } diff --git a/src/parsing/lex_tokens.rs b/src/parsing/lexer/lex_tokens.rs similarity index 95% rename from src/parsing/lex_tokens.rs rename to src/parsing/lexer/lex_tokens.rs index f0ea86b..9320c1b 100644 --- a/src/parsing/lex_tokens.rs +++ b/src/parsing/lexer/lex_tokens.rs @@ -1,7 +1,13 @@ use crate::defines::{LiteralFloat, LiteralInt}; +use crate::span::Span; + +pub struct LexToken { + pub token_type: TokenType, + pub span: Span +} #[derive(PartialEq, Debug)] -pub enum LexToken { +pub enum TokenType { EndOfFile, WhiteSpace, Identifier(String), diff --git a/src/parsing/lexer/lexer_tests.rs b/src/parsing/lexer/lexer_tests.rs new file mode 100644 index 0000000..6032ee4 --- /dev/null +++ b/src/parsing/lexer/lexer_tests.rs @@ -0,0 +1,324 @@ +use super::lex; +use crate::logger::messages::Message; +use crate::parsing::lexer::lex_tokens::TokenType; + +macro_rules! lex_token_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b, &mut |_message, _span| { + unreachable!(); + }); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0].token_type, $c); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, $b.chars().count()); + assert_eq!(tokens[1].token_type, TokenType::EndOfFile); + } + }; +} + +macro_rules! lex_identifier_test { + ( $a: ident, $b: expr) => { + #[test] + fn $a() { + let tokens = lex($b, &mut |_message, _span| { + unreachable!(); + }); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0].token_type, TokenType::Identifier($b.to_string())); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, $b.chars().count()); + assert_eq!(tokens[1].token_type, TokenType::EndOfFile); + } + }; +} + +macro_rules! lex_integer_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b, &mut |_message, _span| { + unreachable!(); + }); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0].token_type, TokenType::IntegerLiteral($c)); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, $b.chars().count()); + assert_eq!(tokens[1].token_type, TokenType::EndOfFile); + } + }; +} + +macro_rules! lex_float_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b, &mut |_message, _span| { + unreachable!(); + }); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0].token_type, TokenType::FloatLiteral($c)); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, $b.chars().count()); + assert_eq!(tokens[1].token_type, TokenType::EndOfFile); + } + }; +} + +macro_rules! lex_string_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b, &mut |_message, _span| { + unreachable!(); + }); + assert_eq!(tokens.len(), 2); + assert_eq!( + tokens[0].token_type, + TokenType::StringLiteral($c.to_string()) + ); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, $b.chars().count()); + assert_eq!(tokens[1].token_type, TokenType::EndOfFile); + } + }; +} + +lex_token_test!(lex_space, " ", TokenType::WhiteSpace); +lex_token_test!(lex_tab, "\t", TokenType::WhiteSpace); +lex_token_test!(lex_return_line, "\r", TokenType::WhiteSpace); +lex_token_test!(lex_newline, "\n", TokenType::WhiteSpace); + +lex_token_test!(lex_equals, "=", TokenType::Equals); +lex_token_test!(lex_equals_equals, "==", TokenType::EqualsEquals); +lex_token_test!(lex_plus, "+", TokenType::Plus); +lex_token_test!(lex_plus_plus, "++", TokenType::PlusPlus); +lex_token_test!(lex_plus_equals, "+=", TokenType::PlusEquals); +lex_token_test!(lex_minus, "-", TokenType::Minus); +lex_token_test!(lex_minus_minus, "--", TokenType::MinusMinus); +lex_token_test!(lex_minus_equals, "-=", TokenType::MinusEquals); +lex_token_test!(lex_star, "*", TokenType::Star); +lex_token_test!(lex_star_equals, "*=", TokenType::StarEquals); +lex_token_test!(lex_star_star, "**", TokenType::StarStar); +lex_token_test!(lex_star_star_equals, "**=", TokenType::StarStarEquals); +lex_token_test!(lex_slash, "/", TokenType::Slash); +lex_token_test!(lex_slash_equals, "/=", TokenType::SlashEquals); +lex_token_test!(lex_percent, "%", TokenType::Percent); +lex_token_test!(lex_percent_equals, "%=", TokenType::PercentEquals); + +lex_token_test!(lex_exclamation_mark, "!", TokenType::ExclamationMark); +lex_token_test!(lex_not_equals, "!=", TokenType::NotEquals); +lex_token_test!(lex_not_is_keyword, "!is", TokenType::NotIsKeyword); + +lex_token_test!(lex_vert_line, "|", TokenType::VerticalLine); +lex_token_test!(lex_vert_line_equals, "|=", TokenType::LineEquals); +lex_token_test!(lex_line_line, "||", TokenType::LineLine); + +lex_token_test!(lex_ampersand, "&", TokenType::Ampersand); +lex_token_test!(lex_ampersand_equals, "&=", TokenType::AmpersandEquals); +lex_token_test!(lex_ampersand_ampersand, "&&", TokenType::AmpersandAmpersand); +lex_token_test!(lex_less_than, "<", TokenType::LessThan); +lex_token_test!(lex_less_than_equals, "<=", TokenType::LessThanEquals); +lex_token_test!(lex_left_left, "<<", TokenType::LeftLeft); +lex_token_test!(lex_left_left_equals, "<<=", TokenType::LeftLeftEquals); + +lex_token_test!(lex_greater_than, ">", TokenType::GreaterThan); +lex_token_test!(lex_greater_than_equals, ">=", TokenType::GreaterThanEquals); +lex_token_test!(lex_right_right, ">>", TokenType::RightRight); +lex_token_test!(lex_right_right_equals, ">>=", TokenType::RightRightEquals); +lex_token_test!(lex_right_right_right, ">>>", TokenType::RightRightRight); +lex_token_test!( + lex_right_right_right_equals, + ">>>=", + TokenType::RightRightRightEquals +); + +lex_token_test!(lex_tilde, "~", TokenType::Tilde); +lex_token_test!(lex_at_symbol, "@", TokenType::AtSymbol); + +lex_token_test!(lex_and_keyword, "and", TokenType::AndKeyword); +lex_token_test!(lex_abstract_keyword, "abstract", TokenType::AbstractKeyword); +lex_token_test!(lex_auto_keyword, "auto", TokenType::AutoKeyword); +lex_token_test!(lex_bool_keyword, "bool", TokenType::BoolKeyword); +lex_token_test!(lex_break_keyword, "break", TokenType::BreakKeyword); +lex_token_test!(lex_case_keyword, "case", TokenType::CaseKeyword); +lex_token_test!(lex_cast_keyword, "cast", TokenType::CastKeyword); +lex_token_test!(lex_catch_keyword, "catch", TokenType::CatchKeyword); +lex_token_test!(lex_class_keyword, "class", TokenType::ClassKeyword); +lex_token_test!(lex_const_keyword, "const", TokenType::ConstKeyword); +lex_token_test!(lex_continue_keyword, "continue", TokenType::ContinueKeyword); +lex_token_test!(lex_default_keyword, "default", TokenType::DefaultKeyword); +lex_token_test!(lex_do_keyword, "do", TokenType::DoKeyword); +lex_token_test!(lex_double_keyword, "double", TokenType::DoubleKeyword); +lex_token_test!(lex_else_keyword, "else", TokenType::ElseKeyword); +lex_token_test!(lex_enum_keyword, "enum", TokenType::EnumKeyword); +lex_token_test!(lex_explicit_keyword, "explicit", TokenType::ExplicitKeyword); +lex_token_test!(lex_external_keyword, "external", TokenType::ExternalKeyword); +lex_token_test!(lex_false_keyword, "false", TokenType::FalseKeyword); +lex_token_test!(lex_final_keyword, "final", TokenType::FinalKeyword); +lex_token_test!(lex_float_keyword, "float", TokenType::FloatKeyword); +lex_token_test!(lex_for_keyword, "for", TokenType::ForKeyword); +lex_token_test!(lex_from_keyword, "from", TokenType::FromKeyword); +lex_token_test!(lex_funcdef_keyword, "funcdef", TokenType::FuncDefKeyword); +lex_token_test!(lex_function_keyword, "function", TokenType::FunctionKeyword); +lex_token_test!(lex_get_keyword, "get", TokenType::GetKeyword); +lex_token_test!(lex_if_keyword, "if", TokenType::IfKeyword); +lex_token_test!(lex_import_keyword, "import", TokenType::ImportKeyword); +lex_token_test!(lex_in_keyword, "in", TokenType::InKeyword); +lex_token_test!(lex_inout_keyword, "inout", TokenType::InOutKeyword); +lex_token_test!(lex_int_keyword, "int", TokenType::IntKeyword); +lex_token_test!( + lex_interface_keyword, + "interface", + TokenType::InterfaceKeyword +); +lex_token_test!(lex_int8_keyword, "int8", TokenType::Int8Keyword); +lex_token_test!(lex_int16_keyword, "int16", TokenType::Int16Keyword); +lex_token_test!(lex_int32_keyword, "int32", TokenType::Int32Keyword); +lex_token_test!(lex_int64_keyword, "int64", TokenType::Int64Keyword); +lex_token_test!(lex_is_keyword, "is", TokenType::IsKeyword); +lex_token_test!(lex_mixin_keyword, "mixin", TokenType::MixinKeyword); +lex_token_test!( + lex_namespace_keyword, + "namespace", + TokenType::NamespaceKeyword +); +lex_token_test!(lex_not_keyword, "not", TokenType::NotKeyword); +lex_token_test!(lex_null_keyword, "null", TokenType::NullKeyword); +lex_token_test!(lex_or_keyword, "or", TokenType::OrKeyword); +lex_token_test!(lex_out_keyword, "out", TokenType::OutKeyword); +lex_token_test!(lex_override_keyword, "override", TokenType::OverrideKeyword); +lex_token_test!(lex_private_keyword, "private", TokenType::PrivateKeyword); +lex_token_test!(lex_property_keyword, "property", TokenType::PropertyKeyword); +lex_token_test!( + lex_protected_keyword, + "protected", + TokenType::ProtectedKeyword +); + +lex_token_test!(lex_return_keyword, "return", TokenType::ReturnKeyword); +lex_token_test!(lex_set_keyword, "set", TokenType::SetKeyword); +lex_token_test!(lex_shared_keyword, "shared", TokenType::SharedKeyword); +lex_token_test!(lex_super_keyword, "super", TokenType::SuperKeyword); +lex_token_test!(lex_switch_keyword, "switch", TokenType::SwitchKeyword); +lex_token_test!(lex_this_keyword, "this", TokenType::ThisKeyword); +lex_token_test!(lex_true_keyword, "true", TokenType::TrueKeyword); +lex_token_test!(lex_try_keyword, "try", TokenType::TryKeyword); +lex_token_test!(lex_typedef_keyword, "typedef", TokenType::TypeDefKeyword); +lex_token_test!(lex_uint_keyword, "uint", TokenType::UintKeyword); +lex_token_test!(lex_uint8_keyword, "uint8", TokenType::Uint8Keyword); +lex_token_test!(lex_uint16_keyword, "uint16", TokenType::Uint16Keyword); +lex_token_test!(lex_uint32_keyword, "uint32", TokenType::Uint32Keyword); + +lex_token_test!(lex_void_keyword, "void", TokenType::VoidKeyword); +lex_token_test!(lex_while_keyword, "while", TokenType::WhileKeyword); +lex_token_test!(lex_xor_keyword, "xor", TokenType::XorKeyword); + +lex_identifier_test!(lex_basic_identifier_foo, "foo"); +lex_identifier_test!(lex_basic_identifier_foobar, "foobar"); + +lex_integer_test!(lex_zero, "0", 0); +lex_integer_test!(lex_one_two_three_four, "1234", 1234); +lex_integer_test!(lex_specific_one_two_three_four, "0d1234", 1234); +lex_integer_test!(lex_decimal_with_underline, "123_456", 123456); +lex_integer_test!(lex_specific_decimal_with_underline, "0D123_456", 123456); +lex_integer_test!(lex_hexadecimal_0f, "0X0F", 15); +lex_integer_test!(lex_hexadecimal_ff, "0xff", 255); +lex_integer_test!(lex_hexadecimal_ff_ff, "0xff_ff", 65535); +lex_integer_test!(lex_octal_112, "0o112", 74); +lex_integer_test!(lex_binary_1110, "0b1110", 14); +lex_integer_test!(lex_binary_01110, "0b01110", 14); + +lex_float_test!(lex_zero_float, "0.0", 0.0); +lex_float_test!(lex_half, "0.5", 0.5); +lex_float_test!(lex_point_0_5, "0.05", 0.05); +lex_float_test!(lex_half_with_exponent, "0.5e10", 0.5e10); + +lex_string_test!(lex_simple_string, "\"foo\"", "foo"); +lex_string_test!(lex_simple_string_single_quote, "\'foo\'", "foo"); +lex_string_test!(lex_string_with_escape, "\"fo\\\"o\"", "fo\"o"); +lex_string_test!(lex_string_with_new_line, "\"fo\\no\"", "fo\no"); +lex_string_test!(lex_heredoc_string, "\"\"\"foo\"\"\"", "foo"); +lex_string_test!(lex_heredoc_string_with_quote, "\"\"\"fo\"o\"\"\"", "fo\"o"); + +#[test] +fn lex_two_identifier() { + let tokens = lex("foo bar", &mut |_message, _span| {}); + assert_eq!(tokens.len(), 4); + assert_eq!( + tokens[0].token_type, + TokenType::Identifier("foo".to_string()) + ); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, 3); + assert_eq!(tokens[1].token_type, TokenType::WhiteSpace); + assert_eq!( + tokens[2].token_type, + TokenType::Identifier("bar".to_string()) + ); + assert_eq!(tokens[2].span.start, 4); + assert_eq!(tokens[2].span.end, 7); + + assert_eq!(tokens[3].token_type, TokenType::EndOfFile); +} + +#[test] +fn lex_multiple_tokens_with_not_is() { + let tokens = lex("a !is b", &mut |_message, _span| {}); + assert_eq!(tokens.len(), 6); + assert_eq!(tokens[0].token_type, TokenType::Identifier("a".to_string())); + assert_eq!(tokens[0].span.start, 0); + assert_eq!(tokens[0].span.end, 1); + assert_eq!(tokens[1].token_type, TokenType::WhiteSpace); + assert_eq!(tokens[1].span.start, 1); + assert_eq!(tokens[1].span.end, 2); + assert_eq!(tokens[2].token_type, TokenType::NotIsKeyword); + assert_eq!(tokens[2].span.start, 2); + assert_eq!(tokens[2].span.end, 5); + assert_eq!(tokens[3].token_type, TokenType::WhiteSpace); + assert_eq!(tokens[3].span.start, 5); + assert_eq!(tokens[3].span.end, 6); + assert_eq!(tokens[4].token_type, TokenType::Identifier("b".to_string())); + assert_eq!(tokens[4].span.start, 6); + assert_eq!(tokens[4].span.end, 7); + assert_eq!(tokens[5].token_type, TokenType::EndOfFile); + assert_eq!(tokens[5].span.start, 7); + assert_eq!(tokens[5].span.end, 7); +} + +#[test] +fn lex_invalid_character_at_first_position() { + let mut reached = false; + lex("\x08", &mut |message, span| { + reached = true; + assert_matches!(message, Message::UnexpectedCharacter('\x08')); + assert_eq!(span.start, 0); + assert_eq!(span.end, 1); + }); + assert!(reached); +} + +#[test] +fn lex_invalid_character_at_other_position() { + let mut reached = false; + lex(" \x08", &mut |message, span| { + reached = true; + assert_matches!(message, Message::UnexpectedCharacter('\x08')); + assert_eq!(span.start, 2); + assert_eq!(span.end, 3); + }); + assert!(reached); +} + +#[test] +fn lex_unclosed_string_literal() { + let mut reached = false; + lex("\" ", &mut |message, span| { + reached = true; + assert_matches!(message, Message::UnclosedStringLiteral); + assert_eq!(span.start, 5); + assert_eq!(span.end, 6); + }); + assert!(reached); +} diff --git a/src/parsing/lexer/mod.rs b/src/parsing/lexer/mod.rs new file mode 100644 index 0000000..a9d031d --- /dev/null +++ b/src/parsing/lexer/mod.rs @@ -0,0 +1,462 @@ +use crate::logger::messages::Message; +use crate::parsing::lexer::lex_tokens::LexToken; +use crate::span::Span; +use lex_numerical::lex_numeric; +use lex_tokens::TokenType; +use string_walker::StringWalker; + +mod lex_numerical; +pub mod lex_tokens; +#[cfg(test)] +mod lexer_tests; +mod string_walker; + +#[inline(always)] +fn lex_and_consume( + chars: &mut StringWalker, + eq: TokenType, + f: &mut dyn FnMut(TokenType, usize, usize), +) { + chars.next(); + f(eq, chars.real_position - 1, chars.real_position) +} + +#[inline(always)] +fn lex_eq_or( + chars: &mut StringWalker, + eq: TokenType, + or: TokenType, + start_pos: usize, + f: &mut dyn FnMut(TokenType, usize, usize), +) { + chars.next(); + if let Some('=') = chars.peek() { + chars.next(); + f(eq, start_pos, chars.real_position); + } else { + f(or, start_pos, chars.real_position); + } +} + +#[inline(always)] +fn lex_eq_rep_or( + chars: &mut StringWalker, + v: char, + eq: TokenType, + rep: TokenType, + or: TokenType, + f: &mut dyn FnMut(TokenType, usize, usize), +) { + let start_pos = chars.real_position; + chars.next(); + match chars.peek() { + Some(c) => { + if *c == v { + chars.next(); + f(rep, start_pos, chars.real_position); + } else if *c == '=' { + chars.next(); + f(eq, start_pos, chars.real_position); + } else { + f(or, start_pos, chars.real_position); + } + } + None => f(or, start_pos, chars.real_position), + }; +} + +type TT = TokenType; + +fn lex_keyword_or_identifier(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) { + let mut reading = true; + let mut length = 1; + let start_pos = chars.real_position; + while reading { + match chars.peek() { + Some(c) => match c { + 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => { + length += 1; + } + _ => { + reading = false; + } + }, + None => { + reading = false; + } + }; + } + chars.reset_peek(); + let c: String = chars.take(length); + + let token_type = match c.as_str() { + "and" => TT::AndKeyword, + "abstract" => TT::AbstractKeyword, + "auto" => TT::AutoKeyword, + "bool" => TT::BoolKeyword, + "break" => TT::BreakKeyword, + "case" => TT::CaseKeyword, + "cast" => TT::CastKeyword, + "catch" => TT::CatchKeyword, + "class" => TT::ClassKeyword, + "const" => TT::ConstKeyword, + "continue" => TT::ContinueKeyword, + "default" => TT::DefaultKeyword, + "do" => TT::DoKeyword, + "double" => TT::DoubleKeyword, + "else" => TT::ElseKeyword, + "enum" => TT::EnumKeyword, + "explicit" => TT::ExplicitKeyword, + "external" => TT::ExternalKeyword, + "false" => TT::FalseKeyword, + "final" => TT::FinalKeyword, + "float" => TT::FloatKeyword, + "for" => TT::ForKeyword, + "from" => TT::FromKeyword, + "funcdef" => TT::FuncDefKeyword, + "function" => TT::FunctionKeyword, + "get" => TT::GetKeyword, + "if" => TT::IfKeyword, + "import" => TT::ImportKeyword, + "in" => TT::InKeyword, + "inout" => TT::InOutKeyword, + "int" => TT::IntKeyword, + "interface" => TT::InterfaceKeyword, + "int8" => TT::Int8Keyword, + "int16" => TT::Int16Keyword, + "int32" => TT::Int32Keyword, + "int64" => TT::Int64Keyword, + "is" => TT::IsKeyword, + "mixin" => TT::MixinKeyword, + "namespace" => TT::NamespaceKeyword, + "not" => TT::NotKeyword, + "null" => TT::NullKeyword, + "or" => TT::OrKeyword, + "out" => TT::OutKeyword, + "override" => TT::OverrideKeyword, + "private" => TT::PrivateKeyword, + "property" => TT::PropertyKeyword, + "protected" => TT::ProtectedKeyword, + "return" => TT::ReturnKeyword, + "set" => TT::SetKeyword, + "shared" => TT::SharedKeyword, + "super" => TT::SuperKeyword, + "switch" => TT::SwitchKeyword, + "this" => TT::ThisKeyword, + "true" => TT::TrueKeyword, + "try" => TT::TryKeyword, + "typedef" => TT::TypeDefKeyword, + "uint" => TT::UintKeyword, + "uint8" => TT::Uint8Keyword, + "uint16" => TT::Uint16Keyword, + "uint32" => TT::Uint32Keyword, + "uint64" => TT::Uint64Keyword, + "void" => TT::VoidKeyword, + "while" => TT::WhileKeyword, + "xor" => TT::XorKeyword, + _ => TT::Identifier(c), + }; + f(token_type, start_pos, chars.real_position); +} + +fn lex_string( + chars: &mut StringWalker, + opening_char: &char, + heredoc: bool, + log: &mut dyn FnMut(Message, Span), + f: &mut dyn FnMut(TokenType, usize, usize), +) { + let start_pos = chars.real_position; + chars.next(); + if heredoc { + chars.next(); + chars.next(); + } + let mut length: i32 = 0; + let mut string_length = 0; + let mut last_was_control = false; + + // We loop twice here. In the first loop we get the number of characters to read, the number of + // characters the string should be, and whether it's valid. This reduces the amount of allocations + // we need to do to read a string. + loop { + let p = chars.peek(); + match p { + None => { + log( + Message::UnclosedStringLiteral, + Span::new(chars.peek_position - 1, chars.peek_position), + ); + break; + } + Some(&'\\') if !last_was_control => { + last_was_control = true; + length += 1; + } + Some(c) => { + if c == opening_char && !last_was_control { + if heredoc { + if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') { + break; + } else { + length += 1; + string_length += 1; + } + } else { + break; + } + } + length += 1; + string_length += 1; + last_was_control = false; + } + } + } + chars.reset_peek(); + let mut s: String = String::with_capacity(string_length); + for _ in 0..length { + let p = chars.next().unwrap(); + match p { + '\\' => { + if last_was_control { + s.push('\\'); + } else { + last_was_control = true; + continue; + } + } + '0' if last_was_control => s.push('\0'), + 'n' if last_was_control => s.push('\n'), + 'r' if last_was_control => s.push('\r'), + 't' if last_was_control => s.push('\t'), + _ => s.push(p), + }; + last_was_control = false; + } + assert_eq!(s.len(), string_length); + chars.reset_peek(); + chars.next(); + if heredoc { + chars.next(); + chars.next(); + } + + f(TT::StringLiteral(s), start_pos, chars.real_position); +} + +pub fn lex(s: &str, log: &mut dyn FnMut(Message, Span)) -> Vec { + let mut tokens: Vec = Vec::new(); + let mut chars = StringWalker::create(s); + let mut reading = true; + + let mut add_token = |token_type: TokenType, start: usize, end: usize| { + tokens.push(LexToken { + token_type, + span: Span::new(start, end), + }) + }; + + while reading { + let p = chars.peek().cloned(); + match p { + Some(c) => match c { + ' ' | '\t' | '\r' | '\n' => { + chars.next(); + add_token(TT::WhiteSpace, chars.real_position - 1, chars.real_position); + } + '=' => { + let start_pos = chars.real_position; + lex_eq_or( + &mut chars, + TT::EqualsEquals, + TT::Equals, + start_pos, + &mut add_token, + ) + } + '+' => lex_eq_rep_or( + &mut chars, + '+', + TT::PlusEquals, + TT::PlusPlus, + TT::Plus, + &mut add_token, + ), + '-' => lex_eq_rep_or( + &mut chars, + '-', + TT::MinusEquals, + TT::MinusMinus, + TT::Minus, + &mut add_token, + ), + '*' => { + let start_pos = chars.real_position; + if chars.peek() == Some(&'*') { + chars.next(); + lex_eq_or( + &mut chars, + TT::StarStarEquals, + TT::StarStar, + start_pos, + &mut add_token, + ) + } else { + lex_eq_or( + &mut chars, + TT::StarEquals, + TT::Star, + start_pos, + &mut add_token, + ) + } + } + '/' => { + let start_pos = chars.real_position; + lex_eq_or( + &mut chars, + TT::SlashEquals, + TT::Slash, + start_pos, + &mut add_token, + ); + } + '%' => { + let start_pos = chars.real_position; + lex_eq_or( + &mut chars, + TT::PercentEquals, + TT::Percent, + start_pos, + &mut add_token, + ); + } + '|' => lex_eq_rep_or( + &mut chars, + '|', + TT::LineEquals, + TT::LineLine, + TT::VerticalLine, + &mut add_token, + ), + '&' => lex_eq_rep_or( + &mut chars, + '&', + TT::AmpersandEquals, + TT::AmpersandAmpersand, + TT::Ampersand, + &mut add_token, + ), + '^' => lex_eq_rep_or( + &mut chars, + '^', + TT::RoofEquals, + TT::RoofRoof, + TT::Roof, + &mut add_token, + ), + '<' => { + let start_pos = chars.real_position; + if chars.peek() == Some(&'<') { + chars.next(); + lex_eq_or( + &mut chars, + TT::LeftLeftEquals, + TT::LeftLeft, + start_pos, + &mut add_token, + ) + } else { + lex_eq_or( + &mut chars, + TT::LessThanEquals, + TT::LessThan, + start_pos, + &mut add_token, + ) + } + } + '>' => { + let start_pos = chars.real_position; + if chars.peek() == Some(&'>') { + if chars.peek() == Some(&'>') { + chars.next(); + chars.next(); + lex_eq_or( + &mut chars, + TT::RightRightRightEquals, + TT::RightRightRight, + start_pos, + &mut add_token, + ) + } else { + chars.next(); + lex_eq_or( + &mut chars, + TT::RightRightEquals, + TT::RightRight, + start_pos, + &mut add_token, + ) + } + } else { + lex_eq_or( + &mut chars, + TT::GreaterThanEquals, + TT::GreaterThan, + start_pos, + &mut add_token, + ) + } + } + '!' => { + let start_pos = chars.real_position; + let next = chars.peek(); + if next == Some(&'=') { + chars.next(); + chars.next(); + add_token(TT::NotEquals, start_pos, chars.real_position); + } else if next == Some(&'i') && chars.peek() == Some(&'s') { + chars.next(); + chars.next(); + chars.next(); + add_token(TT::NotIsKeyword, start_pos, chars.real_position); + } else { + chars.next(); + add_token(TT::ExclamationMark, start_pos, chars.real_position); + } + } + + '~' => lex_and_consume(&mut chars, TT::Tilde, &mut add_token), + '@' => lex_and_consume(&mut chars, TT::AtSymbol, &mut add_token), + ';' => lex_and_consume(&mut chars, TT::Semicolon, &mut add_token), + ':' => lex_and_consume(&mut chars, TT::Colon, &mut add_token), + + '(' => lex_and_consume(&mut chars, TT::OpenBracket, &mut add_token), + ')' => lex_and_consume(&mut chars, TT::CloseBracket, &mut add_token), + '{' => lex_and_consume(&mut chars, TT::OpenCurlyBracket, &mut add_token), + '}' => lex_and_consume(&mut chars, TT::CloseCurlyBracket, &mut add_token), + '[' => lex_and_consume(&mut chars, TT::OpenBlockBracket, &mut add_token), + ']' => lex_and_consume(&mut chars, TT::CloseBlockBracket, &mut add_token), + + '0'..'9' => lex_numeric(&mut chars, &mut add_token), + 'a'..'z' | 'A'..'Z' | '_' => lex_keyword_or_identifier(&mut chars, &mut add_token), + '\'' => lex_string(&mut chars, &'\'', false, log, &mut add_token), + '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => { + lex_string(&mut chars, &'"', true, log, &mut add_token) + } + '"' => lex_string(&mut chars, &'"', false, log, &mut add_token), + + _ => log( + Message::UnexpectedCharacter(c), + Span::new(chars.real_position, chars.real_position + 1), + ), + }, + None => { + add_token(TT::EndOfFile, chars.real_position, chars.real_position); + reading = false; + } + } + } + tokens +} diff --git a/src/parsing/lexer/string_walker.rs b/src/parsing/lexer/string_walker.rs new file mode 100644 index 0000000..0b4df93 --- /dev/null +++ b/src/parsing/lexer/string_walker.rs @@ -0,0 +1,40 @@ +use itertools::{Itertools, MultiPeek}; +use std::str::Chars; + +pub(super) struct StringWalker<'a> { + chars: MultiPeek>, + pub(crate) real_position: usize, + pub(crate) peek_position: usize, +} + +impl<'a> StringWalker<'a> { + pub fn create(s: &str) -> StringWalker { + StringWalker { + chars: s.chars().multipeek(), + real_position: 0, + peek_position: 0, + } + } + + pub fn peek(&mut self) -> Option<&char> { + self.peek_position += 1; + self.chars.peek() + } + + pub fn next(&mut self) -> Option { + self.real_position += 1; + self.peek_position = self.real_position; + self.chars.next() + } + + pub fn reset_peek(&mut self) { + self.peek_position = self.real_position; + self.chars.reset_peek() + } + + pub fn take(&mut self, length: usize) -> String { + self.real_position += length; + self.peek_position = self.real_position; + self.chars.by_ref().take(length).collect() + } +} diff --git a/src/parsing/lexer_tests.rs b/src/parsing/lexer_tests.rs deleted file mode 100644 index fa9e60d..0000000 --- a/src/parsing/lexer_tests.rs +++ /dev/null @@ -1,241 +0,0 @@ -use super::lex_tokens::LexToken; -use super::lexer::lex; - -macro_rules! lex_token_test { - ( $a: ident, $b: expr, $c: expr) => { - #[test] - fn $a() { - let tokens = lex($b); - assert_eq!(tokens.len(), 2); - assert_eq!(tokens[0], $c); - assert_eq!(tokens[1], LexToken::EndOfFile); - } - }; -} - -macro_rules! lex_identifier_test { - ( $a: ident, $b: expr) => { - #[test] - fn $a() { - let tokens = lex($b); - assert_eq!(tokens.len(), 2); - assert_eq!(tokens[0], LexToken::Identifier($b.to_string())); - assert_eq!(tokens[1], LexToken::EndOfFile); - } - }; -} - -macro_rules! lex_integer_test { - ( $a: ident, $b: expr, $c: expr) => { - #[test] - fn $a() { - let tokens = lex($b); - assert_eq!(tokens.len(), 2); - assert_eq!(tokens[0], LexToken::IntegerLiteral($c)); - assert_eq!(tokens[1], LexToken::EndOfFile); - } - }; -} - -macro_rules! lex_float_test { - ( $a: ident, $b: expr, $c: expr) => { - #[test] - fn $a() { - let tokens = lex($b); - assert_eq!(tokens.len(), 2); - assert_eq!(tokens[0], LexToken::FloatLiteral($c)); - assert_eq!(tokens[1], LexToken::EndOfFile); - } - }; -} - -macro_rules! lex_string_test { - ( $a: ident, $b: expr, $c: expr) => { - #[test] - fn $a() { - let tokens = lex($b); - assert_eq!(tokens.len(), 2); - assert_eq!(tokens[0], LexToken::StringLiteral($c.to_string())); - assert_eq!(tokens[1], LexToken::EndOfFile); - } - }; -} - -lex_token_test!(lex_space, " ", LexToken::WhiteSpace); -lex_token_test!(lex_tab, "\t", LexToken::WhiteSpace); -lex_token_test!(lex_return_line, "\r", LexToken::WhiteSpace); -lex_token_test!(lex_newline, "\n", LexToken::WhiteSpace); - -lex_token_test!(lex_equals, "=", LexToken::Equals); -lex_token_test!(lex_equals_equals, "==", LexToken::EqualsEquals); -lex_token_test!(lex_plus, "+", LexToken::Plus); -lex_token_test!(lex_plus_plus, "++", LexToken::PlusPlus); -lex_token_test!(lex_plus_equals, "+=", LexToken::PlusEquals); -lex_token_test!(lex_minus, "-", LexToken::Minus); -lex_token_test!(lex_minus_minus, "--", LexToken::MinusMinus); -lex_token_test!(lex_minus_equals, "-=", LexToken::MinusEquals); -lex_token_test!(lex_star, "*", LexToken::Star); -lex_token_test!(lex_star_equals, "*=", LexToken::StarEquals); -lex_token_test!(lex_star_star, "**", LexToken::StarStar); -lex_token_test!(lex_star_star_equals, "**=", LexToken::StarStarEquals); -lex_token_test!(lex_slash, "/", LexToken::Slash); -lex_token_test!(lex_slash_equals, "/=", LexToken::SlashEquals); -lex_token_test!(lex_percent, "%", LexToken::Percent); -lex_token_test!(lex_percent_equals, "%=", LexToken::PercentEquals); - -lex_token_test!(lex_exclamation_mark, "!", LexToken::ExclamationMark); -lex_token_test!(lex_not_equals, "!=", LexToken::NotEquals); -lex_token_test!(lex_not_is_keyword, "!is", LexToken::NotIsKeyword); - -lex_token_test!(lex_vert_line, "|", LexToken::VerticalLine); -lex_token_test!(lex_vert_line_equals, "|=", LexToken::LineEquals); -lex_token_test!(lex_line_line, "||", LexToken::LineLine); - -lex_token_test!(lex_ampersand, "&", LexToken::Ampersand); -lex_token_test!(lex_ampersand_equals, "&=", LexToken::AmpersandEquals); -lex_token_test!(lex_ampersand_ampersand, "&&", LexToken::AmpersandAmpersand); -lex_token_test!(lex_less_than, "<", LexToken::LessThan); -lex_token_test!(lex_less_than_equals, "<=", LexToken::LessThanEquals); -lex_token_test!(lex_left_left, "<<", LexToken::LeftLeft); -lex_token_test!(lex_left_left_equals, "<<=", LexToken::LeftLeftEquals); - -lex_token_test!(lex_greater_than, ">", LexToken::GreaterThan); -lex_token_test!(lex_greater_than_equals, ">=", LexToken::GreaterThanEquals); -lex_token_test!(lex_right_right, ">>", LexToken::RightRight); -lex_token_test!(lex_right_right_equals, ">>=", LexToken::RightRightEquals); -lex_token_test!(lex_right_right_right, ">>>", LexToken::RightRightRight); -lex_token_test!( - lex_right_right_right_equals, - ">>>=", - LexToken::RightRightRightEquals -); - -lex_token_test!(lex_tilde, "~", LexToken::Tilde); -lex_token_test!(lex_at_symbol, "@", LexToken::AtSymbol); - -lex_token_test!(lex_and_keyword, "and", LexToken::AndKeyword); -lex_token_test!(lex_abstract_keyword, "abstract", LexToken::AbstractKeyword); -lex_token_test!(lex_auto_keyword, "auto", LexToken::AutoKeyword); -lex_token_test!(lex_bool_keyword, "bool", LexToken::BoolKeyword); -lex_token_test!(lex_break_keyword, "break", LexToken::BreakKeyword); -lex_token_test!(lex_case_keyword, "case", LexToken::CaseKeyword); -lex_token_test!(lex_cast_keyword, "cast", LexToken::CastKeyword); -lex_token_test!(lex_catch_keyword, "catch", LexToken::CatchKeyword); -lex_token_test!(lex_class_keyword, "class", LexToken::ClassKeyword); -lex_token_test!(lex_const_keyword, "const", LexToken::ConstKeyword); -lex_token_test!(lex_continue_keyword, "continue", LexToken::ContinueKeyword); -lex_token_test!(lex_default_keyword, "default", LexToken::DefaultKeyword); -lex_token_test!(lex_do_keyword, "do", LexToken::DoKeyword); -lex_token_test!(lex_double_keyword, "double", LexToken::DoubleKeyword); -lex_token_test!(lex_else_keyword, "else", LexToken::ElseKeyword); -lex_token_test!(lex_enum_keyword, "enum", LexToken::EnumKeyword); -lex_token_test!(lex_explicit_keyword, "explicit", LexToken::ExplicitKeyword); -lex_token_test!(lex_external_keyword, "external", LexToken::ExternalKeyword); -lex_token_test!(lex_false_keyword, "false", LexToken::FalseKeyword); -lex_token_test!(lex_final_keyword, "final", LexToken::FinalKeyword); -lex_token_test!(lex_float_keyword, "float", LexToken::FloatKeyword); -lex_token_test!(lex_for_keyword, "for", LexToken::ForKeyword); -lex_token_test!(lex_from_keyword, "from", LexToken::FromKeyword); -lex_token_test!(lex_funcdef_keyword, "funcdef", LexToken::FuncDefKeyword); -lex_token_test!(lex_function_keyword, "function", LexToken::FunctionKeyword); -lex_token_test!(lex_get_keyword, "get", LexToken::GetKeyword); -lex_token_test!(lex_if_keyword, "if", LexToken::IfKeyword); -lex_token_test!(lex_import_keyword, "import", LexToken::ImportKeyword); -lex_token_test!(lex_in_keyword, "in", LexToken::InKeyword); -lex_token_test!(lex_inout_keyword, "inout", LexToken::InOutKeyword); -lex_token_test!(lex_int_keyword, "int", LexToken::IntKeyword); -lex_token_test!( - lex_interface_keyword, - "interface", - LexToken::InterfaceKeyword -); -lex_token_test!(lex_int8_keyword, "int8", LexToken::Int8Keyword); -lex_token_test!(lex_int16_keyword, "int16", LexToken::Int16Keyword); -lex_token_test!(lex_int32_keyword, "int32", LexToken::Int32Keyword); -lex_token_test!(lex_int64_keyword, "int64", LexToken::Int64Keyword); -lex_token_test!(lex_is_keyword, "is", LexToken::IsKeyword); -lex_token_test!(lex_mixin_keyword, "mixin", LexToken::MixinKeyword); -lex_token_test!( - lex_namespace_keyword, - "namespace", - LexToken::NamespaceKeyword -); -lex_token_test!(lex_not_keyword, "not", LexToken::NotKeyword); -lex_token_test!(lex_null_keyword, "null", LexToken::NullKeyword); -lex_token_test!(lex_or_keyword, "or", LexToken::OrKeyword); -lex_token_test!(lex_out_keyword, "out", LexToken::OutKeyword); -lex_token_test!(lex_override_keyword, "override", LexToken::OverrideKeyword); -lex_token_test!(lex_private_keyword, "private", LexToken::PrivateKeyword); -lex_token_test!(lex_property_keyword, "property", LexToken::PropertyKeyword); -lex_token_test!( - lex_protected_keyword, - "protected", - LexToken::ProtectedKeyword -); - -lex_token_test!(lex_return_keyword, "return", LexToken::ReturnKeyword); -lex_token_test!(lex_set_keyword, "set", LexToken::SetKeyword); -lex_token_test!(lex_shared_keyword, "shared", LexToken::SharedKeyword); -lex_token_test!(lex_super_keyword, "super", LexToken::SuperKeyword); -lex_token_test!(lex_switch_keyword, "switch", LexToken::SwitchKeyword); -lex_token_test!(lex_this_keyword, "this", LexToken::ThisKeyword); -lex_token_test!(lex_true_keyword, "true", LexToken::TrueKeyword); -lex_token_test!(lex_try_keyword, "try", LexToken::TryKeyword); -lex_token_test!(lex_typedef_keyword, "typedef", LexToken::TypeDefKeyword); -lex_token_test!(lex_uint_keyword, "uint", LexToken::UintKeyword); -lex_token_test!(lex_uint8_keyword, "uint8", LexToken::Uint8Keyword); -lex_token_test!(lex_uint16_keyword, "uint16", LexToken::Uint16Keyword); -lex_token_test!(lex_uint32_keyword, "uint32", LexToken::Uint32Keyword); - -lex_token_test!(lex_void_keyword, "void", LexToken::VoidKeyword); -lex_token_test!(lex_while_keyword, "while", LexToken::WhileKeyword); -lex_token_test!(lex_xor_keyword, "xor", LexToken::XorKeyword); - -lex_identifier_test!(lex_basic_identifier_foo, "foo"); -lex_identifier_test!(lex_basic_identifier_foobar, "foobar"); - -lex_integer_test!(lex_zero, "0", 0); -lex_integer_test!(lex_one_two_three_four, "1234", 1234); -lex_integer_test!(lex_specific_one_two_three_four, "0d1234", 1234); -lex_integer_test!(lex_decimal_with_underline, "123_456", 123456); -lex_integer_test!(lex_specific_decimal_with_underline, "0D123_456", 123456); -lex_integer_test!(lex_hexadecimal_0f, "0X0F", 15); -lex_integer_test!(lex_hexadecimal_ff, "0xff", 255); -lex_integer_test!(lex_hexadecimal_ff_ff, "0xff_ff", 65535); -lex_integer_test!(lex_octal_112, "0o112", 74); -lex_integer_test!(lex_binary_1110, "0b1110", 14); -lex_integer_test!(lex_binary_01110, "0b01110", 14); - -lex_float_test!(lex_zero_float, "0.0", 0.0); -lex_float_test!(lex_half, "0.5", 0.5); -lex_float_test!(lex_point_0_5, "0.05", 0.05); -lex_float_test!(lex_half_with_exponent, "0.5e10", 0.5e10); - -lex_string_test!(lex_simple_string, "\"foo\"", "foo"); -lex_string_test!(lex_simple_string_single_quote, "\'foo\'", "foo"); -lex_string_test!(lex_string_with_escape, "\"fo\\\"o\"", "fo\"o"); -lex_string_test!(lex_string_with_new_line, "\"fo\\no\"", "fo\no"); -lex_string_test!(lex_heredoc_string, "\"\"\"foo\"\"\"", "foo"); -lex_string_test!(lex_heredoc_string_with_quote, "\"\"\"fo\"o\"\"\"", "fo\"o"); - -#[test] -fn lex_two_identifier() { - let tokens = lex("foo bar"); - assert_eq!(tokens.len(), 4); - assert_eq!(tokens[0], LexToken::Identifier("foo".to_string())); - assert_eq!(tokens[1], LexToken::WhiteSpace); - assert_eq!(tokens[2], LexToken::Identifier("bar".to_string())); - assert_eq!(tokens[3], LexToken::EndOfFile); -} - -#[test] -fn lex_multiple_tokens_with_not_is() { - let tokens = lex("a !is b"); - assert_eq!(tokens.len(), 6); - assert_eq!(tokens[0], LexToken::Identifier("a".to_string())); - assert_eq!(tokens[1], LexToken::WhiteSpace); - assert_eq!(tokens[2], LexToken::NotIsKeyword); - assert_eq!(tokens[3], LexToken::WhiteSpace); - assert_eq!(tokens[4], LexToken::Identifier("b".to_string())); - assert_eq!(tokens[5], LexToken::EndOfFile); -} diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs index f51db11..591a845 100644 --- a/src/parsing/mod.rs +++ b/src/parsing/mod.rs @@ -1,6 +1,2 @@ -mod lex_numerical; -pub mod lex_tokens; pub mod lexer; - -#[cfg(test)] -mod lexer_tests; +pub mod parser; \ No newline at end of file diff --git a/src/parsing/parser/mod.rs b/src/parsing/parser/mod.rs new file mode 100644 index 0000000..cbd4bad --- /dev/null +++ b/src/parsing/parser/mod.rs @@ -0,0 +1,89 @@ +pub mod parsed_statement; +#[cfg(test)] +mod parser_tests; + +use super::lexer::lex_tokens::TokenType; +use crate::parsing::parser::parsed_statement::ParsedStatement; +use itertools::{Itertools, MultiPeek}; + +struct ParseReader<'a> { + tokens: MultiPeek>, +} + +impl<'a> ParseReader<'a> { + pub fn peek(&mut self) -> &TokenType { + let t = self.tokens.peek(); + match t { + None => &TokenType::EndOfFile, + Some(TokenType::WhiteSpace) => self.peek(), + Some(v) => v, + } + } + + pub fn next(&mut self) -> &TokenType { + let t = self.tokens.next(); + match t { + None => &TokenType::EndOfFile, + Some(TokenType::WhiteSpace) => self.next(), + Some(v) => v, + } + } + + pub fn consume(&mut self, token: TokenType) -> &TokenType { + let n = self.next(); + if n != &token { + // TODO: log error + unimplemented!() + } + n + } + + #[inline(always)] + pub fn reset_peek(&mut self) { + self.tokens.reset_peek(); + } +} + +pub fn parse(tokens: Vec) -> Box { + let mut reader = ParseReader { + tokens: tokens.iter().multipeek(), + }; + parse_script(&mut reader) +} + +fn parse_script(reader: &mut ParseReader) -> Box { + let mut vec: Vec> = Vec::new(); + loop { + let n = reader.peek(); + match n { + TokenType::NamespaceKeyword => { + vec.push(parse_namespace(reader)); + } + TokenType::EndOfFile => break, + _ => { + // Log error? + } + } + } + + Box::new(ParsedStatement::Script(vec)) +} + +fn parse_namespace(reader: &mut ParseReader) -> Box { + reader.next(); // Consume namespace + let identifier_token = reader.next(); + let s: String; + match identifier_token { + TokenType::Identifier(i) => { + s = i.to_string(); + } + _ => { + // Log error + unimplemented!(); + } + } + reader.consume(TokenType::OpenCurlyBracket); + let script = parse_script(reader); + reader.consume(TokenType::CloseCurlyBracket); + Box::new(ParsedStatement::Namespace(s, script)) +} diff --git a/src/parsing/parser/parsed_statement.rs b/src/parsing/parser/parsed_statement.rs new file mode 100644 index 0000000..e64eb64 --- /dev/null +++ b/src/parsing/parser/parsed_statement.rs @@ -0,0 +1,5 @@ +pub enum ParsedStatement { + Script(Vec>), + Namespace(String, Box), + +} diff --git a/src/parsing/parser/parser_tests.rs b/src/parsing/parser/parser_tests.rs new file mode 100644 index 0000000..df4c022 --- /dev/null +++ b/src/parsing/parser/parser_tests.rs @@ -0,0 +1,31 @@ +use super::parse; +use super::parsed_statement::ParsedStatement; +use crate::parsing::lexer::lex_tokens::TokenType; + +#[test] +fn test_empty_namespace() { + let script = parse(vec![ + TokenType::NamespaceKeyword, + TokenType::WhiteSpace, + TokenType::Identifier("foo".to_string()), + TokenType::WhiteSpace, + TokenType::OpenCurlyBracket, + TokenType::CloseCurlyBracket, + TokenType::EndOfFile, + ]); + if let ParsedStatement::Script(inner) = script.as_ref() { + assert_eq!(1, inner.len()); + if let ParsedStatement::Namespace(identifier, inner_script) = inner[0].as_ref() { + assert_eq!(identifier, "foo"); + if let ParsedStatement::Script(inner) = inner_script.as_ref() { + assert_eq!(0, inner.len()); + } else { + unreachable!(); + } + } else { + unreachable!() + } + } else { + unreachable!(); + } +} diff --git a/src/span.rs b/src/span.rs new file mode 100644 index 0000000..7b46ba4 --- /dev/null +++ b/src/span.rs @@ -0,0 +1,10 @@ +pub struct Span { + pub start: usize, + pub end: usize, +} + +impl Span { + pub fn new(start: usize, end: usize) -> Span { + Span { start, end } + } +}