commit f1af568cb84c69b961827bdb979568d8b4e35419 Author: Deukhoofd Date: Sat May 15 16:53:53 2021 +0200 Initial commit, implements parser. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2e04901 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +Cargo.lock +.idea/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..93fe82c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "seraph_script" +version = "0.1.0" +authors = ["Deukhoofd "] +edition = "2018" + +[dependencies] +itertools = "0.10.0" \ No newline at end of file diff --git a/grammar.ebnf b/grammar.ebnf new file mode 100644 index 0000000..203477d --- /dev/null +++ b/grammar.ebnf @@ -0,0 +1,99 @@ +letter ::= 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' + | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' + | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' + | 'V' | 'W' | 'X' | 'Y' | 'Z' | 'a' | 'b' + | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' + | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' | 'p' + | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' + | 'x' | 'y' | 'z' ; + +all_characters ::= ; +digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ; +hexadecimal_digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | + '8' | '9' | 'A' | 'a' | 'B' | 'b' | 'C' | 'c' | + 'D' | 'd' | 'E' | 'e' | 'F' | 'f'; +octal_digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'; +binary_digit ::= '0' | '1'; + +character ::= letter | digit | '_' ; + +identifier ::= (letter | '_') { character }; + +float ::= digit { digit } '.' digit {digit}; +dec_integer ::= ['0D' | '0d'] digit {digit}; +hex_integer ::= ('0X'|'0x') hexadecimal_digit {hexadecimal_digit}; +octal_integer ::= ('0O'|'0o') octal_digit {octal_digit}; +binary_integer ::= ('0B'|'0b') binary_digit {binary_digit}; +integer ::= dec_integer | hex_integer | octal_integer | binary_integer; +number ::= integer | float; + +string ::= ('\'' {all_characters} '\'' | '\"' {all_characters} '\"' | '\"\"\"' {all_characters} '\"\"\"'); + +assignop ::= '=' | '+=' | '-=' | '*=' | '/=' | '|=' | '&=' | '^=' | '%=' | '**=' | '<<=' | '>>=' | '>>>='; +logicop ::= '&&' | '||' | '^^' | 'and' | 'or' | 'xor'; +compop ::= '==' | '!=' | '<' | '<=' | '>' | '>=' | 'is' | '!is'; +mathop ::= '+' | '-' | '*' | '/' | '%' | '**'; +bitop ::= '&' | '|' | '^' | '<<' | '>>' | '>>>'; + + +primtype ::= 'void' | 'int' | 'int8' | 'int16' | 'int32' | 'int64' | 'uint' | 'uint8' | 'uint16' | + 'uint32' | 'uint64' | 'float' | 'double' | 'bool'; +datatype ::= (identifier | primtype | 'auto'); +scope ::= ['::'] {identifier '::'} [identifier ['<' type {',' type} '>'] '::']; +type ::= ['const'] scope datatype ['<' type {',' type} '>'] { ('[' ']') | ('@' ['const']) }; +# ternary is defined further below due to a circular dependency: ternary->expr->exprterm->initlist->assign +assign ::= ternary [ assignop assign ]; +initlist ::= '{' [assign | initlist] {',' [assign | initlist]} '}'; +exprpreop ::= '-' | '+' | '!' | '++' | '--' | '~' | '@'; +arglist ::= '(' [identifier ':'] assign {',' [identifier ':'] assign} ')'; +funccall ::= scope identifier arglist; +constructcall ::= type arglist; +varaccess ::= scope | identifier; +cast ::= 'cast' '<' type '>' '(' assign ')'; +literal ::= number | string | 'true' | 'false' | 'null'; +typemod ::= ['&' ['in' | 'out' | 'inout']]; +# statblock is defined further below, as statements are higher level than expressions. +lambda ::= 'function' '(' [[type typemod] identifier {',' [type typemod] identifier}] ')' statblock; + +exprvalue ::= 'void' | constructcall | funccall | varaccess | cast | literal | '(' assign ')' | lambda; +exprpostop ::= ('.' (funccall | identifier)) | ('[' [identifier ':'] assign {',' [identifier ':' assign} ']') | arglist | '++' | '--'; +exprterm ::= ([type '='] initlist) | ({exprpreop} exprvalue {exprpostop}); +expr ::= exprterm {(mathop | compop | logicop | bitop) exprterm}; +ternary ::= expr ['?' assign : assign]; + +return ::= 'return' [assign] ';'; +exprstat ::= assign ';'; +continue ::= 'continue' ';'; +break ::= 'break' ';'; + +# As these are all statements using other statements, they use the statement and statblock types defined further below. +if ::= 'if' '(' assign ')' statement ['else' statement]; +for ::= 'for' '(' (var | exprstat) exprstat [assign {',' assign}] ')' statement; +while ::= 'while' '(' assign ')' statement; +dowhile ::= 'do' statement 'while' '(' assign ')' ';'; +try ::= 'try' statblock 'catch' statblock; +case ::= (('case' expr) | 'default') ':' {statement}; +switch ::= 'switch' '(' assign ')' '{' {case} '}'; + +statement ::= (if | for | while | return | statblock | break | continue | dowhile | switch | exprstat | try ); +var ::= ['private'|'protected'] type identifier [( '=' (initlist | expr)) | arglist] {',' identifier [( '=' (initlist | expr)) | arglist]} ';'; +statblock ::= '{' {var | statement} '}'; + +funcattr ::= {'override' | 'final' | 'explicit' | 'property'}; +paramlist ::= '(' ['void' | (type typemod [identifier] ['=' expr] {',' type typemod [identifier] ['=' expr]})] ')'; + +virtprop ::= ['private' | 'protected'] type ['&'] identifier '{' {('get' | 'set') ['const'] funcattr (statblock | ';')} '}'; +func ::= {'shared' | 'external'} ['private' | 'protected'] [((type ['&']) | '~')] identifier paramlist ['const'] funcattr (';' | statblock); +funcdef ::= {'external' | 'shared'} 'funcdef' type ['&'] identifier paramlist ';' +class ::= {'shared' | 'abstract' | 'final' | 'external'} 'class' identifier + (';' | ([':' identifier {',' identifier}] '{' {virtprop | func | var | funcdef | class} '}')); +mixin ::= 'mixin' class; +enum ::= {'shared' | 'external'} 'enum' identifier [ ':' primtype ] (';' | ('{' identifier ['=' expr] {',' identifier ['=' expr]} '}')); +import ::= 'import' type ['&'] identifier paramlist funcattr 'from' string ';'; +typedef ::= 'typedef' (primtype | identifier) identifier ';'; + +interfacemethod ::= type ['&'] identifier paramlist ['const'] ';'; +interface ::= {'external' | 'shared'} 'interface' identifier (';' | ([':' identifier {',' identifier}] '{' {virtprop | interfacemethod} '}')); + +namespace ::= 'namespace' identifier '{' script '}'; +script ::= {import | enum | typedef | class | mixin | interface | funcdef | virtprop | var | func | namespace | ';'}; diff --git a/src/defines.rs b/src/defines.rs new file mode 100644 index 0000000..8df6423 --- /dev/null +++ b/src/defines.rs @@ -0,0 +1,4 @@ +/// The size integers use internally to store literals and do compile time calculations. +pub type LiteralInt = i64; +/// The size floating point numbers use internally to store literals and do compile time calculations. +pub type LiteralFloat = f64; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7aca25b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,5 @@ +#![feature(concat_idents)] +#![feature(exclusive_range_pattern)] + +pub(crate) mod defines; +pub mod parsing; diff --git a/src/parsing/lex_numerical.rs b/src/parsing/lex_numerical.rs new file mode 100644 index 0000000..d7f80cb --- /dev/null +++ b/src/parsing/lex_numerical.rs @@ -0,0 +1,242 @@ +use crate::defines::{LiteralFloat, LiteralInt}; +use crate::parsing::lex_tokens::LexToken; +use itertools::MultiPeek; +use std::str::Chars; + +#[inline(always)] +fn get_decimal_value(c: char) -> Option { + match c { + '0' => Some(0), + '1' => Some(1), + '2' => Some(2), + '3' => Some(3), + '4' => Some(4), + '5' => Some(5), + '6' => Some(6), + '7' => Some(7), + '8' => Some(8), + '9' => Some(9), + _ => None, + } +} + +#[inline(always)] +fn lex_numeric_default(chars: &mut MultiPeek) -> LexToken { + let mut int_value: LiteralInt = 0; + let mut decimal_value: LiteralInt = 0; + let mut exponent_value: LiteralInt = 0; + let mut decimal_length: LiteralInt = 0; + let mut is_decimal = false; + let mut is_exponent = false; + let mut is_reading = true; + + let mut c: Option = chars.peek().cloned(); + while c.is_some() && is_reading { + let v = get_decimal_value(c.unwrap()); + match v { + None => { + if c.unwrap() == '_' { + chars.next(); + c = chars.peek().cloned(); + continue; + } + if !is_decimal && c.unwrap() == '.' { + is_decimal = true; + chars.next(); + c = chars.peek().cloned(); + continue; + } + if is_decimal && c.unwrap() == 'e' || c.unwrap() == 'E' { + is_decimal = false; + is_exponent = true; + chars.next(); + c = chars.peek().cloned(); + continue; + } + c = chars.peek().cloned(); + is_reading = false; + continue; + } + Some(i) => { + chars.next(); + if is_decimal { + decimal_value *= 10; + decimal_value += i; + decimal_length += 1; + } else if is_exponent { + exponent_value *= 10; + exponent_value += i; + } else { + int_value *= 10; + int_value += i; + } + } + } + c = chars.peek().cloned(); + } + chars.reset_peek(); + if is_decimal || is_exponent { + let mut val = int_value as LiteralFloat + + (decimal_value as LiteralFloat / 10_i64.pow(decimal_length as u32) as LiteralFloat); + if is_exponent { + val *= exponent_value.pow(10) as LiteralFloat; + } + LexToken::FloatLiteral(val) + } else { + LexToken::IntegerLiteral(int_value) + } +} + +#[inline(always)] +fn get_hexadecimal_value(c: char) -> Option { + match c { + '0' => Some(0), + '1' => Some(1), + '2' => Some(2), + '3' => Some(3), + '4' => Some(4), + '5' => Some(5), + '6' => Some(6), + '7' => Some(7), + '8' => Some(8), + '9' => Some(9), + 'A' | 'a' => Some(10), + 'B' | 'b' => Some(11), + 'C' | 'c' => Some(12), + 'D' | 'd' => Some(13), + 'E' | 'e' => Some(14), + 'F' | 'f' => Some(15), + _ => None, + } +} + +#[inline(always)] +fn lex_numeric_hexadecimal(chars: &mut MultiPeek) -> LexToken { + let mut int_value: LiteralInt = 0; + let mut reading = true; + let mut n = chars.peek().cloned(); + while n.is_some() && reading { + match get_hexadecimal_value(n.unwrap()) { + Some(i) => { + int_value <<= 4; + int_value += i; + chars.next(); + } + None => { + if n.unwrap() == '_' { + chars.next(); + } else { + reading = false + } + } + } + n = chars.peek().cloned(); + } + LexToken::IntegerLiteral(int_value) +} + +#[inline(always)] +fn get_octal_value(c: char) -> Option { + match c { + '0' => Some(0), + '1' => Some(1), + '2' => Some(2), + '3' => Some(3), + '4' => Some(4), + '5' => Some(5), + '6' => Some(6), + '7' => Some(7), + _ => None, + } +} + +#[inline(always)] +fn lex_numeric_octal(chars: &mut MultiPeek) -> LexToken { + let mut int_value: LiteralInt = 0; + let mut reading = true; + let mut n = chars.peek().cloned(); + while n.is_some() && reading { + match get_octal_value(n.unwrap()) { + Some(i) => { + int_value <<= 3; + int_value += i; + chars.next(); + } + None => { + if n.unwrap() == '_' { + chars.next(); + } else { + reading = false + } + } + } + n = chars.peek().cloned(); + } + LexToken::IntegerLiteral(int_value) +} + +#[inline(always)] +fn get_binary_value(c: char) -> Option { + match c { + '0' => Some(0), + '1' => Some(1), + _ => None, + } +} + +#[inline(always)] +fn lex_numeric_binary(chars: &mut MultiPeek) -> LexToken { + let mut int_value: LiteralInt = 0; + let mut reading = true; + let mut n = chars.peek().cloned(); + while n.is_some() && reading { + match get_binary_value(n.unwrap()) { + Some(i) => { + int_value <<= 1; + int_value += i; + chars.next(); + } + None => { + if n.unwrap() == '_' { + chars.next(); + } else { + reading = false + } + } + } + n = chars.peek().cloned(); + } + LexToken::IntegerLiteral(int_value) +} + +#[inline(always)] +pub fn lex_numeric(chars: &mut MultiPeek) -> LexToken { + chars.reset_peek(); + if chars.peek() == Some(&'0') { + match chars.peek() { + Some(&'D') | Some(&'d') => { + chars.next(); + chars.next(); + return lex_numeric_default(chars); + } + Some(&'X') | Some(&'x') => { + chars.next(); + chars.next(); + return lex_numeric_hexadecimal(chars); + } + Some(&'O') | Some(&'o') => { + chars.next(); + chars.next(); + return lex_numeric_octal(chars); + } + Some(&'B') | Some(&'b') => { + chars.next(); + chars.next(); + return lex_numeric_binary(chars); + } + _ => {} + } + } + chars.reset_peek(); + lex_numeric_default(chars) +} diff --git a/src/parsing/lex_tokens.rs b/src/parsing/lex_tokens.rs new file mode 100644 index 0000000..f0ea86b --- /dev/null +++ b/src/parsing/lex_tokens.rs @@ -0,0 +1,138 @@ +use crate::defines::{LiteralFloat, LiteralInt}; + +#[derive(PartialEq, Debug)] +pub enum LexToken { + EndOfFile, + WhiteSpace, + Identifier(String), + IntegerLiteral(LiteralInt), + FloatLiteral(LiteralFloat), + StringLiteral(String), + + Semicolon, + Colon, + OpenBracket, + CloseBracket, + OpenCurlyBracket, + CloseCurlyBracket, + OpenBlockBracket, + CloseBlockBracket, + + // Keywords + AndKeyword, + AbstractKeyword, + AutoKeyword, + BoolKeyword, + BreakKeyword, + CaseKeyword, + CastKeyword, + CatchKeyword, + ClassKeyword, + ConstKeyword, + ContinueKeyword, + DefaultKeyword, + DoKeyword, + DoubleKeyword, + ElseKeyword, + EnumKeyword, + ExplicitKeyword, + ExternalKeyword, + FalseKeyword, + FinalKeyword, + FloatKeyword, + ForKeyword, + FromKeyword, + FuncDefKeyword, + FunctionKeyword, + GetKeyword, + IfKeyword, + ImportKeyword, + InKeyword, + InOutKeyword, + IntKeyword, + InterfaceKeyword, + Int8Keyword, + Int16Keyword, + Int32Keyword, + Int64Keyword, + IsKeyword, + MixinKeyword, + NamespaceKeyword, + NotKeyword, + NullKeyword, + OrKeyword, + OutKeyword, + OverrideKeyword, + PrivateKeyword, + PropertyKeyword, + ProtectedKeyword, + ReturnKeyword, + SetKeyword, + SharedKeyword, + SuperKeyword, + SwitchKeyword, + ThisKeyword, + TrueKeyword, + TryKeyword, + TypeDefKeyword, + UintKeyword, + Uint8Keyword, + Uint16Keyword, + Uint32Keyword, + Uint64Keyword, + VoidKeyword, + WhileKeyword, + XorKeyword, + + // AssignOp + Equals, + PlusEquals, + MinusEquals, + StarEquals, + SlashEquals, + LineEquals, + AmpersandEquals, + RoofEquals, + PercentEquals, + StarStarEquals, + LeftLeftEquals, + RightRightEquals, + RightRightRightEquals, + + // LogicOp + AmpersandAmpersand, + LineLine, + RoofRoof, + + // CompOp + EqualsEquals, + NotEquals, + NotIsKeyword, + GreaterThan, + GreaterThanEquals, + LessThan, + LessThanEquals, + + // MathOp + Plus, + Minus, + Star, + Slash, + Percent, + StarStar, + + // BitOp + Ampersand, + VerticalLine, + Roof, + LeftLeft, + RightRight, + RightRightRight, + + // ExprPreOp + ExclamationMark, + PlusPlus, + MinusMinus, + Tilde, + AtSymbol, +} diff --git a/src/parsing/lexer.rs b/src/parsing/lexer.rs new file mode 100644 index 0000000..fdf5f6c --- /dev/null +++ b/src/parsing/lexer.rs @@ -0,0 +1,350 @@ +use super::lex_numerical::lex_numeric; +use crate::parsing::lex_tokens::LexToken; +use itertools::{Itertools, MultiPeek}; +use std::str::Chars; + +#[inline(always)] +fn lex_and_consume(chars: &mut MultiPeek, eq: LexToken) -> LexToken { + chars.next(); + eq +} + +#[inline(always)] +fn lex_eq_or(chars: &mut MultiPeek, eq: LexToken, or: LexToken) -> LexToken { + chars.next(); + if let Some('=') = chars.peek() { + chars.next(); + eq + } else { + or + } +} + +#[inline(always)] +fn lex_eq_rep_or( + chars: &mut MultiPeek, + v: char, + eq: LexToken, + rep: LexToken, + or: LexToken, +) -> LexToken { + chars.next(); + return match chars.peek() { + Some(c) => { + if *c == v { + chars.next(); + rep + } else if *c == '=' { + chars.next(); + eq + } else { + or + } + } + None => or, + }; +} + +type LT = LexToken; + +fn lex_keyword_or_identifier(chars: &mut MultiPeek) -> LexToken { + let mut reading = true; + let mut length = 1; + while reading { + match chars.peek() { + Some(c) => match c { + 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => { + length += 1; + } + _ => { + reading = false; + } + }, + None => { + reading = false; + } + }; + } + chars.reset_peek(); + let c: String = chars.take(length).collect(); + + match c.as_str() { + "and" => LT::AndKeyword, + "abstract" => LT::AbstractKeyword, + "auto" => LT::AutoKeyword, + "bool" => LT::BoolKeyword, + "break" => LT::BreakKeyword, + "case" => LT::CaseKeyword, + "cast" => LT::CastKeyword, + "catch" => LT::CatchKeyword, + "class" => LT::ClassKeyword, + "const" => LT::ConstKeyword, + "continue" => LT::ContinueKeyword, + "default" => LT::DefaultKeyword, + "do" => LT::DoKeyword, + "double" => LT::DoubleKeyword, + "else" => LT::ElseKeyword, + "enum" => LT::EnumKeyword, + "explicit" => LT::ExplicitKeyword, + "external" => LT::ExternalKeyword, + "false" => LT::FalseKeyword, + "final" => LT::FinalKeyword, + "float" => LT::FloatKeyword, + "for" => LT::ForKeyword, + "from" => LT::FromKeyword, + "funcdef" => LT::FuncDefKeyword, + "function" => LT::FunctionKeyword, + "get" => LT::GetKeyword, + "if" => LT::IfKeyword, + "import" => LT::ImportKeyword, + "in" => LT::InKeyword, + "inout" => LT::InOutKeyword, + "int" => LT::IntKeyword, + "interface" => LT::InterfaceKeyword, + "int8" => LT::Int8Keyword, + "int16" => LT::Int16Keyword, + "int32" => LT::Int32Keyword, + "int64" => LT::Int64Keyword, + "is" => LT::IsKeyword, + "mixin" => LT::MixinKeyword, + "namespace" => LT::NamespaceKeyword, + "not" => LT::NotKeyword, + "null" => LT::NullKeyword, + "or" => LT::OrKeyword, + "out" => LT::OutKeyword, + "override" => LT::OverrideKeyword, + "private" => LT::PrivateKeyword, + "property" => LT::PropertyKeyword, + "protected" => LT::ProtectedKeyword, + "return" => LT::ReturnKeyword, + "set" => LT::SetKeyword, + "shared" => LT::SharedKeyword, + "super" => LT::SuperKeyword, + "switch" => LT::SwitchKeyword, + "this" => LT::ThisKeyword, + "true" => LT::TrueKeyword, + "try" => LT::TryKeyword, + "typedef" => LT::TypeDefKeyword, + "uint" => LT::UintKeyword, + "uint8" => LT::Uint8Keyword, + "uint16" => LT::Uint16Keyword, + "uint32" => LT::Uint32Keyword, + "uint64" => LT::Uint64Keyword, + "void" => LT::VoidKeyword, + "while" => LT::WhileKeyword, + "xor" => LT::XorKeyword, + _ => LT::Identifier(c), + } +} + +fn lex_string(chars: &mut MultiPeek, opening_char: &char, heredoc: bool) -> LexToken { + chars.next(); + if heredoc { + chars.next(); + chars.next(); + } + let mut length: i32 = 0; + let mut string_length = 0; + let mut last_was_control = false; + + // We loop twice here. In the first loop we get the number of characters to read, the number of + // characters the string should be, and whether it's valid. This reduces the amount of allocations + // we need to do to read a string. + loop { + let p = chars.peek(); + match p { + None => { + // TODO: log error. Strings need to be closed, EOF should error. + unimplemented!(); + } + Some(&'\\') if !last_was_control => { + last_was_control = true; + length += 1; + } + Some(c) => { + if c == opening_char && !last_was_control { + if heredoc { + if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') { + break; + } else { + length += 1; + string_length += 1; + } + } else { + break; + } + } + length += 1; + string_length += 1; + last_was_control = false; + } + } + } + chars.reset_peek(); + let mut s: String = String::with_capacity(string_length); + for _ in 0..length { + let p = chars.next().unwrap(); + match p { + '\\' => { + if last_was_control { + s.push('\\'); + } else { + last_was_control = true; + continue; + } + } + '0' if last_was_control => s.push('\0'), + 'n' if last_was_control => s.push('\n'), + 'r' if last_was_control => s.push('\r'), + 't' if last_was_control => s.push('\t'), + _ => s.push(p), + }; + last_was_control = false; + } + assert_eq!(s.len(), string_length); + chars.reset_peek(); + chars.next(); + if heredoc { + chars.next(); + chars.next(); + } + + LT::StringLiteral(s) +} + +pub fn lex(s: &str) -> Vec { + let mut tokens: Vec = Vec::new(); + let mut chars = s.chars().multipeek(); + let mut reading = true; + while reading { + let p = chars.peek().cloned(); + match p { + Some(c) => match c { + ' ' | '\t' | '\r' | '\n' => { + chars.next(); + tokens.push(LT::WhiteSpace); + } + '=' => tokens.push(lex_eq_or(&mut chars, LT::EqualsEquals, LT::Equals)), + '+' => tokens.push(lex_eq_rep_or( + &mut chars, + '+', + LT::PlusEquals, + LT::PlusPlus, + LT::Plus, + )), + '-' => tokens.push(lex_eq_rep_or( + &mut chars, + '-', + LT::MinusEquals, + LT::MinusMinus, + LT::Minus, + )), + '*' => { + if chars.peek() == Some(&'*') { + chars.next(); + tokens.push(lex_eq_or(&mut chars, LT::StarStarEquals, LT::StarStar)) + } else { + tokens.push(lex_eq_or(&mut chars, LT::StarEquals, LT::Star)) + } + } + '/' => tokens.push(lex_eq_or(&mut chars, LT::SlashEquals, LT::Slash)), + '%' => tokens.push(lex_eq_or(&mut chars, LT::PercentEquals, LT::Percent)), + '|' => tokens.push(lex_eq_rep_or( + &mut chars, + '|', + LT::LineEquals, + LT::LineLine, + LT::VerticalLine, + )), + '&' => tokens.push(lex_eq_rep_or( + &mut chars, + '&', + LT::AmpersandEquals, + LT::AmpersandAmpersand, + LT::Ampersand, + )), + '^' => tokens.push(lex_eq_rep_or( + &mut chars, + '^', + LT::RoofEquals, + LT::RoofRoof, + LT::Roof, + )), + '<' => { + if chars.peek() == Some(&'<') { + chars.next(); + tokens.push(lex_eq_or(&mut chars, LT::LeftLeftEquals, LT::LeftLeft)) + } else { + tokens.push(lex_eq_or(&mut chars, LT::LessThanEquals, LT::LessThan)) + } + } + '>' => { + if chars.peek() == Some(&'>') { + if chars.peek() == Some(&'>') { + chars.next(); + chars.next(); + tokens.push(lex_eq_or( + &mut chars, + LT::RightRightRightEquals, + LT::RightRightRight, + )) + } else { + chars.next(); + tokens.push(lex_eq_or(&mut chars, LT::RightRightEquals, LT::RightRight)) + } + } else { + tokens.push(lex_eq_or( + &mut chars, + LT::GreaterThanEquals, + LT::GreaterThan, + )) + } + } + '!' => { + let next = chars.peek(); + if next == Some(&'=') { + chars.next(); + chars.next(); + tokens.push(LT::NotEquals); + } else if next == Some(&'i') && chars.peek() == Some(&'s') { + chars.next(); + chars.next(); + chars.next(); + tokens.push(LT::NotIsKeyword); + } else { + chars.next(); + tokens.push(LT::ExclamationMark); + } + } + + '~' => tokens.push(lex_and_consume(&mut chars, LT::Tilde)), + '@' => tokens.push(lex_and_consume(&mut chars, LT::AtSymbol)), + ';' => tokens.push(lex_and_consume(&mut chars, LT::Semicolon)), + ':' => tokens.push(lex_and_consume(&mut chars, LT::Colon)), + + '(' => tokens.push(lex_and_consume(&mut chars, LT::OpenBracket)), + ')' => tokens.push(lex_and_consume(&mut chars, LT::CloseBracket)), + '{' => tokens.push(lex_and_consume(&mut chars, LT::OpenCurlyBracket)), + '}' => tokens.push(lex_and_consume(&mut chars, LT::CloseCurlyBracket)), + '[' => tokens.push(lex_and_consume(&mut chars, LT::OpenBlockBracket)), + ']' => tokens.push(lex_and_consume(&mut chars, LT::CloseBlockBracket)), + + '0'..'9' => tokens.push(lex_numeric(&mut chars)), + 'a'..'z' | 'A'..'Z' | '_' => tokens.push(lex_keyword_or_identifier(&mut chars)), + '\'' => tokens.push(lex_string(&mut chars, &'\'', false)), + '"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => { + tokens.push(lex_string(&mut chars, &'"', true)) + } + '"' => tokens.push(lex_string(&mut chars, &'"', false)), + + // TODO: Definitely not unreachable. Log a proper error here. + _ => unreachable!(), + }, + None => { + tokens.push(LT::EndOfFile); + reading = false; + } + } + } + tokens +} diff --git a/src/parsing/lexer_tests.rs b/src/parsing/lexer_tests.rs new file mode 100644 index 0000000..fa9e60d --- /dev/null +++ b/src/parsing/lexer_tests.rs @@ -0,0 +1,241 @@ +use super::lex_tokens::LexToken; +use super::lexer::lex; + +macro_rules! lex_token_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0], $c); + assert_eq!(tokens[1], LexToken::EndOfFile); + } + }; +} + +macro_rules! lex_identifier_test { + ( $a: ident, $b: expr) => { + #[test] + fn $a() { + let tokens = lex($b); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0], LexToken::Identifier($b.to_string())); + assert_eq!(tokens[1], LexToken::EndOfFile); + } + }; +} + +macro_rules! lex_integer_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0], LexToken::IntegerLiteral($c)); + assert_eq!(tokens[1], LexToken::EndOfFile); + } + }; +} + +macro_rules! lex_float_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0], LexToken::FloatLiteral($c)); + assert_eq!(tokens[1], LexToken::EndOfFile); + } + }; +} + +macro_rules! lex_string_test { + ( $a: ident, $b: expr, $c: expr) => { + #[test] + fn $a() { + let tokens = lex($b); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0], LexToken::StringLiteral($c.to_string())); + assert_eq!(tokens[1], LexToken::EndOfFile); + } + }; +} + +lex_token_test!(lex_space, " ", LexToken::WhiteSpace); +lex_token_test!(lex_tab, "\t", LexToken::WhiteSpace); +lex_token_test!(lex_return_line, "\r", LexToken::WhiteSpace); +lex_token_test!(lex_newline, "\n", LexToken::WhiteSpace); + +lex_token_test!(lex_equals, "=", LexToken::Equals); +lex_token_test!(lex_equals_equals, "==", LexToken::EqualsEquals); +lex_token_test!(lex_plus, "+", LexToken::Plus); +lex_token_test!(lex_plus_plus, "++", LexToken::PlusPlus); +lex_token_test!(lex_plus_equals, "+=", LexToken::PlusEquals); +lex_token_test!(lex_minus, "-", LexToken::Minus); +lex_token_test!(lex_minus_minus, "--", LexToken::MinusMinus); +lex_token_test!(lex_minus_equals, "-=", LexToken::MinusEquals); +lex_token_test!(lex_star, "*", LexToken::Star); +lex_token_test!(lex_star_equals, "*=", LexToken::StarEquals); +lex_token_test!(lex_star_star, "**", LexToken::StarStar); +lex_token_test!(lex_star_star_equals, "**=", LexToken::StarStarEquals); +lex_token_test!(lex_slash, "/", LexToken::Slash); +lex_token_test!(lex_slash_equals, "/=", LexToken::SlashEquals); +lex_token_test!(lex_percent, "%", LexToken::Percent); +lex_token_test!(lex_percent_equals, "%=", LexToken::PercentEquals); + +lex_token_test!(lex_exclamation_mark, "!", LexToken::ExclamationMark); +lex_token_test!(lex_not_equals, "!=", LexToken::NotEquals); +lex_token_test!(lex_not_is_keyword, "!is", LexToken::NotIsKeyword); + +lex_token_test!(lex_vert_line, "|", LexToken::VerticalLine); +lex_token_test!(lex_vert_line_equals, "|=", LexToken::LineEquals); +lex_token_test!(lex_line_line, "||", LexToken::LineLine); + +lex_token_test!(lex_ampersand, "&", LexToken::Ampersand); +lex_token_test!(lex_ampersand_equals, "&=", LexToken::AmpersandEquals); +lex_token_test!(lex_ampersand_ampersand, "&&", LexToken::AmpersandAmpersand); +lex_token_test!(lex_less_than, "<", LexToken::LessThan); +lex_token_test!(lex_less_than_equals, "<=", LexToken::LessThanEquals); +lex_token_test!(lex_left_left, "<<", LexToken::LeftLeft); +lex_token_test!(lex_left_left_equals, "<<=", LexToken::LeftLeftEquals); + +lex_token_test!(lex_greater_than, ">", LexToken::GreaterThan); +lex_token_test!(lex_greater_than_equals, ">=", LexToken::GreaterThanEquals); +lex_token_test!(lex_right_right, ">>", LexToken::RightRight); +lex_token_test!(lex_right_right_equals, ">>=", LexToken::RightRightEquals); +lex_token_test!(lex_right_right_right, ">>>", LexToken::RightRightRight); +lex_token_test!( + lex_right_right_right_equals, + ">>>=", + LexToken::RightRightRightEquals +); + +lex_token_test!(lex_tilde, "~", LexToken::Tilde); +lex_token_test!(lex_at_symbol, "@", LexToken::AtSymbol); + +lex_token_test!(lex_and_keyword, "and", LexToken::AndKeyword); +lex_token_test!(lex_abstract_keyword, "abstract", LexToken::AbstractKeyword); +lex_token_test!(lex_auto_keyword, "auto", LexToken::AutoKeyword); +lex_token_test!(lex_bool_keyword, "bool", LexToken::BoolKeyword); +lex_token_test!(lex_break_keyword, "break", LexToken::BreakKeyword); +lex_token_test!(lex_case_keyword, "case", LexToken::CaseKeyword); +lex_token_test!(lex_cast_keyword, "cast", LexToken::CastKeyword); +lex_token_test!(lex_catch_keyword, "catch", LexToken::CatchKeyword); +lex_token_test!(lex_class_keyword, "class", LexToken::ClassKeyword); +lex_token_test!(lex_const_keyword, "const", LexToken::ConstKeyword); +lex_token_test!(lex_continue_keyword, "continue", LexToken::ContinueKeyword); +lex_token_test!(lex_default_keyword, "default", LexToken::DefaultKeyword); +lex_token_test!(lex_do_keyword, "do", LexToken::DoKeyword); +lex_token_test!(lex_double_keyword, "double", LexToken::DoubleKeyword); +lex_token_test!(lex_else_keyword, "else", LexToken::ElseKeyword); +lex_token_test!(lex_enum_keyword, "enum", LexToken::EnumKeyword); +lex_token_test!(lex_explicit_keyword, "explicit", LexToken::ExplicitKeyword); +lex_token_test!(lex_external_keyword, "external", LexToken::ExternalKeyword); +lex_token_test!(lex_false_keyword, "false", LexToken::FalseKeyword); +lex_token_test!(lex_final_keyword, "final", LexToken::FinalKeyword); +lex_token_test!(lex_float_keyword, "float", LexToken::FloatKeyword); +lex_token_test!(lex_for_keyword, "for", LexToken::ForKeyword); +lex_token_test!(lex_from_keyword, "from", LexToken::FromKeyword); +lex_token_test!(lex_funcdef_keyword, "funcdef", LexToken::FuncDefKeyword); +lex_token_test!(lex_function_keyword, "function", LexToken::FunctionKeyword); +lex_token_test!(lex_get_keyword, "get", LexToken::GetKeyword); +lex_token_test!(lex_if_keyword, "if", LexToken::IfKeyword); +lex_token_test!(lex_import_keyword, "import", LexToken::ImportKeyword); +lex_token_test!(lex_in_keyword, "in", LexToken::InKeyword); +lex_token_test!(lex_inout_keyword, "inout", LexToken::InOutKeyword); +lex_token_test!(lex_int_keyword, "int", LexToken::IntKeyword); +lex_token_test!( + lex_interface_keyword, + "interface", + LexToken::InterfaceKeyword +); +lex_token_test!(lex_int8_keyword, "int8", LexToken::Int8Keyword); +lex_token_test!(lex_int16_keyword, "int16", LexToken::Int16Keyword); +lex_token_test!(lex_int32_keyword, "int32", LexToken::Int32Keyword); +lex_token_test!(lex_int64_keyword, "int64", LexToken::Int64Keyword); +lex_token_test!(lex_is_keyword, "is", LexToken::IsKeyword); +lex_token_test!(lex_mixin_keyword, "mixin", LexToken::MixinKeyword); +lex_token_test!( + lex_namespace_keyword, + "namespace", + LexToken::NamespaceKeyword +); +lex_token_test!(lex_not_keyword, "not", LexToken::NotKeyword); +lex_token_test!(lex_null_keyword, "null", LexToken::NullKeyword); +lex_token_test!(lex_or_keyword, "or", LexToken::OrKeyword); +lex_token_test!(lex_out_keyword, "out", LexToken::OutKeyword); +lex_token_test!(lex_override_keyword, "override", LexToken::OverrideKeyword); +lex_token_test!(lex_private_keyword, "private", LexToken::PrivateKeyword); +lex_token_test!(lex_property_keyword, "property", LexToken::PropertyKeyword); +lex_token_test!( + lex_protected_keyword, + "protected", + LexToken::ProtectedKeyword +); + +lex_token_test!(lex_return_keyword, "return", LexToken::ReturnKeyword); +lex_token_test!(lex_set_keyword, "set", LexToken::SetKeyword); +lex_token_test!(lex_shared_keyword, "shared", LexToken::SharedKeyword); +lex_token_test!(lex_super_keyword, "super", LexToken::SuperKeyword); +lex_token_test!(lex_switch_keyword, "switch", LexToken::SwitchKeyword); +lex_token_test!(lex_this_keyword, "this", LexToken::ThisKeyword); +lex_token_test!(lex_true_keyword, "true", LexToken::TrueKeyword); +lex_token_test!(lex_try_keyword, "try", LexToken::TryKeyword); +lex_token_test!(lex_typedef_keyword, "typedef", LexToken::TypeDefKeyword); +lex_token_test!(lex_uint_keyword, "uint", LexToken::UintKeyword); +lex_token_test!(lex_uint8_keyword, "uint8", LexToken::Uint8Keyword); +lex_token_test!(lex_uint16_keyword, "uint16", LexToken::Uint16Keyword); +lex_token_test!(lex_uint32_keyword, "uint32", LexToken::Uint32Keyword); + +lex_token_test!(lex_void_keyword, "void", LexToken::VoidKeyword); +lex_token_test!(lex_while_keyword, "while", LexToken::WhileKeyword); +lex_token_test!(lex_xor_keyword, "xor", LexToken::XorKeyword); + +lex_identifier_test!(lex_basic_identifier_foo, "foo"); +lex_identifier_test!(lex_basic_identifier_foobar, "foobar"); + +lex_integer_test!(lex_zero, "0", 0); +lex_integer_test!(lex_one_two_three_four, "1234", 1234); +lex_integer_test!(lex_specific_one_two_three_four, "0d1234", 1234); +lex_integer_test!(lex_decimal_with_underline, "123_456", 123456); +lex_integer_test!(lex_specific_decimal_with_underline, "0D123_456", 123456); +lex_integer_test!(lex_hexadecimal_0f, "0X0F", 15); +lex_integer_test!(lex_hexadecimal_ff, "0xff", 255); +lex_integer_test!(lex_hexadecimal_ff_ff, "0xff_ff", 65535); +lex_integer_test!(lex_octal_112, "0o112", 74); +lex_integer_test!(lex_binary_1110, "0b1110", 14); +lex_integer_test!(lex_binary_01110, "0b01110", 14); + +lex_float_test!(lex_zero_float, "0.0", 0.0); +lex_float_test!(lex_half, "0.5", 0.5); +lex_float_test!(lex_point_0_5, "0.05", 0.05); +lex_float_test!(lex_half_with_exponent, "0.5e10", 0.5e10); + +lex_string_test!(lex_simple_string, "\"foo\"", "foo"); +lex_string_test!(lex_simple_string_single_quote, "\'foo\'", "foo"); +lex_string_test!(lex_string_with_escape, "\"fo\\\"o\"", "fo\"o"); +lex_string_test!(lex_string_with_new_line, "\"fo\\no\"", "fo\no"); +lex_string_test!(lex_heredoc_string, "\"\"\"foo\"\"\"", "foo"); +lex_string_test!(lex_heredoc_string_with_quote, "\"\"\"fo\"o\"\"\"", "fo\"o"); + +#[test] +fn lex_two_identifier() { + let tokens = lex("foo bar"); + assert_eq!(tokens.len(), 4); + assert_eq!(tokens[0], LexToken::Identifier("foo".to_string())); + assert_eq!(tokens[1], LexToken::WhiteSpace); + assert_eq!(tokens[2], LexToken::Identifier("bar".to_string())); + assert_eq!(tokens[3], LexToken::EndOfFile); +} + +#[test] +fn lex_multiple_tokens_with_not_is() { + let tokens = lex("a !is b"); + assert_eq!(tokens.len(), 6); + assert_eq!(tokens[0], LexToken::Identifier("a".to_string())); + assert_eq!(tokens[1], LexToken::WhiteSpace); + assert_eq!(tokens[2], LexToken::NotIsKeyword); + assert_eq!(tokens[3], LexToken::WhiteSpace); + assert_eq!(tokens[4], LexToken::Identifier("b".to_string())); + assert_eq!(tokens[5], LexToken::EndOfFile); +} diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs new file mode 100644 index 0000000..f51db11 --- /dev/null +++ b/src/parsing/mod.rs @@ -0,0 +1,6 @@ +mod lex_numerical; +pub mod lex_tokens; +pub mod lexer; + +#[cfg(test)] +mod lexer_tests;