475 lines
16 KiB
Rust
475 lines
16 KiB
Rust
use crate::logger::messages::Message;
|
|
use crate::parsing::lexer::lex_tokens::LexToken;
|
|
use crate::span::Span;
|
|
use lex_numerical::lex_numeric;
|
|
use lex_tokens::TokenType;
|
|
use string_walker::StringWalker;
|
|
|
|
mod lex_numerical;
|
|
pub mod lex_tokens;
|
|
#[cfg(test)]
|
|
mod lexer_tests;
|
|
mod string_walker;
|
|
|
|
#[inline(always)]
|
|
fn lex_and_consume(
|
|
chars: &mut StringWalker,
|
|
eq: TokenType,
|
|
f: &mut dyn FnMut(TokenType, usize, usize),
|
|
) {
|
|
chars.next();
|
|
f(eq, chars.real_position - 1, chars.real_position)
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn lex_eq_or(
|
|
chars: &mut StringWalker,
|
|
eq: TokenType,
|
|
or: TokenType,
|
|
start_pos: usize,
|
|
f: &mut dyn FnMut(TokenType, usize, usize),
|
|
) {
|
|
chars.next();
|
|
if let Some('=') = chars.peek() {
|
|
chars.next();
|
|
f(eq, start_pos, chars.real_position);
|
|
} else {
|
|
f(or, start_pos, chars.real_position);
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn lex_eq_rep_or(
|
|
chars: &mut StringWalker,
|
|
v: char,
|
|
eq: TokenType,
|
|
rep: TokenType,
|
|
or: TokenType,
|
|
f: &mut dyn FnMut(TokenType, usize, usize),
|
|
) {
|
|
let start_pos = chars.real_position;
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some(c) => {
|
|
if *c == v {
|
|
chars.next();
|
|
f(rep, start_pos, chars.real_position);
|
|
} else if *c == '=' {
|
|
chars.next();
|
|
f(eq, start_pos, chars.real_position);
|
|
} else {
|
|
f(or, start_pos, chars.real_position);
|
|
}
|
|
}
|
|
None => f(or, start_pos, chars.real_position),
|
|
};
|
|
}
|
|
|
|
type TT = TokenType;
|
|
|
|
fn lex_keyword_or_identifier(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) {
|
|
let mut reading = true;
|
|
let mut length = 1;
|
|
let start_pos = chars.real_position;
|
|
while reading {
|
|
match chars.peek() {
|
|
Some(c) => match c {
|
|
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => {
|
|
length += 1;
|
|
}
|
|
_ => {
|
|
reading = false;
|
|
}
|
|
},
|
|
None => {
|
|
reading = false;
|
|
}
|
|
};
|
|
}
|
|
chars.reset_peek();
|
|
let c: String = chars.take(length);
|
|
|
|
let token_type = match c.as_str() {
|
|
"and" => TT::AndKeyword,
|
|
"abstract" => TT::AbstractKeyword,
|
|
"auto" => TT::AutoKeyword,
|
|
"bool" => TT::BoolKeyword,
|
|
"break" => TT::BreakKeyword,
|
|
"case" => TT::CaseKeyword,
|
|
"cast" => TT::CastKeyword,
|
|
"catch" => TT::CatchKeyword,
|
|
"class" => TT::ClassKeyword,
|
|
"const" => TT::ConstKeyword,
|
|
"continue" => TT::ContinueKeyword,
|
|
"default" => TT::DefaultKeyword,
|
|
"do" => TT::DoKeyword,
|
|
"double" => TT::DoubleKeyword,
|
|
"else" => TT::ElseKeyword,
|
|
"enum" => TT::EnumKeyword,
|
|
"explicit" => TT::ExplicitKeyword,
|
|
"external" => TT::ExternalKeyword,
|
|
"false" => TT::FalseKeyword,
|
|
"final" => TT::FinalKeyword,
|
|
"float" => TT::FloatKeyword,
|
|
"for" => TT::ForKeyword,
|
|
"from" => TT::FromKeyword,
|
|
"funcdef" => TT::FuncDefKeyword,
|
|
"function" => TT::FunctionKeyword,
|
|
"get" => TT::GetKeyword,
|
|
"if" => TT::IfKeyword,
|
|
"import" => TT::ImportKeyword,
|
|
"in" => TT::InKeyword,
|
|
"inout" => TT::InOutKeyword,
|
|
"int" => TT::IntKeyword,
|
|
"interface" => TT::InterfaceKeyword,
|
|
"int8" => TT::Int8Keyword,
|
|
"int16" => TT::Int16Keyword,
|
|
"int32" => TT::Int32Keyword,
|
|
"int64" => TT::Int64Keyword,
|
|
"is" => TT::IsKeyword,
|
|
"mixin" => TT::MixinKeyword,
|
|
"namespace" => TT::NamespaceKeyword,
|
|
"not" => TT::NotKeyword,
|
|
"null" => TT::NullKeyword,
|
|
"or" => TT::OrKeyword,
|
|
"out" => TT::OutKeyword,
|
|
"override" => TT::OverrideKeyword,
|
|
"private" => TT::PrivateKeyword,
|
|
"property" => TT::PropertyKeyword,
|
|
"protected" => TT::ProtectedKeyword,
|
|
"return" => TT::ReturnKeyword,
|
|
"set" => TT::SetKeyword,
|
|
"shared" => TT::SharedKeyword,
|
|
"super" => TT::SuperKeyword,
|
|
"switch" => TT::SwitchKeyword,
|
|
"this" => TT::ThisKeyword,
|
|
"true" => TT::TrueKeyword,
|
|
"try" => TT::TryKeyword,
|
|
"typedef" => TT::TypeDefKeyword,
|
|
"uint" => TT::UintKeyword,
|
|
"uint8" => TT::Uint8Keyword,
|
|
"uint16" => TT::Uint16Keyword,
|
|
"uint32" => TT::Uint32Keyword,
|
|
"uint64" => TT::Uint64Keyword,
|
|
"void" => TT::VoidKeyword,
|
|
"while" => TT::WhileKeyword,
|
|
"xor" => TT::XorKeyword,
|
|
_ => TT::Identifier(c),
|
|
};
|
|
f(token_type, start_pos, chars.real_position);
|
|
}
|
|
|
|
fn lex_string(
|
|
chars: &mut StringWalker,
|
|
opening_char: &char,
|
|
heredoc: bool,
|
|
log: &mut dyn FnMut(Message, Span),
|
|
f: &mut dyn FnMut(TokenType, usize, usize),
|
|
) {
|
|
let start_pos = chars.real_position;
|
|
chars.next();
|
|
if heredoc {
|
|
chars.next();
|
|
chars.next();
|
|
}
|
|
let mut length: i32 = 0;
|
|
let mut string_length = 0;
|
|
let mut last_was_control = false;
|
|
|
|
// We loop twice here. In the first loop we get the number of characters to read, the number of
|
|
// characters the string should be, and whether it's valid. This reduces the amount of allocations
|
|
// we need to do to read a string.
|
|
loop {
|
|
let p = chars.peek();
|
|
match p {
|
|
None => {
|
|
log(
|
|
Message::UnclosedStringLiteral,
|
|
Span::new(chars.peek_position - 1, chars.peek_position),
|
|
);
|
|
break;
|
|
}
|
|
Some(&'\\') if !last_was_control => {
|
|
last_was_control = true;
|
|
length += 1;
|
|
}
|
|
Some(c) => {
|
|
if c == opening_char && !last_was_control {
|
|
if heredoc {
|
|
if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') {
|
|
break;
|
|
} else {
|
|
length += 1;
|
|
string_length += 1;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
length += 1;
|
|
string_length += 1;
|
|
last_was_control = false;
|
|
}
|
|
}
|
|
}
|
|
chars.reset_peek();
|
|
let mut s: String = String::with_capacity(string_length);
|
|
for _ in 0..length {
|
|
let p = chars.next().unwrap();
|
|
match p {
|
|
'\\' => {
|
|
if last_was_control {
|
|
s.push('\\');
|
|
} else {
|
|
last_was_control = true;
|
|
continue;
|
|
}
|
|
}
|
|
'0' if last_was_control => s.push('\0'),
|
|
'n' if last_was_control => s.push('\n'),
|
|
'r' if last_was_control => s.push('\r'),
|
|
't' if last_was_control => s.push('\t'),
|
|
_ => s.push(p),
|
|
};
|
|
last_was_control = false;
|
|
}
|
|
assert_eq!(s.len(), string_length);
|
|
chars.reset_peek();
|
|
chars.next();
|
|
if heredoc {
|
|
chars.next();
|
|
chars.next();
|
|
}
|
|
|
|
f(TT::StringLiteral(s), start_pos, chars.real_position);
|
|
}
|
|
|
|
pub fn lex(s: &str, log: &mut dyn FnMut(Message, Span)) -> Vec<LexToken> {
|
|
let mut tokens: Vec<LexToken> = Vec::new();
|
|
let mut chars = StringWalker::create(s);
|
|
let mut reading = true;
|
|
|
|
let mut add_token = |token_type: TokenType, start: usize, end: usize| {
|
|
tokens.push(LexToken {
|
|
token_type,
|
|
span: Span::new(start, end),
|
|
})
|
|
};
|
|
|
|
while reading {
|
|
let p = chars.peek().cloned();
|
|
match p {
|
|
Some(c) => match c {
|
|
' ' | '\t' | '\r' | '\n' => {
|
|
chars.next();
|
|
add_token(TT::WhiteSpace, chars.real_position - 1, chars.real_position);
|
|
}
|
|
'=' => {
|
|
let start_pos = chars.real_position;
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::EqualsEquals,
|
|
TT::Equals,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
}
|
|
'+' => lex_eq_rep_or(
|
|
&mut chars,
|
|
'+',
|
|
TT::PlusEquals,
|
|
TT::PlusPlus,
|
|
TT::Plus,
|
|
&mut add_token,
|
|
),
|
|
'-' => lex_eq_rep_or(
|
|
&mut chars,
|
|
'-',
|
|
TT::MinusEquals,
|
|
TT::MinusMinus,
|
|
TT::Minus,
|
|
&mut add_token,
|
|
),
|
|
'*' => {
|
|
let start_pos = chars.real_position;
|
|
if chars.peek() == Some(&'*') {
|
|
chars.next();
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::StarStarEquals,
|
|
TT::StarStar,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
} else {
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::StarEquals,
|
|
TT::Star,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
}
|
|
}
|
|
'/' => {
|
|
let start_pos = chars.real_position;
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::SlashEquals,
|
|
TT::Slash,
|
|
start_pos,
|
|
&mut add_token,
|
|
);
|
|
}
|
|
'%' => {
|
|
let start_pos = chars.real_position;
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::PercentEquals,
|
|
TT::Percent,
|
|
start_pos,
|
|
&mut add_token,
|
|
);
|
|
}
|
|
'|' => lex_eq_rep_or(
|
|
&mut chars,
|
|
'|',
|
|
TT::LineEquals,
|
|
TT::LineLine,
|
|
TT::VerticalLine,
|
|
&mut add_token,
|
|
),
|
|
'&' => lex_eq_rep_or(
|
|
&mut chars,
|
|
'&',
|
|
TT::AmpersandEquals,
|
|
TT::AmpersandAmpersand,
|
|
TT::Ampersand,
|
|
&mut add_token,
|
|
),
|
|
'^' => lex_eq_rep_or(
|
|
&mut chars,
|
|
'^',
|
|
TT::RoofEquals,
|
|
TT::RoofRoof,
|
|
TT::Roof,
|
|
&mut add_token,
|
|
),
|
|
'<' => {
|
|
let start_pos = chars.real_position;
|
|
if chars.peek() == Some(&'<') {
|
|
chars.next();
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::LeftLeftEquals,
|
|
TT::LeftLeft,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
} else {
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::LessThanEquals,
|
|
TT::LessThan,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
}
|
|
}
|
|
'>' => {
|
|
let start_pos = chars.real_position;
|
|
if chars.peek() == Some(&'>') {
|
|
if chars.peek() == Some(&'>') {
|
|
chars.next();
|
|
chars.next();
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::RightRightRightEquals,
|
|
TT::RightRightRight,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
} else {
|
|
chars.next();
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::RightRightEquals,
|
|
TT::RightRight,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
}
|
|
} else {
|
|
lex_eq_or(
|
|
&mut chars,
|
|
TT::GreaterThanEquals,
|
|
TT::GreaterThan,
|
|
start_pos,
|
|
&mut add_token,
|
|
)
|
|
}
|
|
}
|
|
'!' => {
|
|
let start_pos = chars.real_position;
|
|
let next = chars.peek();
|
|
if next == Some(&'=') {
|
|
chars.next();
|
|
chars.next();
|
|
add_token(TT::NotEquals, start_pos, chars.real_position);
|
|
} else if next == Some(&'i') && chars.peek() == Some(&'s') {
|
|
chars.next();
|
|
chars.next();
|
|
chars.next();
|
|
add_token(TT::NotIsKeyword, start_pos, chars.real_position);
|
|
} else {
|
|
chars.next();
|
|
add_token(TT::ExclamationMark, start_pos, chars.real_position);
|
|
}
|
|
}
|
|
|
|
'~' => lex_and_consume(&mut chars, TT::Tilde, &mut add_token),
|
|
'@' => lex_and_consume(&mut chars, TT::AtSymbol, &mut add_token),
|
|
';' => lex_and_consume(&mut chars, TT::Semicolon, &mut add_token),
|
|
':' => {
|
|
let start_pos = chars.real_position;
|
|
if chars.peek() == Some(&':') {
|
|
chars.next();
|
|
chars.next();
|
|
add_token(TT::ColonColon, start_pos, chars.real_position);
|
|
} else {
|
|
chars.next();
|
|
add_token(TT::Colon, start_pos, chars.real_position);
|
|
}
|
|
}
|
|
|
|
'(' => lex_and_consume(&mut chars, TT::OpenBracket, &mut add_token),
|
|
')' => lex_and_consume(&mut chars, TT::CloseBracket, &mut add_token),
|
|
'{' => lex_and_consume(&mut chars, TT::OpenCurlyBracket, &mut add_token),
|
|
'}' => lex_and_consume(&mut chars, TT::CloseCurlyBracket, &mut add_token),
|
|
'[' => lex_and_consume(&mut chars, TT::OpenBlockBracket, &mut add_token),
|
|
']' => lex_and_consume(&mut chars, TT::CloseBlockBracket, &mut add_token),
|
|
',' => lex_and_consume(&mut chars, TT::Comma, &mut add_token),
|
|
'.' => lex_and_consume(&mut chars, TT::Dot, &mut add_token),
|
|
|
|
'0'..'9' => lex_numeric(&mut chars, &mut add_token),
|
|
'a'..'z' | 'A'..'Z' | '_' => lex_keyword_or_identifier(&mut chars, &mut add_token),
|
|
'\'' => lex_string(&mut chars, &'\'', false, log, &mut add_token),
|
|
'"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => {
|
|
lex_string(&mut chars, &'"', true, log, &mut add_token)
|
|
}
|
|
'"' => lex_string(&mut chars, &'"', false, log, &mut add_token),
|
|
|
|
_ => log(
|
|
Message::UnexpectedCharacter(c),
|
|
Span::new(chars.real_position, chars.real_position + 1),
|
|
),
|
|
},
|
|
None => {
|
|
add_token(TT::EndOfFile, chars.real_position, chars.real_position);
|
|
reading = false;
|
|
}
|
|
}
|
|
}
|
|
tokens
|
|
}
|