SeraphScript/src/parsing/lexer/mod.rs

475 lines
16 KiB
Rust

use crate::logger::messages::Message;
use crate::parsing::lexer::lex_tokens::LexToken;
use crate::span::Span;
use lex_numerical::lex_numeric;
use lex_tokens::TokenType;
use string_walker::StringWalker;
mod lex_numerical;
pub mod lex_tokens;
#[cfg(test)]
mod lexer_tests;
mod string_walker;
#[inline(always)]
fn lex_and_consume(
chars: &mut StringWalker,
eq: TokenType,
f: &mut dyn FnMut(TokenType, usize, usize),
) {
chars.next();
f(eq, chars.real_position - 1, chars.real_position)
}
#[inline(always)]
fn lex_eq_or(
chars: &mut StringWalker,
eq: TokenType,
or: TokenType,
start_pos: usize,
f: &mut dyn FnMut(TokenType, usize, usize),
) {
chars.next();
if let Some('=') = chars.peek() {
chars.next();
f(eq, start_pos, chars.real_position);
} else {
f(or, start_pos, chars.real_position);
}
}
#[inline(always)]
fn lex_eq_rep_or(
chars: &mut StringWalker,
v: char,
eq: TokenType,
rep: TokenType,
or: TokenType,
f: &mut dyn FnMut(TokenType, usize, usize),
) {
let start_pos = chars.real_position;
chars.next();
match chars.peek() {
Some(c) => {
if *c == v {
chars.next();
f(rep, start_pos, chars.real_position);
} else if *c == '=' {
chars.next();
f(eq, start_pos, chars.real_position);
} else {
f(or, start_pos, chars.real_position);
}
}
None => f(or, start_pos, chars.real_position),
};
}
type TT = TokenType;
fn lex_keyword_or_identifier(chars: &mut StringWalker, f: &mut dyn FnMut(TokenType, usize, usize)) {
let mut reading = true;
let mut length = 1;
let start_pos = chars.real_position;
while reading {
match chars.peek() {
Some(c) => match c {
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => {
length += 1;
}
_ => {
reading = false;
}
},
None => {
reading = false;
}
};
}
chars.reset_peek();
let c: String = chars.take(length);
let token_type = match c.as_str() {
"and" => TT::AndKeyword,
"abstract" => TT::AbstractKeyword,
"auto" => TT::AutoKeyword,
"bool" => TT::BoolKeyword,
"break" => TT::BreakKeyword,
"case" => TT::CaseKeyword,
"cast" => TT::CastKeyword,
"catch" => TT::CatchKeyword,
"class" => TT::ClassKeyword,
"const" => TT::ConstKeyword,
"continue" => TT::ContinueKeyword,
"default" => TT::DefaultKeyword,
"do" => TT::DoKeyword,
"double" => TT::DoubleKeyword,
"else" => TT::ElseKeyword,
"enum" => TT::EnumKeyword,
"explicit" => TT::ExplicitKeyword,
"external" => TT::ExternalKeyword,
"false" => TT::FalseKeyword,
"final" => TT::FinalKeyword,
"float" => TT::FloatKeyword,
"for" => TT::ForKeyword,
"from" => TT::FromKeyword,
"funcdef" => TT::FuncDefKeyword,
"function" => TT::FunctionKeyword,
"get" => TT::GetKeyword,
"if" => TT::IfKeyword,
"import" => TT::ImportKeyword,
"in" => TT::InKeyword,
"inout" => TT::InOutKeyword,
"int" => TT::IntKeyword,
"interface" => TT::InterfaceKeyword,
"int8" => TT::Int8Keyword,
"int16" => TT::Int16Keyword,
"int32" => TT::Int32Keyword,
"int64" => TT::Int64Keyword,
"is" => TT::IsKeyword,
"mixin" => TT::MixinKeyword,
"namespace" => TT::NamespaceKeyword,
"not" => TT::NotKeyword,
"null" => TT::NullKeyword,
"or" => TT::OrKeyword,
"out" => TT::OutKeyword,
"override" => TT::OverrideKeyword,
"private" => TT::PrivateKeyword,
"property" => TT::PropertyKeyword,
"protected" => TT::ProtectedKeyword,
"return" => TT::ReturnKeyword,
"set" => TT::SetKeyword,
"shared" => TT::SharedKeyword,
"super" => TT::SuperKeyword,
"switch" => TT::SwitchKeyword,
"this" => TT::ThisKeyword,
"true" => TT::TrueKeyword,
"try" => TT::TryKeyword,
"typedef" => TT::TypeDefKeyword,
"uint" => TT::UintKeyword,
"uint8" => TT::Uint8Keyword,
"uint16" => TT::Uint16Keyword,
"uint32" => TT::Uint32Keyword,
"uint64" => TT::Uint64Keyword,
"void" => TT::VoidKeyword,
"while" => TT::WhileKeyword,
"xor" => TT::XorKeyword,
_ => TT::Identifier(c),
};
f(token_type, start_pos, chars.real_position);
}
fn lex_string(
chars: &mut StringWalker,
opening_char: &char,
heredoc: bool,
log: &mut dyn FnMut(Message, Span),
f: &mut dyn FnMut(TokenType, usize, usize),
) {
let start_pos = chars.real_position;
chars.next();
if heredoc {
chars.next();
chars.next();
}
let mut length: i32 = 0;
let mut string_length = 0;
let mut last_was_control = false;
// We loop twice here. In the first loop we get the number of characters to read, the number of
// characters the string should be, and whether it's valid. This reduces the amount of allocations
// we need to do to read a string.
loop {
let p = chars.peek();
match p {
None => {
log(
Message::UnclosedStringLiteral,
Span::new(chars.peek_position - 1, chars.peek_position),
);
break;
}
Some(&'\\') if !last_was_control => {
last_was_control = true;
length += 1;
}
Some(c) => {
if c == opening_char && !last_was_control {
if heredoc {
if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') {
break;
} else {
length += 1;
string_length += 1;
}
} else {
break;
}
}
length += 1;
string_length += 1;
last_was_control = false;
}
}
}
chars.reset_peek();
let mut s: String = String::with_capacity(string_length);
for _ in 0..length {
let p = chars.next().unwrap();
match p {
'\\' => {
if last_was_control {
s.push('\\');
} else {
last_was_control = true;
continue;
}
}
'0' if last_was_control => s.push('\0'),
'n' if last_was_control => s.push('\n'),
'r' if last_was_control => s.push('\r'),
't' if last_was_control => s.push('\t'),
_ => s.push(p),
};
last_was_control = false;
}
assert_eq!(s.len(), string_length);
chars.reset_peek();
chars.next();
if heredoc {
chars.next();
chars.next();
}
f(TT::StringLiteral(s), start_pos, chars.real_position);
}
pub fn lex(s: &str, log: &mut dyn FnMut(Message, Span)) -> Vec<LexToken> {
let mut tokens: Vec<LexToken> = Vec::new();
let mut chars = StringWalker::create(s);
let mut reading = true;
let mut add_token = |token_type: TokenType, start: usize, end: usize| {
tokens.push(LexToken {
token_type,
span: Span::new(start, end),
})
};
while reading {
let p = chars.peek().cloned();
match p {
Some(c) => match c {
' ' | '\t' | '\r' | '\n' => {
chars.next();
add_token(TT::WhiteSpace, chars.real_position - 1, chars.real_position);
}
'=' => {
let start_pos = chars.real_position;
lex_eq_or(
&mut chars,
TT::EqualsEquals,
TT::Equals,
start_pos,
&mut add_token,
)
}
'+' => lex_eq_rep_or(
&mut chars,
'+',
TT::PlusEquals,
TT::PlusPlus,
TT::Plus,
&mut add_token,
),
'-' => lex_eq_rep_or(
&mut chars,
'-',
TT::MinusEquals,
TT::MinusMinus,
TT::Minus,
&mut add_token,
),
'*' => {
let start_pos = chars.real_position;
if chars.peek() == Some(&'*') {
chars.next();
lex_eq_or(
&mut chars,
TT::StarStarEquals,
TT::StarStar,
start_pos,
&mut add_token,
)
} else {
lex_eq_or(
&mut chars,
TT::StarEquals,
TT::Star,
start_pos,
&mut add_token,
)
}
}
'/' => {
let start_pos = chars.real_position;
lex_eq_or(
&mut chars,
TT::SlashEquals,
TT::Slash,
start_pos,
&mut add_token,
);
}
'%' => {
let start_pos = chars.real_position;
lex_eq_or(
&mut chars,
TT::PercentEquals,
TT::Percent,
start_pos,
&mut add_token,
);
}
'|' => lex_eq_rep_or(
&mut chars,
'|',
TT::LineEquals,
TT::LineLine,
TT::VerticalLine,
&mut add_token,
),
'&' => lex_eq_rep_or(
&mut chars,
'&',
TT::AmpersandEquals,
TT::AmpersandAmpersand,
TT::Ampersand,
&mut add_token,
),
'^' => lex_eq_rep_or(
&mut chars,
'^',
TT::RoofEquals,
TT::RoofRoof,
TT::Roof,
&mut add_token,
),
'<' => {
let start_pos = chars.real_position;
if chars.peek() == Some(&'<') {
chars.next();
lex_eq_or(
&mut chars,
TT::LeftLeftEquals,
TT::LeftLeft,
start_pos,
&mut add_token,
)
} else {
lex_eq_or(
&mut chars,
TT::LessThanEquals,
TT::LessThan,
start_pos,
&mut add_token,
)
}
}
'>' => {
let start_pos = chars.real_position;
if chars.peek() == Some(&'>') {
if chars.peek() == Some(&'>') {
chars.next();
chars.next();
lex_eq_or(
&mut chars,
TT::RightRightRightEquals,
TT::RightRightRight,
start_pos,
&mut add_token,
)
} else {
chars.next();
lex_eq_or(
&mut chars,
TT::RightRightEquals,
TT::RightRight,
start_pos,
&mut add_token,
)
}
} else {
lex_eq_or(
&mut chars,
TT::GreaterThanEquals,
TT::GreaterThan,
start_pos,
&mut add_token,
)
}
}
'!' => {
let start_pos = chars.real_position;
let next = chars.peek();
if next == Some(&'=') {
chars.next();
chars.next();
add_token(TT::NotEquals, start_pos, chars.real_position);
} else if next == Some(&'i') && chars.peek() == Some(&'s') {
chars.next();
chars.next();
chars.next();
add_token(TT::NotIsKeyword, start_pos, chars.real_position);
} else {
chars.next();
add_token(TT::ExclamationMark, start_pos, chars.real_position);
}
}
'~' => lex_and_consume(&mut chars, TT::Tilde, &mut add_token),
'@' => lex_and_consume(&mut chars, TT::AtSymbol, &mut add_token),
';' => lex_and_consume(&mut chars, TT::Semicolon, &mut add_token),
':' => {
let start_pos = chars.real_position;
if chars.peek() == Some(&':') {
chars.next();
chars.next();
add_token(TT::ColonColon, start_pos, chars.real_position);
} else {
chars.next();
add_token(TT::Colon, start_pos, chars.real_position);
}
}
'(' => lex_and_consume(&mut chars, TT::OpenBracket, &mut add_token),
')' => lex_and_consume(&mut chars, TT::CloseBracket, &mut add_token),
'{' => lex_and_consume(&mut chars, TT::OpenCurlyBracket, &mut add_token),
'}' => lex_and_consume(&mut chars, TT::CloseCurlyBracket, &mut add_token),
'[' => lex_and_consume(&mut chars, TT::OpenBlockBracket, &mut add_token),
']' => lex_and_consume(&mut chars, TT::CloseBlockBracket, &mut add_token),
',' => lex_and_consume(&mut chars, TT::Comma, &mut add_token),
'.' => lex_and_consume(&mut chars, TT::Dot, &mut add_token),
'?' => lex_and_consume(&mut chars, TT::QuestionMark, &mut add_token),
'0'..'9' => lex_numeric(&mut chars, &mut add_token),
'a'..'z' | 'A'..'Z' | '_' => lex_keyword_or_identifier(&mut chars, &mut add_token),
'\'' => lex_string(&mut chars, &'\'', false, log, &mut add_token),
'"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => {
lex_string(&mut chars, &'"', true, log, &mut add_token)
}
'"' => lex_string(&mut chars, &'"', false, log, &mut add_token),
_ => log(
Message::UnexpectedCharacter(c),
Span::new(chars.real_position, chars.real_position + 1),
),
},
None => {
add_token(TT::EndOfFile, chars.real_position, chars.real_position);
reading = false;
}
}
}
tokens
}