351 lines
12 KiB
Rust
351 lines
12 KiB
Rust
use super::lex_numerical::lex_numeric;
|
|
use crate::parsing::lex_tokens::LexToken;
|
|
use itertools::{Itertools, MultiPeek};
|
|
use std::str::Chars;
|
|
|
|
#[inline(always)]
|
|
fn lex_and_consume(chars: &mut MultiPeek<Chars>, eq: LexToken) -> LexToken {
|
|
chars.next();
|
|
eq
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn lex_eq_or(chars: &mut MultiPeek<Chars>, eq: LexToken, or: LexToken) -> LexToken {
|
|
chars.next();
|
|
if let Some('=') = chars.peek() {
|
|
chars.next();
|
|
eq
|
|
} else {
|
|
or
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn lex_eq_rep_or(
|
|
chars: &mut MultiPeek<Chars>,
|
|
v: char,
|
|
eq: LexToken,
|
|
rep: LexToken,
|
|
or: LexToken,
|
|
) -> LexToken {
|
|
chars.next();
|
|
return match chars.peek() {
|
|
Some(c) => {
|
|
if *c == v {
|
|
chars.next();
|
|
rep
|
|
} else if *c == '=' {
|
|
chars.next();
|
|
eq
|
|
} else {
|
|
or
|
|
}
|
|
}
|
|
None => or,
|
|
};
|
|
}
|
|
|
|
type LT = LexToken;
|
|
|
|
fn lex_keyword_or_identifier(chars: &mut MultiPeek<Chars>) -> LexToken {
|
|
let mut reading = true;
|
|
let mut length = 1;
|
|
while reading {
|
|
match chars.peek() {
|
|
Some(c) => match c {
|
|
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => {
|
|
length += 1;
|
|
}
|
|
_ => {
|
|
reading = false;
|
|
}
|
|
},
|
|
None => {
|
|
reading = false;
|
|
}
|
|
};
|
|
}
|
|
chars.reset_peek();
|
|
let c: String = chars.take(length).collect();
|
|
|
|
match c.as_str() {
|
|
"and" => LT::AndKeyword,
|
|
"abstract" => LT::AbstractKeyword,
|
|
"auto" => LT::AutoKeyword,
|
|
"bool" => LT::BoolKeyword,
|
|
"break" => LT::BreakKeyword,
|
|
"case" => LT::CaseKeyword,
|
|
"cast" => LT::CastKeyword,
|
|
"catch" => LT::CatchKeyword,
|
|
"class" => LT::ClassKeyword,
|
|
"const" => LT::ConstKeyword,
|
|
"continue" => LT::ContinueKeyword,
|
|
"default" => LT::DefaultKeyword,
|
|
"do" => LT::DoKeyword,
|
|
"double" => LT::DoubleKeyword,
|
|
"else" => LT::ElseKeyword,
|
|
"enum" => LT::EnumKeyword,
|
|
"explicit" => LT::ExplicitKeyword,
|
|
"external" => LT::ExternalKeyword,
|
|
"false" => LT::FalseKeyword,
|
|
"final" => LT::FinalKeyword,
|
|
"float" => LT::FloatKeyword,
|
|
"for" => LT::ForKeyword,
|
|
"from" => LT::FromKeyword,
|
|
"funcdef" => LT::FuncDefKeyword,
|
|
"function" => LT::FunctionKeyword,
|
|
"get" => LT::GetKeyword,
|
|
"if" => LT::IfKeyword,
|
|
"import" => LT::ImportKeyword,
|
|
"in" => LT::InKeyword,
|
|
"inout" => LT::InOutKeyword,
|
|
"int" => LT::IntKeyword,
|
|
"interface" => LT::InterfaceKeyword,
|
|
"int8" => LT::Int8Keyword,
|
|
"int16" => LT::Int16Keyword,
|
|
"int32" => LT::Int32Keyword,
|
|
"int64" => LT::Int64Keyword,
|
|
"is" => LT::IsKeyword,
|
|
"mixin" => LT::MixinKeyword,
|
|
"namespace" => LT::NamespaceKeyword,
|
|
"not" => LT::NotKeyword,
|
|
"null" => LT::NullKeyword,
|
|
"or" => LT::OrKeyword,
|
|
"out" => LT::OutKeyword,
|
|
"override" => LT::OverrideKeyword,
|
|
"private" => LT::PrivateKeyword,
|
|
"property" => LT::PropertyKeyword,
|
|
"protected" => LT::ProtectedKeyword,
|
|
"return" => LT::ReturnKeyword,
|
|
"set" => LT::SetKeyword,
|
|
"shared" => LT::SharedKeyword,
|
|
"super" => LT::SuperKeyword,
|
|
"switch" => LT::SwitchKeyword,
|
|
"this" => LT::ThisKeyword,
|
|
"true" => LT::TrueKeyword,
|
|
"try" => LT::TryKeyword,
|
|
"typedef" => LT::TypeDefKeyword,
|
|
"uint" => LT::UintKeyword,
|
|
"uint8" => LT::Uint8Keyword,
|
|
"uint16" => LT::Uint16Keyword,
|
|
"uint32" => LT::Uint32Keyword,
|
|
"uint64" => LT::Uint64Keyword,
|
|
"void" => LT::VoidKeyword,
|
|
"while" => LT::WhileKeyword,
|
|
"xor" => LT::XorKeyword,
|
|
_ => LT::Identifier(c),
|
|
}
|
|
}
|
|
|
|
fn lex_string(chars: &mut MultiPeek<Chars>, opening_char: &char, heredoc: bool) -> LexToken {
|
|
chars.next();
|
|
if heredoc {
|
|
chars.next();
|
|
chars.next();
|
|
}
|
|
let mut length: i32 = 0;
|
|
let mut string_length = 0;
|
|
let mut last_was_control = false;
|
|
|
|
// We loop twice here. In the first loop we get the number of characters to read, the number of
|
|
// characters the string should be, and whether it's valid. This reduces the amount of allocations
|
|
// we need to do to read a string.
|
|
loop {
|
|
let p = chars.peek();
|
|
match p {
|
|
None => {
|
|
// TODO: log error. Strings need to be closed, EOF should error.
|
|
unimplemented!();
|
|
}
|
|
Some(&'\\') if !last_was_control => {
|
|
last_was_control = true;
|
|
length += 1;
|
|
}
|
|
Some(c) => {
|
|
if c == opening_char && !last_was_control {
|
|
if heredoc {
|
|
if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') {
|
|
break;
|
|
} else {
|
|
length += 1;
|
|
string_length += 1;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
length += 1;
|
|
string_length += 1;
|
|
last_was_control = false;
|
|
}
|
|
}
|
|
}
|
|
chars.reset_peek();
|
|
let mut s: String = String::with_capacity(string_length);
|
|
for _ in 0..length {
|
|
let p = chars.next().unwrap();
|
|
match p {
|
|
'\\' => {
|
|
if last_was_control {
|
|
s.push('\\');
|
|
} else {
|
|
last_was_control = true;
|
|
continue;
|
|
}
|
|
}
|
|
'0' if last_was_control => s.push('\0'),
|
|
'n' if last_was_control => s.push('\n'),
|
|
'r' if last_was_control => s.push('\r'),
|
|
't' if last_was_control => s.push('\t'),
|
|
_ => s.push(p),
|
|
};
|
|
last_was_control = false;
|
|
}
|
|
assert_eq!(s.len(), string_length);
|
|
chars.reset_peek();
|
|
chars.next();
|
|
if heredoc {
|
|
chars.next();
|
|
chars.next();
|
|
}
|
|
|
|
LT::StringLiteral(s)
|
|
}
|
|
|
|
pub fn lex(s: &str) -> Vec<LT> {
|
|
let mut tokens: Vec<LT> = Vec::new();
|
|
let mut chars = s.chars().multipeek();
|
|
let mut reading = true;
|
|
while reading {
|
|
let p = chars.peek().cloned();
|
|
match p {
|
|
Some(c) => match c {
|
|
' ' | '\t' | '\r' | '\n' => {
|
|
chars.next();
|
|
tokens.push(LT::WhiteSpace);
|
|
}
|
|
'=' => tokens.push(lex_eq_or(&mut chars, LT::EqualsEquals, LT::Equals)),
|
|
'+' => tokens.push(lex_eq_rep_or(
|
|
&mut chars,
|
|
'+',
|
|
LT::PlusEquals,
|
|
LT::PlusPlus,
|
|
LT::Plus,
|
|
)),
|
|
'-' => tokens.push(lex_eq_rep_or(
|
|
&mut chars,
|
|
'-',
|
|
LT::MinusEquals,
|
|
LT::MinusMinus,
|
|
LT::Minus,
|
|
)),
|
|
'*' => {
|
|
if chars.peek() == Some(&'*') {
|
|
chars.next();
|
|
tokens.push(lex_eq_or(&mut chars, LT::StarStarEquals, LT::StarStar))
|
|
} else {
|
|
tokens.push(lex_eq_or(&mut chars, LT::StarEquals, LT::Star))
|
|
}
|
|
}
|
|
'/' => tokens.push(lex_eq_or(&mut chars, LT::SlashEquals, LT::Slash)),
|
|
'%' => tokens.push(lex_eq_or(&mut chars, LT::PercentEquals, LT::Percent)),
|
|
'|' => tokens.push(lex_eq_rep_or(
|
|
&mut chars,
|
|
'|',
|
|
LT::LineEquals,
|
|
LT::LineLine,
|
|
LT::VerticalLine,
|
|
)),
|
|
'&' => tokens.push(lex_eq_rep_or(
|
|
&mut chars,
|
|
'&',
|
|
LT::AmpersandEquals,
|
|
LT::AmpersandAmpersand,
|
|
LT::Ampersand,
|
|
)),
|
|
'^' => tokens.push(lex_eq_rep_or(
|
|
&mut chars,
|
|
'^',
|
|
LT::RoofEquals,
|
|
LT::RoofRoof,
|
|
LT::Roof,
|
|
)),
|
|
'<' => {
|
|
if chars.peek() == Some(&'<') {
|
|
chars.next();
|
|
tokens.push(lex_eq_or(&mut chars, LT::LeftLeftEquals, LT::LeftLeft))
|
|
} else {
|
|
tokens.push(lex_eq_or(&mut chars, LT::LessThanEquals, LT::LessThan))
|
|
}
|
|
}
|
|
'>' => {
|
|
if chars.peek() == Some(&'>') {
|
|
if chars.peek() == Some(&'>') {
|
|
chars.next();
|
|
chars.next();
|
|
tokens.push(lex_eq_or(
|
|
&mut chars,
|
|
LT::RightRightRightEquals,
|
|
LT::RightRightRight,
|
|
))
|
|
} else {
|
|
chars.next();
|
|
tokens.push(lex_eq_or(&mut chars, LT::RightRightEquals, LT::RightRight))
|
|
}
|
|
} else {
|
|
tokens.push(lex_eq_or(
|
|
&mut chars,
|
|
LT::GreaterThanEquals,
|
|
LT::GreaterThan,
|
|
))
|
|
}
|
|
}
|
|
'!' => {
|
|
let next = chars.peek();
|
|
if next == Some(&'=') {
|
|
chars.next();
|
|
chars.next();
|
|
tokens.push(LT::NotEquals);
|
|
} else if next == Some(&'i') && chars.peek() == Some(&'s') {
|
|
chars.next();
|
|
chars.next();
|
|
chars.next();
|
|
tokens.push(LT::NotIsKeyword);
|
|
} else {
|
|
chars.next();
|
|
tokens.push(LT::ExclamationMark);
|
|
}
|
|
}
|
|
|
|
'~' => tokens.push(lex_and_consume(&mut chars, LT::Tilde)),
|
|
'@' => tokens.push(lex_and_consume(&mut chars, LT::AtSymbol)),
|
|
';' => tokens.push(lex_and_consume(&mut chars, LT::Semicolon)),
|
|
':' => tokens.push(lex_and_consume(&mut chars, LT::Colon)),
|
|
|
|
'(' => tokens.push(lex_and_consume(&mut chars, LT::OpenBracket)),
|
|
')' => tokens.push(lex_and_consume(&mut chars, LT::CloseBracket)),
|
|
'{' => tokens.push(lex_and_consume(&mut chars, LT::OpenCurlyBracket)),
|
|
'}' => tokens.push(lex_and_consume(&mut chars, LT::CloseCurlyBracket)),
|
|
'[' => tokens.push(lex_and_consume(&mut chars, LT::OpenBlockBracket)),
|
|
']' => tokens.push(lex_and_consume(&mut chars, LT::CloseBlockBracket)),
|
|
|
|
'0'..'9' => tokens.push(lex_numeric(&mut chars)),
|
|
'a'..'z' | 'A'..'Z' | '_' => tokens.push(lex_keyword_or_identifier(&mut chars)),
|
|
'\'' => tokens.push(lex_string(&mut chars, &'\'', false)),
|
|
'"' if chars.peek() == Some(&'\"') && chars.peek() == Some(&'\"') => {
|
|
tokens.push(lex_string(&mut chars, &'"', true))
|
|
}
|
|
'"' => tokens.push(lex_string(&mut chars, &'"', false)),
|
|
|
|
// TODO: Definitely not unreachable. Log a proper error here.
|
|
_ => unreachable!(),
|
|
},
|
|
None => {
|
|
tokens.push(LT::EndOfFile);
|
|
reading = false;
|
|
}
|
|
}
|
|
}
|
|
tokens
|
|
}
|