201 lines
3.5 KiB
Rust
201 lines
3.5 KiB
Rust
use std::ops::Range;
|
|
|
|
#[derive(Eq, PartialEq, Debug)]
|
|
pub struct Span(Range<usize>);
|
|
|
|
impl From<Range<usize>> for Span {
|
|
fn from(value: Range<usize>) -> Self {
|
|
Self(value)
|
|
}
|
|
}
|
|
|
|
#[derive(Eq, PartialEq, Debug)]
|
|
pub enum TokenError {
|
|
InvalidByteInNumericLiteral,
|
|
}
|
|
|
|
#[derive(PartialEq, Debug)]
|
|
pub enum TokenKind {
|
|
Whitespace(String),
|
|
LPar(u8),
|
|
RPar(u8),
|
|
Atom(Atom),
|
|
Error(TokenError),
|
|
}
|
|
|
|
#[derive(PartialEq, Debug)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
pub span: Span,
|
|
}
|
|
impl Token {
|
|
}
|
|
|
|
#[derive(PartialEq, Debug)]
|
|
pub enum Atom {
|
|
Keyword(String),
|
|
Identifier(String),
|
|
String(String),
|
|
Integer(i64),
|
|
Float(f64),
|
|
}
|
|
|
|
fn is_numlit(b: u8) -> bool {
|
|
b.is_ascii_digit() || b == b'.'
|
|
}
|
|
|
|
fn is_identifier(b: u8) -> bool {
|
|
!ends_literal(b)
|
|
}
|
|
|
|
fn ends_literal(b: u8) -> bool {
|
|
[b'(', b')'].contains(&b) || b.is_ascii_whitespace()
|
|
}
|
|
|
|
pub struct Tokens<'a> {
|
|
at: usize,
|
|
corpus: &'a str,
|
|
}
|
|
impl<'a> Tokens<'a> {
|
|
pub fn new(corpus: &'a str) -> Self {
|
|
Self { corpus, at: 0 }
|
|
}
|
|
|
|
fn head(&self) -> Option<u8> {
|
|
if self.at >= self.corpus.len() {
|
|
return None;
|
|
}
|
|
|
|
self.corpus[self.at..].bytes().next()
|
|
}
|
|
|
|
fn pop_head(&mut self) -> Option<u8> {
|
|
let out = self.head()?;
|
|
self.at += 1;
|
|
|
|
Some(out)
|
|
}
|
|
|
|
fn chomp_while(&mut self, mut f: impl FnMut(u8) -> bool) {
|
|
loop {
|
|
let ch = match self.pop_head() {
|
|
None => return,
|
|
Some(ch) => ch,
|
|
};
|
|
|
|
if !f(ch) {
|
|
self.at -= 1;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
fn next(&mut self) -> Option<Token> {
|
|
let start = self.at;
|
|
let ch = self.pop_head()?;
|
|
|
|
let tk = match ch {
|
|
b'(' => TokenKind::LPar(ch),
|
|
b')' => TokenKind::RPar(ch),
|
|
_ if ch.is_ascii_whitespace() => {
|
|
self.chomp_while(|b| b.is_ascii_whitespace());
|
|
|
|
TokenKind::Whitespace(self.corpus[start..self.at].to_string())
|
|
},
|
|
_ if ch.is_ascii_digit() => {
|
|
let mut is_float = false;
|
|
|
|
self.chomp_while(|b| {
|
|
if b == b'.' {
|
|
is_float = true;
|
|
}
|
|
|
|
is_numlit(b)
|
|
});
|
|
|
|
if !self.head().map(ends_literal).unwrap_or(true) {
|
|
TokenKind::Error(TokenError::InvalidByteInNumericLiteral)
|
|
} else {
|
|
TokenKind::Atom(if is_float {
|
|
Atom::Float(
|
|
self.corpus[start..self.at].parse().unwrap(),
|
|
)
|
|
} else {
|
|
Atom::Integer(
|
|
self.corpus[start..self.at].parse().unwrap(),
|
|
)
|
|
})
|
|
}
|
|
},
|
|
b':' => {
|
|
self.chomp_while(is_identifier);
|
|
|
|
TokenKind::Atom(Atom::Keyword(
|
|
self.corpus[start..self.at].to_string(),
|
|
))
|
|
},
|
|
_ => {
|
|
self.chomp_while(is_identifier);
|
|
|
|
TokenKind::Atom(Atom::Identifier(
|
|
self.corpus[start..self.at].to_string(),
|
|
))
|
|
},
|
|
};
|
|
|
|
Some(Token {
|
|
kind: tk,
|
|
span: Span::from(start..self.at),
|
|
})
|
|
}
|
|
}
|
|
|
|
impl Iterator for Tokens<'_> {
|
|
type Item = Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
Self::next(self)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test_tokenize {
|
|
use super::{Atom, TokenKind};
|
|
|
|
use super::Tokens;
|
|
|
|
#[test]
|
|
fn simple1() {
|
|
let tokens: Vec<_> =
|
|
Tokens::new("(:hello)").map(|tk| tk.kind).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
[
|
|
TokenKind::LPar(b'('),
|
|
TokenKind::Atom(Atom::Keyword(":hello".to_string())),
|
|
TokenKind::RPar(b')'),
|
|
]
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn simple2() {
|
|
let tokens: Vec<_> =
|
|
Tokens::new("(-> 1 2.4)").map(|t| t.kind).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
[
|
|
TokenKind::LPar(b'('),
|
|
TokenKind::Atom(Atom::Identifier("->".to_string())),
|
|
TokenKind::Whitespace(" ".to_string()),
|
|
TokenKind::Atom(Atom::Integer(1)),
|
|
TokenKind::Whitespace(" ".to_string()),
|
|
TokenKind::Atom(Atom::Float(2.4)),
|
|
TokenKind::RPar(b')'),
|
|
]
|
|
)
|
|
}
|
|
}
|