atelier/bake/lib/syn/tok.rs
2025-01-16 20:05:09 -05:00

201 lines
3.5 KiB
Rust

use std::ops::Range;
#[derive(Eq, PartialEq, Debug)]
pub struct Span(Range<usize>);
impl From<Range<usize>> for Span {
fn from(value: Range<usize>) -> Self {
Self(value)
}
}
#[derive(Eq, PartialEq, Debug)]
pub enum TokenError {
InvalidByteInNumericLiteral,
}
#[derive(PartialEq, Debug)]
pub enum TokenKind {
Whitespace(String),
LPar(u8),
RPar(u8),
Atom(Atom),
Error(TokenError),
}
#[derive(PartialEq, Debug)]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
impl Token {
}
#[derive(PartialEq, Debug)]
pub enum Atom {
Keyword(String),
Identifier(String),
String(String),
Integer(i64),
Float(f64),
}
fn is_numlit(b: u8) -> bool {
b.is_ascii_digit() || b == b'.'
}
fn is_identifier(b: u8) -> bool {
!ends_literal(b)
}
fn ends_literal(b: u8) -> bool {
[b'(', b')'].contains(&b) || b.is_ascii_whitespace()
}
pub struct Tokens<'a> {
at: usize,
corpus: &'a str,
}
impl<'a> Tokens<'a> {
pub fn new(corpus: &'a str) -> Self {
Self { corpus, at: 0 }
}
fn head(&self) -> Option<u8> {
if self.at >= self.corpus.len() {
return None;
}
self.corpus[self.at..].bytes().next()
}
fn pop_head(&mut self) -> Option<u8> {
let out = self.head()?;
self.at += 1;
Some(out)
}
fn chomp_while(&mut self, mut f: impl FnMut(u8) -> bool) {
loop {
let ch = match self.pop_head() {
None => return,
Some(ch) => ch,
};
if !f(ch) {
self.at -= 1;
return;
}
}
}
fn next(&mut self) -> Option<Token> {
let start = self.at;
let ch = self.pop_head()?;
let tk = match ch {
b'(' => TokenKind::LPar(ch),
b')' => TokenKind::RPar(ch),
_ if ch.is_ascii_whitespace() => {
self.chomp_while(|b| b.is_ascii_whitespace());
TokenKind::Whitespace(self.corpus[start..self.at].to_string())
},
_ if ch.is_ascii_digit() => {
let mut is_float = false;
self.chomp_while(|b| {
if b == b'.' {
is_float = true;
}
is_numlit(b)
});
if !self.head().map(ends_literal).unwrap_or(true) {
TokenKind::Error(TokenError::InvalidByteInNumericLiteral)
} else {
TokenKind::Atom(if is_float {
Atom::Float(
self.corpus[start..self.at].parse().unwrap(),
)
} else {
Atom::Integer(
self.corpus[start..self.at].parse().unwrap(),
)
})
}
},
b':' => {
self.chomp_while(is_identifier);
TokenKind::Atom(Atom::Keyword(
self.corpus[start..self.at].to_string(),
))
},
_ => {
self.chomp_while(is_identifier);
TokenKind::Atom(Atom::Identifier(
self.corpus[start..self.at].to_string(),
))
},
};
Some(Token {
kind: tk,
span: Span::from(start..self.at),
})
}
}
impl Iterator for Tokens<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
Self::next(self)
}
}
#[cfg(test)]
mod test_tokenize {
use super::{Atom, TokenKind};
use super::Tokens;
#[test]
fn simple1() {
let tokens: Vec<_> =
Tokens::new("(:hello)").map(|tk| tk.kind).collect();
assert_eq!(
tokens,
[
TokenKind::LPar(b'('),
TokenKind::Atom(Atom::Keyword(":hello".to_string())),
TokenKind::RPar(b')'),
]
)
}
#[test]
fn simple2() {
let tokens: Vec<_> =
Tokens::new("(-> 1 2.4)").map(|t| t.kind).collect();
assert_eq!(
tokens,
[
TokenKind::LPar(b'('),
TokenKind::Atom(Atom::Identifier("->".to_string())),
TokenKind::Whitespace(" ".to_string()),
TokenKind::Atom(Atom::Integer(1)),
TokenKind::Whitespace(" ".to_string()),
TokenKind::Atom(Atom::Float(2.4)),
TokenKind::RPar(b')'),
]
)
}
}