use std::ops::Range; #[derive(Eq, PartialEq, Debug)] pub struct Span(Range); impl From> for Span { fn from(value: Range) -> Self { Self(value) } } #[derive(Eq, PartialEq, Debug)] pub enum TokenError { InvalidByteInNumericLiteral, } #[derive(PartialEq, Debug)] pub enum TokenKind { Whitespace(String), LPar(u8), RPar(u8), Atom(Atom), Error(TokenError), } #[derive(PartialEq, Debug)] pub struct Token { pub kind: TokenKind, pub span: Span, } impl Token { } #[derive(PartialEq, Debug)] pub enum Atom { Keyword(String), Identifier(String), String(String), Integer(i64), Float(f64), } fn is_numlit(b: u8) -> bool { b.is_ascii_digit() || b == b'.' } fn is_identifier(b: u8) -> bool { !ends_literal(b) } fn ends_literal(b: u8) -> bool { [b'(', b')'].contains(&b) || b.is_ascii_whitespace() } pub struct Tokens<'a> { at: usize, corpus: &'a str, } impl<'a> Tokens<'a> { pub fn new(corpus: &'a str) -> Self { Self { corpus, at: 0 } } fn head(&self) -> Option { if self.at >= self.corpus.len() { return None; } self.corpus[self.at..].bytes().next() } fn pop_head(&mut self) -> Option { let out = self.head()?; self.at += 1; Some(out) } fn chomp_while(&mut self, mut f: impl FnMut(u8) -> bool) { loop { let ch = match self.pop_head() { None => return, Some(ch) => ch, }; if !f(ch) { self.at -= 1; return; } } } fn next(&mut self) -> Option { let start = self.at; let ch = self.pop_head()?; let tk = match ch { b'(' => TokenKind::LPar(ch), b')' => TokenKind::RPar(ch), _ if ch.is_ascii_whitespace() => { self.chomp_while(|b| b.is_ascii_whitespace()); TokenKind::Whitespace(self.corpus[start..self.at].to_string()) }, _ if ch.is_ascii_digit() => { let mut is_float = false; self.chomp_while(|b| { if b == b'.' { is_float = true; } is_numlit(b) }); if !self.head().map(ends_literal).unwrap_or(true) { TokenKind::Error(TokenError::InvalidByteInNumericLiteral) } else { TokenKind::Atom(if is_float { Atom::Float( self.corpus[start..self.at].parse().unwrap(), ) } else { Atom::Integer( self.corpus[start..self.at].parse().unwrap(), ) }) } }, b':' => { self.chomp_while(is_identifier); TokenKind::Atom(Atom::Keyword( self.corpus[start..self.at].to_string(), )) }, _ => { self.chomp_while(is_identifier); TokenKind::Atom(Atom::Identifier( self.corpus[start..self.at].to_string(), )) }, }; Some(Token { kind: tk, span: Span::from(start..self.at), }) } } impl Iterator for Tokens<'_> { type Item = Token; fn next(&mut self) -> Option { Self::next(self) } } #[cfg(test)] mod test_tokenize { use super::{Atom, TokenKind}; use super::Tokens; #[test] fn simple1() { let tokens: Vec<_> = Tokens::new("(:hello)").map(|tk| tk.kind).collect(); assert_eq!( tokens, [ TokenKind::LPar(b'('), TokenKind::Atom(Atom::Keyword(":hello".to_string())), TokenKind::RPar(b')'), ] ) } #[test] fn simple2() { let tokens: Vec<_> = Tokens::new("(-> 1 2.4)").map(|t| t.kind).collect(); assert_eq!( tokens, [ TokenKind::LPar(b'('), TokenKind::Atom(Atom::Identifier("->".to_string())), TokenKind::Whitespace(" ".to_string()), TokenKind::Atom(Atom::Integer(1)), TokenKind::Whitespace(" ".to_string()), TokenKind::Atom(Atom::Float(2.4)), TokenKind::RPar(b')'), ] ) } }