atelier/bake/lib/syn/tok.rs

use std::ops::Range;

#[derive(Eq, PartialEq, Debug)]
pub struct Span(Range<usize>);

impl From<Range<usize>> for Span {
	fn from(value: Range<usize>) -> Self {
		Self(value)
	}
}

#[derive(Eq, PartialEq, Debug)]
pub enum TokenError {
	InvalidByteInNumericLiteral,
}

#[derive(PartialEq, Debug)]
pub enum TokenKind {
	Whitespace(String),
	LPar(u8),
	RPar(u8),
	Atom(Atom),
	Error(TokenError),
}

#[derive(PartialEq, Debug)]
pub struct Token {
	pub kind: TokenKind,
	pub span: Span,
}
impl Token {
}

#[derive(PartialEq, Debug)]
pub enum Atom {
	Keyword(String),
	Identifier(String),
	String(String),
	Integer(i64),
	Float(f64),
}

fn is_numlit(b: u8) -> bool {
	b.is_ascii_digit() || b == b'.'
}

fn is_identifier(b: u8) -> bool {
	!ends_literal(b)
}

fn ends_literal(b: u8) -> bool {
	[b'(', b')'].contains(&b) || b.is_ascii_whitespace()
}

pub struct Tokens<'a> {
	at: usize,
	corpus: &'a str,
}
impl<'a> Tokens<'a> {
	pub fn new(corpus: &'a str) -> Self {
		Self { corpus, at: 0 }
	}

	fn head(&self) -> Option<u8> {
		if self.at >= self.corpus.len() {
			return None;
		}

		self.corpus[self.at..].bytes().next()
	}

	fn pop_head(&mut self) -> Option<u8> {
		let out = self.head()?;
		self.at += 1;

		Some(out)
	}

	fn chomp_while(&mut self, mut f: impl FnMut(u8) -> bool) {
		loop {
			let ch = match self.pop_head() {
				None => return,
				Some(ch) => ch,
			};

			if !f(ch) {
				self.at -= 1;
				return;
			}
		}
	}

	fn next(&mut self) -> Option<Token> {
		let start = self.at;
		let ch = self.pop_head()?;

		let tk = match ch {
			b'(' => TokenKind::LPar(ch),
			b')' => TokenKind::RPar(ch),
			_ if ch.is_ascii_whitespace() => {
				self.chomp_while(|b| b.is_ascii_whitespace());

				TokenKind::Whitespace(self.corpus[start..self.at].to_string())
			},
			_ if ch.is_ascii_digit() => {
				let mut is_float = false;

				self.chomp_while(|b| {
					if b == b'.' {
						is_float = true;
					}

					is_numlit(b)
				});

				if !self.head().map(ends_literal).unwrap_or(true) {
					TokenKind::Error(TokenError::InvalidByteInNumericLiteral)
				} else {
					TokenKind::Atom(if is_float {
						Atom::Float(
							self.corpus[start..self.at].parse().unwrap(),
						)
					} else {
						Atom::Integer(
							self.corpus[start..self.at].parse().unwrap(),
						)
					})
				}
			},
			b':' => {
				self.chomp_while(is_identifier);

				TokenKind::Atom(Atom::Keyword(
					self.corpus[start..self.at].to_string(),
				))
			},
			_ => {
				self.chomp_while(is_identifier);

				TokenKind::Atom(Atom::Identifier(
					self.corpus[start..self.at].to_string(),
				))
			},
		};

		Some(Token {
			kind: tk,
			span: Span::from(start..self.at),
		})
	}
}

impl Iterator for Tokens<'_> {
	type Item = Token;

	fn next(&mut self) -> Option<Self::Item> {
		Self::next(self)
	}
}

#[cfg(test)]
mod test_tokenize {
	use super::{Atom, TokenKind};

	use super::Tokens;

	#[test]
	fn simple1() {
		let tokens: Vec<_> =
			Tokens::new("(:hello)").map(|tk| tk.kind).collect();

		assert_eq!(
			tokens,
			[
				TokenKind::LPar(b'('),
				TokenKind::Atom(Atom::Keyword(":hello".to_string())),
				TokenKind::RPar(b')'),
			]
		)
	}

	#[test]
	fn simple2() {
		let tokens: Vec<_> =
			Tokens::new("(-> 1 2.4)").map(|t| t.kind).collect();

		assert_eq!(
			tokens,
			[
				TokenKind::LPar(b'('),
				TokenKind::Atom(Atom::Identifier("->".to_string())),
				TokenKind::Whitespace(" ".to_string()),
				TokenKind::Atom(Atom::Integer(1)),
				TokenKind::Whitespace(" ".to_string()),
				TokenKind::Atom(Atom::Float(2.4)),
				TokenKind::RPar(b')'),
			]
		)
	}
}