diff --git a/Cargo.lock b/Cargo.lock index 18e4f4b..362721a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,13 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "cb-calculator" -version = "0.1.0" -dependencies = [ - "thiserror", -] - [[package]] name = "proc-macro2" version = "1.0.32" @@ -27,6 +20,13 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "s5-cb-calculator" +version = "0.1.0" +dependencies = [ + "thiserror", +] + [[package]] name = "syn" version = "1.0.82" diff --git a/Cargo.toml b/Cargo.toml index 468a390..3f52b23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "cb-calculator" +name = "s5-cb-calculator" version = "0.1.0" edition = "2018" diff --git a/src/lexer/errors.rs b/src/lexer/errors.rs index c50f827..a08118c 100644 --- a/src/lexer/errors.rs +++ b/src/lexer/errors.rs @@ -2,11 +2,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum LexerErrors { - #[error("unexpected character {char} at position {pos} in context {context}")] + #[error("unexpected character {char} at position {pos}")] UnexpectedCharacter { char: char, - pos: u32, - context: String, + pos: usize, }, #[error("cannot lex an empty text sequence")] EmptyTextSequenceError, diff --git a/src/lexer/fsm.rs b/src/lexer/fsm.rs new file mode 100644 index 0000000..09d9962 --- /dev/null +++ b/src/lexer/fsm.rs @@ -0,0 +1,85 @@ +use std::str::FromStr; + +use crate::lexer::tokens::{OpType, Token, TokenMeta}; + +const STATES: [i32; 6] = [1, 2, 3, 4, 5, 6]; +const FINAL_STATES: [i32; 4] = [2, 3, 4, 5]; +const ERROR_STATE: i32 = 6; + +/// Transitions in a matrix in the form of this: +/// \CHR ( ) 0..=9 *|/|^|-|+ _ SPACE +/// ST +/// 1 2 3 4 5 6 1 +/// 2 6 6 6 6 6 6 +/// 3 6 6 6 6 6 6 +/// 4 6 6 4 6 6 6 +/// 5 6 6 6 6 6 6 +/// 6 6 6 6 6 6 6 +const TRANSITIONS: [[i32; 6]; 6] = [ + [2, 3, 4, 5, 6, 1], + [6, 6, 6, 6, 6, 6], + [6, 6, 6, 6, 6, 6], + [6, 6, 4, 6, 6, 6], + [6, 6, 6, 6, 6, 6], + [6, 6, 6, 6, 6, 6], +]; + + +#[derive(Debug, Copy, Clone)] +pub struct FSM { + state: i32, + last: i32, +} + +impl FSM { + #[inline] + pub fn new() -> FSM { + FSM { state: 1, last: 1 } + } + + pub fn is_final(&self) -> bool { + FINAL_STATES.contains(&self.state) + } + + pub fn is_error(&self) -> bool { + ERROR_STATE == self.state + } + + /// revert to last state + pub fn revert(&mut self) { + self.state = self.last; + } + + pub fn get_state(&self) -> i32 { + self.state + } + + pub fn transition(&mut self, c: char) { + let new_state = self.get_transition(c); + self.last = self.state; + self.state = new_state; + } + + fn get_transition(&self, c: char) -> i32 { + let lut_col = match c { + '(' => 0, + ')' => 1, + '0'..='9' => 2, + '*' | '/' | '^' | '-' | '+' => 3, + ' ' => 5, + _ => 4, + }; + TRANSITIONS[(&self.state - 1) as usize][lut_col] + } +} + +pub fn get_token(fsm: &FSM, cbuf: &Vec, meta: TokenMeta) -> Option { + match fsm.state { + 1 | 6 => None, + 2 => Some(Token::OBR(meta)), + 3 => Some(Token::CBR(meta)), + 4 => Some(Token::ID(meta, f64::from_str(&cbuf.iter().collect::()).unwrap())), + 5 => Some(Token::OP(meta, OpType::from_char(*cbuf.last().unwrap()).unwrap())), + _ => panic!("Invalid State {}!", fsm.state), + } +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index a6febd8..988871c 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,23 +1,29 @@ -mod errors; -mod tokens; - use std::collections::VecDeque; use std::fs::File; use std::path::Path; -use std::str::Chars; +use std::str::{Chars, FromStr}; + use tokens::Token; + use crate::lexer::errors::LexerErrors; use crate::lexer::errors::LexerErrors::EmptyTextSequenceError; +use crate::lexer::fsm::{FSM, get_token}; +use crate::lexer::tokens::{OpType, TokenMeta}; + +mod errors; +mod tokens; +mod fsm; pub type Result = std::result::Result; -pub struct Lexer<'a> { - input: String, - chars: Chars<'a>, +pub struct Lexer { + input: Vec, + pos: usize, } -impl Lexer<'_> { + +impl Lexer { /// Create a new Lexer for the given String /// Example: /// ``` @@ -26,17 +32,77 @@ impl Lexer<'_> { /// lexer::Lexer::new(String::from(text)); /// ``` #[inline] - pub fn new(input: String) -> Lexer { - Lexer { input, chars: input.chars() } + pub fn new(input: &str) -> Lexer { + Lexer { input: input.chars().collect(), pos: 0 } } // Get the next token pub fn next(&mut self) -> Result> { - let mut buffer: Vec = Vec::new(); - loop { - if let Some(c) = self.chars.next() { - buffer.push(c); + if let Some((fsm, cbuf)) = self.longest_token_prefix() { + if let Some(token) = fsm::get_token(&fsm, &cbuf, TokenMeta{ pos: self.pos }) { + self.pos += cbuf.len(); + Ok(Some(token)) + } else { + // TODO: handling of end of stream + Err(LexerErrors::UnexpectedCharacter { char: self.input.get(self.pos), pos: self.pos }) } + } else { + // if no more tokens are there + Ok(None) } } + + fn longest_token_prefix(&mut self) -> Option<(FSM, Vec)> { + let mut fsm = FSM::new(); + let mut chars = match self.input.get(self.pos..) { + Some(slice) => slice.iter(), + None => return None, + }; + let mut cbuf: Vec = vec![]; + + while let Some(cur) = chars.next() { + fsm.transition(*cur); + if fsm.is_final() { + cbuf.push(*cur); + } else if fsm.is_error() { + fsm.revert(); + break + } + } + Some((fsm, cbuf)) + } + +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn parser_test() { + let mut lexer = Lexer::new("15/3^2+20-(5*60)"); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 0 }, 15.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 2 }, OpType::DIV))))); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 3 }, 3.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 4 }, OpType::POW))))); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 5 }, 2.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 6 }, OpType::ADD))))); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 7 }, 20.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 9 }, OpType::SUB))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OBR(TokenMeta { pos: 10 }))))); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 11 }, 5.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 12 }, OpType::MUL))))); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 13 }, 60.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::CBR(TokenMeta { pos: 15 }))))); + // println!("{:?}", lexer.next()); + assert!(matches!(lexer.next(), Ok(None))); + } + + #[test] + fn parser_error_test() { + let mut lexer = Lexer::new("15+@"); + assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta{ pos: 0 }, 15.0))))); + assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta{ pos: 2 }, OpType::ADD))))); + assert!(matches!(lexer.next(), Err(LexerErrors::UnexpectedCharacter { char: _, pos: _ }))) + } } diff --git a/src/lexer/tokens.rs b/src/lexer/tokens.rs index 1e5c708..a9b064e 100644 --- a/src/lexer/tokens.rs +++ b/src/lexer/tokens.rs @@ -6,9 +6,7 @@ /// * Position of the *first character making up the token* in said line #[derive(Debug)] pub struct TokenMeta { - file: String, - line: u32, - pos: u32, + pub pos: usize, } #[derive(Debug)] @@ -20,6 +18,20 @@ pub enum OpType { POW, } +impl OpType { + #[inline] + pub fn from_char(c: char) -> Option { + match c { + '*' => Some(OpType::MUL), + '/' => Some(OpType::DIV), + '+' => Some(OpType::ADD), + '-' => Some(OpType::SUB), + '^' => Some(OpType::POW), + _ => None, + } + } +} + /// Bracket types, either OPEN or CLOSE. #[derive(Debug)] pub enum BrType { diff --git a/src/main.rs b/src/main.rs index 17212d4..06d5ee1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,9 @@ -use cb_calculator::lexer::Lexer; +use s5_cb_calculator::lexer::Lexer; fn main() { - let lexer = Lexer::new("15+(30^2-5)*2/4".chars().collect()); + let mut lexer = Lexer::new("15+(30^2-5)*2/4".chars().collect()); + while let Ok(Some(token)) = lexer.next() { + println!("Token: {:?}", token) + } println!("Hello, world!"); }