feat: completed lexer

This commit is contained in:
Yandrik 2021-11-29 15:06:14 +01:00
parent 3bed67c2b6
commit e1ae39199b
7 changed files with 195 additions and 30 deletions

14
Cargo.lock generated
View File

@ -2,13 +2,6 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "cb-calculator"
version = "0.1.0"
dependencies = [
"thiserror",
]
[[package]]
name = "proc-macro2"
version = "1.0.32"
@ -27,6 +20,13 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "s5-cb-calculator"
version = "0.1.0"
dependencies = [
"thiserror",
]
[[package]]
name = "syn"
version = "1.0.82"

View File

@ -1,5 +1,5 @@
[package]
name = "cb-calculator"
name = "s5-cb-calculator"
version = "0.1.0"
edition = "2018"

View File

@ -2,11 +2,10 @@ use thiserror::Error;
#[derive(Error, Debug)]
pub enum LexerErrors {
#[error("unexpected character {char} at position {pos} in context {context}")]
#[error("unexpected character {char} at position {pos}")]
UnexpectedCharacter {
char: char,
pos: u32,
context: String,
pos: usize,
},
#[error("cannot lex an empty text sequence")]
EmptyTextSequenceError,

85
src/lexer/fsm.rs Normal file
View File

@ -0,0 +1,85 @@
use std::str::FromStr;
use crate::lexer::tokens::{OpType, Token, TokenMeta};
const STATES: [i32; 6] = [1, 2, 3, 4, 5, 6];
const FINAL_STATES: [i32; 4] = [2, 3, 4, 5];
const ERROR_STATE: i32 = 6;
/// Transitions in a matrix in the form of this:
/// \CHR ( ) 0..=9 *|/|^|-|+ _ SPACE
/// ST
/// 1 2 3 4 5 6 1
/// 2 6 6 6 6 6 6
/// 3 6 6 6 6 6 6
/// 4 6 6 4 6 6 6
/// 5 6 6 6 6 6 6
/// 6 6 6 6 6 6 6
const TRANSITIONS: [[i32; 6]; 6] = [
[2, 3, 4, 5, 6, 1],
[6, 6, 6, 6, 6, 6],
[6, 6, 6, 6, 6, 6],
[6, 6, 4, 6, 6, 6],
[6, 6, 6, 6, 6, 6],
[6, 6, 6, 6, 6, 6],
];
#[derive(Debug, Copy, Clone)]
pub struct FSM {
state: i32,
last: i32,
}
impl FSM {
#[inline]
pub fn new() -> FSM {
FSM { state: 1, last: 1 }
}
pub fn is_final(&self) -> bool {
FINAL_STATES.contains(&self.state)
}
pub fn is_error(&self) -> bool {
ERROR_STATE == self.state
}
/// revert to last state
pub fn revert(&mut self) {
self.state = self.last;
}
pub fn get_state(&self) -> i32 {
self.state
}
pub fn transition(&mut self, c: char) {
let new_state = self.get_transition(c);
self.last = self.state;
self.state = new_state;
}
fn get_transition(&self, c: char) -> i32 {
let lut_col = match c {
'(' => 0,
')' => 1,
'0'..='9' => 2,
'*' | '/' | '^' | '-' | '+' => 3,
' ' => 5,
_ => 4,
};
TRANSITIONS[(&self.state - 1) as usize][lut_col]
}
}
pub fn get_token(fsm: &FSM, cbuf: &Vec<char>, meta: TokenMeta) -> Option<Token> {
match fsm.state {
1 | 6 => None,
2 => Some(Token::OBR(meta)),
3 => Some(Token::CBR(meta)),
4 => Some(Token::ID(meta, f64::from_str(&cbuf.iter().collect::<String>()).unwrap())),
5 => Some(Token::OP(meta, OpType::from_char(*cbuf.last().unwrap()).unwrap())),
_ => panic!("Invalid State {}!", fsm.state),
}
}

View File

@ -1,23 +1,29 @@
mod errors;
mod tokens;
use std::collections::VecDeque;
use std::fs::File;
use std::path::Path;
use std::str::Chars;
use std::str::{Chars, FromStr};
use tokens::Token;
use crate::lexer::errors::LexerErrors;
use crate::lexer::errors::LexerErrors::EmptyTextSequenceError;
use crate::lexer::fsm::{FSM, get_token};
use crate::lexer::tokens::{OpType, TokenMeta};
mod errors;
mod tokens;
mod fsm;
pub type Result<T> = std::result::Result<T, errors::LexerErrors>;
pub struct Lexer<'a> {
input: String,
chars: Chars<'a>,
pub struct Lexer {
input: Vec<char>,
pos: usize,
}
impl Lexer<'_> {
impl Lexer {
/// Create a new Lexer for the given String
/// Example:
/// ```
@ -26,17 +32,77 @@ impl Lexer<'_> {
/// lexer::Lexer::new(String::from(text));
/// ```
#[inline]
pub fn new(input: String) -> Lexer {
Lexer { input, chars: input.chars() }
pub fn new(input: &str) -> Lexer {
Lexer { input: input.chars().collect(), pos: 0 }
}
// Get the next token
pub fn next(&mut self) -> Result<Option<Token>> {
let mut buffer: Vec<char> = Vec::new();
loop {
if let Some(c) = self.chars.next() {
buffer.push(c);
if let Some((fsm, cbuf)) = self.longest_token_prefix() {
if let Some(token) = fsm::get_token(&fsm, &cbuf, TokenMeta{ pos: self.pos }) {
self.pos += cbuf.len();
Ok(Some(token))
} else {
// TODO: handling of end of stream
Err(LexerErrors::UnexpectedCharacter { char: self.input.get(self.pos), pos: self.pos })
}
} else {
// if no more tokens are there
Ok(None)
}
}
fn longest_token_prefix(&mut self) -> Option<(FSM, Vec<char>)> {
let mut fsm = FSM::new();
let mut chars = match self.input.get(self.pos..) {
Some(slice) => slice.iter(),
None => return None,
};
let mut cbuf: Vec<char> = vec![];
while let Some(cur) = chars.next() {
fsm.transition(*cur);
if fsm.is_final() {
cbuf.push(*cur);
} else if fsm.is_error() {
fsm.revert();
break
}
}
Some((fsm, cbuf))
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn parser_test() {
let mut lexer = Lexer::new("15/3^2+20-(5*60)");
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 0 }, 15.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 2 }, OpType::DIV)))));
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 3 }, 3.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 4 }, OpType::POW)))));
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 5 }, 2.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 6 }, OpType::ADD)))));
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 7 }, 20.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 9 }, OpType::SUB)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OBR(TokenMeta { pos: 10 })))));
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 11 }, 5.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta { pos: 12 }, OpType::MUL)))));
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta { pos: 13 }, 60.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::CBR(TokenMeta { pos: 15 })))));
// println!("{:?}", lexer.next());
assert!(matches!(lexer.next(), Ok(None)));
}
#[test]
fn parser_error_test() {
let mut lexer = Lexer::new("15+@");
assert!(matches!(lexer.next(), Ok(Some(Token::ID(TokenMeta{ pos: 0 }, 15.0)))));
assert!(matches!(lexer.next(), Ok(Some(Token::OP(TokenMeta{ pos: 2 }, OpType::ADD)))));
assert!(matches!(lexer.next(), Err(LexerErrors::UnexpectedCharacter { char: _, pos: _ })))
}
}

View File

@ -6,9 +6,7 @@
/// * Position of the *first character making up the token* in said line
#[derive(Debug)]
pub struct TokenMeta {
file: String,
line: u32,
pos: u32,
pub pos: usize,
}
#[derive(Debug)]
@ -20,6 +18,20 @@ pub enum OpType {
POW,
}
impl OpType {
#[inline]
pub fn from_char(c: char) -> Option<OpType> {
match c {
'*' => Some(OpType::MUL),
'/' => Some(OpType::DIV),
'+' => Some(OpType::ADD),
'-' => Some(OpType::SUB),
'^' => Some(OpType::POW),
_ => None,
}
}
}
/// Bracket types, either OPEN or CLOSE.
#[derive(Debug)]
pub enum BrType {

View File

@ -1,6 +1,9 @@
use cb_calculator::lexer::Lexer;
use s5_cb_calculator::lexer::Lexer;
fn main() {
let lexer = Lexer::new("15+(30^2-5)*2/4".chars().collect());
let mut lexer = Lexer::new("15+(30^2-5)*2/4".chars().collect());
while let Ok(Some(token)) = lexer.next() {
println!("Token: {:?}", token)
}
println!("Hello, world!");
}