Guest User

Untitled

a guest
Jun 25th, 2018
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.53 KB | None | 0 0
  1. pub mod tokens;
  2.  
  3. use std::str::CharIndices;
  4. use self::tokens::Token;
  5.  
  6. #[inline]
  7. fn is_id_start(ch: char) -> bool {
  8. ch == '_' || ch.is_ascii_alphabetic()
  9. }
  10.  
  11. #[inline]
  12. fn is_id_continue(ch: char) -> bool {
  13. ch == '_' || ch.is_ascii_digit()
  14. }
  15.  
  16. pub type Location = usize;
  17.  
  18. #[derive(Debug, Fail, PartialEq)]
  19. pub enum LexicalError {
  20. #[fail(display = "Invalid character '{}' found at {}", ch, location)]
  21. InvalidCharacter { ch: char, location: Location },
  22.  
  23. #[fail(display = "String starting at {} was not terminated", location)]
  24. UnterminatedString { location: Location },
  25. }
  26.  
  27. pub type SpanResult<'input> = Result<(Location, Token<'input>, Location), LexicalError>;
  28.  
  29. pub struct Lexer<'input> {
  30. source: &'input str,
  31. chars: CharIndices<'input>,
  32. lookahead: Option<(usize, char)>,
  33. lookahead2: Option<(usize, char)>,
  34. }
  35.  
  36. impl<'input> Lexer<'input> {
  37. pub fn new(source: &'input str) -> Lexer<'input> {
  38. let mut chars = source.char_indices();
  39. let lookahead = chars.next();
  40. let lookahead2 = chars.next();
  41.  
  42. Lexer {
  43. source,
  44. chars,
  45. lookahead,
  46. lookahead2,
  47. }
  48. }
  49.  
  50. fn bump(&mut self) -> Option<(usize, char)> {
  51. let next = self.lookahead;
  52. self.lookahead = self.lookahead2;
  53. self.lookahead2 = self.chars.next();
  54. next
  55. }
  56.  
  57. fn take_until<F>(&mut self, mut terminate: F) -> Option<usize>
  58. where
  59. F: FnMut(char) -> bool,
  60. {
  61. while let Some((i, ch)) = self.lookahead {
  62. if terminate(ch) {
  63. return Some(i);
  64. } else {
  65. self.bump();
  66. }
  67. }
  68.  
  69. None
  70. }
  71.  
  72. fn take_while<F>(&mut self, mut condition: F) -> Option<usize>
  73. where
  74. F: FnMut(char) -> bool,
  75. {
  76. self.take_until(|ch| !condition(ch))
  77. }
  78.  
  79. fn skip_to_line_end(&mut self) {
  80. self.take_while(|ch| ch != '\n');
  81. }
  82.  
  83. fn skip_whitespace(&mut self) {
  84. self.take_while(|ch| ch.is_whitespace());
  85. }
  86.  
  87. fn read_string(&mut self, pos: usize) -> SpanResult<'input> {
  88. match self.take_until(|ch| ch == '"') {
  89. Some(i) => {
  90. self.bump();
  91. Ok((pos, Token::String(&self.source[pos + 1..i]), i + 1))
  92. }
  93. None => Err(LexicalError::UnterminatedString { location: pos }),
  94. }
  95. }
  96.  
  97. fn read_integer(&mut self, pos: usize) -> SpanResult<'input> {
  98. let mut end = self.take_while(|ch| ch.is_ascii_digit());
  99.  
  100. if let Some((_, _)) = self.lookahead {
  101. // Check if it's a decimal or a field access
  102. if let Some((_, next_ch)) = self.lookahead2 {
  103. if next_ch.is_ascii_digit() {
  104. self.bump();
  105. end = self.take_while(|ch| ch.is_ascii_digit());
  106. }
  107. }
  108. }
  109.  
  110. let end = end.unwrap_or_else(|| self.source.len());
  111.  
  112. Ok((
  113. pos,
  114. Token::Integer(self.source[pos..end].parse().expect("unparsable integer")),
  115. end,
  116. ))
  117. }
  118.  
  119. fn read_float(&mut self, pos: usize) -> SpanResult<'input> {
  120. let mut end = self.take_while(|ch| ch.is_ascii_digit());
  121.  
  122. if let Some((_, '.')) = self.lookahead {
  123. // Check if it's a decimal or a field access
  124. if let Some((_, next_ch)) = self.lookahead2 {
  125. if next_ch.is_ascii_digit() {
  126. self.bump();
  127. end = self.take_while(|ch| ch.is_ascii_digit());
  128. }
  129. }
  130. }
  131.  
  132. let end = end.unwrap_or_else(|| self.source.len());
  133.  
  134. Ok((
  135. pos,
  136. Token::Float(self.source[pos..end].parse().expect("unparsable number")),
  137. end,
  138. ))
  139. }
  140.  
  141. fn read_identifier(&mut self, pos: usize) -> SpanResult<'input> {
  142. let end = self.take_while(|ch| is_id_start(ch) || is_id_continue(ch))
  143. .unwrap_or_else(|| self.source.len());
  144.  
  145. match &self.source[pos..end] {
  146. "else" => Ok((pos, Token::Else, end)),
  147. "false" => Ok((pos, Token::False, end)),
  148. "def" => Ok((pos, Token::Def, end)),
  149. "for" => Ok((pos, Token::For, end)),
  150. "if" => Ok((pos, Token::If, end)),
  151. "nil" => Ok((pos, Token::Nil, end)),
  152. "print" => Ok((pos, Token::Print, end)),
  153. "return" => Ok((pos, Token::Return, end)),
  154. "this" => Ok((pos, Token::This, end)),
  155. "true" => Ok((pos, Token::True, end)),
  156. "let" => Ok((pos, Token::Let, end)),
  157. "while" => Ok((pos, Token::While, end)),
  158. id => Ok((pos, Token::Identifier(id), end)),
  159. }
  160. }
  161. }
  162.  
  163. impl<'input> Iterator for Lexer<'input> {
  164. type Item = SpanResult<'input>;
  165.  
  166. fn next(&mut self) -> Option<SpanResult<'input>> {
  167. self.skip_whitespace();
  168.  
  169. if let Some((i, ch)) = self.bump() {
  170. match ch {
  171. '{' => Some(Ok((i, Token::OpenBrace, i + 1))),
  172. '}' => Some(Ok((i, Token::CloseBrace, i + 1))),
  173. '(' => Some(Ok((i, Token::OpenParen, i + 1))),
  174. ')' => Some(Ok((i, Token::CloseParen, i + 1))),
  175. '[' => Some(Ok((i, Token::OpenBracket, i + 1))),
  176. ']' => Some(Ok((i, Token::CloseBracket, i + 1))),
  177. ';' => Some(Ok((i, Token::Semicolon, i + 1))),
  178. ',' => Some(Ok((i, Token::Comma, i + 1))),
  179. '.' => Some(Ok((i, Token::Dot, i + 1))),
  180. '+' => Some(Ok((i, Token::Plus, i + 1))),
  181. '-' => Some(Ok((i, Token::Minus, i + 1))),
  182. '*' => Some(Ok((i, Token::Star, i + 1))),
  183.  
  184. '/' => {
  185. if let Some((_, '/')) = self.lookahead {
  186. self.skip_to_line_end();
  187. self.next()
  188. } else {
  189. Some(Ok((i, Token::Slash, i + 1)))
  190. }
  191. }
  192.  
  193. '!' => {
  194. if let Some((_, '=')) = self.lookahead {
  195. self.bump();
  196. Some(Ok((i, Token::NotEqual, i + 2)))
  197. } else {
  198. Some(Ok((i, Token::Not, i + 1)))
  199. }
  200. }
  201.  
  202. '=' => {
  203. if let Some((_, '=')) = self.lookahead {
  204. self.bump();
  205. Some(Ok((i, Token::EqualEqual, i + 2)))
  206. } else {
  207. Some(Ok((i, Token::Equal, i + 1)))
  208. }
  209. }
  210.  
  211. '>' => {
  212. if let Some((_, '=')) = self.lookahead {
  213. self.bump();
  214. Some(Ok((i, Token::GreaterEqual, i + 2)))
  215. } else {
  216. Some(Ok((i, Token::Greater, i + 1)))
  217. }
  218. }
  219.  
  220. '<' => {
  221. if let Some((_, '=')) = self.lookahead {
  222. self.bump();
  223. Some(Ok((i, Token::LessEqual, i + 2)))
  224. } else {
  225. Some(Ok((i, Token::Less, i + 1)))
  226. }
  227. }
  228.  
  229. '&' => {
  230. if let Some((_, '&')) = self.lookahead {
  231. self.bump();
  232. Some(Ok((i, Token::AmpAmp, i + 2)))
  233. } else {
  234. Some(Err(LexicalError::InvalidCharacter { ch, location: i }))
  235. }
  236. }
  237.  
  238. '|' => {
  239. if let Some((_, '|')) = self.lookahead {
  240. self.bump();
  241. Some(Ok((i, Token::PipePipe, i + 2)))
  242. } else {
  243. Some(Err(LexicalError::InvalidCharacter { ch, location: i }))
  244. }
  245. }
  246.  
  247. '"' => Some(self.read_string(i)),
  248. ch if is_id_start(ch) => Some(self.read_identifier(i)),
  249. ch if ch.is_ascii_digit() => Some(self.read_integer(i)),
  250. ch if ch.is_ascii_digit() => Some(self.read_float(i)),
  251. ch => Some(Err(LexicalError::InvalidCharacter { ch, location: i })),
  252. }
  253. } else {
  254. None
  255. }
  256. }
  257. }
  258.  
  259. #[cfg(test)]
  260. mod test {
  261. use super::*;
  262.  
  263. fn lex(source: &str, expected: Vec<(usize, Token, usize)>) {
  264. let mut lexer = Lexer::new(source);
  265.  
  266. let mut actual_len = 0;
  267. let expected_len = expected.len();
  268.  
  269. for (expected, actual) in expected.into_iter().zip(lexer.by_ref()) {
  270. actual_len += 1;
  271. let actual = actual.unwrap();
  272. assert_eq!(expected, actual);
  273. }
  274.  
  275. assert_eq!(expected_len, actual_len);
  276. assert_eq!(None, lexer.next());
  277. }
  278.  
  279. #[test]
  280. fn delimiters() {
  281. lex(
  282. "{} [] ()",
  283. vec![
  284. (0, Token::OpenBrace, 1),
  285. (1, Token::CloseBrace, 2),
  286. (3, Token::OpenBracket, 4),
  287. (4, Token::CloseBracket, 5),
  288. (6, Token::OpenParen, 7),
  289. (7, Token::CloseParen, 8),
  290. ],
  291. );
  292. }
  293.  
  294. #[test]
  295. fn operators() {
  296. lex(
  297. ", . + - * / = == ! != > >= < <= && ||",
  298. vec![
  299. (0, Token::Comma, 1),
  300. (2, Token::Dot, 3),
  301. (4, Token::Plus, 5),
  302. (6, Token::Minus, 7),
  303. (8, Token::Star, 9),
  304. (10, Token::Slash, 11),
  305. (12, Token::Equal, 13),
  306. (14, Token::EqualEqual, 16),
  307. (17, Token::Not, 18),
  308. (19, Token::NotEqual, 21),
  309. (22, Token::Greater, 23),
  310. (24, Token::GreaterEqual, 26),
  311. (27, Token::Less, 28),
  312. (29, Token::LessEqual, 31),
  313. (32, Token::AmpAmp, 34),
  314. (35, Token::PipePipe, 37),
  315. ],
  316. );
  317. }
  318.  
  319. #[test]
  320. fn line_comment() {
  321. lex(
  322. "123; // comment\n 123",
  323. vec![
  324. (0, Token::Float(123.0), 3),
  325. (3, Token::Semicolon, 4),
  326. (17, Token::Float(123.0), 20),
  327. ],
  328. );
  329. }
  330.  
  331. #[test]
  332. fn string() {
  333. lex(
  334. "\"hello, world\"",
  335. vec![(0, Token::String("hello, world"), 14)],
  336. );
  337. }
  338.  
  339. #[test]
  340. fn float() {
  341. lex("123", vec![(0, Token::Float(123.0), 3)]);
  342. }
  343.  
  344. #[test]
  345. fn decimal() {
  346. lex("123.45", vec![(0, Token::Float(123.45), 6)]);
  347. }
  348.  
  349. #[test]
  350. fn number_field_access() {
  351. lex(
  352. "123.prop",
  353. vec![
  354. (0, Token::Float(123.0), 3),
  355. (0, Token::Integer(123), 3),
  356. (3, Token::Dot, 4),
  357. (4, Token::Identifier("prop"), 8),
  358. ],
  359. );
  360. }
  361.  
  362. #[test]
  363. fn identifiers() {
  364. lex("id", vec![(0, Token::Identifier("id"), 2)]);
  365. lex("_id", vec![(0, Token::Identifier("_id"), 3)]);
  366. lex("id123", vec![(0, Token::Identifier("id123"), 5)]);
  367. }
  368.  
  369. #[test]
  370. fn keywords() {
  371. lex("else", vec![(0, Token::Else, 4)]);
  372. lex("false", vec![(0, Token::False, 5)]);
  373. lex("def", vec![(0, Token::Def, 2)]);
  374. lex("for", vec![(0, Token::For, 3)]);
  375. lex("if", vec![(0, Token::If, 2)]);
  376. lex("nil", vec![(0, Token::Nil, 3)]);
  377. lex("print", vec![(0, Token::Print, 5)]);
  378. lex("return", vec![(0, Token::Return, 6)]);
  379. lex("this", vec![(0, Token::This, 4)]);
  380. lex("true", vec![(0, Token::True, 4)]);
  381. lex("let", vec![(0, Token::Let, 3)]);
  382. lex("while", vec![(0, Token::While, 5)]);
  383. }
  384. }
Add Comment
Please, Sign In to add comment