From 2c0b19eb0aae48f28735319a0634c3d0eb9b4b13 Mon Sep 17 00:00:00 2001 From: aNNiMON Date: Sat, 16 Sep 2023 20:12:01 +0300 Subject: [PATCH] More strict lexer, fixed HEX numbers and quote escaping --- .../com/annimon/ownlang/parser/Lexer.java | 71 ++++--- .../ownlang/parser/LexerPositionsTest.java | 22 +- .../com/annimon/ownlang/parser/LexerTest.java | 199 ++++++------------ .../parser/LexerValidDataProvider.java | 91 ++++++++ 4 files changed, 214 insertions(+), 169 deletions(-) create mode 100644 ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java diff --git a/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java b/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java index 3a15e61..48efd15 100644 --- a/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java +++ b/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java @@ -133,21 +133,20 @@ public final class Lexer { while (pos < length) { // Fast path for skipping whitespaces while (Character.isWhitespace(peek(0))) { - next(); + skip(); } final char current = peek(0); - if (Character.isDigit(current)) tokenizeNumber(); + if (isNumber(current)) tokenizeNumber(); else if (isOwnLangIdentifierStart(current)) tokenizeWord(); - else if (current == '`') tokenizeExtendedWord(); else if (current == '"') tokenizeText(); + else if (OPERATOR_CHARS.indexOf(current) != -1) tokenizeOperator(); + else if (Character.isWhitespace(current)) skip(); + else if (current == '`') tokenizeExtendedWord(); else if (current == '#') tokenizeHexNumber(1); - else if (OPERATOR_CHARS.indexOf(current) != -1) { - tokenizeOperator(); - } else { - // whitespaces - next(); - } + else if (current == ';') skip(); // ignore semicolon + else if (current == '\0') break; + else throw error("Unknown token " + current); } return tokens; } @@ -163,7 +162,7 @@ public final class Lexer { boolean hasDot = false; while (true) { if (current == '.') { - if (hasDot) throw error("Invalid float number"); + if (hasDot) throw error("Invalid float number " + buffer); hasDot = true; } else if (!Character.isDigit(current)) { break; @@ -178,7 +177,7 @@ public final class Lexer { clearBuffer(); final Pos startPos = markPos(); // Skip HEX prefix 0x or # - for (int i = 0; i < skipChars; i++) next(); + for (int i = 0; i < skipChars; i++) skip(); char current = peek(0); while (isHexNumber(current) || (current == '_')) { @@ -188,13 +187,18 @@ public final class Lexer { } current = next(); } - if (!buffer.isEmpty()) { - addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos); - } + + if (buffer.isEmpty()) throw error("Empty HEX value"); + if (peek(-1) == '_') throw error("HEX value cannot end with _"); + addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos); + } + + private static boolean isNumber(char current) { + return ('0' <= current && current <= '9'); } private static boolean isHexNumber(char current) { - return Character.isDigit(current) + return ('0' <= current && current <= '9') || ('a' <= current && current <= 'f') || ('A' <= current && current <= 'F'); } @@ -203,13 +207,9 @@ public final class Lexer { char current = peek(0); if (current == '/') { if (peek(1) == '/') { - next(); - next(); tokenizeComment(); return; } else if (peek(1) == '*') { - next(); - next(); tokenizeMultilineComment(); return; } @@ -247,7 +247,7 @@ public final class Lexer { private void tokenizeExtendedWord() { final Pos startPos = markPos(); - next();// skip ` + skip();// skip ` clearBuffer(); char current = peek(0); while (current != '`') { @@ -256,19 +256,20 @@ public final class Lexer { buffer.append(current); current = next(); } - next(); // skip closing ` + skip(); // skip closing ` addToken(TokenType.WORD, buffer.toString(), startPos); } private void tokenizeText() { final Pos startPos = markPos(); - next();// skip " + skip();// skip " clearBuffer(); char current = peek(0); while (true) { if (current == '\\') { current = next(); switch (current) { + case '\\': current = next(); buffer.append('\\'); continue; case '"': current = next(); buffer.append('"'); continue; case '0': current = next(); buffer.append('\0'); continue; case 'b': current = next(); buffer.append('\b'); continue; @@ -305,12 +306,14 @@ public final class Lexer { buffer.append(current); current = next(); } - next(); // skip closing " + skip(); // skip closing " addToken(TokenType.TEXT, buffer.toString(), startPos); } private void tokenizeComment() { + skip(); // / + skip(); // / char current = peek(0); while ("\r\n\0".indexOf(current) == -1) { current = next(); @@ -318,13 +321,15 @@ public final class Lexer { } private void tokenizeMultilineComment() { + skip(); // / + skip(); // * char current = peek(0); while (current != '*' || peek(1) != '/') { if (current == '\0') throw error("Reached end of file while parsing multiline comment"); current = next(); } - next(); // * - next(); // / + skip(); // * + skip(); // / } private boolean isOwnLangIdentifierStart(char current) { @@ -332,7 +337,7 @@ public final class Lexer { } private boolean isOwnLangIdentifierPart(char current) { - return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$')); + return isOwnLangIdentifierStart(current) || isNumber(current); } private void clearBuffer() { @@ -342,18 +347,22 @@ public final class Lexer { private Pos markPos() { return new Pos(row, col); } - - private char next() { - final char result = peek(0); + + private void skip() { + if (pos >= length) return; + final char result = input.charAt(pos); if (result == '\n') { row++; col = 1; } else col++; - pos++; - return peek(0); } + private char next() { + skip(); + return peek(0); + } + private char peek(int relativePosition) { final int position = pos + relativePosition; if (position >= length) return '\0'; diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java index 900eaae..5f0c799 100644 --- a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java +++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java @@ -31,6 +31,26 @@ class LexerPositionsTest { text = "line1 line2 line3" + a = 3 + """.stripIndent(); + List result = Lexer.tokenize(input); + + assertThat(result) + .hasSize(6) + .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type) + .containsExactly( + tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT), + tuple(4, 1, WORD), tuple(4, 3, EQ), tuple(4, 5, NUMBER) + ); + } + + @Test + void testMultilineComment() { + String input = """ + /* + line2 + line*/a =/* + */3 """.stripIndent(); List result = Lexer.tokenize(input); @@ -38,7 +58,7 @@ class LexerPositionsTest { .hasSize(3) .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type) .containsExactly( - tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT) + tuple(3, 9, WORD), tuple(3, 11, EQ), tuple(4, 3, NUMBER) ); } } \ No newline at end of file diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java index 6e081d3..c3e9785 100644 --- a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java +++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java @@ -2,181 +2,106 @@ package com.annimon.ownlang.parser; import com.annimon.ownlang.exceptions.LexerException; import org.junit.jupiter.api.Test; - -import java.util.ArrayList; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import java.io.IOException; import java.util.List; - +import java.util.stream.Stream; import static com.annimon.ownlang.parser.TokenType.*; -import static org.junit.jupiter.api.Assertions.*; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.Assertions.assertEquals; /** * * @author aNNiMON */ public class LexerTest { + + public static Stream validData() { + return LexerValidDataProvider.getAll(); + } + + public static Stream invalidData() { + return Stream.builder() + .add(Arguments.of("Wrong float point", "3.14.15")) + .add(Arguments.of("Wrong HEX number", "0Xf7_p6_s5")) + .add(Arguments.of("HEX number ends with _", "0Xf7_")) + .add(Arguments.of("Empty rest of HEX number", "#")) + .add(Arguments.of("Unicode character identifier", "€ = 1")) + .add(Arguments.of("Unicode character only", "€")) + .add(Arguments.of("String error", "\"1\"\"")) + .add(Arguments.of("Multiline comment EOF", "/* 1234 \n")) + .add(Arguments.of("Extended word EOF", "` 1234")) + .build(); + } @Test public void testNumbers() { - String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF #"; - List expList = list(NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER); + String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF"; List result = Lexer.tokenize(input); - assertTokens(expList, result); - assertEquals("0", result.get(0).text()); - assertEquals("3.1415", result.get(1).text()); - assertEquals("CAFEBABE", result.get(2).text()); - assertEquals("f7d6c5", result.get(3).text()); - } - - @Test - public void testNumbersError() { - final String input = "3.14.15 0Xf7_p6_s5"; - assertThrows(LexerException.class, () -> Lexer.tokenize(input)); - } - - @Test - public void testArithmetic() { - String input = "x = -1 + 2 * 3 % 4 / 5"; - List expList = list(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - assertEquals("x", result.get(0).text()); - } - - @Test - public void testKeywords() { - String input = "if else while for include"; - List expList = list(IF, ELSE, WHILE, FOR, INCLUDE); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - - @Test - public void testWord() { - String input = "if bool include \"text\n\ntext\""; - List expList = list(IF, WORD, INCLUDE, TEXT); - List result = Lexer.tokenize(input); - assertTokens(expList, result); + assertTokens(result, NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER); + assertThat(result) + .extracting(Token::text) + .containsExactly("0", "3.1415", "CAFEBABE", "f7d6c5", "FFFF"); } @Test public void testString() { String input = "\"1\\\"2\""; - List expList = list(TEXT); List result = Lexer.tokenize(input); - assertTokens(expList, result); + assertTokens(result, TEXT); assertEquals("1\"2", result.get(0).text()); } + + @Test + public void testEscapeString() { + String input = """ + "\\\\/\\\\" + """.stripIndent(); + List result = Lexer.tokenize(input); + assertTokens(result, TEXT); + assertEquals("\\/\\", result.get(0).text()); + } @Test public void testEmptyString() { String input = "\"\""; - List expList = list(TEXT); List result = Lexer.tokenize(input); - assertTokens(expList, result); + assertTokens(result, TEXT); assertEquals("", result.get(0).text()); } - @Test - public void testStringError() { - String input = "\"1\"\""; - List expList = list(TEXT); - assertThrows(LexerException.class, () -> { - List result = Lexer.tokenize(input); - assertTokens(expList, result); - assertEquals("1", result.get(0).text()); - }); - } - - @Test - public void testOperators() { - String input = "=+-*/%<>!&|"; - List expList = list(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - - @Test - public void testOperators2Char() { - String input = "== != <= >= && || ==+ >=- ->"; - List expList = list(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR, - EQEQ, PLUS, GTEQ, MINUS, MINUS, GT); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - @Test public void testComments() { String input = "// 1234 \n /* */ 123 /* \n 12345 \n\n\n */"; - List expList = list(NUMBER); List result = Lexer.tokenize(input); - assertTokens(expList, result); + assertTokens(result, NUMBER); assertEquals("123", result.get(0).text()); } - - @Test - public void testComments2() { - String input = "// /* 1234 \n */"; - List expList = list(STAR, SLASH); + + @ParameterizedTest + @MethodSource("validData") + public void testValidInput(String name, String input, List tokenTypes) throws IOException { List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - - @Test - public void testCommentsError() { - final String input = "/* 1234 \n"; - assertThrows(LexerException.class, () -> Lexer.tokenize(input)); + assertThat(result) + .hasSize(tokenTypes.size()) + .extracting(Token::type) + .containsAll(tokenTypes); } - @Test - public void testExtendedWordError() { - final String input = "` 1234"; - assertThrows(LexerException.class, () -> Lexer.tokenize(input)); - } - - @Test - public void testUnicodeCharacterIdentifier() { - String input = "€ = 1"; - List expList = list(EQ, NUMBER); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - - @Test - public void testUnicodeCharacterExtendedWordIdentifier() { - String input = "`€` = 1"; - List expList = list(WORD, EQ, NUMBER); - List result = Lexer.tokenize(input); - assertTokens(expList, result); - } - - @Test - public void testUnicodeCharacterEOF() { - String input = "€"; - assertTrue(Lexer.tokenize(input).isEmpty()); + @ParameterizedTest + @MethodSource("invalidData") + public void testInvalidInput(String name, String input) throws IOException { + assertThatThrownBy(() -> Lexer.tokenize(input)) + .isInstanceOf(LexerException.class); } - private static void assertTokens(List expList, List result) { - final int length = expList.size(); - assertEquals(length, result.size()); - for (int i = 0; i < length; i++) { - assertEquals(expList.get(i).type(), result.get(i).type()); - } + private static void assertTokens(List result, TokenType... tokenTypes) { + assertThat(result) + .hasSize(tokenTypes.length) + .extracting(Token::type) + .containsExactly(tokenTypes); } - - private static List list(TokenType... types) { - final List list = new ArrayList<>(); - for (TokenType t : types) { - list.add(token(t)); - } - return list; - } - - private static Token token(TokenType type) { - return token(type, "", new Pos(0, 0)); - } - - private static Token token(TokenType type, String text, Pos pos) { - return new Token(type, text, pos); - } - } diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java new file mode 100644 index 0000000..5c0390c --- /dev/null +++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java @@ -0,0 +1,91 @@ +package com.annimon.ownlang.parser; + +import org.junit.jupiter.params.provider.Arguments; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import static com.annimon.ownlang.parser.TokenType.*; + +public class LexerValidDataProvider { + + public static Stream getAll() { + final var result = new ArrayList(); + result.addAll(numbers()); + result.addAll(keywords()); + result.addAll(words()); + result.addAll(operators()); + result.addAll(comments()); + result.addAll(other()); + result.addAll(notSupported()); + return result.stream(); + } + + private static List numbers() { + return List.of( + Arguments.of("Numbers", + "12 7.8 90000000 10.03", + List.of(NUMBER, NUMBER, NUMBER, NUMBER)), + Arguments.of("Hex numbers", + "#FF 0xCA 0x12fb 0xFF", + List.of(HEX_NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER)) + ); + } + + private static List keywords() { + return List.of( + Arguments.of("Keywords", + "if else while for include", + List.of(IF, ELSE, WHILE, FOR, INCLUDE)) + ); + } + + private static List words() { + return List.of( + Arguments.of("Word", + "if bool include \"text\n\ntext\"", + List.of(IF, WORD, INCLUDE, TEXT)), + Arguments.of("Extended word identifier", + "`€` = 1", + List.of(WORD, EQ, NUMBER)) + ); + } + + private static List operators() { + return List.of( + Arguments.of("Operators", + "=+-*/%<>!&|", + List.of(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR)), + Arguments.of("Operators 2 characters", + "== != <= >= && || ==+ >=- ->", + List.of(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR, + EQEQ, PLUS, GTEQ, MINUS, MINUS, GT)) + ); + } + + private static List comments() { + return List.of( + Arguments.of("Comments", + "// /* 1234 \n */", + List.of(STAR, SLASH)) + ); + } + + private static List other() { + return List.of( + Arguments.of("Arithmetic", + "x = -1 + 2 * 3 % 4 / 5", + List.of(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER)) + ); + } + + private static List notSupported() { + return List.of( + Arguments.of("Float notation", + "7e8", + List.of(NUMBER, WORD)), + Arguments.of("Float hex numbers", + "0Xf7p6", + List.of(HEX_NUMBER, WORD)) + ); + } +}