From 2c0b19eb0aae48f28735319a0634c3d0eb9b4b13 Mon Sep 17 00:00:00 2001
From: aNNiMON <annimon119@gmail.com>
Date: Sat, 16 Sep 2023 20:12:01 +0300
Subject: [PATCH] More strict lexer, fixed HEX numbers and quote escaping

---
 .../com/annimon/ownlang/parser/Lexer.java     |  71 ++++---
 .../ownlang/parser/LexerPositionsTest.java    |  22 +-
 .../com/annimon/ownlang/parser/LexerTest.java | 199 ++++++------------
 .../parser/LexerValidDataProvider.java        |  91 ++++++++
 4 files changed, 214 insertions(+), 169 deletions(-)
 create mode 100644 ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java

diff --git a/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java b/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java
index 3a15e61..48efd15 100644
--- a/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java
+++ b/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java
@@ -133,21 +133,20 @@ public final class Lexer {
         while (pos < length) {
             // Fast path for skipping whitespaces
             while (Character.isWhitespace(peek(0))) {
-                next();
+                skip();
             }
 
             final char current = peek(0);
-            if (Character.isDigit(current)) tokenizeNumber();
+            if (isNumber(current)) tokenizeNumber();
             else if (isOwnLangIdentifierStart(current)) tokenizeWord();
-            else if (current == '`') tokenizeExtendedWord();
             else if (current == '"') tokenizeText();
+            else if (OPERATOR_CHARS.indexOf(current) != -1) tokenizeOperator();
+            else if (Character.isWhitespace(current)) skip();
+            else if (current == '`') tokenizeExtendedWord();
             else if (current == '#') tokenizeHexNumber(1);
-            else if (OPERATOR_CHARS.indexOf(current) != -1) {
-                tokenizeOperator();
-            } else {
-                // whitespaces
-                next();
-            }
+            else if (current == ';') skip(); // ignore semicolon
+            else if (current == '\0') break;
+            else throw error("Unknown token " + current);
         }
         return tokens;
     }
@@ -163,7 +162,7 @@ public final class Lexer {
         boolean hasDot = false;
         while (true) {
             if (current == '.') {
-                if (hasDot) throw error("Invalid float number");
+                if (hasDot) throw error("Invalid float number " + buffer);
                 hasDot = true;
             } else if (!Character.isDigit(current)) {
                 break;
@@ -178,7 +177,7 @@ public final class Lexer {
         clearBuffer();
         final Pos startPos = markPos();
         // Skip HEX prefix 0x or #
-        for (int i = 0; i < skipChars; i++) next();
+        for (int i = 0; i < skipChars; i++) skip();
 
         char current = peek(0);
         while (isHexNumber(current) || (current == '_')) {
@@ -188,13 +187,18 @@ public final class Lexer {
             }
             current = next();
         }
-        if (!buffer.isEmpty()) {
-            addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
-        }
+
+        if (buffer.isEmpty()) throw error("Empty HEX value");
+        if (peek(-1) == '_') throw error("HEX value cannot end with _");
+        addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
+    }
+
+    private static boolean isNumber(char current) {
+        return ('0' <= current && current <= '9');
     }
 
     private static boolean isHexNumber(char current) {
-        return Character.isDigit(current)
+        return ('0' <= current && current <= '9')
                 || ('a' <= current && current <= 'f')
                 || ('A' <= current && current <= 'F');
     }
@@ -203,13 +207,9 @@ public final class Lexer {
         char current = peek(0);
         if (current == '/') {
             if (peek(1) == '/') {
-                next();
-                next();
                 tokenizeComment();
                 return;
             } else if (peek(1) == '*') {
-                next();
-                next();
                 tokenizeMultilineComment();
                 return;
             }
@@ -247,7 +247,7 @@ public final class Lexer {
 
     private void tokenizeExtendedWord() {
         final Pos startPos = markPos();
-        next();// skip `
+        skip();// skip `
         clearBuffer();
         char current = peek(0);
         while (current != '`') {
@@ -256,19 +256,20 @@ public final class Lexer {
             buffer.append(current);
             current = next();
         }
-        next(); // skip closing `
+        skip(); // skip closing `
         addToken(TokenType.WORD, buffer.toString(), startPos);
     }
     
     private void tokenizeText() {
         final Pos startPos = markPos();
-        next();// skip "
+        skip();// skip "
         clearBuffer();
         char current = peek(0);
         while (true) {
             if (current == '\\') {
                 current = next();
                 switch (current) {
+                    case '\\': current = next(); buffer.append('\\'); continue;
                     case '"': current = next(); buffer.append('"'); continue;
                     case '0': current = next(); buffer.append('\0'); continue;
                     case 'b': current = next(); buffer.append('\b'); continue;
@@ -305,12 +306,14 @@ public final class Lexer {
             buffer.append(current);
             current = next();
         }
-        next(); // skip closing "
+        skip(); // skip closing "
         
         addToken(TokenType.TEXT, buffer.toString(), startPos);
     }
     
     private void tokenizeComment() {
+        skip(); // /
+        skip(); // /
         char current = peek(0);
         while ("\r\n\0".indexOf(current) == -1) {
             current = next();
@@ -318,13 +321,15 @@ public final class Lexer {
      }
     
     private void tokenizeMultilineComment() {
+        skip(); // /
+        skip(); // *
         char current = peek(0);
         while (current != '*' || peek(1) != '/') {
             if (current == '\0') throw error("Reached end of file while parsing multiline comment");
             current = next();
         }
-        next(); // *
-        next(); // /
+        skip(); // *
+        skip(); // /
     }
 
     private boolean isOwnLangIdentifierStart(char current) {
@@ -332,7 +337,7 @@ public final class Lexer {
     }
 
     private boolean isOwnLangIdentifierPart(char current) {
-        return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$'));
+        return isOwnLangIdentifierStart(current) || isNumber(current);
     }
     
     private void clearBuffer() {
@@ -342,18 +347,22 @@ public final class Lexer {
     private Pos markPos() {
         return new Pos(row, col);
     }
-    
-    private char next() {
-        final char result = peek(0);
+
+    private void skip() {
+        if (pos >= length) return;
+        final char result = input.charAt(pos);
         if (result == '\n') {
             row++;
             col = 1;
         } else col++;
-
         pos++;
-        return peek(0);
     }
     
+    private char next() {
+        skip();
+        return peek(0);
+    }
+
     private char peek(int relativePosition) {
         final int position = pos + relativePosition;
         if (position >= length) return '\0';
diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java
index 900eaae..5f0c799 100644
--- a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java
+++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java
@@ -31,6 +31,26 @@ class LexerPositionsTest {
                 text = "line1
                   line2
                   line3"
+                a = 3
+                """.stripIndent();
+        List<Token> result = Lexer.tokenize(input);
+
+        assertThat(result)
+                .hasSize(6)
+                .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
+                .containsExactly(
+                        tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT),
+                        tuple(4, 1, WORD), tuple(4, 3, EQ), tuple(4, 5, NUMBER)
+                );
+    }
+
+    @Test
+    void testMultilineComment() {
+        String input = """
+                /*
+                  line2
+                  line*/a =/*
+                */3
                 """.stripIndent();
         List<Token> result = Lexer.tokenize(input);
 
@@ -38,7 +58,7 @@ class LexerPositionsTest {
                 .hasSize(3)
                 .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
                 .containsExactly(
-                        tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT)
+                        tuple(3, 9, WORD), tuple(3, 11, EQ), tuple(4, 3, NUMBER)
                 );
     }
 }
\ No newline at end of file
diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java
index 6e081d3..c3e9785 100644
--- a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java
+++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerTest.java
@@ -2,181 +2,106 @@ package com.annimon.ownlang.parser;
 
 import com.annimon.ownlang.exceptions.LexerException;
 import org.junit.jupiter.api.Test;
-
-import java.util.ArrayList;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import java.io.IOException;
 import java.util.List;
-
+import java.util.stream.Stream;
 import static com.annimon.ownlang.parser.TokenType.*;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.jupiter.api.Assertions.assertEquals;
 
 /**
  *
  * @author aNNiMON
  */
 public class LexerTest {
+
+    public static Stream<Arguments> validData() {
+        return LexerValidDataProvider.getAll();
+    }
+
+    public static Stream<Arguments> invalidData() {
+        return Stream.<Arguments>builder()
+                .add(Arguments.of("Wrong float point", "3.14.15"))
+                .add(Arguments.of("Wrong HEX number", "0Xf7_p6_s5"))
+                .add(Arguments.of("HEX number ends with _", "0Xf7_"))
+                .add(Arguments.of("Empty rest of HEX number", "#"))
+                .add(Arguments.of("Unicode character identifier", "€ = 1"))
+                .add(Arguments.of("Unicode character only", "€"))
+                .add(Arguments.of("String error", "\"1\"\""))
+                .add(Arguments.of("Multiline comment EOF", "/* 1234 \n"))
+                .add(Arguments.of("Extended word EOF", "` 1234"))
+                .build();
+    }
     
     @Test
     public void testNumbers() {
-        String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF #";
-        List<Token> expList = list(NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
+        String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF";
         List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-        assertEquals("0", result.get(0).text());
-        assertEquals("3.1415", result.get(1).text());
-        assertEquals("CAFEBABE", result.get(2).text());
-        assertEquals("f7d6c5", result.get(3).text());
-    }
-    
-    @Test
-    public void testNumbersError() {
-        final String input = "3.14.15 0Xf7_p6_s5";
-        assertThrows(LexerException.class, () -> Lexer.tokenize(input));
-    }
-    
-    @Test
-    public void testArithmetic() {
-        String input = "x = -1 + 2 * 3 % 4 / 5";
-        List<Token> expList = list(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-        assertEquals("x", result.get(0).text());
-    }
-    
-    @Test
-    public void testKeywords() {
-        String input = "if else while for include";
-        List<Token> expList = list(IF, ELSE, WHILE, FOR, INCLUDE);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-    
-    @Test
-    public void testWord() {
-        String input = "if bool include \"text\n\ntext\"";
-        List<Token> expList = list(IF, WORD, INCLUDE, TEXT);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
+        assertTokens(result, NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
+        assertThat(result)
+                .extracting(Token::text)
+                .containsExactly("0", "3.1415", "CAFEBABE", "f7d6c5", "FFFF");
     }
     
     @Test
     public void testString() {
         String input = "\"1\\\"2\"";
-        List<Token> expList = list(TEXT);
         List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
+        assertTokens(result, TEXT);
         assertEquals("1\"2", result.get(0).text());
     }
+
+    @Test
+    public void testEscapeString() {
+        String input = """
+                "\\\\/\\\\"
+                """.stripIndent();
+        List<Token> result = Lexer.tokenize(input);
+        assertTokens(result, TEXT);
+        assertEquals("\\/\\", result.get(0).text());
+    }
     
     @Test
     public void testEmptyString() {
         String input = "\"\"";
-        List<Token> expList = list(TEXT);
         List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
+        assertTokens(result, TEXT);
         assertEquals("", result.get(0).text());
     }
     
-    @Test
-    public void testStringError() {
-        String input = "\"1\"\"";
-        List<Token> expList = list(TEXT);
-        assertThrows(LexerException.class, () -> {
-            List<Token> result = Lexer.tokenize(input);
-            assertTokens(expList, result);
-            assertEquals("1", result.get(0).text());
-        });
-    }
-    
-    @Test
-    public void testOperators() {
-        String input = "=+-*/%<>!&|";
-        List<Token> expList = list(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-    
-    @Test
-    public void testOperators2Char() {
-        String input = "== != <= >= && || ==+ >=- ->";
-        List<Token> expList = list(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR,
-                EQEQ, PLUS,   GTEQ, MINUS,  MINUS, GT);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-    
     @Test
     public void testComments() {
         String input = "// 1234 \n /* */ 123 /* \n 12345 \n\n\n */";
-        List<Token> expList = list(NUMBER);
         List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
+        assertTokens(result, NUMBER);
         assertEquals("123", result.get(0).text());
     }
-    
-    @Test
-    public void testComments2() {
-        String input = "// /* 1234 \n */";
-        List<Token> expList = list(STAR, SLASH);
+
+    @ParameterizedTest
+    @MethodSource("validData")
+    public void testValidInput(String name, String input, List<TokenType> tokenTypes) throws IOException {
         List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-    
-    @Test
-    public void testCommentsError() {
-        final String input = "/* 1234 \n";
-        assertThrows(LexerException.class, () -> Lexer.tokenize(input));
+        assertThat(result)
+                .hasSize(tokenTypes.size())
+                .extracting(Token::type)
+                .containsAll(tokenTypes);
     }
 
-    @Test
-    public void testExtendedWordError() {
-        final String input = "` 1234";
-        assertThrows(LexerException.class, () -> Lexer.tokenize(input));
-    }
-
-    @Test
-    public void testUnicodeCharacterIdentifier() {
-        String input = "€ = 1";
-        List<Token> expList = list(EQ, NUMBER);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-
-    @Test
-    public void testUnicodeCharacterExtendedWordIdentifier() {
-        String input = "`€` = 1";
-        List<Token> expList = list(WORD, EQ, NUMBER);
-        List<Token> result = Lexer.tokenize(input);
-        assertTokens(expList, result);
-    }
-
-    @Test
-    public void testUnicodeCharacterEOF() {
-        String input = "€";
-        assertTrue(Lexer.tokenize(input).isEmpty());
+    @ParameterizedTest
+    @MethodSource("invalidData")
+    public void testInvalidInput(String name, String input) throws IOException {
+        assertThatThrownBy(() -> Lexer.tokenize(input))
+                .isInstanceOf(LexerException.class);
     }
     
-    private static void assertTokens(List<Token> expList, List<Token> result) {
-        final int length = expList.size();
-        assertEquals(length, result.size());
-        for (int i = 0; i < length; i++) {
-            assertEquals(expList.get(i).type(), result.get(i).type());
-        }
+    private static void assertTokens(List<Token> result, TokenType... tokenTypes) {
+        assertThat(result)
+                .hasSize(tokenTypes.length)
+                .extracting(Token::type)
+                .containsExactly(tokenTypes);
     }
-    
-    private static List<Token> list(TokenType... types) {
-        final List<Token> list = new ArrayList<>();
-        for (TokenType t : types) {
-            list.add(token(t));
-        }
-        return list;
-    }
-    
-    private static Token token(TokenType type) {
-        return token(type, "", new Pos(0, 0));
-    }
-    
-    private static Token token(TokenType type, String text, Pos pos) {
-        return new Token(type, text, pos);
-    }
-    
 }
diff --git a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java
new file mode 100644
index 0000000..5c0390c
--- /dev/null
+++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerValidDataProvider.java
@@ -0,0 +1,91 @@
+package com.annimon.ownlang.parser;
+
+import org.junit.jupiter.params.provider.Arguments;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Stream;
+import static com.annimon.ownlang.parser.TokenType.*;
+
+public class LexerValidDataProvider {
+
+    public static Stream<Arguments> getAll() {
+        final var result = new ArrayList<Arguments>();
+        result.addAll(numbers());
+        result.addAll(keywords());
+        result.addAll(words());
+        result.addAll(operators());
+        result.addAll(comments());
+        result.addAll(other());
+        result.addAll(notSupported());
+        return result.stream();
+    }
+
+    private static List<Arguments> numbers() {
+        return List.of(
+                Arguments.of("Numbers",
+                        "12 7.8 90000000 10.03",
+                        List.of(NUMBER, NUMBER, NUMBER, NUMBER)),
+                Arguments.of("Hex numbers",
+                        "#FF 0xCA 0x12fb 0xFF",
+                        List.of(HEX_NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER))
+        );
+    }
+
+    private static List<Arguments> keywords() {
+        return List.of(
+                Arguments.of("Keywords",
+                        "if else while for include",
+                        List.of(IF, ELSE, WHILE, FOR, INCLUDE))
+        );
+    }
+
+    private static List<Arguments> words() {
+        return List.of(
+                Arguments.of("Word",
+                        "if bool include \"text\n\ntext\"",
+                        List.of(IF, WORD, INCLUDE, TEXT)),
+                Arguments.of("Extended word identifier",
+                        "`€` = 1",
+                        List.of(WORD, EQ, NUMBER))
+        );
+    }
+
+    private static List<Arguments> operators() {
+        return List.of(
+                Arguments.of("Operators",
+                        "=+-*/%<>!&|",
+                        List.of(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR)),
+                Arguments.of("Operators 2 characters",
+                        "== != <= >= && || ==+ >=- ->",
+                        List.of(EQEQ, EXCLEQ, LTEQ, GTEQ,  AMPAMP, BARBAR,
+                                EQEQ, PLUS,   GTEQ, MINUS, MINUS,  GT))
+        );
+    }
+
+    private static List<Arguments> comments() {
+        return List.of(
+                Arguments.of("Comments",
+                        "// /* 1234 \n */",
+                        List.of(STAR, SLASH))
+        );
+    }
+
+    private static List<Arguments> other() {
+        return List.of(
+                Arguments.of("Arithmetic",
+                        "x = -1 + 2 * 3 % 4 / 5",
+                        List.of(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER))
+        );
+    }
+
+    private static List<Arguments> notSupported() {
+        return List.of(
+                Arguments.of("Float notation",
+                        "7e8",
+                        List.of(NUMBER, WORD)),
+                Arguments.of("Float hex numbers",
+                        "0Xf7p6",
+                        List.of(HEX_NUMBER, WORD))
+        );
+    }
+}