Fix incorrect token positions in lexer

2024-09-20 00:34:20 +03:00 · 2023-09-11 19:15:01 +03:00 · 2023-09-11 19:15:01 +03:00 · 7baf9f6fc8
commit 7baf9f6fc8
parent fc73bce943
4 changed files with 79 additions and 23 deletions
--- a/build.gradle
+++ b/build.gradle
@ -7,7 +7,8 @@ ext {
            jline: '2.14.5', // jline:jline
            junit: '5.9.2', // org.junit:junit-bom
-            jmh: '1.37' // org.openjdk.jmh:jmh-core
+            jmh: '1.37', // org.openjdk.jmh:jmh-core
            assertj: '3.24.2' // org.assertj:assertj-core
    ]
 }
--- a/ownlang-parser/build.gradle
+++ b/ownlang-parser/build.gradle
@ -12,6 +12,7 @@ dependencies {
    testImplementation platform("org.junit:junit-bom:${versions.junit}")
    testImplementation "org.junit.jupiter:junit-jupiter-params:${versions.junit}"
    testImplementation 'org.junit.jupiter:junit-jupiter'
    testImplementation("org.assertj:assertj-core:${versions.assertj}")
    testImplementation "org.openjdk.jmh:jmh-core:${versions.jmh}"
    testImplementation "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"
    testAnnotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"
--- a/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java
+++ b/ownlang-parser/src/main/java/com/annimon/ownlang/parser/Lexer.java
@ -138,10 +138,7 @@ public final class Lexer {
            else if (isOwnLangIdentifierStart(current)) tokenizeWord();
            else if (current == '`') tokenizeExtendedWord();
            else if (current == '"') tokenizeText();
-            else if (current == '#') {
+            else if (current == '#') tokenizeHexNumber(1);
                next();
                tokenizeHexNumber(1);
            }
            else if (OPERATOR_CHARS.indexOf(current) != -1) {
                tokenizeOperator();
            } else {
@ -154,10 +151,9 @@ public final class Lexer {
    private void tokenizeNumber() {
        clearBuffer();
        final Pos startPos = markPos();
        char current = peek(0);
        if (current == '0' && (peek(1) == 'x' || (peek(1) == 'X'))) {
            next();
            next();
            tokenizeHexNumber(2);
            return;
        }
@ -170,11 +166,15 @@ public final class Lexer {
            buffer.append(current);
            current = next();
        }
-        addToken(TokenType.NUMBER, buffer.toString());
+        addToken(TokenType.NUMBER, buffer.toString(), startPos);
    }
-    private void tokenizeHexNumber(int skipped) {
+    private void tokenizeHexNumber(int skipChars) {
        clearBuffer();
        final Pos startPos = markPos();
        // Skip HEX prefix 0x or #
        for (int i = 0; i < skipChars; i++) next();
        char current = peek(0);
        while (isHexNumber(current) || (current == '_')) {
            if (current != '_') {
@ -185,7 +185,7 @@ public final class Lexer {
        }
        final int length = buffer.length();
        if (length > 0) {
-            addToken(TokenType.HEX_NUMBER, buffer.toString());
+            addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
        }
    }
@ -210,11 +210,13 @@ public final class Lexer {
                return;
            }
        }
        final Pos startPos = markPos();
        clearBuffer();
        while (true) {
            final String text = buffer.toString();
            if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) {
-                addToken(OPERATORS.get(text));
+                addToken(OPERATORS.get(text), startPos);
                return;
            }
            buffer.append(current);
@ -224,6 +226,7 @@ public final class Lexer {
    private void tokenizeWord() {
        clearBuffer();
        final Pos startPos = markPos();
        buffer.append(peek(0));
        char current = next();
        while (true) {
@ -236,13 +239,14 @@ public final class Lexer {
        final String word = buffer.toString();
        if (KEYWORDS.containsKey(word)) {
-            addToken(KEYWORDS.get(word));
+            addToken(KEYWORDS.get(word), startPos);
        } else {
-            addToken(TokenType.WORD, word);
+            addToken(TokenType.WORD, word, startPos);
        }
    }
    private void tokenizeExtendedWord() {
        final Pos startPos = markPos();
        next();// skip `
        clearBuffer();
        char current = peek(0);
@ -254,10 +258,11 @@ public final class Lexer {
            current = next();
        }
        next(); // skip closing `
-        addToken(TokenType.WORD, buffer.toString());
+        addToken(TokenType.WORD, buffer.toString(), startPos);
    }
    private void tokenizeText() {
        final Pos startPos = markPos();
        next();// skip "
        clearBuffer();
        char current = peek(0);
@ -303,7 +308,7 @@ public final class Lexer {
        }
        next(); // skip closing "
-        addToken(TokenType.TEXT, buffer.toString());
+        addToken(TokenType.TEXT, buffer.toString(), startPos);
    }
    private void tokenizeComment() {
@ -336,14 +341,19 @@ public final class Lexer {
        buffer.setLength(0);
    }
    private Pos markPos() {
        return new Pos(row, col);
    }
    private char next() {
        pos++;
        final char result = peek(0);
        if (result == '\n') {
            row++;
            col = 1;
        } else col++;
-        return result;
+
        pos++;
        return peek(0);
    }
    private char peek(int relativePosition) {
@ -352,15 +362,15 @@ public final class Lexer {
        return input.charAt(position);
    }
-    private void addToken(TokenType type) {
+    private void addToken(TokenType type, Pos startPos) {
-        addToken(type, "");
+        addToken(type, "", startPos);
    }
-    private void addToken(TokenType type, String text) {
+    private void addToken(TokenType type, String text, Pos startRow) {
-        tokens.add(new Token(type, text, new Pos(row, col)));
+        tokens.add(new Token(type, text, startRow));
    }
    private LexerException error(String text) {
-        return new LexerException(new Pos(row, col), text);
+        return new LexerException(markPos(), text);
    }
 }
--- a/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java
+++ b/ownlang-parser/src/test/java/com/annimon/ownlang/parser/LexerPositionsTest.java
@ -0,0 +1,44 @@
 package com.annimon.ownlang.parser;
 import org.junit.jupiter.api.Test;
 import java.util.List;
 import static com.annimon.ownlang.parser.TokenType.*;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.tuple;
 class LexerPositionsTest {
    @Test
    void testMultiline() {
        String input = """
                x = 123
                y = "abc"
                """.stripIndent();
        List<Token> result = Lexer.tokenize(input);
        assertThat(result)
                .hasSize(6)
                .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
                .containsExactly(
                        tuple(1, 1, WORD), tuple(1, 3, EQ), tuple(1, 5, NUMBER),
                        tuple(2, 1, WORD), tuple(2, 3, EQ), tuple(2, 5, TEXT)
                );
    }
    @Test
    void testMultilineText() {
        String input = """
                text = "line1
                  line2
                  line3"
                """.stripIndent();
        List<Token> result = Lexer.tokenize(input);
        assertThat(result)
                .hasSize(3)
                .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
                .containsExactly(
                        tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT)
                );
    }
 }