Fix incorrect token positions in lexer

This commit is contained in:
aNNiMON 2023-09-11 19:15:01 +03:00 committed by Victor Melnik
parent fc73bce943
commit 7baf9f6fc8
4 changed files with 79 additions and 23 deletions

View File

@ -7,7 +7,8 @@ ext {
jline: '2.14.5', // jline:jline
junit: '5.9.2', // org.junit:junit-bom
jmh: '1.37' // org.openjdk.jmh:jmh-core
jmh: '1.37', // org.openjdk.jmh:jmh-core
assertj: '3.24.2' // org.assertj:assertj-core
]
}

View File

@ -12,6 +12,7 @@ dependencies {
testImplementation platform("org.junit:junit-bom:${versions.junit}")
testImplementation "org.junit.jupiter:junit-jupiter-params:${versions.junit}"
testImplementation 'org.junit.jupiter:junit-jupiter'
testImplementation("org.assertj:assertj-core:${versions.assertj}")
testImplementation "org.openjdk.jmh:jmh-core:${versions.jmh}"
testImplementation "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"
testAnnotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"

View File

@ -138,10 +138,7 @@ public final class Lexer {
else if (isOwnLangIdentifierStart(current)) tokenizeWord();
else if (current == '`') tokenizeExtendedWord();
else if (current == '"') tokenizeText();
else if (current == '#') {
next();
tokenizeHexNumber(1);
}
else if (current == '#') tokenizeHexNumber(1);
else if (OPERATOR_CHARS.indexOf(current) != -1) {
tokenizeOperator();
} else {
@ -154,10 +151,9 @@ public final class Lexer {
private void tokenizeNumber() {
clearBuffer();
final Pos startPos = markPos();
char current = peek(0);
if (current == '0' && (peek(1) == 'x' || (peek(1) == 'X'))) {
next();
next();
tokenizeHexNumber(2);
return;
}
@ -170,11 +166,15 @@ public final class Lexer {
buffer.append(current);
current = next();
}
addToken(TokenType.NUMBER, buffer.toString());
addToken(TokenType.NUMBER, buffer.toString(), startPos);
}
private void tokenizeHexNumber(int skipped) {
private void tokenizeHexNumber(int skipChars) {
clearBuffer();
final Pos startPos = markPos();
// Skip HEX prefix 0x or #
for (int i = 0; i < skipChars; i++) next();
char current = peek(0);
while (isHexNumber(current) || (current == '_')) {
if (current != '_') {
@ -185,7 +185,7 @@ public final class Lexer {
}
final int length = buffer.length();
if (length > 0) {
addToken(TokenType.HEX_NUMBER, buffer.toString());
addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
}
}
@ -210,11 +210,13 @@ public final class Lexer {
return;
}
}
final Pos startPos = markPos();
clearBuffer();
while (true) {
final String text = buffer.toString();
if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) {
addToken(OPERATORS.get(text));
addToken(OPERATORS.get(text), startPos);
return;
}
buffer.append(current);
@ -224,6 +226,7 @@ public final class Lexer {
private void tokenizeWord() {
clearBuffer();
final Pos startPos = markPos();
buffer.append(peek(0));
char current = next();
while (true) {
@ -236,13 +239,14 @@ public final class Lexer {
final String word = buffer.toString();
if (KEYWORDS.containsKey(word)) {
addToken(KEYWORDS.get(word));
addToken(KEYWORDS.get(word), startPos);
} else {
addToken(TokenType.WORD, word);
addToken(TokenType.WORD, word, startPos);
}
}
private void tokenizeExtendedWord() {
final Pos startPos = markPos();
next();// skip `
clearBuffer();
char current = peek(0);
@ -254,10 +258,11 @@ public final class Lexer {
current = next();
}
next(); // skip closing `
addToken(TokenType.WORD, buffer.toString());
addToken(TokenType.WORD, buffer.toString(), startPos);
}
private void tokenizeText() {
final Pos startPos = markPos();
next();// skip "
clearBuffer();
char current = peek(0);
@ -303,7 +308,7 @@ public final class Lexer {
}
next(); // skip closing "
addToken(TokenType.TEXT, buffer.toString());
addToken(TokenType.TEXT, buffer.toString(), startPos);
}
private void tokenizeComment() {
@ -335,15 +340,20 @@ public final class Lexer {
private void clearBuffer() {
buffer.setLength(0);
}
private Pos markPos() {
return new Pos(row, col);
}
private char next() {
pos++;
final char result = peek(0);
if (result == '\n') {
row++;
col = 1;
} else col++;
return result;
pos++;
return peek(0);
}
private char peek(int relativePosition) {
@ -352,15 +362,15 @@ public final class Lexer {
return input.charAt(position);
}
private void addToken(TokenType type) {
addToken(type, "");
private void addToken(TokenType type, Pos startPos) {
addToken(type, "", startPos);
}
private void addToken(TokenType type, String text) {
tokens.add(new Token(type, text, new Pos(row, col)));
private void addToken(TokenType type, String text, Pos startRow) {
tokens.add(new Token(type, text, startRow));
}
private LexerException error(String text) {
return new LexerException(new Pos(row, col), text);
return new LexerException(markPos(), text);
}
}

View File

@ -0,0 +1,44 @@
package com.annimon.ownlang.parser;
import org.junit.jupiter.api.Test;
import java.util.List;
import static com.annimon.ownlang.parser.TokenType.*;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.tuple;
class LexerPositionsTest {
@Test
void testMultiline() {
String input = """
x = 123
y = "abc"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(6)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 3, EQ), tuple(1, 5, NUMBER),
tuple(2, 1, WORD), tuple(2, 3, EQ), tuple(2, 5, TEXT)
);
}
@Test
void testMultilineText() {
String input = """
text = "line1
line2
line3"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(3)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT)
);
}
}