Fix incorrect token positions in lexer

This commit is contained in:
aNNiMON 2023-09-11 19:15:01 +03:00 committed by Victor Melnik
parent fc73bce943
commit 7baf9f6fc8
4 changed files with 79 additions and 23 deletions

View File

@ -7,7 +7,8 @@ ext {
jline: '2.14.5', // jline:jline jline: '2.14.5', // jline:jline
junit: '5.9.2', // org.junit:junit-bom junit: '5.9.2', // org.junit:junit-bom
jmh: '1.37' // org.openjdk.jmh:jmh-core jmh: '1.37', // org.openjdk.jmh:jmh-core
assertj: '3.24.2' // org.assertj:assertj-core
] ]
} }

View File

@ -12,6 +12,7 @@ dependencies {
testImplementation platform("org.junit:junit-bom:${versions.junit}") testImplementation platform("org.junit:junit-bom:${versions.junit}")
testImplementation "org.junit.jupiter:junit-jupiter-params:${versions.junit}" testImplementation "org.junit.jupiter:junit-jupiter-params:${versions.junit}"
testImplementation 'org.junit.jupiter:junit-jupiter' testImplementation 'org.junit.jupiter:junit-jupiter'
testImplementation("org.assertj:assertj-core:${versions.assertj}")
testImplementation "org.openjdk.jmh:jmh-core:${versions.jmh}" testImplementation "org.openjdk.jmh:jmh-core:${versions.jmh}"
testImplementation "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}" testImplementation "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"
testAnnotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}" testAnnotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:${versions.jmh}"

View File

@ -138,10 +138,7 @@ public final class Lexer {
else if (isOwnLangIdentifierStart(current)) tokenizeWord(); else if (isOwnLangIdentifierStart(current)) tokenizeWord();
else if (current == '`') tokenizeExtendedWord(); else if (current == '`') tokenizeExtendedWord();
else if (current == '"') tokenizeText(); else if (current == '"') tokenizeText();
else if (current == '#') { else if (current == '#') tokenizeHexNumber(1);
next();
tokenizeHexNumber(1);
}
else if (OPERATOR_CHARS.indexOf(current) != -1) { else if (OPERATOR_CHARS.indexOf(current) != -1) {
tokenizeOperator(); tokenizeOperator();
} else { } else {
@ -154,10 +151,9 @@ public final class Lexer {
private void tokenizeNumber() { private void tokenizeNumber() {
clearBuffer(); clearBuffer();
final Pos startPos = markPos();
char current = peek(0); char current = peek(0);
if (current == '0' && (peek(1) == 'x' || (peek(1) == 'X'))) { if (current == '0' && (peek(1) == 'x' || (peek(1) == 'X'))) {
next();
next();
tokenizeHexNumber(2); tokenizeHexNumber(2);
return; return;
} }
@ -170,11 +166,15 @@ public final class Lexer {
buffer.append(current); buffer.append(current);
current = next(); current = next();
} }
addToken(TokenType.NUMBER, buffer.toString()); addToken(TokenType.NUMBER, buffer.toString(), startPos);
} }
private void tokenizeHexNumber(int skipped) { private void tokenizeHexNumber(int skipChars) {
clearBuffer(); clearBuffer();
final Pos startPos = markPos();
// Skip HEX prefix 0x or #
for (int i = 0; i < skipChars; i++) next();
char current = peek(0); char current = peek(0);
while (isHexNumber(current) || (current == '_')) { while (isHexNumber(current) || (current == '_')) {
if (current != '_') { if (current != '_') {
@ -185,7 +185,7 @@ public final class Lexer {
} }
final int length = buffer.length(); final int length = buffer.length();
if (length > 0) { if (length > 0) {
addToken(TokenType.HEX_NUMBER, buffer.toString()); addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
} }
} }
@ -210,11 +210,13 @@ public final class Lexer {
return; return;
} }
} }
final Pos startPos = markPos();
clearBuffer(); clearBuffer();
while (true) { while (true) {
final String text = buffer.toString(); final String text = buffer.toString();
if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) { if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) {
addToken(OPERATORS.get(text)); addToken(OPERATORS.get(text), startPos);
return; return;
} }
buffer.append(current); buffer.append(current);
@ -224,6 +226,7 @@ public final class Lexer {
private void tokenizeWord() { private void tokenizeWord() {
clearBuffer(); clearBuffer();
final Pos startPos = markPos();
buffer.append(peek(0)); buffer.append(peek(0));
char current = next(); char current = next();
while (true) { while (true) {
@ -236,13 +239,14 @@ public final class Lexer {
final String word = buffer.toString(); final String word = buffer.toString();
if (KEYWORDS.containsKey(word)) { if (KEYWORDS.containsKey(word)) {
addToken(KEYWORDS.get(word)); addToken(KEYWORDS.get(word), startPos);
} else { } else {
addToken(TokenType.WORD, word); addToken(TokenType.WORD, word, startPos);
} }
} }
private void tokenizeExtendedWord() { private void tokenizeExtendedWord() {
final Pos startPos = markPos();
next();// skip ` next();// skip `
clearBuffer(); clearBuffer();
char current = peek(0); char current = peek(0);
@ -254,10 +258,11 @@ public final class Lexer {
current = next(); current = next();
} }
next(); // skip closing ` next(); // skip closing `
addToken(TokenType.WORD, buffer.toString()); addToken(TokenType.WORD, buffer.toString(), startPos);
} }
private void tokenizeText() { private void tokenizeText() {
final Pos startPos = markPos();
next();// skip " next();// skip "
clearBuffer(); clearBuffer();
char current = peek(0); char current = peek(0);
@ -303,7 +308,7 @@ public final class Lexer {
} }
next(); // skip closing " next(); // skip closing "
addToken(TokenType.TEXT, buffer.toString()); addToken(TokenType.TEXT, buffer.toString(), startPos);
} }
private void tokenizeComment() { private void tokenizeComment() {
@ -336,14 +341,19 @@ public final class Lexer {
buffer.setLength(0); buffer.setLength(0);
} }
private Pos markPos() {
return new Pos(row, col);
}
private char next() { private char next() {
pos++;
final char result = peek(0); final char result = peek(0);
if (result == '\n') { if (result == '\n') {
row++; row++;
col = 1; col = 1;
} else col++; } else col++;
return result;
pos++;
return peek(0);
} }
private char peek(int relativePosition) { private char peek(int relativePosition) {
@ -352,15 +362,15 @@ public final class Lexer {
return input.charAt(position); return input.charAt(position);
} }
private void addToken(TokenType type) { private void addToken(TokenType type, Pos startPos) {
addToken(type, ""); addToken(type, "", startPos);
} }
private void addToken(TokenType type, String text) { private void addToken(TokenType type, String text, Pos startRow) {
tokens.add(new Token(type, text, new Pos(row, col))); tokens.add(new Token(type, text, startRow));
} }
private LexerException error(String text) { private LexerException error(String text) {
return new LexerException(new Pos(row, col), text); return new LexerException(markPos(), text);
} }
} }

View File

@ -0,0 +1,44 @@
package com.annimon.ownlang.parser;
import org.junit.jupiter.api.Test;
import java.util.List;
import static com.annimon.ownlang.parser.TokenType.*;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.tuple;
class LexerPositionsTest {
@Test
void testMultiline() {
String input = """
x = 123
y = "abc"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(6)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 3, EQ), tuple(1, 5, NUMBER),
tuple(2, 1, WORD), tuple(2, 3, EQ), tuple(2, 5, TEXT)
);
}
@Test
void testMultilineText() {
String input = """
text = "line1
line2
line3"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(3)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT)
);
}
}