More strict lexer, fixed HEX numbers and quote escaping

This commit is contained in:
aNNiMON 2023-09-16 20:12:01 +03:00 committed by Victor Melnik
parent 15c277d145
commit 2c0b19eb0a
4 changed files with 214 additions and 169 deletions

View File

@ -133,21 +133,20 @@ public final class Lexer {
while (pos < length) {
// Fast path for skipping whitespaces
while (Character.isWhitespace(peek(0))) {
next();
skip();
}
final char current = peek(0);
if (Character.isDigit(current)) tokenizeNumber();
if (isNumber(current)) tokenizeNumber();
else if (isOwnLangIdentifierStart(current)) tokenizeWord();
else if (current == '`') tokenizeExtendedWord();
else if (current == '"') tokenizeText();
else if (OPERATOR_CHARS.indexOf(current) != -1) tokenizeOperator();
else if (Character.isWhitespace(current)) skip();
else if (current == '`') tokenizeExtendedWord();
else if (current == '#') tokenizeHexNumber(1);
else if (OPERATOR_CHARS.indexOf(current) != -1) {
tokenizeOperator();
} else {
// whitespaces
next();
}
else if (current == ';') skip(); // ignore semicolon
else if (current == '\0') break;
else throw error("Unknown token " + current);
}
return tokens;
}
@ -163,7 +162,7 @@ public final class Lexer {
boolean hasDot = false;
while (true) {
if (current == '.') {
if (hasDot) throw error("Invalid float number");
if (hasDot) throw error("Invalid float number " + buffer);
hasDot = true;
} else if (!Character.isDigit(current)) {
break;
@ -178,7 +177,7 @@ public final class Lexer {
clearBuffer();
final Pos startPos = markPos();
// Skip HEX prefix 0x or #
for (int i = 0; i < skipChars; i++) next();
for (int i = 0; i < skipChars; i++) skip();
char current = peek(0);
while (isHexNumber(current) || (current == '_')) {
@ -188,13 +187,18 @@ public final class Lexer {
}
current = next();
}
if (!buffer.isEmpty()) {
addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
}
if (buffer.isEmpty()) throw error("Empty HEX value");
if (peek(-1) == '_') throw error("HEX value cannot end with _");
addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
}
private static boolean isNumber(char current) {
return ('0' <= current && current <= '9');
}
private static boolean isHexNumber(char current) {
return Character.isDigit(current)
return ('0' <= current && current <= '9')
|| ('a' <= current && current <= 'f')
|| ('A' <= current && current <= 'F');
}
@ -203,13 +207,9 @@ public final class Lexer {
char current = peek(0);
if (current == '/') {
if (peek(1) == '/') {
next();
next();
tokenizeComment();
return;
} else if (peek(1) == '*') {
next();
next();
tokenizeMultilineComment();
return;
}
@ -247,7 +247,7 @@ public final class Lexer {
private void tokenizeExtendedWord() {
final Pos startPos = markPos();
next();// skip `
skip();// skip `
clearBuffer();
char current = peek(0);
while (current != '`') {
@ -256,19 +256,20 @@ public final class Lexer {
buffer.append(current);
current = next();
}
next(); // skip closing `
skip(); // skip closing `
addToken(TokenType.WORD, buffer.toString(), startPos);
}
private void tokenizeText() {
final Pos startPos = markPos();
next();// skip "
skip();// skip "
clearBuffer();
char current = peek(0);
while (true) {
if (current == '\\') {
current = next();
switch (current) {
case '\\': current = next(); buffer.append('\\'); continue;
case '"': current = next(); buffer.append('"'); continue;
case '0': current = next(); buffer.append('\0'); continue;
case 'b': current = next(); buffer.append('\b'); continue;
@ -305,12 +306,14 @@ public final class Lexer {
buffer.append(current);
current = next();
}
next(); // skip closing "
skip(); // skip closing "
addToken(TokenType.TEXT, buffer.toString(), startPos);
}
private void tokenizeComment() {
skip(); // /
skip(); // /
char current = peek(0);
while ("\r\n\0".indexOf(current) == -1) {
current = next();
@ -318,13 +321,15 @@ public final class Lexer {
}
private void tokenizeMultilineComment() {
skip(); // /
skip(); // *
char current = peek(0);
while (current != '*' || peek(1) != '/') {
if (current == '\0') throw error("Reached end of file while parsing multiline comment");
current = next();
}
next(); // *
next(); // /
skip(); // *
skip(); // /
}
private boolean isOwnLangIdentifierStart(char current) {
@ -332,7 +337,7 @@ public final class Lexer {
}
private boolean isOwnLangIdentifierPart(char current) {
return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$'));
return isOwnLangIdentifierStart(current) || isNumber(current);
}
private void clearBuffer() {
@ -342,18 +347,22 @@ public final class Lexer {
private Pos markPos() {
return new Pos(row, col);
}
private char next() {
final char result = peek(0);
private void skip() {
if (pos >= length) return;
final char result = input.charAt(pos);
if (result == '\n') {
row++;
col = 1;
} else col++;
pos++;
return peek(0);
}
private char next() {
skip();
return peek(0);
}
private char peek(int relativePosition) {
final int position = pos + relativePosition;
if (position >= length) return '\0';

View File

@ -31,6 +31,26 @@ class LexerPositionsTest {
text = "line1
line2
line3"
a = 3
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(6)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT),
tuple(4, 1, WORD), tuple(4, 3, EQ), tuple(4, 5, NUMBER)
);
}
@Test
void testMultilineComment() {
String input = """
/*
line2
line*/a =/*
*/3
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
@ -38,7 +58,7 @@ class LexerPositionsTest {
.hasSize(3)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT)
tuple(3, 9, WORD), tuple(3, 11, EQ), tuple(4, 3, NUMBER)
);
}
}

View File

@ -2,181 +2,106 @@ package com.annimon.ownlang.parser;
import com.annimon.ownlang.exceptions.LexerException;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.List;
import java.util.stream.Stream;
import static com.annimon.ownlang.parser.TokenType.*;
import static org.junit.jupiter.api.Assertions.*;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.junit.jupiter.api.Assertions.assertEquals;
/**
*
* @author aNNiMON
*/
public class LexerTest {
public static Stream<Arguments> validData() {
return LexerValidDataProvider.getAll();
}
public static Stream<Arguments> invalidData() {
return Stream.<Arguments>builder()
.add(Arguments.of("Wrong float point", "3.14.15"))
.add(Arguments.of("Wrong HEX number", "0Xf7_p6_s5"))
.add(Arguments.of("HEX number ends with _", "0Xf7_"))
.add(Arguments.of("Empty rest of HEX number", "#"))
.add(Arguments.of("Unicode character identifier", "€ = 1"))
.add(Arguments.of("Unicode character only", ""))
.add(Arguments.of("String error", "\"1\"\""))
.add(Arguments.of("Multiline comment EOF", "/* 1234 \n"))
.add(Arguments.of("Extended word EOF", "` 1234"))
.build();
}
@Test
public void testNumbers() {
String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF #";
List<Token> expList = list(NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF";
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertEquals("0", result.get(0).text());
assertEquals("3.1415", result.get(1).text());
assertEquals("CAFEBABE", result.get(2).text());
assertEquals("f7d6c5", result.get(3).text());
}
@Test
public void testNumbersError() {
final String input = "3.14.15 0Xf7_p6_s5";
assertThrows(LexerException.class, () -> Lexer.tokenize(input));
}
@Test
public void testArithmetic() {
String input = "x = -1 + 2 * 3 % 4 / 5";
List<Token> expList = list(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertEquals("x", result.get(0).text());
}
@Test
public void testKeywords() {
String input = "if else while for include";
List<Token> expList = list(IF, ELSE, WHILE, FOR, INCLUDE);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testWord() {
String input = "if bool include \"text\n\ntext\"";
List<Token> expList = list(IF, WORD, INCLUDE, TEXT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertTokens(result, NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
assertThat(result)
.extracting(Token::text)
.containsExactly("0", "3.1415", "CAFEBABE", "f7d6c5", "FFFF");
}
@Test
public void testString() {
String input = "\"1\\\"2\"";
List<Token> expList = list(TEXT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertTokens(result, TEXT);
assertEquals("1\"2", result.get(0).text());
}
@Test
public void testEscapeString() {
String input = """
"\\\\/\\\\"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertTokens(result, TEXT);
assertEquals("\\/\\", result.get(0).text());
}
@Test
public void testEmptyString() {
String input = "\"\"";
List<Token> expList = list(TEXT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertTokens(result, TEXT);
assertEquals("", result.get(0).text());
}
@Test
public void testStringError() {
String input = "\"1\"\"";
List<Token> expList = list(TEXT);
assertThrows(LexerException.class, () -> {
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertEquals("1", result.get(0).text());
});
}
@Test
public void testOperators() {
String input = "=+-*/%<>!&|";
List<Token> expList = list(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testOperators2Char() {
String input = "== != <= >= && || ==+ >=- ->";
List<Token> expList = list(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR,
EQEQ, PLUS, GTEQ, MINUS, MINUS, GT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testComments() {
String input = "// 1234 \n /* */ 123 /* \n 12345 \n\n\n */";
List<Token> expList = list(NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertTokens(result, NUMBER);
assertEquals("123", result.get(0).text());
}
@Test
public void testComments2() {
String input = "// /* 1234 \n */";
List<Token> expList = list(STAR, SLASH);
@ParameterizedTest
@MethodSource("validData")
public void testValidInput(String name, String input, List<TokenType> tokenTypes) throws IOException {
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testCommentsError() {
final String input = "/* 1234 \n";
assertThrows(LexerException.class, () -> Lexer.tokenize(input));
assertThat(result)
.hasSize(tokenTypes.size())
.extracting(Token::type)
.containsAll(tokenTypes);
}
@Test
public void testExtendedWordError() {
final String input = "` 1234";
assertThrows(LexerException.class, () -> Lexer.tokenize(input));
}
@Test
public void testUnicodeCharacterIdentifier() {
String input = "€ = 1";
List<Token> expList = list(EQ, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testUnicodeCharacterExtendedWordIdentifier() {
String input = "`€` = 1";
List<Token> expList = list(WORD, EQ, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testUnicodeCharacterEOF() {
String input = "";
assertTrue(Lexer.tokenize(input).isEmpty());
@ParameterizedTest
@MethodSource("invalidData")
public void testInvalidInput(String name, String input) throws IOException {
assertThatThrownBy(() -> Lexer.tokenize(input))
.isInstanceOf(LexerException.class);
}
private static void assertTokens(List<Token> expList, List<Token> result) {
final int length = expList.size();
assertEquals(length, result.size());
for (int i = 0; i < length; i++) {
assertEquals(expList.get(i).type(), result.get(i).type());
}
private static void assertTokens(List<Token> result, TokenType... tokenTypes) {
assertThat(result)
.hasSize(tokenTypes.length)
.extracting(Token::type)
.containsExactly(tokenTypes);
}
private static List<Token> list(TokenType... types) {
final List<Token> list = new ArrayList<>();
for (TokenType t : types) {
list.add(token(t));
}
return list;
}
private static Token token(TokenType type) {
return token(type, "", new Pos(0, 0));
}
private static Token token(TokenType type, String text, Pos pos) {
return new Token(type, text, pos);
}
}

View File

@ -0,0 +1,91 @@
package com.annimon.ownlang.parser;
import org.junit.jupiter.params.provider.Arguments;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
import static com.annimon.ownlang.parser.TokenType.*;
public class LexerValidDataProvider {
public static Stream<Arguments> getAll() {
final var result = new ArrayList<Arguments>();
result.addAll(numbers());
result.addAll(keywords());
result.addAll(words());
result.addAll(operators());
result.addAll(comments());
result.addAll(other());
result.addAll(notSupported());
return result.stream();
}
private static List<Arguments> numbers() {
return List.of(
Arguments.of("Numbers",
"12 7.8 90000000 10.03",
List.of(NUMBER, NUMBER, NUMBER, NUMBER)),
Arguments.of("Hex numbers",
"#FF 0xCA 0x12fb 0xFF",
List.of(HEX_NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER))
);
}
private static List<Arguments> keywords() {
return List.of(
Arguments.of("Keywords",
"if else while for include",
List.of(IF, ELSE, WHILE, FOR, INCLUDE))
);
}
private static List<Arguments> words() {
return List.of(
Arguments.of("Word",
"if bool include \"text\n\ntext\"",
List.of(IF, WORD, INCLUDE, TEXT)),
Arguments.of("Extended word identifier",
"`€` = 1",
List.of(WORD, EQ, NUMBER))
);
}
private static List<Arguments> operators() {
return List.of(
Arguments.of("Operators",
"=+-*/%<>!&|",
List.of(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR)),
Arguments.of("Operators 2 characters",
"== != <= >= && || ==+ >=- ->",
List.of(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR,
EQEQ, PLUS, GTEQ, MINUS, MINUS, GT))
);
}
private static List<Arguments> comments() {
return List.of(
Arguments.of("Comments",
"// /* 1234 \n */",
List.of(STAR, SLASH))
);
}
private static List<Arguments> other() {
return List.of(
Arguments.of("Arithmetic",
"x = -1 + 2 * 3 % 4 / 5",
List.of(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER))
);
}
private static List<Arguments> notSupported() {
return List.of(
Arguments.of("Float notation",
"7e8",
List.of(NUMBER, WORD)),
Arguments.of("Float hex numbers",
"0Xf7p6",
List.of(HEX_NUMBER, WORD))
);
}
}