More strict lexer, fixed HEX numbers and quote escaping

This commit is contained in:
aNNiMON 2023-09-16 20:12:01 +03:00 committed by Victor Melnik
parent 15c277d145
commit 2c0b19eb0a
4 changed files with 214 additions and 169 deletions

View File

@ -133,21 +133,20 @@ public final class Lexer {
while (pos < length) { while (pos < length) {
// Fast path for skipping whitespaces // Fast path for skipping whitespaces
while (Character.isWhitespace(peek(0))) { while (Character.isWhitespace(peek(0))) {
next(); skip();
} }
final char current = peek(0); final char current = peek(0);
if (Character.isDigit(current)) tokenizeNumber(); if (isNumber(current)) tokenizeNumber();
else if (isOwnLangIdentifierStart(current)) tokenizeWord(); else if (isOwnLangIdentifierStart(current)) tokenizeWord();
else if (current == '`') tokenizeExtendedWord();
else if (current == '"') tokenizeText(); else if (current == '"') tokenizeText();
else if (OPERATOR_CHARS.indexOf(current) != -1) tokenizeOperator();
else if (Character.isWhitespace(current)) skip();
else if (current == '`') tokenizeExtendedWord();
else if (current == '#') tokenizeHexNumber(1); else if (current == '#') tokenizeHexNumber(1);
else if (OPERATOR_CHARS.indexOf(current) != -1) { else if (current == ';') skip(); // ignore semicolon
tokenizeOperator(); else if (current == '\0') break;
} else { else throw error("Unknown token " + current);
// whitespaces
next();
}
} }
return tokens; return tokens;
} }
@ -163,7 +162,7 @@ public final class Lexer {
boolean hasDot = false; boolean hasDot = false;
while (true) { while (true) {
if (current == '.') { if (current == '.') {
if (hasDot) throw error("Invalid float number"); if (hasDot) throw error("Invalid float number " + buffer);
hasDot = true; hasDot = true;
} else if (!Character.isDigit(current)) { } else if (!Character.isDigit(current)) {
break; break;
@ -178,7 +177,7 @@ public final class Lexer {
clearBuffer(); clearBuffer();
final Pos startPos = markPos(); final Pos startPos = markPos();
// Skip HEX prefix 0x or # // Skip HEX prefix 0x or #
for (int i = 0; i < skipChars; i++) next(); for (int i = 0; i < skipChars; i++) skip();
char current = peek(0); char current = peek(0);
while (isHexNumber(current) || (current == '_')) { while (isHexNumber(current) || (current == '_')) {
@ -188,13 +187,18 @@ public final class Lexer {
} }
current = next(); current = next();
} }
if (!buffer.isEmpty()) {
if (buffer.isEmpty()) throw error("Empty HEX value");
if (peek(-1) == '_') throw error("HEX value cannot end with _");
addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos); addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
} }
private static boolean isNumber(char current) {
return ('0' <= current && current <= '9');
} }
private static boolean isHexNumber(char current) { private static boolean isHexNumber(char current) {
return Character.isDigit(current) return ('0' <= current && current <= '9')
|| ('a' <= current && current <= 'f') || ('a' <= current && current <= 'f')
|| ('A' <= current && current <= 'F'); || ('A' <= current && current <= 'F');
} }
@ -203,13 +207,9 @@ public final class Lexer {
char current = peek(0); char current = peek(0);
if (current == '/') { if (current == '/') {
if (peek(1) == '/') { if (peek(1) == '/') {
next();
next();
tokenizeComment(); tokenizeComment();
return; return;
} else if (peek(1) == '*') { } else if (peek(1) == '*') {
next();
next();
tokenizeMultilineComment(); tokenizeMultilineComment();
return; return;
} }
@ -247,7 +247,7 @@ public final class Lexer {
private void tokenizeExtendedWord() { private void tokenizeExtendedWord() {
final Pos startPos = markPos(); final Pos startPos = markPos();
next();// skip ` skip();// skip `
clearBuffer(); clearBuffer();
char current = peek(0); char current = peek(0);
while (current != '`') { while (current != '`') {
@ -256,19 +256,20 @@ public final class Lexer {
buffer.append(current); buffer.append(current);
current = next(); current = next();
} }
next(); // skip closing ` skip(); // skip closing `
addToken(TokenType.WORD, buffer.toString(), startPos); addToken(TokenType.WORD, buffer.toString(), startPos);
} }
private void tokenizeText() { private void tokenizeText() {
final Pos startPos = markPos(); final Pos startPos = markPos();
next();// skip " skip();// skip "
clearBuffer(); clearBuffer();
char current = peek(0); char current = peek(0);
while (true) { while (true) {
if (current == '\\') { if (current == '\\') {
current = next(); current = next();
switch (current) { switch (current) {
case '\\': current = next(); buffer.append('\\'); continue;
case '"': current = next(); buffer.append('"'); continue; case '"': current = next(); buffer.append('"'); continue;
case '0': current = next(); buffer.append('\0'); continue; case '0': current = next(); buffer.append('\0'); continue;
case 'b': current = next(); buffer.append('\b'); continue; case 'b': current = next(); buffer.append('\b'); continue;
@ -305,12 +306,14 @@ public final class Lexer {
buffer.append(current); buffer.append(current);
current = next(); current = next();
} }
next(); // skip closing " skip(); // skip closing "
addToken(TokenType.TEXT, buffer.toString(), startPos); addToken(TokenType.TEXT, buffer.toString(), startPos);
} }
private void tokenizeComment() { private void tokenizeComment() {
skip(); // /
skip(); // /
char current = peek(0); char current = peek(0);
while ("\r\n\0".indexOf(current) == -1) { while ("\r\n\0".indexOf(current) == -1) {
current = next(); current = next();
@ -318,13 +321,15 @@ public final class Lexer {
} }
private void tokenizeMultilineComment() { private void tokenizeMultilineComment() {
skip(); // /
skip(); // *
char current = peek(0); char current = peek(0);
while (current != '*' || peek(1) != '/') { while (current != '*' || peek(1) != '/') {
if (current == '\0') throw error("Reached end of file while parsing multiline comment"); if (current == '\0') throw error("Reached end of file while parsing multiline comment");
current = next(); current = next();
} }
next(); // * skip(); // *
next(); // / skip(); // /
} }
private boolean isOwnLangIdentifierStart(char current) { private boolean isOwnLangIdentifierStart(char current) {
@ -332,7 +337,7 @@ public final class Lexer {
} }
private boolean isOwnLangIdentifierPart(char current) { private boolean isOwnLangIdentifierPart(char current) {
return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$')); return isOwnLangIdentifierStart(current) || isNumber(current);
} }
private void clearBuffer() { private void clearBuffer() {
@ -343,14 +348,18 @@ public final class Lexer {
return new Pos(row, col); return new Pos(row, col);
} }
private char next() { private void skip() {
final char result = peek(0); if (pos >= length) return;
final char result = input.charAt(pos);
if (result == '\n') { if (result == '\n') {
row++; row++;
col = 1; col = 1;
} else col++; } else col++;
pos++; pos++;
}
private char next() {
skip();
return peek(0); return peek(0);
} }

View File

@ -31,6 +31,26 @@ class LexerPositionsTest {
text = "line1 text = "line1
line2 line2
line3" line3"
a = 3
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertThat(result)
.hasSize(6)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT),
tuple(4, 1, WORD), tuple(4, 3, EQ), tuple(4, 5, NUMBER)
);
}
@Test
void testMultilineComment() {
String input = """
/*
line2
line*/a =/*
*/3
""".stripIndent(); """.stripIndent();
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
@ -38,7 +58,7 @@ class LexerPositionsTest {
.hasSize(3) .hasSize(3)
.extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type) .extracting(s -> s.pos().row(), s -> s.pos().col(), Token::type)
.containsExactly( .containsExactly(
tuple(1, 1, WORD), tuple(1, 6, EQ), tuple(1, 8, TEXT) tuple(3, 9, WORD), tuple(3, 11, EQ), tuple(4, 3, NUMBER)
); );
} }
} }

View File

@ -2,12 +2,16 @@ package com.annimon.ownlang.parser;
import com.annimon.ownlang.exceptions.LexerException; import com.annimon.ownlang.exceptions.LexerException;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import java.util.ArrayList; import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.stream.Stream;
import static com.annimon.ownlang.parser.TokenType.*; import static com.annimon.ownlang.parser.TokenType.*;
import static org.junit.jupiter.api.Assertions.*; import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.junit.jupiter.api.Assertions.assertEquals;
/** /**
* *
@ -15,168 +19,89 @@ import static org.junit.jupiter.api.Assertions.*;
*/ */
public class LexerTest { public class LexerTest {
public static Stream<Arguments> validData() {
return LexerValidDataProvider.getAll();
}
public static Stream<Arguments> invalidData() {
return Stream.<Arguments>builder()
.add(Arguments.of("Wrong float point", "3.14.15"))
.add(Arguments.of("Wrong HEX number", "0Xf7_p6_s5"))
.add(Arguments.of("HEX number ends with _", "0Xf7_"))
.add(Arguments.of("Empty rest of HEX number", "#"))
.add(Arguments.of("Unicode character identifier", "€ = 1"))
.add(Arguments.of("Unicode character only", ""))
.add(Arguments.of("String error", "\"1\"\""))
.add(Arguments.of("Multiline comment EOF", "/* 1234 \n"))
.add(Arguments.of("Extended word EOF", "` 1234"))
.build();
}
@Test @Test
public void testNumbers() { public void testNumbers() {
String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF #"; String input = "0 3.1415 0xCAFEBABE 0Xf7_d6_c5 #FFFF";
List<Token> expList = list(NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result); assertTokens(result, NUMBER, NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER);
assertEquals("0", result.get(0).text()); assertThat(result)
assertEquals("3.1415", result.get(1).text()); .extracting(Token::text)
assertEquals("CAFEBABE", result.get(2).text()); .containsExactly("0", "3.1415", "CAFEBABE", "f7d6c5", "FFFF");
assertEquals("f7d6c5", result.get(3).text());
}
@Test
public void testNumbersError() {
final String input = "3.14.15 0Xf7_p6_s5";
assertThrows(LexerException.class, () -> Lexer.tokenize(input));
}
@Test
public void testArithmetic() {
String input = "x = -1 + 2 * 3 % 4 / 5";
List<Token> expList = list(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertEquals("x", result.get(0).text());
}
@Test
public void testKeywords() {
String input = "if else while for include";
List<Token> expList = list(IF, ELSE, WHILE, FOR, INCLUDE);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testWord() {
String input = "if bool include \"text\n\ntext\"";
List<Token> expList = list(IF, WORD, INCLUDE, TEXT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
} }
@Test @Test
public void testString() { public void testString() {
String input = "\"1\\\"2\""; String input = "\"1\\\"2\"";
List<Token> expList = list(TEXT);
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result); assertTokens(result, TEXT);
assertEquals("1\"2", result.get(0).text()); assertEquals("1\"2", result.get(0).text());
} }
@Test
public void testEscapeString() {
String input = """
"\\\\/\\\\"
""".stripIndent();
List<Token> result = Lexer.tokenize(input);
assertTokens(result, TEXT);
assertEquals("\\/\\", result.get(0).text());
}
@Test @Test
public void testEmptyString() { public void testEmptyString() {
String input = "\"\""; String input = "\"\"";
List<Token> expList = list(TEXT);
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result); assertTokens(result, TEXT);
assertEquals("", result.get(0).text()); assertEquals("", result.get(0).text());
} }
@Test
public void testStringError() {
String input = "\"1\"\"";
List<Token> expList = list(TEXT);
assertThrows(LexerException.class, () -> {
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
assertEquals("1", result.get(0).text());
});
}
@Test
public void testOperators() {
String input = "=+-*/%<>!&|";
List<Token> expList = list(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testOperators2Char() {
String input = "== != <= >= && || ==+ >=- ->";
List<Token> expList = list(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR,
EQEQ, PLUS, GTEQ, MINUS, MINUS, GT);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test @Test
public void testComments() { public void testComments() {
String input = "// 1234 \n /* */ 123 /* \n 12345 \n\n\n */"; String input = "// 1234 \n /* */ 123 /* \n 12345 \n\n\n */";
List<Token> expList = list(NUMBER);
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result); assertTokens(result, NUMBER);
assertEquals("123", result.get(0).text()); assertEquals("123", result.get(0).text());
} }
@Test @ParameterizedTest
public void testComments2() { @MethodSource("validData")
String input = "// /* 1234 \n */"; public void testValidInput(String name, String input, List<TokenType> tokenTypes) throws IOException {
List<Token> expList = list(STAR, SLASH);
List<Token> result = Lexer.tokenize(input); List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result); assertThat(result)
.hasSize(tokenTypes.size())
.extracting(Token::type)
.containsAll(tokenTypes);
} }
@Test @ParameterizedTest
public void testCommentsError() { @MethodSource("invalidData")
final String input = "/* 1234 \n"; public void testInvalidInput(String name, String input) throws IOException {
assertThrows(LexerException.class, () -> Lexer.tokenize(input)); assertThatThrownBy(() -> Lexer.tokenize(input))
.isInstanceOf(LexerException.class);
} }
@Test private static void assertTokens(List<Token> result, TokenType... tokenTypes) {
public void testExtendedWordError() { assertThat(result)
final String input = "` 1234"; .hasSize(tokenTypes.length)
assertThrows(LexerException.class, () -> Lexer.tokenize(input)); .extracting(Token::type)
} .containsExactly(tokenTypes);
@Test
public void testUnicodeCharacterIdentifier() {
String input = "€ = 1";
List<Token> expList = list(EQ, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testUnicodeCharacterExtendedWordIdentifier() {
String input = "`€` = 1";
List<Token> expList = list(WORD, EQ, NUMBER);
List<Token> result = Lexer.tokenize(input);
assertTokens(expList, result);
}
@Test
public void testUnicodeCharacterEOF() {
String input = "";
assertTrue(Lexer.tokenize(input).isEmpty());
}
private static void assertTokens(List<Token> expList, List<Token> result) {
final int length = expList.size();
assertEquals(length, result.size());
for (int i = 0; i < length; i++) {
assertEquals(expList.get(i).type(), result.get(i).type());
} }
} }
private static List<Token> list(TokenType... types) {
final List<Token> list = new ArrayList<>();
for (TokenType t : types) {
list.add(token(t));
}
return list;
}
private static Token token(TokenType type) {
return token(type, "", new Pos(0, 0));
}
private static Token token(TokenType type, String text, Pos pos) {
return new Token(type, text, pos);
}
}

View File

@ -0,0 +1,91 @@
package com.annimon.ownlang.parser;
import org.junit.jupiter.params.provider.Arguments;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
import static com.annimon.ownlang.parser.TokenType.*;
public class LexerValidDataProvider {
public static Stream<Arguments> getAll() {
final var result = new ArrayList<Arguments>();
result.addAll(numbers());
result.addAll(keywords());
result.addAll(words());
result.addAll(operators());
result.addAll(comments());
result.addAll(other());
result.addAll(notSupported());
return result.stream();
}
private static List<Arguments> numbers() {
return List.of(
Arguments.of("Numbers",
"12 7.8 90000000 10.03",
List.of(NUMBER, NUMBER, NUMBER, NUMBER)),
Arguments.of("Hex numbers",
"#FF 0xCA 0x12fb 0xFF",
List.of(HEX_NUMBER, HEX_NUMBER, HEX_NUMBER, HEX_NUMBER))
);
}
private static List<Arguments> keywords() {
return List.of(
Arguments.of("Keywords",
"if else while for include",
List.of(IF, ELSE, WHILE, FOR, INCLUDE))
);
}
private static List<Arguments> words() {
return List.of(
Arguments.of("Word",
"if bool include \"text\n\ntext\"",
List.of(IF, WORD, INCLUDE, TEXT)),
Arguments.of("Extended word identifier",
"`€` = 1",
List.of(WORD, EQ, NUMBER))
);
}
private static List<Arguments> operators() {
return List.of(
Arguments.of("Operators",
"=+-*/%<>!&|",
List.of(EQ, PLUS, MINUS, STAR, SLASH, PERCENT, LT, GT, EXCL, AMP, BAR)),
Arguments.of("Operators 2 characters",
"== != <= >= && || ==+ >=- ->",
List.of(EQEQ, EXCLEQ, LTEQ, GTEQ, AMPAMP, BARBAR,
EQEQ, PLUS, GTEQ, MINUS, MINUS, GT))
);
}
private static List<Arguments> comments() {
return List.of(
Arguments.of("Comments",
"// /* 1234 \n */",
List.of(STAR, SLASH))
);
}
private static List<Arguments> other() {
return List.of(
Arguments.of("Arithmetic",
"x = -1 + 2 * 3 % 4 / 5",
List.of(WORD, EQ, MINUS, NUMBER, PLUS, NUMBER, STAR, NUMBER, PERCENT, NUMBER, SLASH, NUMBER))
);
}
private static List<Arguments> notSupported() {
return List.of(
Arguments.of("Float notation",
"7e8",
List.of(NUMBER, WORD)),
Arguments.of("Float hex numbers",
"0Xf7p6",
List.of(HEX_NUMBER, WORD))
);
}
}