1
0
mirror of https://github.com/aNNiMON/HotaruFX.git synced 2024-09-19 14:14:21 +03:00

Add lexer

This commit is contained in:
Victor 2017-08-21 16:39:31 +03:00
parent dc0dbe0db7
commit 37cf836c40
8 changed files with 560 additions and 0 deletions

View File

@ -0,0 +1,4 @@
package com.annimon.hotarufx;
public class Main {
}

View File

@ -0,0 +1,14 @@
package com.annimon.hotarufx.exceptions;
import com.annimon.hotarufx.lexer.SourcePosition;
public class LexerException extends RuntimeException {
public LexerException(String message) {
super(message);
}
public LexerException(SourcePosition position, String message) {
super(position.toString() + " " + message);
}
}

View File

@ -0,0 +1,200 @@
package com.annimon.hotarufx.lexer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.val;
public class HotaruLexer extends Lexer {
public static List<Token> tokenize(String input) {
val lexer = new HotaruLexer(input);
lexer.tokenize();
return lexer.getTokens();
}
private static final String TEXT_CHARS = "'\"";
private static final String OPERATOR_CHARS = "(){}=.,";
private static final Map<String, HotaruTokenId> OPERATORS;
static {
OPERATORS = new HashMap<>();
OPERATORS.put("(", HotaruTokenId.LPAREN);
OPERATORS.put(")", HotaruTokenId.RPAREN);
OPERATORS.put("{", HotaruTokenId.LBRACE);
OPERATORS.put("}", HotaruTokenId.RBRACE);
OPERATORS.put("=", HotaruTokenId.EQ);
OPERATORS.put(".", HotaruTokenId.DOT);
OPERATORS.put(",", HotaruTokenId.COMMA);
}
public HotaruLexer(String input) {
super(input);
}
public Token nextToken() {
val current = peek(0);
if (Character.isDigit(current)) return tokenizeNumber();
else if (Character.isJavaIdentifierStart(current)) return tokenizeWord();
else if (current == '#') return tokenizeComment();
else if (current == '/' && peek(1) == '*') {
return tokenizeMultilineComment();
}
else if (TEXT_CHARS.indexOf(current) != -1) {
return tokenizeText(current);
}
else if (OPERATOR_CHARS.indexOf(current) != -1) {
return tokenizeOperator();
}
else if (Character.isWhitespace(current)) {
return tokenizeWhitespaces();
}
else {
// other
next();
}
return createToken(HotaruTokenId.WS, "", 1);
}
private Token tokenizeNumber() {
clearBuffer();
char current = peek(0);
while (true) {
if (current == '.') {
if (getBuffer().indexOf(".") != -1)
throw error("Invalid float number");
} else if (!Character.isDigit(current)) {
break;
}
getBuffer().append(current);
current = next();
}
return addToken(HotaruTokenId.NUMBER);
}
private Token tokenizeWord() {
clearBuffer();
getBuffer().append(peek(0));
char current = next();
while (!isEOF()) {
if (!Character.isJavaIdentifierPart(current)) {
break;
}
getBuffer().append(current);
current = next();
}
val word = getBuffer().toString();
return addToken(HotaruTokenId.WORD, word);
}
private Token tokenizeText(char openChar) {
next();// "
clearBuffer();
char current = peek(0);
while (true) {
if (current == '\\') {
val buffer = getBuffer();
current = next();
if (current == openChar) {
current = next();
buffer.append(openChar);
continue;
}
switch (current) {
case '0': current = next(); buffer.append('\0'); continue;
case 'b': current = next(); buffer.append('\b'); continue;
case 'f': current = next(); buffer.append('\f'); continue;
case 'n': current = next(); buffer.append('\n'); continue;
case 'r': current = next(); buffer.append('\r'); continue;
case 't': current = next(); buffer.append('\t'); continue;
case 'u': // http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3
int rollbackPosition = getPos();
while (current == 'u') current = next();
int escapedValue = 0;
for (int i = 12; i >= 0 && escapedValue != -1; i -= 4) {
if (isHexNumber(current)) {
escapedValue |= (Character.digit(current, 16) << i);
} else {
escapedValue = -1;
}
current = next();
}
if (escapedValue >= 0) {
buffer.append((char) escapedValue);
} else {
// rollback
buffer.append("\\u");
setPos(rollbackPosition);
}
continue;
}
buffer.append('\\');
continue;
}
if (current == openChar) break;
if (current == '\0') {
throw error("Reached end of file while parsing text");
}
getBuffer().append(current);
current = next();
}
next(); // "
return addToken(HotaruTokenId.TEXT, getBuffer().toString(), getBuffer().length() + 2);
}
private Token tokenizeOperator() {
char current = peek(0);
clearBuffer();
while (true) {
val text = getBuffer().toString();
if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) {
return addToken(OPERATORS.get(text), "", text.length());
}
getBuffer().append(current);
current = next();
}
}
private Token tokenizeComment() {
next(); // #
clearBuffer();
getBuffer().append("#");
char current = peek(0);
while ("\r\n\0".indexOf(current) == -1) {
getBuffer().append(current);
current = next();
}
return createToken(HotaruTokenId.SINGLE_LINE_COMMENT);
}
private Token tokenizeMultilineComment() {
next(); // /
next(); // *
clearBuffer();
getBuffer().append("/*");
char current = peek(0);
while (true) {
if (current == '*' && peek(1) == '/') break;
if (current == '\0') {
throw error("Reached end of file while parsing multiline comment");
}
getBuffer().append(current);
current = next();
}
next(); // *
next(); // /
getBuffer().append("*/");
return createToken(HotaruTokenId.MULTI_LINE_COMMENT);
}
private Token tokenizeWhitespaces() {
clearBuffer();
char current = peek(0);
while (Character.isWhitespace(current)) {
getBuffer().append(current);
current = next();
}
return createToken(HotaruTokenId.WS);
}
}

View File

@ -0,0 +1,36 @@
package com.annimon.hotarufx.lexer;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@AllArgsConstructor(access = AccessLevel.PACKAGE)
public enum HotaruTokenId {
NUMBER(Category.NUMBER),
WORD(Category.IDENTIFIER),
TEXT(Category.STRING),
EQ(Category.OPERATOR),
LPAREN(Category.OPERATOR),
RPAREN(Category.OPERATOR),
LBRACE(Category.OPERATOR),
RBRACE(Category.OPERATOR),
COMMA(Category.OPERATOR),
DOT(Category.OPERATOR),
SINGLE_LINE_COMMENT(Category.COMMENT),
MULTI_LINE_COMMENT(Category.COMMENT),
WS(Category.WHITESPACE),
EOF(Category.WHITESPACE);
private enum Category {
NUMBER, IDENTIFIER, STRING, OPERATOR, COMMENT, WHITESPACE
}
private final Category category;
public String getPrimaryCategory() {
return category.name().toLowerCase();
}
}

View File

@ -0,0 +1,113 @@
package com.annimon.hotarufx.lexer;
import com.annimon.hotarufx.exceptions.LexerException;
import java.util.ArrayList;
import java.util.List;
import lombok.val;
public abstract class Lexer {
private final String input;
private final int length;
private final List<Token> tokens;
private final StringBuilder buffer;
private int pos;
private int row, col;
public Lexer(String input) {
this.input = input;
length = input.length();
tokens = new ArrayList<>();
buffer = new StringBuilder();
pos = 0;
row = col = 1;
}
public List<Token> getTokens() {
return tokens;
}
protected StringBuilder getBuffer() {
return buffer;
}
protected int getPos() {
return pos;
}
protected void setPos(int pos) {
this.pos = pos;
}
public boolean isEOF() {
return pos >= length;
}
public List<Token> tokenize() {
final List<Token> allTokens = new ArrayList<>();
while (!isEOF()) {
allTokens.add(nextToken());
}
return allTokens;
}
public abstract Token nextToken();
protected void clearBuffer() {
buffer.setLength(0);
}
protected char next() {
pos++;
final char result = peek(0);
if (result == '\n') {
row++;
col = 1;
} else col++;
return result;
}
protected char peek(int relativePosition) {
final int position = pos + relativePosition;
if (position >= length) return '\0';
return input.charAt(position);
}
protected SourcePosition currentPosition() {
return new SourcePosition(pos, row, col);
}
protected Token addToken(HotaruTokenId tokenId) {
return addToken(tokenId, buffer.toString());
}
protected Token addToken(HotaruTokenId tokenId, String text) {
return addToken(tokenId, text, text.length());
}
protected Token addToken(HotaruTokenId tokenId, String text, int length) {
val token = createToken(tokenId, text, length);
tokens.add(token);
return token;
}
protected Token createToken(HotaruTokenId tokenId) {
return createToken(tokenId, buffer.toString(), buffer.length());
}
protected Token createToken(HotaruTokenId tokenId, String text, int length) {
return new Token(tokenId, text, length, currentPosition());
}
protected LexerException error(String message) {
return new LexerException(currentPosition(), message);
}
protected boolean isHexNumber(char current) {
return Character.isDigit(current)
|| ('a' <= current && current <= 'f')
|| ('A' <= current && current <= 'F');
}
}

View File

@ -0,0 +1,16 @@
package com.annimon.hotarufx.lexer;
import lombok.Data;
@Data
public class SourcePosition {
private final int position;
private final int row;
private final int column;
@Override
public String toString() {
return "[" + row + ", " + column + "]";
}
}

View File

@ -0,0 +1,17 @@
package com.annimon.hotarufx.lexer;
import lombok.Data;
@Data
public class Token {
private final HotaruTokenId type;
private final String text;
private final int length;
private final SourcePosition position;
@Override
public String toString() {
return type.name() + " " + position + " " + text;
}
}

View File

@ -0,0 +1,160 @@
package com.annimon.hotarufx.lexer;
import com.annimon.hotarufx.exceptions.LexerException;
import java.util.List;
import org.hamcrest.FeatureMatcher;
import org.hamcrest.Matcher;
import org.junit.jupiter.api.Test;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.is;
import static org.junit.jupiter.api.Assertions.*;
class HotaruLexerTest {
static List<Token> t(String input) {
return HotaruLexer.tokenize(input);
}
static List<Token> all(String input) {
return new HotaruLexer(input).tokenize();
}
static Token single(String input) {
List<Token> tokens = t(input);
if (tokens.isEmpty()) {
throw new AssertionError("Tokens list is empty");
}
return tokens.get(0);
}
@Test
void testTokenizeNumbers() {
assertThat(all("1 1.5 2"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.NUMBER)
));
assertThrows(LexerException.class, () -> {
all("1.2.3");
});
}
@Test
void testTokenizeWords() {
assertThat(all("a b c"), contains(
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.WORD)
));
}
@Test
void testTokenizeText() {
assertThat(t("1 \" 1\n2 3 '\""), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.TEXT)
));
assertThat(t("1 ' 1\n2 3 ' 2"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.TEXT),
tokenId(HotaruTokenId.NUMBER)
));
assertThrows(LexerException.class, () -> {
all("' ... ");
});
assertThat(single("'\\\''").getText(), is("'"));
assertThat(single("\'\\\"\'").getText(), is("\\\""));
assertThat(single("\"\\\"\"").getText(), is("\""));
assertThat(single("\"\\\'\"").getText(), is("\\\'"));
}
@Test
void testTokenizeOperators() {
assertThat(t("(){}=,."), contains(
tokenId(HotaruTokenId.LPAREN),
tokenId(HotaruTokenId.RPAREN),
tokenId(HotaruTokenId.LBRACE),
tokenId(HotaruTokenId.RBRACE),
tokenId(HotaruTokenId.EQ),
tokenId(HotaruTokenId.COMMA),
tokenId(HotaruTokenId.DOT)
));
}
@Test
void testTokenizeComments() {
assertThat(all("1 # 2 3 4"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.SINGLE_LINE_COMMENT)
));
assertThat(t("1 # 2 3 4\n 2"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.NUMBER)
));
}
@Test
void testTokenizeMultilineComments() {
assertThat(all("1 /* 2\n3\n4 */"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.WS),
tokenId(HotaruTokenId.MULTI_LINE_COMMENT)
));
assertThat(t("1 /* 2 3 4 */ 2"), contains(
tokenId(HotaruTokenId.NUMBER),
tokenId(HotaruTokenId.NUMBER)
));
assertThrows(LexerException.class, () -> {
all("/* ... ");
});
}
@Test
void testStatements() {
assertThat(t("A = node()"), contains(
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.EQ),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.LPAREN),
tokenId(HotaruTokenId.RPAREN)
));
assertThat(t("B.x = 100"), contains(
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.DOT),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.EQ),
tokenId(HotaruTokenId.NUMBER)
));
assertThat(t("G1 = group(A, B)"), contains(
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.EQ),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.LPAREN),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.COMMA),
tokenId(HotaruTokenId.WORD),
tokenId(HotaruTokenId.RPAREN)
));
}
Matcher<Token> tokenId(HotaruTokenId tokenId) {
return tokenId(is(tokenId));
}
Matcher<Token> tokenId(Matcher<HotaruTokenId> matcher) {
return new FeatureMatcher<Token, HotaruTokenId>(matcher, "tokenId", "tokenId") {
@Override
protected HotaruTokenId featureValueOf(Token actual) {
return actual.getType();
}
};
}
}