From 37cf836c4009f19891657529ad11b129d72bfcf8 Mon Sep 17 00:00:00 2001 From: Victor Date: Mon, 21 Aug 2017 16:39:31 +0300 Subject: [PATCH] Add lexer --- .../main/java/com/annimon/hotarufx/Main.java | 4 + .../hotarufx/exceptions/LexerException.java | 14 ++ .../annimon/hotarufx/lexer/HotaruLexer.java | 200 ++++++++++++++++++ .../annimon/hotarufx/lexer/HotaruTokenId.java | 36 ++++ .../com/annimon/hotarufx/lexer/Lexer.java | 113 ++++++++++ .../hotarufx/lexer/SourcePosition.java | 16 ++ .../com/annimon/hotarufx/lexer/Token.java | 17 ++ .../hotarufx/lexer/HotaruLexerTest.java | 160 ++++++++++++++ 8 files changed, 560 insertions(+) create mode 100644 app/src/main/java/com/annimon/hotarufx/Main.java create mode 100644 app/src/main/java/com/annimon/hotarufx/exceptions/LexerException.java create mode 100644 app/src/main/java/com/annimon/hotarufx/lexer/HotaruLexer.java create mode 100644 app/src/main/java/com/annimon/hotarufx/lexer/HotaruTokenId.java create mode 100644 app/src/main/java/com/annimon/hotarufx/lexer/Lexer.java create mode 100644 app/src/main/java/com/annimon/hotarufx/lexer/SourcePosition.java create mode 100644 app/src/main/java/com/annimon/hotarufx/lexer/Token.java create mode 100644 app/src/test/java/com/annimon/hotarufx/lexer/HotaruLexerTest.java diff --git a/app/src/main/java/com/annimon/hotarufx/Main.java b/app/src/main/java/com/annimon/hotarufx/Main.java new file mode 100644 index 0000000..4de9237 --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/Main.java @@ -0,0 +1,4 @@ +package com.annimon.hotarufx; + +public class Main { +} diff --git a/app/src/main/java/com/annimon/hotarufx/exceptions/LexerException.java b/app/src/main/java/com/annimon/hotarufx/exceptions/LexerException.java new file mode 100644 index 0000000..ba6a701 --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/exceptions/LexerException.java @@ -0,0 +1,14 @@ +package com.annimon.hotarufx.exceptions; + +import com.annimon.hotarufx.lexer.SourcePosition; + +public class LexerException extends RuntimeException { + + public LexerException(String message) { + super(message); + } + + public LexerException(SourcePosition position, String message) { + super(position.toString() + " " + message); + } +} diff --git a/app/src/main/java/com/annimon/hotarufx/lexer/HotaruLexer.java b/app/src/main/java/com/annimon/hotarufx/lexer/HotaruLexer.java new file mode 100644 index 0000000..8de3259 --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/lexer/HotaruLexer.java @@ -0,0 +1,200 @@ +package com.annimon.hotarufx.lexer; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.val; + +public class HotaruLexer extends Lexer { + + public static List tokenize(String input) { + val lexer = new HotaruLexer(input); + lexer.tokenize(); + return lexer.getTokens(); + } + + private static final String TEXT_CHARS = "'\""; + private static final String OPERATOR_CHARS = "(){}=.,"; + + private static final Map OPERATORS; + static { + OPERATORS = new HashMap<>(); + OPERATORS.put("(", HotaruTokenId.LPAREN); + OPERATORS.put(")", HotaruTokenId.RPAREN); + OPERATORS.put("{", HotaruTokenId.LBRACE); + OPERATORS.put("}", HotaruTokenId.RBRACE); + OPERATORS.put("=", HotaruTokenId.EQ); + OPERATORS.put(".", HotaruTokenId.DOT); + OPERATORS.put(",", HotaruTokenId.COMMA); + } + + public HotaruLexer(String input) { + super(input); + } + + public Token nextToken() { + val current = peek(0); + if (Character.isDigit(current)) return tokenizeNumber(); + else if (Character.isJavaIdentifierStart(current)) return tokenizeWord(); + else if (current == '#') return tokenizeComment(); + else if (current == '/' && peek(1) == '*') { + return tokenizeMultilineComment(); + } + else if (TEXT_CHARS.indexOf(current) != -1) { + return tokenizeText(current); + } + else if (OPERATOR_CHARS.indexOf(current) != -1) { + return tokenizeOperator(); + } + else if (Character.isWhitespace(current)) { + return tokenizeWhitespaces(); + } + else { + // other + next(); + } + return createToken(HotaruTokenId.WS, "", 1); + } + + private Token tokenizeNumber() { + clearBuffer(); + char current = peek(0); + while (true) { + if (current == '.') { + if (getBuffer().indexOf(".") != -1) + throw error("Invalid float number"); + } else if (!Character.isDigit(current)) { + break; + } + getBuffer().append(current); + current = next(); + } + return addToken(HotaruTokenId.NUMBER); + } + + private Token tokenizeWord() { + clearBuffer(); + getBuffer().append(peek(0)); + char current = next(); + while (!isEOF()) { + if (!Character.isJavaIdentifierPart(current)) { + break; + } + getBuffer().append(current); + current = next(); + } + + val word = getBuffer().toString(); + return addToken(HotaruTokenId.WORD, word); + } + + private Token tokenizeText(char openChar) { + next();// " + clearBuffer(); + char current = peek(0); + while (true) { + if (current == '\\') { + val buffer = getBuffer(); + current = next(); + if (current == openChar) { + current = next(); + buffer.append(openChar); + continue; + } + switch (current) { + case '0': current = next(); buffer.append('\0'); continue; + case 'b': current = next(); buffer.append('\b'); continue; + case 'f': current = next(); buffer.append('\f'); continue; + case 'n': current = next(); buffer.append('\n'); continue; + case 'r': current = next(); buffer.append('\r'); continue; + case 't': current = next(); buffer.append('\t'); continue; + case 'u': // http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3 + int rollbackPosition = getPos(); + while (current == 'u') current = next(); + int escapedValue = 0; + for (int i = 12; i >= 0 && escapedValue != -1; i -= 4) { + if (isHexNumber(current)) { + escapedValue |= (Character.digit(current, 16) << i); + } else { + escapedValue = -1; + } + current = next(); + } + if (escapedValue >= 0) { + buffer.append((char) escapedValue); + } else { + // rollback + buffer.append("\\u"); + setPos(rollbackPosition); + } + continue; + } + buffer.append('\\'); + continue; + } + if (current == openChar) break; + if (current == '\0') { + throw error("Reached end of file while parsing text"); + } + getBuffer().append(current); + current = next(); + } + next(); // " + return addToken(HotaruTokenId.TEXT, getBuffer().toString(), getBuffer().length() + 2); + } + + private Token tokenizeOperator() { + char current = peek(0); + clearBuffer(); + while (true) { + val text = getBuffer().toString(); + if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) { + return addToken(OPERATORS.get(text), "", text.length()); + } + getBuffer().append(current); + current = next(); + } + } + + private Token tokenizeComment() { + next(); // # + clearBuffer(); + getBuffer().append("#"); + char current = peek(0); + while ("\r\n\0".indexOf(current) == -1) { + getBuffer().append(current); + current = next(); + } + return createToken(HotaruTokenId.SINGLE_LINE_COMMENT); + } + + private Token tokenizeMultilineComment() { + next(); // / + next(); // * + clearBuffer(); + getBuffer().append("/*"); + char current = peek(0); + while (true) { + if (current == '*' && peek(1) == '/') break; + if (current == '\0') { + throw error("Reached end of file while parsing multiline comment"); + } + getBuffer().append(current); + current = next(); + } + next(); // * + next(); // / + getBuffer().append("*/"); + return createToken(HotaruTokenId.MULTI_LINE_COMMENT); + } + + private Token tokenizeWhitespaces() { + clearBuffer(); + char current = peek(0); + while (Character.isWhitespace(current)) { + getBuffer().append(current); + current = next(); + } + return createToken(HotaruTokenId.WS); + } +} diff --git a/app/src/main/java/com/annimon/hotarufx/lexer/HotaruTokenId.java b/app/src/main/java/com/annimon/hotarufx/lexer/HotaruTokenId.java new file mode 100644 index 0000000..1382bc9 --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/lexer/HotaruTokenId.java @@ -0,0 +1,36 @@ +package com.annimon.hotarufx.lexer; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; + +@AllArgsConstructor(access = AccessLevel.PACKAGE) +public enum HotaruTokenId { + + NUMBER(Category.NUMBER), + WORD(Category.IDENTIFIER), + TEXT(Category.STRING), + + EQ(Category.OPERATOR), + LPAREN(Category.OPERATOR), + RPAREN(Category.OPERATOR), + LBRACE(Category.OPERATOR), + RBRACE(Category.OPERATOR), + COMMA(Category.OPERATOR), + DOT(Category.OPERATOR), + + SINGLE_LINE_COMMENT(Category.COMMENT), + MULTI_LINE_COMMENT(Category.COMMENT), + + WS(Category.WHITESPACE), + EOF(Category.WHITESPACE); + + private enum Category { + NUMBER, IDENTIFIER, STRING, OPERATOR, COMMENT, WHITESPACE + } + + private final Category category; + + public String getPrimaryCategory() { + return category.name().toLowerCase(); + } +} diff --git a/app/src/main/java/com/annimon/hotarufx/lexer/Lexer.java b/app/src/main/java/com/annimon/hotarufx/lexer/Lexer.java new file mode 100644 index 0000000..c52b819 --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/lexer/Lexer.java @@ -0,0 +1,113 @@ +package com.annimon.hotarufx.lexer; + +import com.annimon.hotarufx.exceptions.LexerException; +import java.util.ArrayList; +import java.util.List; +import lombok.val; + +public abstract class Lexer { + + private final String input; + private final int length; + + private final List tokens; + private final StringBuilder buffer; + + private int pos; + private int row, col; + + public Lexer(String input) { + this.input = input; + length = input.length(); + tokens = new ArrayList<>(); + buffer = new StringBuilder(); + pos = 0; + row = col = 1; + } + + public List getTokens() { + return tokens; + } + + protected StringBuilder getBuffer() { + return buffer; + } + + protected int getPos() { + return pos; + } + + protected void setPos(int pos) { + this.pos = pos; + } + + public boolean isEOF() { + return pos >= length; + } + + public List tokenize() { + final List allTokens = new ArrayList<>(); + while (!isEOF()) { + allTokens.add(nextToken()); + } + return allTokens; + } + + public abstract Token nextToken(); + + protected void clearBuffer() { + buffer.setLength(0); + } + + protected char next() { + pos++; + final char result = peek(0); + if (result == '\n') { + row++; + col = 1; + } else col++; + return result; + } + + protected char peek(int relativePosition) { + final int position = pos + relativePosition; + if (position >= length) return '\0'; + return input.charAt(position); + } + + protected SourcePosition currentPosition() { + return new SourcePosition(pos, row, col); + } + + protected Token addToken(HotaruTokenId tokenId) { + return addToken(tokenId, buffer.toString()); + } + + protected Token addToken(HotaruTokenId tokenId, String text) { + return addToken(tokenId, text, text.length()); + } + + protected Token addToken(HotaruTokenId tokenId, String text, int length) { + val token = createToken(tokenId, text, length); + tokens.add(token); + return token; + } + + protected Token createToken(HotaruTokenId tokenId) { + return createToken(tokenId, buffer.toString(), buffer.length()); + } + + protected Token createToken(HotaruTokenId tokenId, String text, int length) { + return new Token(tokenId, text, length, currentPosition()); + } + + protected LexerException error(String message) { + return new LexerException(currentPosition(), message); + } + + protected boolean isHexNumber(char current) { + return Character.isDigit(current) + || ('a' <= current && current <= 'f') + || ('A' <= current && current <= 'F'); + } +} diff --git a/app/src/main/java/com/annimon/hotarufx/lexer/SourcePosition.java b/app/src/main/java/com/annimon/hotarufx/lexer/SourcePosition.java new file mode 100644 index 0000000..3d7372f --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/lexer/SourcePosition.java @@ -0,0 +1,16 @@ +package com.annimon.hotarufx.lexer; + +import lombok.Data; + +@Data +public class SourcePosition { + + private final int position; + private final int row; + private final int column; + + @Override + public String toString() { + return "[" + row + ", " + column + "]"; + } +} diff --git a/app/src/main/java/com/annimon/hotarufx/lexer/Token.java b/app/src/main/java/com/annimon/hotarufx/lexer/Token.java new file mode 100644 index 0000000..1343cbb --- /dev/null +++ b/app/src/main/java/com/annimon/hotarufx/lexer/Token.java @@ -0,0 +1,17 @@ +package com.annimon.hotarufx.lexer; + +import lombok.Data; + +@Data +public class Token { + + private final HotaruTokenId type; + private final String text; + private final int length; + private final SourcePosition position; + + @Override + public String toString() { + return type.name() + " " + position + " " + text; + } +} diff --git a/app/src/test/java/com/annimon/hotarufx/lexer/HotaruLexerTest.java b/app/src/test/java/com/annimon/hotarufx/lexer/HotaruLexerTest.java new file mode 100644 index 0000000..2f33ec3 --- /dev/null +++ b/app/src/test/java/com/annimon/hotarufx/lexer/HotaruLexerTest.java @@ -0,0 +1,160 @@ +package com.annimon.hotarufx.lexer; + +import com.annimon.hotarufx.exceptions.LexerException; +import java.util.List; +import org.hamcrest.FeatureMatcher; +import org.hamcrest.Matcher; +import org.junit.jupiter.api.Test; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.is; +import static org.junit.jupiter.api.Assertions.*; + +class HotaruLexerTest { + + static List t(String input) { + return HotaruLexer.tokenize(input); + } + + static List all(String input) { + return new HotaruLexer(input).tokenize(); + } + + static Token single(String input) { + List tokens = t(input); + if (tokens.isEmpty()) { + throw new AssertionError("Tokens list is empty"); + } + return tokens.get(0); + } + + @Test + void testTokenizeNumbers() { + assertThat(all("1 1.5 2"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.NUMBER) + )); + + assertThrows(LexerException.class, () -> { + all("1.2.3"); + }); + } + + @Test + void testTokenizeWords() { + assertThat(all("a b c"), contains( + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.WORD) + )); + } + + @Test + void testTokenizeText() { + assertThat(t("1 \" 1\n2 3 '\""), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.TEXT) + )); + assertThat(t("1 ' 1\n2 3 ' 2"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.TEXT), + tokenId(HotaruTokenId.NUMBER) + )); + assertThrows(LexerException.class, () -> { + all("' ... "); + }); + assertThat(single("'\\\''").getText(), is("'")); + assertThat(single("\'\\\"\'").getText(), is("\\\"")); + assertThat(single("\"\\\"\"").getText(), is("\"")); + assertThat(single("\"\\\'\"").getText(), is("\\\'")); + } + + @Test + void testTokenizeOperators() { + assertThat(t("(){}=,."), contains( + tokenId(HotaruTokenId.LPAREN), + tokenId(HotaruTokenId.RPAREN), + tokenId(HotaruTokenId.LBRACE), + tokenId(HotaruTokenId.RBRACE), + tokenId(HotaruTokenId.EQ), + tokenId(HotaruTokenId.COMMA), + tokenId(HotaruTokenId.DOT) + )); + } + + @Test + void testTokenizeComments() { + assertThat(all("1 # 2 3 4"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.SINGLE_LINE_COMMENT) + )); + assertThat(t("1 # 2 3 4\n 2"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.NUMBER) + )); + } + + @Test + void testTokenizeMultilineComments() { + assertThat(all("1 /* 2\n3\n4 */"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.WS), + tokenId(HotaruTokenId.MULTI_LINE_COMMENT) + )); + assertThat(t("1 /* 2 3 4 */ 2"), contains( + tokenId(HotaruTokenId.NUMBER), + tokenId(HotaruTokenId.NUMBER) + )); + assertThrows(LexerException.class, () -> { + all("/* ... "); + }); + } + + @Test + void testStatements() { + assertThat(t("A = node()"), contains( + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.EQ), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.LPAREN), + tokenId(HotaruTokenId.RPAREN) + )); + assertThat(t("B.x = 100"), contains( + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.DOT), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.EQ), + tokenId(HotaruTokenId.NUMBER) + )); + assertThat(t("G1 = group(A, B)"), contains( + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.EQ), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.LPAREN), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.COMMA), + tokenId(HotaruTokenId.WORD), + tokenId(HotaruTokenId.RPAREN) + )); + } + + Matcher tokenId(HotaruTokenId tokenId) { + return tokenId(is(tokenId)); + } + + Matcher tokenId(Matcher matcher) { + return new FeatureMatcher(matcher, "tokenId", "tokenId") { + + @Override + protected HotaruTokenId featureValueOf(Token actual) { + return actual.getType(); + } + }; + } +} \ No newline at end of file