Use immutable map, fast skip whitespaces in lexer

This commit is contained in:
aNNiMON 2023-09-11 19:57:01 +03:00 committed by Victor Melnik
parent 7baf9f6fc8
commit 15c277d145

View File

@ -1,11 +1,7 @@
package com.annimon.ownlang.parser; package com.annimon.ownlang.parser;
import com.annimon.ownlang.exceptions.LexerException; import com.annimon.ownlang.exceptions.LexerException;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/** /**
* *
@ -21,92 +17,94 @@ public final class Lexer {
private static final Map<String, TokenType> OPERATORS; private static final Map<String, TokenType> OPERATORS;
static { static {
OPERATORS = new HashMap<>(); final var operators = new HashMap<String, TokenType>();
OPERATORS.put("+", TokenType.PLUS); operators.put("+", TokenType.PLUS);
OPERATORS.put("-", TokenType.MINUS); operators.put("-", TokenType.MINUS);
OPERATORS.put("*", TokenType.STAR); operators.put("*", TokenType.STAR);
OPERATORS.put("/", TokenType.SLASH); operators.put("/", TokenType.SLASH);
OPERATORS.put("%", TokenType.PERCENT); operators.put("%", TokenType.PERCENT);
OPERATORS.put("(", TokenType.LPAREN); operators.put("(", TokenType.LPAREN);
OPERATORS.put(")", TokenType.RPAREN); operators.put(")", TokenType.RPAREN);
OPERATORS.put("[", TokenType.LBRACKET); operators.put("[", TokenType.LBRACKET);
OPERATORS.put("]", TokenType.RBRACKET); operators.put("]", TokenType.RBRACKET);
OPERATORS.put("{", TokenType.LBRACE); operators.put("{", TokenType.LBRACE);
OPERATORS.put("}", TokenType.RBRACE); operators.put("}", TokenType.RBRACE);
OPERATORS.put("=", TokenType.EQ); operators.put("=", TokenType.EQ);
OPERATORS.put("<", TokenType.LT); operators.put("<", TokenType.LT);
OPERATORS.put(">", TokenType.GT); operators.put(">", TokenType.GT);
OPERATORS.put(".", TokenType.DOT); operators.put(".", TokenType.DOT);
OPERATORS.put(",", TokenType.COMMA); operators.put(",", TokenType.COMMA);
OPERATORS.put("^", TokenType.CARET); operators.put("^", TokenType.CARET);
OPERATORS.put("~", TokenType.TILDE); operators.put("~", TokenType.TILDE);
OPERATORS.put("?", TokenType.QUESTION); operators.put("?", TokenType.QUESTION);
OPERATORS.put(":", TokenType.COLON); operators.put(":", TokenType.COLON);
OPERATORS.put("!", TokenType.EXCL); operators.put("!", TokenType.EXCL);
OPERATORS.put("&", TokenType.AMP); operators.put("&", TokenType.AMP);
OPERATORS.put("|", TokenType.BAR); operators.put("|", TokenType.BAR);
OPERATORS.put("==", TokenType.EQEQ); operators.put("==", TokenType.EQEQ);
OPERATORS.put("!=", TokenType.EXCLEQ); operators.put("!=", TokenType.EXCLEQ);
OPERATORS.put("<=", TokenType.LTEQ); operators.put("<=", TokenType.LTEQ);
OPERATORS.put(">=", TokenType.GTEQ); operators.put(">=", TokenType.GTEQ);
OPERATORS.put("+=", TokenType.PLUSEQ); operators.put("+=", TokenType.PLUSEQ);
OPERATORS.put("-=", TokenType.MINUSEQ); operators.put("-=", TokenType.MINUSEQ);
OPERATORS.put("*=", TokenType.STAREQ); operators.put("*=", TokenType.STAREQ);
OPERATORS.put("/=", TokenType.SLASHEQ); operators.put("/=", TokenType.SLASHEQ);
OPERATORS.put("%=", TokenType.PERCENTEQ); operators.put("%=", TokenType.PERCENTEQ);
OPERATORS.put("&=", TokenType.AMPEQ); operators.put("&=", TokenType.AMPEQ);
OPERATORS.put("^=", TokenType.CARETEQ); operators.put("^=", TokenType.CARETEQ);
OPERATORS.put("|=", TokenType.BAREQ); operators.put("|=", TokenType.BAREQ);
OPERATORS.put("::=", TokenType.COLONCOLONEQ); operators.put("::=", TokenType.COLONCOLONEQ);
OPERATORS.put("<<=", TokenType.LTLTEQ); operators.put("<<=", TokenType.LTLTEQ);
OPERATORS.put(">>=", TokenType.GTGTEQ); operators.put(">>=", TokenType.GTGTEQ);
OPERATORS.put(">>>=", TokenType.GTGTGTEQ); operators.put(">>>=", TokenType.GTGTGTEQ);
OPERATORS.put("++", TokenType.PLUSPLUS); operators.put("++", TokenType.PLUSPLUS);
OPERATORS.put("--", TokenType.MINUSMINUS); operators.put("--", TokenType.MINUSMINUS);
OPERATORS.put("::", TokenType.COLONCOLON); operators.put("::", TokenType.COLONCOLON);
OPERATORS.put("&&", TokenType.AMPAMP); operators.put("&&", TokenType.AMPAMP);
OPERATORS.put("||", TokenType.BARBAR); operators.put("||", TokenType.BARBAR);
OPERATORS.put("<<", TokenType.LTLT); operators.put("<<", TokenType.LTLT);
OPERATORS.put(">>", TokenType.GTGT); operators.put(">>", TokenType.GTGT);
OPERATORS.put(">>>", TokenType.GTGTGT); operators.put(">>>", TokenType.GTGTGT);
OPERATORS.put("@", TokenType.AT); operators.put("@", TokenType.AT);
OPERATORS.put("@=", TokenType.ATEQ); operators.put("@=", TokenType.ATEQ);
OPERATORS.put("..", TokenType.DOTDOT); operators.put("..", TokenType.DOTDOT);
OPERATORS.put("**", TokenType.STARSTAR); operators.put("**", TokenType.STARSTAR);
OPERATORS.put("^^", TokenType.CARETCARET); operators.put("^^", TokenType.CARETCARET);
OPERATORS.put("?:", TokenType.QUESTIONCOLON); operators.put("?:", TokenType.QUESTIONCOLON);
OPERATORS.put("??", TokenType.QUESTIONQUESTION); operators.put("??", TokenType.QUESTIONQUESTION);
OPERATORS = Map.copyOf(operators);
} }
private static final Map<String, TokenType> KEYWORDS; private static final Map<String, TokenType> KEYWORDS;
static { static {
KEYWORDS = new HashMap<>(); final var keywords = new HashMap<String, TokenType>();
KEYWORDS.put("print", TokenType.PRINT); keywords.put("print", TokenType.PRINT);
KEYWORDS.put("println", TokenType.PRINTLN); keywords.put("println", TokenType.PRINTLN);
KEYWORDS.put("if", TokenType.IF); keywords.put("if", TokenType.IF);
KEYWORDS.put("else", TokenType.ELSE); keywords.put("else", TokenType.ELSE);
KEYWORDS.put("while", TokenType.WHILE); keywords.put("while", TokenType.WHILE);
KEYWORDS.put("for", TokenType.FOR); keywords.put("for", TokenType.FOR);
KEYWORDS.put("do", TokenType.DO); keywords.put("do", TokenType.DO);
KEYWORDS.put("break", TokenType.BREAK); keywords.put("break", TokenType.BREAK);
KEYWORDS.put("continue", TokenType.CONTINUE); keywords.put("continue", TokenType.CONTINUE);
KEYWORDS.put("def", TokenType.DEF); keywords.put("def", TokenType.DEF);
KEYWORDS.put("return", TokenType.RETURN); keywords.put("return", TokenType.RETURN);
KEYWORDS.put("use", TokenType.USE); keywords.put("use", TokenType.USE);
KEYWORDS.put("match", TokenType.MATCH); keywords.put("match", TokenType.MATCH);
KEYWORDS.put("case", TokenType.CASE); keywords.put("case", TokenType.CASE);
KEYWORDS.put("extract", TokenType.EXTRACT); keywords.put("extract", TokenType.EXTRACT);
KEYWORDS.put("include", TokenType.INCLUDE); keywords.put("include", TokenType.INCLUDE);
KEYWORDS.put("class", TokenType.CLASS); keywords.put("class", TokenType.CLASS);
KEYWORDS.put("new", TokenType.NEW); keywords.put("new", TokenType.NEW);
KEYWORDS = Map.copyOf(keywords);
} }
public static Set<String> getKeywords() { public static Set<String> getKeywords() {
@ -133,6 +131,11 @@ public final class Lexer {
public List<Token> tokenize() { public List<Token> tokenize() {
while (pos < length) { while (pos < length) {
// Fast path for skipping whitespaces
while (Character.isWhitespace(peek(0))) {
next();
}
final char current = peek(0); final char current = peek(0);
if (Character.isDigit(current)) tokenizeNumber(); if (Character.isDigit(current)) tokenizeNumber();
else if (isOwnLangIdentifierStart(current)) tokenizeWord(); else if (isOwnLangIdentifierStart(current)) tokenizeWord();
@ -157,9 +160,11 @@ public final class Lexer {
tokenizeHexNumber(2); tokenizeHexNumber(2);
return; return;
} }
boolean hasDot = false;
while (true) { while (true) {
if (current == '.') { if (current == '.') {
if (buffer.indexOf(".") != -1) throw error("Invalid float number"); if (hasDot) throw error("Invalid float number");
hasDot = true;
} else if (!Character.isDigit(current)) { } else if (!Character.isDigit(current)) {
break; break;
} }
@ -183,8 +188,7 @@ public final class Lexer {
} }
current = next(); current = next();
} }
final int length = buffer.length(); if (!buffer.isEmpty()) {
if (length > 0) {
addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos); addToken(TokenType.HEX_NUMBER, buffer.toString(), startPos);
} }
} }
@ -214,9 +218,8 @@ public final class Lexer {
final Pos startPos = markPos(); final Pos startPos = markPos();
clearBuffer(); clearBuffer();
while (true) { while (true) {
final String text = buffer.toString(); if (!buffer.isEmpty() && !OPERATORS.containsKey(buffer.toString() + current)) {
if (!text.isEmpty() && !OPERATORS.containsKey(text + current)) { addToken(OPERATORS.get(buffer.toString()), startPos);
addToken(OPERATORS.get(text), startPos);
return; return;
} }
buffer.append(current); buffer.append(current);
@ -229,10 +232,7 @@ public final class Lexer {
final Pos startPos = markPos(); final Pos startPos = markPos();
buffer.append(peek(0)); buffer.append(peek(0));
char current = next(); char current = next();
while (true) { while (isOwnLangIdentifierPart(current)) {
if (!isOwnLangIdentifierPart(current)) {
break;
}
buffer.append(current); buffer.append(current);
current = next(); current = next();
} }
@ -250,8 +250,7 @@ public final class Lexer {
next();// skip ` next();// skip `
clearBuffer(); clearBuffer();
char current = peek(0); char current = peek(0);
while (true) { while (current != '`') {
if (current == '`') break;
if (current == '\0') throw error("Reached end of file while parsing extended word."); if (current == '\0') throw error("Reached end of file while parsing extended word.");
if (current == '\n' || current == '\r') throw error("Reached end of line while parsing extended word."); if (current == '\n' || current == '\r') throw error("Reached end of line while parsing extended word.");
buffer.append(current); buffer.append(current);
@ -320,8 +319,7 @@ public final class Lexer {
private void tokenizeMultilineComment() { private void tokenizeMultilineComment() {
char current = peek(0); char current = peek(0);
while (true) { while (current != '*' || peek(1) != '/') {
if (current == '*' && peek(1) == '/') break;
if (current == '\0') throw error("Reached end of file while parsing multiline comment"); if (current == '\0') throw error("Reached end of file while parsing multiline comment");
current = next(); current = next();
} }