Исправлено зависание лексера на некоторых юникодных символах

2024-09-20 08:44:20 +03:00 · 2016-06-29 12:39:04 +03:00 · 2016-06-29 12:39:04 +03:00 · 3b0f1f009e
commit 3b0f1f009e
parent 1dcabe6d61
2 changed files with 41 additions and 4 deletions
--- a/src/com/annimon/ownlang/parser/Lexer.java
+++ b/src/com/annimon/ownlang/parser/Lexer.java
@ -120,7 +120,7 @@ public final class Lexer {
        while (pos < length) {
            final char current = peek(0);
            if (Character.isDigit(current)) tokenizeNumber();
-            else if (Character.isJavaIdentifierStart(current)) tokenizeWord();
+            else if (isOwnLangIdentifierStart(current)) tokenizeWord();
            else if (current == '`') tokenizeExtendedWord();
            else if (current == '"') tokenizeText();
            else if (current == '#') {
@ -208,9 +208,10 @@ public final class Lexer {
    
    private void tokenizeWord() {
        clearBuffer();
-        char current = peek(0);
+        buffer.append(peek(0));
+        char current = next();
        while (true) {
-            if (!Character.isLetterOrDigit(current) && (current != '_')  && (current != '$')) {
+            if (!isOwnLangIdentifierPart(current)) {
                break;
            }
            buffer.append(current);
@ -224,7 +225,7 @@ public final class Lexer {
            addToken(TokenType.WORD, word);
        }
    }
-    
+
    private void tokenizeExtendedWord() {
        next();// skip `
        clearBuffer();
@ -306,6 +307,14 @@ public final class Lexer {
        next(); // *
        next(); // /
    }
+
+    private boolean isOwnLangIdentifierStart(char current) {
+        return (Character.isLetter(current) || (current == '_') || (current == '$'));
+    }
+
+    private boolean isOwnLangIdentifierPart(char current) {
+        return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$'));
+    }
    
    private void clearBuffer() {
        buffer.setLength(0);
--- a/test/com/annimon/ownlang/parser/LexerTest.java
+++ b/test/com/annimon/ownlang/parser/LexerTest.java
@ -122,6 +122,34 @@ public class LexerTest {
        String input = "/* 1234 \n";
        Lexer.tokenize(input);
    }
+
+    @Test(expected = LexerException.class)
+    public void testExtendedWordError() {
+        String input = "` 1234";
+        Lexer.tokenize(input);
+    }
+
+    @Test
+    public void testUnicodeCharacterIdentifier() {
+        String input = "€ = 1";
+        List<Token> expList = list(EQ, NUMBER);
+        List<Token> result = Lexer.tokenize(input);
+        assertTokens(expList, result);
+    }
+
+    @Test
+    public void testUnicodeCharacterExtendedWordIdentifier() {
+        String input = "`€` = 1";
+        List<Token> expList = list(WORD, EQ, NUMBER);
+        List<Token> result = Lexer.tokenize(input);
+        assertTokens(expList, result);
+    }
+
+    @Test
+    public void testUnicodeCharacterEOF() {
+        String input = "€";
+        assertTrue(Lexer.tokenize(input).isEmpty());
+    }
    
    private static void assertTokens(List<Token> expList, List<Token> result) {
        final int length = expList.size();