From 3b0f1f009e7c8ac55d19b4f899b4175db2f78758 Mon Sep 17 00:00:00 2001 From: Victor Date: Wed, 29 Jun 2016 12:39:04 +0300 Subject: [PATCH] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=BE=20=D0=B7=D0=B0=D0=B2=D0=B8=D1=81=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=BB=D0=B5=D0=BA=D1=81=D0=B5=D1=80=D0=B0=20?= =?UTF-8?q?=D0=BD=D0=B0=20=D0=BD=D0=B5=D0=BA=D0=BE=D1=82=D0=BE=D1=80=D1=8B?= =?UTF-8?q?=D1=85=20=D1=8E=D0=BD=D0=B8=D0=BA=D0=BE=D0=B4=D0=BD=D1=8B=D1=85?= =?UTF-8?q?=20=D1=81=D0=B8=D0=BC=D0=B2=D0=BE=D0=BB=D0=B0=D1=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/com/annimon/ownlang/parser/Lexer.java | 17 ++++++++--- .../com/annimon/ownlang/parser/LexerTest.java | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/com/annimon/ownlang/parser/Lexer.java b/src/com/annimon/ownlang/parser/Lexer.java index ebb9428..8ad26ca 100644 --- a/src/com/annimon/ownlang/parser/Lexer.java +++ b/src/com/annimon/ownlang/parser/Lexer.java @@ -120,7 +120,7 @@ public final class Lexer { while (pos < length) { final char current = peek(0); if (Character.isDigit(current)) tokenizeNumber(); - else if (Character.isJavaIdentifierStart(current)) tokenizeWord(); + else if (isOwnLangIdentifierStart(current)) tokenizeWord(); else if (current == '`') tokenizeExtendedWord(); else if (current == '"') tokenizeText(); else if (current == '#') { @@ -208,9 +208,10 @@ public final class Lexer { private void tokenizeWord() { clearBuffer(); - char current = peek(0); + buffer.append(peek(0)); + char current = next(); while (true) { - if (!Character.isLetterOrDigit(current) && (current != '_') && (current != '$')) { + if (!isOwnLangIdentifierPart(current)) { break; } buffer.append(current); @@ -224,7 +225,7 @@ public final class Lexer { addToken(TokenType.WORD, word); } } - + private void tokenizeExtendedWord() { next();// skip ` clearBuffer(); @@ -306,6 +307,14 @@ public final class Lexer { next(); // * next(); // / } + + private boolean isOwnLangIdentifierStart(char current) { + return (Character.isLetter(current) || (current == '_') || (current == '$')); + } + + private boolean isOwnLangIdentifierPart(char current) { + return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$')); + } private void clearBuffer() { buffer.setLength(0); diff --git a/test/com/annimon/ownlang/parser/LexerTest.java b/test/com/annimon/ownlang/parser/LexerTest.java index 6633a23..6e9b5eb 100644 --- a/test/com/annimon/ownlang/parser/LexerTest.java +++ b/test/com/annimon/ownlang/parser/LexerTest.java @@ -122,6 +122,34 @@ public class LexerTest { String input = "/* 1234 \n"; Lexer.tokenize(input); } + + @Test(expected = LexerException.class) + public void testExtendedWordError() { + String input = "` 1234"; + Lexer.tokenize(input); + } + + @Test + public void testUnicodeCharacterIdentifier() { + String input = "€ = 1"; + List expList = list(EQ, NUMBER); + List result = Lexer.tokenize(input); + assertTokens(expList, result); + } + + @Test + public void testUnicodeCharacterExtendedWordIdentifier() { + String input = "`€` = 1"; + List expList = list(WORD, EQ, NUMBER); + List result = Lexer.tokenize(input); + assertTokens(expList, result); + } + + @Test + public void testUnicodeCharacterEOF() { + String input = "€"; + assertTrue(Lexer.tokenize(input).isEmpty()); + } private static void assertTokens(List expList, List result) { final int length = expList.size();