From 3b0f1f009e7c8ac55d19b4f899b4175db2f78758 Mon Sep 17 00:00:00 2001
From: Victor <melnikukr@mail.ru>
Date: Wed, 29 Jun 2016 12:39:04 +0300
Subject: [PATCH] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=B5=D0=BD=D0=BE=20=D0=B7=D0=B0=D0=B2=D0=B8=D1=81=D0=B0=D0=BD?=
 =?UTF-8?q?=D0=B8=D0=B5=20=D0=BB=D0=B5=D0=BA=D1=81=D0=B5=D1=80=D0=B0=20?=
 =?UTF-8?q?=D0=BD=D0=B0=20=D0=BD=D0=B5=D0=BA=D0=BE=D1=82=D0=BE=D1=80=D1=8B?=
 =?UTF-8?q?=D1=85=20=D1=8E=D0=BD=D0=B8=D0=BA=D0=BE=D0=B4=D0=BD=D1=8B=D1=85?=
 =?UTF-8?q?=20=D1=81=D0=B8=D0=BC=D0=B2=D0=BE=D0=BB=D0=B0=D1=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/com/annimon/ownlang/parser/Lexer.java     | 17 ++++++++---
 .../com/annimon/ownlang/parser/LexerTest.java | 28 +++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/src/com/annimon/ownlang/parser/Lexer.java b/src/com/annimon/ownlang/parser/Lexer.java
index ebb9428..8ad26ca 100644
--- a/src/com/annimon/ownlang/parser/Lexer.java
+++ b/src/com/annimon/ownlang/parser/Lexer.java
@@ -120,7 +120,7 @@ public final class Lexer {
         while (pos < length) {
             final char current = peek(0);
             if (Character.isDigit(current)) tokenizeNumber();
-            else if (Character.isJavaIdentifierStart(current)) tokenizeWord();
+            else if (isOwnLangIdentifierStart(current)) tokenizeWord();
             else if (current == '`') tokenizeExtendedWord();
             else if (current == '"') tokenizeText();
             else if (current == '#') {
@@ -208,9 +208,10 @@ public final class Lexer {
     
     private void tokenizeWord() {
         clearBuffer();
-        char current = peek(0);
+        buffer.append(peek(0));
+        char current = next();
         while (true) {
-            if (!Character.isLetterOrDigit(current) && (current != '_')  && (current != '$')) {
+            if (!isOwnLangIdentifierPart(current)) {
                 break;
             }
             buffer.append(current);
@@ -224,7 +225,7 @@ public final class Lexer {
             addToken(TokenType.WORD, word);
         }
     }
-    
+
     private void tokenizeExtendedWord() {
         next();// skip `
         clearBuffer();
@@ -306,6 +307,14 @@ public final class Lexer {
         next(); // *
         next(); // /
     }
+
+    private boolean isOwnLangIdentifierStart(char current) {
+        return (Character.isLetter(current) || (current == '_') || (current == '$'));
+    }
+
+    private boolean isOwnLangIdentifierPart(char current) {
+        return (Character.isLetterOrDigit(current) || (current == '_') || (current == '$'));
+    }
     
     private void clearBuffer() {
         buffer.setLength(0);
diff --git a/test/com/annimon/ownlang/parser/LexerTest.java b/test/com/annimon/ownlang/parser/LexerTest.java
index 6633a23..6e9b5eb 100644
--- a/test/com/annimon/ownlang/parser/LexerTest.java
+++ b/test/com/annimon/ownlang/parser/LexerTest.java
@@ -122,6 +122,34 @@ public class LexerTest {
         String input = "/* 1234 \n";
         Lexer.tokenize(input);
     }
+
+    @Test(expected = LexerException.class)
+    public void testExtendedWordError() {
+        String input = "` 1234";
+        Lexer.tokenize(input);
+    }
+
+    @Test
+    public void testUnicodeCharacterIdentifier() {
+        String input = "€ = 1";
+        List<Token> expList = list(EQ, NUMBER);
+        List<Token> result = Lexer.tokenize(input);
+        assertTokens(expList, result);
+    }
+
+    @Test
+    public void testUnicodeCharacterExtendedWordIdentifier() {
+        String input = "`€` = 1";
+        List<Token> expList = list(WORD, EQ, NUMBER);
+        List<Token> result = Lexer.tokenize(input);
+        assertTokens(expList, result);
+    }
+
+    @Test
+    public void testUnicodeCharacterEOF() {
+        String input = "€";
+        assertTrue(Lexer.tokenize(input).isEmpty());
+    }
     
     private static void assertTokens(List<Token> expList, List<Token> result) {
         final int length = expList.size();