001package jmri.jmrit.logixng.util.parser;
002
003import java.util.ArrayList;
004import java.util.List;
005import java.util.concurrent.atomic.AtomicInteger;
006
007/**
008 * Parses and calculates an expression, for example "sin(2*pi*x)/3"
009 *
010 * @author Daniel Bergqvist 2019
011 */
012public class Tokenizer {
013
014    // This class should never be instanciated.
015    private Tokenizer() {
016    }
017
018    private static void addToken(Token currentToken, List<Token> tokens) {
019        if ((currentToken._tokenType == TokenType.FLOATING_NUMBER) && isIntegerNumber(currentToken._string)) {
020            currentToken._tokenType = TokenType.INTEGER_NUMBER;
021        }
022
023        tokens.add(currentToken);
024    }
025
026    public static List<Token> getTokens(String expression) throws InvalidSyntaxException {
027
028        List<Token> tokens = new ArrayList<>();
029        Token currentToken = new Token();
030
031//        System.out.format("%n%n%n");
032//        System.out.format("getTokens(): %s%n", expression);
033
034        AtomicInteger eatNextChar = new AtomicInteger(0);
035
036        char ch = ' ';
037        char lastChar;
038
039        for (int i=0; i < expression.length(); i++) {
040            lastChar = ch;
041            ch = expression.charAt(i);
042            char nextChar = ' ';    // An extra space at the end of the _string doesn't matter
043            if (i+1 < expression.length()) {
044                nextChar = expression.charAt(i+1);
045            }
046
047//            System.out.format("index %d: %s, %s, %c, %c%n", i, currentToken._tokenType.name(), currentToken._string, ch, nextChar);
048
049
050
051            // Check for token type STRING
052            if (ch == '\"') {
053                if (Character.isLetterOrDigit(lastChar)) {
054                    throw new InvalidSyntaxException(Bundle.getMessage("InvalidSyntaxAtIndex", i));
055                }
056
057                if (currentToken._tokenType == TokenType.SPACE) {
058                    currentToken = new Token();
059                } else if (currentToken._tokenType != TokenType.NONE) {
060//                    System.out.format("Add: index %d: %s, %s, %c, %c%n", i, currentToken._tokenType.name(), currentToken._string, ch, nextChar);
061                    addToken(currentToken, tokens);
062                    currentToken = new Token();
063                }
064                currentToken._tokenType = TokenType.STRING;
065
066                boolean done = false;
067                while (!done) {
068                    i++;
069                    if (i >= expression.length()) {
070                        throw new InvalidSyntaxException(Bundle.getMessage("UnexpectedEndOfString"));
071                    }
072                    ch = expression.charAt(i);
073                    nextChar = ' ';    // An extra space at the end of the _string doesn't matter
074                    if (i+1 < expression.length()) {
075                        nextChar = expression.charAt(i+1);
076                    }
077                    // Handle escaped characters
078                    if ((ch == '\\') && ((nextChar == '\\') || (nextChar == '"'))) {
079
080                        currentToken._string += nextChar;
081                        i++;
082                    } else if (ch != '\"') {
083                        currentToken._string += ch;
084                    }
085
086                    done = (ch == '\"');
087                }
088
089                if (Character.isLetterOrDigit(nextChar)) {
090                    throw new InvalidSyntaxException(Bundle.getMessage("InvalidSyntaxAtIndex", i));
091                }
092
093//                System.out.format("Add: index %d: %s, %s, %c, %c%n", i, currentToken._tokenType.name(), currentToken._string, ch, nextChar);
094                addToken(currentToken, tokens);
095                currentToken = new Token();
096
097                // Continue for loop
098                continue;
099            }
100
101
102            char nextNextChar = ' ';        // An extra space at the end of the _string doesn't matter
103            char nextNextNextChar = ' ';    // An extra space at the end of the _string doesn't matter
104            if (i+2 < expression.length()) {
105                nextNextChar = expression.charAt(i+2);
106            }
107            if (i+3 < expression.length()) {
108                nextNextNextChar = expression.charAt(i+3);
109            }
110
111            TokenType nextToken = getTokenType(currentToken, ch, nextChar, nextNextChar, nextNextNextChar, eatNextChar);
112//            System.out.format("index %d: %s, %c%n", i, nextToken.name(), ch);
113
114            if (nextToken == TokenType.SAME_AS_LAST) {
115                currentToken._string += ch;
116                continue;
117            }
118
119            switch (nextToken) {
120                case ERROR:
121                    throw new InvalidSyntaxException(Bundle.getMessage("InvalidSyntaxAtIndex", i));
122
123                case ASSIGN:
124                case ASSIGN_ADD:
125                case ASSIGN_SUBTRACKT:
126                case ASSIGN_MULTIPLY:
127                case ASSIGN_DIVIDE:
128                case ASSIGN_MODULO:
129                case ASSIGN_AND:
130                case ASSIGN_OR:
131                case ASSIGN_XOR:
132                case ASSIGN_SHIFT_LEFT:
133                case ASSIGN_SHIFT_RIGHT:
134                case ASSIGN_UNSIGNED_SHIFT_RIGHT:
135                case TERNARY_QUESTION_MARK:
136                case TERNARY_COLON:
137                case LEFT_PARENTHESIS:
138                case RIGHT_PARENTHESIS:
139                case LEFT_SQUARE_BRACKET:
140                case RIGHT_SQUARE_BRACKET:
141                case LEFT_CURLY_BRACKET:
142                case RIGHT_CURLY_BRACKET:
143                case DOT:
144                case DOT_DOT:
145                case COMMA:
146                case EQUAL:
147                case NOT_EQUAL:
148                case LESS_THAN:
149                case LESS_OR_EQUAL:
150                case GREATER_THAN:
151                case GREATER_OR_EQUAL:
152                case ADD:
153                case SUBTRACKT:
154                case MULTIPLY:
155                case DIVIDE:
156                case MODULO:
157                case SHIFT_LEFT:
158                case SHIFT_RIGHT:
159                case UNSIGNED_SHIFT_RIGHT:
160                case BOOLEAN_AND:
161                case BOOLEAN_OR:
162                case BOOLEAN_XOR:
163                case BOOLEAN_NOT:
164                case BINARY_AND:
165                case BINARY_OR:
166                case BINARY_XOR:
167                case BINARY_NOT:
168                case INCREMENT:
169                case DECREMENT:
170                case IDENTIFIER:
171                case SPACE:
172                case NONE:
173                    if ((currentToken._tokenType != TokenType.NONE) && (currentToken._tokenType != TokenType.SPACE)) {
174                        addToken(currentToken, tokens);
175                        currentToken = new Token();
176                    }
177                    currentToken._tokenType = nextToken;
178                    break;
179
180                case FLOATING_NUMBER:
181                    if ((currentToken._tokenType == TokenType.FLOATING_NUMBER) && !currentToken._string.isEmpty() && !isFloatingNumber(currentToken._string)) {
182//                        System.out.format("Not a number: '%s'%n", currentToken._string);
183                        throw new InvalidSyntaxException(Bundle.getMessage("InvalidSyntaxAtIndex", i));
184                    }
185                    if ((currentToken._tokenType != TokenType.NONE) && (currentToken._tokenType != TokenType.SPACE)) {
186                        addToken(currentToken, tokens);
187                        currentToken = new Token();
188                    }
189                    currentToken._tokenType = nextToken;
190                    break;
191
192                case STRING:
193                    if (!currentToken._string.endsWith("\"")) {
194//                        System.err.format("String: %s%n", currentToken._string);
195                        throw new InvalidSyntaxException(Bundle.getMessage("InvalidSyntaxAtIndex", i));
196                    }
197                    if ((currentToken._tokenType != TokenType.NONE) && (currentToken._tokenType != TokenType.SPACE)) {
198                        addToken(currentToken, tokens);
199                        currentToken = new Token();
200                    }
201                    currentToken._tokenType = nextToken;
202                    break;
203
204                default:
205                    throw new RuntimeException("unknown token type: "+nextToken.name());
206            }
207
208            if (currentToken._tokenType != TokenType.SPACE) {
209                currentToken._string += ch;
210            }
211
212            i += eatNextChar.get();
213//            System.out.format("New string: '%s'%n", currentToken._string);
214        }
215
216        if (currentToken._tokenType != TokenType.NONE) {
217            addToken(currentToken, tokens);
218        }
219
220        return tokens;
221    }
222
223    private static TokenType getTokenType(Token currentToken, char ch, char nextChar, char nextNextChar, char nextNextNextChar, AtomicInteger eatNextChar) {
224
225        eatNextChar.set(0);
226
227        if (ch == '"') {
228            return TokenType.STRING;
229        }
230
231        if (Character.isWhitespace(ch)) {
232            return TokenType.SPACE;
233        }
234
235        if (currentToken._tokenType == TokenType.STRING) {
236            return TokenType.SAME_AS_LAST;
237        }
238
239        if (ch == '.') {
240            if (nextChar == '.') {
241                if ((currentToken._tokenType != TokenType.DOT_DOT)) {
242                    eatNextChar.set(1);
243                    return TokenType.DOT_DOT;
244                } else {
245                    // Three dots in a row is an error
246                    return TokenType.ERROR;
247                }
248            } else if ((currentToken._tokenType == TokenType.IDENTIFIER)
249                    || (currentToken._tokenType == TokenType.NONE)
250                    || (currentToken._tokenType == TokenType.RIGHT_PARENTHESIS)
251                    || (currentToken._tokenType == TokenType.RIGHT_SQUARE_BRACKET)
252                    || (currentToken._tokenType == TokenType.RIGHT_CURLY_BRACKET)
253                    ) {
254                return TokenType.DOT;
255            }
256        }
257
258        if (ch == '?') {
259            return TokenType.TERNARY_QUESTION_MARK;
260        }
261
262        if (ch == ':') {
263            return TokenType.TERNARY_COLON;
264        }
265
266        if ((ch == '=') && (nextChar != '=')) {
267            return TokenType.ASSIGN;
268        }
269
270        if (nextChar == '=') {
271            switch (ch) {
272                case '+':
273                    eatNextChar.set(1);
274                    return TokenType.ASSIGN_ADD;
275                case '-':
276                    eatNextChar.set(1);
277                    return TokenType.ASSIGN_SUBTRACKT;
278                case '*':
279                    eatNextChar.set(1);
280                    return TokenType.ASSIGN_MULTIPLY;
281                case '/':
282                    eatNextChar.set(1);
283                    return TokenType.ASSIGN_DIVIDE;
284                case '%':
285                    eatNextChar.set(1);
286                    return TokenType.ASSIGN_MODULO;
287                default:
288                    // Do nothing
289            }
290        }
291
292        if (ch == '<') {
293            switch (nextChar) {
294                case '=':
295                    eatNextChar.set(1);
296                    return TokenType.LESS_OR_EQUAL;
297                case '<':
298                    if (nextNextChar == '=') {
299                        eatNextChar.set(2);
300                        return TokenType.ASSIGN_SHIFT_LEFT;
301                    } else {
302                        eatNextChar.set(1);
303                        return TokenType.SHIFT_LEFT;
304                    }
305                default:
306                    return TokenType.LESS_THAN;
307            }
308        }
309
310        if (ch == '>') {
311            switch (nextChar) {
312                case '=':
313                    eatNextChar.set(1);
314                    return TokenType.GREATER_OR_EQUAL;
315                case '>':
316                    if (nextNextChar == '=') {
317                        eatNextChar.set(2);
318                        return TokenType.ASSIGN_SHIFT_RIGHT;
319                    } else if (nextNextChar == '>') {
320                        if (nextNextNextChar == '=') {
321                            eatNextChar.set(3);
322                            return TokenType.ASSIGN_UNSIGNED_SHIFT_RIGHT;
323                        } else {
324                            eatNextChar.set(2);
325                            return TokenType.UNSIGNED_SHIFT_RIGHT;
326                        }
327                    } else {
328                        eatNextChar.set(1);
329                        return TokenType.SHIFT_RIGHT;
330                    }
331                default:
332                    return TokenType.GREATER_THAN;
333            }
334        }
335
336        if (ch == '=') {
337            if (nextChar == '=') {
338                eatNextChar.set(1);
339                return TokenType.EQUAL;
340            } else {
341                return TokenType.ERROR;
342            }
343        }
344
345        if (ch == '!') {
346            if (nextChar == '=') {
347                eatNextChar.set(1);
348                return TokenType.NOT_EQUAL;
349            } else {
350                return TokenType.BOOLEAN_NOT;
351            }
352        }
353
354        if (ch == '|') {
355            if (nextChar == '|') {
356                eatNextChar.set(1);
357                return TokenType.BOOLEAN_OR;
358            } else if (nextChar == '=') {
359                eatNextChar.set(1);
360                return TokenType.ASSIGN_OR;
361            } else {
362                return TokenType.BINARY_OR;
363            }
364        }
365
366        if (ch == '&') {
367            if (nextChar == '&') {
368                eatNextChar.set(1);
369                return TokenType.BOOLEAN_AND;
370            } else if (nextChar == '=') {
371                eatNextChar.set(1);
372                return TokenType.ASSIGN_AND;
373            } else {
374                return TokenType.BINARY_AND;
375            }
376        }
377
378        if (ch == '~') {
379            return TokenType.BINARY_NOT;
380        }
381
382        if (ch == ',') {
383            return TokenType.COMMA;
384        }
385
386        if (ch == '+') {
387            if (nextChar == '+') {
388                eatNextChar.set(1);
389                return TokenType.INCREMENT;
390            } else {
391                return TokenType.ADD;
392            }
393        }
394
395        if (ch == '-') {
396            if (nextChar == '-') {
397                eatNextChar.set(1);
398                return TokenType.DECREMENT;
399            } else {
400                return TokenType.SUBTRACKT;
401            }
402        }
403
404        if (ch == '*') {
405            return TokenType.MULTIPLY;
406        }
407
408        if (ch == '/') {
409            return TokenType.DIVIDE;
410        }
411
412        if (ch == '%') {
413            return TokenType.MODULO;
414        }
415
416        if (ch == '^') {
417            if (nextChar == '^') {
418                eatNextChar.set(1);
419                return TokenType.BOOLEAN_XOR;
420            } else if (nextChar == '=') {
421                eatNextChar.set(1);
422                return TokenType.ASSIGN_XOR;
423            } else {
424                return TokenType.BINARY_XOR;
425            }
426        }
427
428        if (ch == '(') {
429            return TokenType.LEFT_PARENTHESIS;
430        }
431
432        if (ch == ')') {
433            return TokenType.RIGHT_PARENTHESIS;
434        }
435
436        if (ch == '[') {
437            return TokenType.LEFT_SQUARE_BRACKET;
438        }
439
440        if (ch == ']') {
441            return TokenType.RIGHT_SQUARE_BRACKET;
442        }
443
444        if (ch == '{') {
445            return TokenType.LEFT_CURLY_BRACKET;
446        }
447
448        if (ch == '}') {
449            return TokenType.RIGHT_CURLY_BRACKET;
450        }
451
452        if ((currentToken._tokenType == TokenType.FLOATING_NUMBER) &&
453                (isFloatingNumber(currentToken._string+ch) || isFloatingNumber(currentToken._string+ch+nextChar))) {
454            return TokenType.SAME_AS_LAST;
455        }
456
457        if ((currentToken._tokenType == TokenType.IDENTIFIER) && (Character.isLetterOrDigit(ch) || (ch == '_'))) {
458            return TokenType.SAME_AS_LAST;
459        }
460
461        if (Character.isDigit(ch)) {
462            return TokenType.FLOATING_NUMBER;
463        }
464
465        if ((currentToken._tokenType == TokenType.FLOATING_NUMBER) &&
466                (Character.isLetterOrDigit(ch))) {
467            return TokenType.ERROR;
468        }
469
470        if (Character.isDigit(ch)) {
471            return TokenType.FLOATING_NUMBER;
472        }
473
474        if (Character.isLetter(ch) || (ch == '_')) {
475            return TokenType.IDENTIFIER;
476        }
477
478        return TokenType.ERROR;
479    }
480
481    private static boolean isIntegerNumber(String str) {
482        return str.matches("\\d+");
483    }
484
485    private static boolean isFloatingNumber(String str) {
486        return str.matches("\\d+") || str.matches("\\d+\\.\\d+");
487    }
488
489}