View Javadoc

1   /*
2    * $Header: /home/projects/jaxen/scm/jaxen/src/java/main/org/jaxen/saxpath/base/XPathLexer.java,v 1.17 2006/02/05 21:47:42 elharo Exp $
3    * $Revision: 1.17 $
4    * $Date: 2006/02/05 21:47:42 $
5    *
6    * ====================================================================
7    *
8    * Copyright 2000-2002 bob mcwhirter & James Strachan.
9    * All rights reserved.
10   *
11   *
12   * Redistribution and use in source and binary forms, with or without
13   * modification, are permitted provided that the following conditions are
14   * met:
15   * 
16   *   * Redistributions of source code must retain the above copyright
17   *     notice, this list of conditions and the following disclaimer.
18   * 
19   *   * Redistributions in binary form must reproduce the above copyright
20   *     notice, this list of conditions and the following disclaimer in the
21   *     documentation and/or other materials provided with the distribution.
22   * 
23   *   * Neither the name of the Jaxen Project nor the names of its
24   *     contributors may be used to endorse or promote products derived 
25   *     from this software without specific prior written permission.
26   * 
27   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
28   * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29   * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
30   * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
31   * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33   * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34   * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35   * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36   * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38   *
39   * ====================================================================
40   * This software consists of voluntary contributions made by many
41   * individuals on behalf of the Jaxen Project and was originally
42   * created by bob mcwhirter <bob@werken.com> and
43   * James Strachan <jstrachan@apache.org>.  For more information on the
44   * Jaxen Project, please see <http://www.jaxen.org/>.
45   *
46   * $Id: XPathLexer.java,v 1.17 2006/02/05 21:47:42 elharo Exp $
47   */
48  
49  
50  
51  
52  package org.jaxen.saxpath.base;
53  
54  class XPathLexer
55  {
56      private String xpath;
57      private int    currentPosition;
58      private int    endPosition;
59  
60      private Token  previousToken;
61  
62      XPathLexer(String xpath)
63      {
64          setXPath( xpath );
65      }
66  
67      private void setXPath(String xpath)
68      {
69          this.xpath           = xpath;
70          this.currentPosition = 0;
71          this.endPosition     = xpath.length();
72      }
73  
74      String getXPath()
75      {
76          return this.xpath;
77      }
78  
79      Token nextToken()
80      {
81          Token token = null;
82  
83          do
84          {
85              token = null;
86  
87              switch ( LA(1) )
88              {
89                  case '$':
90                  {
91                      token = dollar();
92                      break;
93                  }
94                      
95                  case '"':
96                  case '\'':
97                  {
98                      token = literal();
99                      break;
100                 }
101                     
102                 case '/':
103                 {
104                     token = slashes();
105                     break;
106                 }
107 
108                 case ',':
109                 {
110                     token = comma();
111                     break;
112                 }
113                     
114                 case '(':
115                 {
116                     token = leftParen();
117                     break;
118                 }
119                     
120                 case ')':
121                 {
122                     token = rightParen();
123                     break;
124                 }
125                     
126                 case '[':
127                 {
128                     token = leftBracket();
129                     break;
130                 }
131                     
132                 case ']':
133                 {
134                     token = rightBracket();
135                     break;
136                 }
137                     
138                 case '+':
139                 {
140                     token = plus();
141                     break;
142                 }
143                     
144                 case '-':
145                 {
146                     token = minus();
147                     break;
148                 }
149                     
150                 case '<':
151                 case '>':
152                 {
153                     token = relationalOperator();
154                     break;
155                 }        
156 
157                 case '=':
158                 {
159                     token = equals();
160                     break;
161                 }
162                     
163                 case '!':
164                 {
165                     if ( LA(2) == '=' )
166                     {
167                         token = notEquals();
168                     }
169                     break;
170                 }
171                     
172                 case '|':
173                 {
174                     token = pipe();
175                     break;
176                 }
177                     
178                 case '@':
179                 {
180                     token = at();
181                     break;
182                 }
183                     
184                 case ':':
185                 {
186                     if ( LA(2) == ':' )
187                     {
188                         token = doubleColon();
189                     }
190                     else
191                     {
192                         token = colon();
193                     }
194                     break;
195                 }
196                     
197                 case '*':
198                 {
199                     token = star();
200                     break;
201                 }
202                     
203                 case '.':
204                 {
205                     switch ( LA(2) )
206                     {
207                         case '0':
208                         case '1':
209                         case '2':
210                         case '3':
211                         case '4':
212                         case '5':
213                         case '6':
214                         case '7':
215                         case '8':
216                         case '9':
217                         {
218                             token = number();
219                             break;
220                         }
221                         default:
222                         {
223                             token = dots();
224                             break;
225                         }
226                     }
227                     break;
228                 }
229 
230                 case '0':
231                 case '1':
232                 case '2':
233                 case '3':
234                 case '4':
235                 case '5':
236                 case '6':
237                 case '7':
238                 case '8':
239                 case '9':
240                 {
241                     token = number();
242                     break;
243                 }
244 
245                 case ' ':
246                 case '\t':
247                 case '\n':
248                 case '\r':
249                 {
250                     token = whitespace();
251                     break;
252                 }
253                     
254                 default:
255                 {
256                     if ( isIdentifierStartChar( LA(1) ) )
257                     {
258                         token = identifierOrOperatorName();
259                     }
260                 }
261             }
262 
263             if ( token == null )
264             {
265                 if (!hasMoreChars())
266                 {
267                     token = new Token( TokenTypes.EOF,
268                                    getXPath(),
269                                    currentPosition(),
270                                    endPosition() );
271             }
272                 else
273                 {
274                     token = new Token( TokenTypes.ERROR,
275                                    getXPath(),
276                                    currentPosition(),
277                                    endPosition() );
278                 }
279             }
280 
281         }
282         while ( token.getTokenType() == TokenTypes.SKIP );
283 
284         setPreviousToken( token );
285         
286         return token;
287     }
288 
289     private Token identifierOrOperatorName()
290     {
291         Token token = null;
292     
293         if ( previousToken != null )
294         {
295             // For some reason, section 3.7, Lexical structure,
296             // doesn't seem to feel like it needs to mention the
297             // SLASH, DOUBLE_SLASH, and COLON tokens for the test
298             // if an NCName is an operator or not.
299             //
300             // According to section 3.7, "/foo" should be considered
301             // as a SLASH following by an OperatorName being 'foo'.
302             // Which is just simply, clearly, wrong, in my mind.
303             //
304             //     -bob
305             
306             switch ( previousToken.getTokenType() )
307             {
308                 case TokenTypes.AT:
309                 case TokenTypes.DOUBLE_COLON:
310                 case TokenTypes.LEFT_PAREN:
311                 case TokenTypes.LEFT_BRACKET:
312                 case TokenTypes.AND:
313                 case TokenTypes.OR:
314                 case TokenTypes.MOD:
315                 case TokenTypes.DIV:
316                 case TokenTypes.COLON:
317                 case TokenTypes.SLASH:
318                 case TokenTypes.DOUBLE_SLASH:
319                 case TokenTypes.PIPE:
320                 case TokenTypes.DOLLAR:
321                 case TokenTypes.PLUS:
322                 case TokenTypes.MINUS:
323                 case TokenTypes.STAR:
324                 case TokenTypes.COMMA:
325                 case TokenTypes.LESS_THAN_SIGN:
326                 case TokenTypes.GREATER_THAN_SIGN:
327                 case TokenTypes.LESS_THAN_OR_EQUALS_SIGN:
328                 case TokenTypes.GREATER_THAN_OR_EQUALS_SIGN:
329                 case TokenTypes.EQUALS:
330                 case TokenTypes.NOT_EQUALS:
331                 {
332                     token = identifier();
333                     break;
334                 }
335                 default:
336                 {
337                     token = operatorName();
338                     break;
339                 }
340             }
341         }
342         else
343         {
344             token = identifier();
345         }
346     
347         return token;
348     }
349     
350     private Token identifier()
351     {
352         Token token = null;
353     
354         int start = currentPosition();
355     
356         while ( hasMoreChars() )
357         {
358             if ( isIdentifierChar( LA(1) ) )
359             {
360                 consume();
361             }
362             else
363             {
364                 break;
365             }
366         }
367     
368         token = new Token( TokenTypes.IDENTIFIER,
369                            getXPath(),
370                            start,
371                            currentPosition() );
372     
373         return token;
374     }
375     
376     private Token operatorName()
377     {
378         Token token = null;
379     
380         switch ( LA(1) )
381         {
382             case 'a':
383             {
384                 token = and();
385                 break;
386             }
387     
388             case 'o':
389             {
390                 token = or();
391                 break;
392             }
393     
394             case 'm':
395             {
396                 token = mod();
397                 break;
398             }
399     
400             case 'd':
401             {
402                 token = div();
403                 break;
404             }
405         }
406     
407         return token;
408     }
409     
410     private Token mod()
411     {
412         Token token = null;
413     
414         if ( ( LA(1) == 'm' )
415              &&
416              ( LA(2) == 'o' )
417              &&
418              ( LA(3) == 'd' )
419            )
420         {
421             token = new Token( TokenTypes.MOD,
422                                getXPath(),
423                                currentPosition(),
424                                currentPosition()+3 );
425     
426             consume();
427             consume();
428             consume();
429         }
430     
431         return token;
432     }
433     
434     private Token div()
435     {
436         Token token = null;
437     
438         if ( ( LA(1) == 'd' )
439              &&
440              ( LA(2) == 'i' )
441              &&
442              ( LA(3) == 'v' )
443             )
444         {
445             token = new Token( TokenTypes.DIV,
446                                getXPath(),
447                                currentPosition(),
448                                currentPosition()+3 );
449     
450             consume();
451             consume();
452             consume();
453         }
454     
455         return token;
456     }
457     
458     private Token and()
459     {
460         Token token = null;
461     
462         if ( ( LA(1) == 'a' )
463              &&
464              ( LA(2) == 'n' )
465              &&
466              ( LA(3) == 'd' )
467            )
468         {
469             token = new Token( TokenTypes.AND,
470                                getXPath(),
471                                currentPosition(),
472                                currentPosition()+3 );
473     
474             consume();
475             consume();
476             consume();
477         }
478     
479         return token;
480     }
481     
482     private Token or()
483     {
484         Token token = null;
485     
486         if ( ( LA(1) == 'o' )
487              &&
488              ( LA(2) == 'r' )
489            )
490         {
491             token = new Token( TokenTypes.OR,
492                                getXPath(),
493                                currentPosition(),
494                                currentPosition()+2 );
495     
496             consume();
497             consume();
498         }
499     
500         return token;
501     }
502     
503     private Token number()
504     {
505         int     start         = currentPosition();
506         boolean periodAllowed = true;
507     
508       loop:
509         while( true )
510         {
511             switch ( LA(1) )
512             {
513                 case '.':
514                     if ( periodAllowed )
515                     {
516                         periodAllowed = false;
517                         consume();
518                     }
519                     else
520                     {
521                         break loop;
522                     }
523                     break;
524                 case '0':
525                 case '1':
526                 case '2':
527                 case '3':
528                 case '4':
529                 case '5':
530                 case '6':
531                 case '7':
532                 case '8':
533                 case '9':
534                     consume();
535                     break;
536                 default:
537                     break loop;
538             }
539         }
540     
541         return new Token( TokenTypes.DOUBLE,
542                                getXPath(),
543                                start,
544                                currentPosition() );
545     }
546     
547     private Token whitespace()
548     {
549         consume();
550             
551       loop:
552         while( hasMoreChars() )
553         {
554             switch ( LA(1) )
555             {
556                 case ' ':
557                 case '\t':
558                 case '\n':
559                 case '\r':
560                 {
561                     consume();
562                     break;
563                 }
564                     
565                 default:
566                 {
567                     break loop;
568                 }
569             }
570         }
571     
572         return new Token( TokenTypes.SKIP,
573                           getXPath(),
574                           0,
575                           0 );
576     }
577     
578     private Token comma()
579     {
580         Token token = new Token( TokenTypes.COMMA,
581                                  getXPath(),
582                                  currentPosition(),
583                                  currentPosition()+1 );
584     
585         consume();
586     
587         return token;
588     }
589     
590     private Token equals()
591     {
592         Token token = new Token( TokenTypes.EQUALS,
593                                  getXPath(),
594                                  currentPosition(),
595                                  currentPosition()+1 );
596     
597         consume();
598     
599         return token;
600     }
601     
602     private Token minus()
603     {
604         Token token = new Token( TokenTypes.MINUS,
605                                  getXPath(),
606                                  currentPosition(),
607                                  currentPosition()+1 );
608         consume();
609             
610         return token;
611     }
612     
613     private Token plus()
614     {
615         Token token = new Token( TokenTypes.PLUS,
616                                  getXPath(),
617                                  currentPosition(),
618                                  currentPosition()+1 );
619         consume();
620     
621         return token;
622     }
623     
624     private Token dollar()
625     {
626         Token token = new Token( TokenTypes.DOLLAR,
627                                  getXPath(),
628                                  currentPosition(),
629                                  currentPosition()+1 );
630         consume();
631     
632         return token;
633     }
634     
635     private Token pipe()
636     {
637         Token token = new Token( TokenTypes.PIPE,
638                                  getXPath(),
639                                  currentPosition(),
640                                  currentPosition()+1 );
641     
642         consume();
643     
644         return token;
645     }
646     
647     private Token at()
648     {
649         Token token = new Token( TokenTypes.AT,
650                                  getXPath(),
651                                  currentPosition(),
652                                  currentPosition()+1 );
653     
654         consume();
655     
656         return token;
657     }
658     
659     private Token colon()
660     {
661         Token token = new Token( TokenTypes.COLON,
662                                  getXPath(),
663                                  currentPosition(),
664                                  currentPosition()+1 );
665         consume();
666     
667         return token;
668     }
669     
670     private Token doubleColon()
671     {
672         Token token = new Token( TokenTypes.DOUBLE_COLON,
673                                  getXPath(),
674                                  currentPosition(),
675                                  currentPosition()+2 );
676     
677         consume();
678         consume();
679     
680         return token;
681     }
682     
683     private Token notEquals()
684     {
685         Token token = new Token( TokenTypes.NOT_EQUALS,
686                                  getXPath(),
687                                  currentPosition(),
688                                  currentPosition() + 2 );
689     
690         consume();
691         consume();
692     
693         return token;
694     }
695     
696     private Token relationalOperator()
697     {
698         Token token = null;
699     
700         switch ( LA(1) )
701         {
702             case '<':
703             {
704                 if ( LA(2) == '=' )
705                 {
706                     token = new Token( TokenTypes.LESS_THAN_OR_EQUALS_SIGN,
707                                        getXPath(),
708                                        currentPosition(),
709                                        currentPosition() + 2 );
710                     consume();
711                 }
712                 else
713                 {
714                     token = new Token( TokenTypes.LESS_THAN_SIGN,
715                                        getXPath(),
716                                        currentPosition(),
717                                        currentPosition() + 1);
718                 }
719     
720                 consume();
721                 break;
722             }
723             case '>':
724             {
725                 if ( LA(2) == '=' )
726                 {
727                     token = new Token( TokenTypes.GREATER_THAN_OR_EQUALS_SIGN,
728                                        getXPath(),
729                                        currentPosition(),
730                                        currentPosition() + 2 );
731                     consume();
732                 }
733                 else
734                 {
735                     token = new Token( TokenTypes.GREATER_THAN_SIGN,
736                                        getXPath(),
737                                        currentPosition(),
738                                        currentPosition() + 1 );
739                 }
740     
741                 consume();
742                 break;
743             }
744         }
745     
746         return token;
747                 
748     }
749     
750     private Token star()
751     {
752         Token token = new Token( TokenTypes.STAR,
753                                  getXPath(),
754                                  currentPosition(),
755                                  currentPosition()+1 );
756     
757         consume();
758             
759         return token;
760     }
761     
762     private Token literal()
763     {
764         Token token = null;
765     
766         char match  = LA(1);
767     
768         consume();
769     
770         int start = currentPosition();
771             
772         while ( ( token == null )
773                 &&
774                 hasMoreChars() )
775         {
776             if ( LA(1) == match )
777             {
778                 token = new Token( TokenTypes.LITERAL,
779                                    getXPath(),
780                                    start,
781                                    currentPosition() );
782             }
783             consume();
784         }
785     
786         return token;
787     }
788     
789     private Token dots()
790     {
791         Token token = null;
792     
793         switch ( LA(2) )
794         {
795             case '.':
796             {
797                 token = new Token( TokenTypes.DOT_DOT,
798                                    getXPath(),
799                                    currentPosition(),
800                                    currentPosition()+2 ) ;
801                 consume();
802                 consume();
803                 break;
804             }
805             default:
806             {
807                 token = new Token( TokenTypes.DOT,
808                                    getXPath(),
809                                    currentPosition(),
810                                    currentPosition()+1 );
811                 consume();
812                 break;
813             }
814         }
815     
816         return token;
817     }
818     
819     private Token leftBracket()
820     {
821         Token token = new Token( TokenTypes.LEFT_BRACKET,
822                                  getXPath(),
823                                  currentPosition(),
824                                  currentPosition()+1 );
825     
826         consume();
827     
828         return token;
829     }
830     
831     private Token rightBracket()
832     {
833         Token token = new Token( TokenTypes.RIGHT_BRACKET,
834                                  getXPath(),
835                                  currentPosition(),
836                                  currentPosition()+1 );
837     
838         consume();
839     
840         return token;
841     }
842     
843     private Token leftParen()
844     {
845         Token token = new Token( TokenTypes.LEFT_PAREN,
846                                  getXPath(),
847                                  currentPosition(),
848                                  currentPosition()+1 );
849     
850         consume();
851     
852         return token;
853     }
854     
855     private Token rightParen()
856     {
857         Token token = new Token( TokenTypes.RIGHT_PAREN,
858                                  getXPath(),
859                                  currentPosition(),
860                                  currentPosition()+1 );
861     
862         consume();
863     
864         return token;
865     }
866     
867     private Token slashes()
868     {
869         Token token = null;
870     
871         switch ( LA(2) )
872         {
873             case '/':
874             {
875                 token = new Token( TokenTypes.DOUBLE_SLASH,
876                                    getXPath(),
877                                    currentPosition(),
878                                    currentPosition()+2 );
879                 consume();
880                 consume();
881                 break;
882             }
883             default:
884             {
885                 token = new Token( TokenTypes.SLASH,
886                                    getXPath(),
887                                    currentPosition(),
888                                    currentPosition()+1 );
889                 consume();
890             }
891         }
892     
893         return token;
894     }
895     
896     private char LA(int i) 
897     {
898         if ( currentPosition + ( i - 1 ) >= endPosition() )
899         {
900             return (char) -1;
901         }
902     
903         return getXPath().charAt( currentPosition() + (i - 1) );
904     }
905     
906     private void consume()
907     {
908         ++this.currentPosition;
909     }
910     
911     private int currentPosition()
912     {
913         return this.currentPosition;
914     }
915     
916     private int endPosition()
917     {
918         return this.endPosition;
919     }
920     
921     private void setPreviousToken(Token previousToken)
922     {
923         this.previousToken = previousToken;
924     }
925     
926     private boolean hasMoreChars()
927     {
928         return currentPosition() < endPosition();
929     }
930     
931     private boolean isIdentifierChar(char c)
932     {
933         return Verifier.isXMLNCNameCharacter( c );
934     }
935     
936     private boolean isIdentifierStartChar(char c)
937     {
938         return Verifier.isXMLNCNameStartCharacter( c );
939     }
940 
941 }