d7e943662235bd65697fd62f84124b491c63822d
[supertux.git] / src / squirrel / sqstdlib / sqstdrex.c
1 /* see copyright notice in squirrel.h */
2 #include <squirrel.h>
3 #include <string.h>
4 #include <ctype.h>
5 #include <setjmp.h>
6 #include "sqstdstring.h"
7
8 #ifdef _DEBUG
9 #include <stdio.h>
10
11 static const SQChar *g_nnames[] =
12 {
13         _SC("NONE"),_SC("OP_GREEDY"),   _SC("OP_OR"),
14         _SC("OP_EXPR"),_SC("OP_NOCAPEXPR"),_SC("OP_DOT"),       _SC("OP_CLASS"),
15         _SC("OP_CCLASS"),_SC("OP_NCLASS"),_SC("OP_RANGE"),_SC("OP_CHAR"),
16         _SC("OP_EOL"),_SC("OP_BOL"),_SC("OP_WB")
17 };
18
19 #endif
20
21 #define OP_GREEDY               MAX_CHAR+1 // * + ? {n}
22 #define OP_OR                   MAX_CHAR+2
23 #define OP_EXPR                 MAX_CHAR+3 //parentesis ()
24 #define OP_NOCAPEXPR    MAX_CHAR+4 //parentesis (?:)
25 #define OP_DOT                  MAX_CHAR+5
26 #define OP_CLASS                MAX_CHAR+6
27 #define OP_CCLASS               MAX_CHAR+7
28 #define OP_NCLASS               MAX_CHAR+8 //negates class the [^
29 #define OP_RANGE                MAX_CHAR+9
30 #define OP_CHAR                 MAX_CHAR+10
31 #define OP_EOL                  MAX_CHAR+11
32 #define OP_BOL                  MAX_CHAR+12
33 #define OP_WB                   MAX_CHAR+13
34
35 #define SQREX_SYMBOL_ANY_CHAR '.'
36 #define SQREX_SYMBOL_GREEDY_ONE_OR_MORE '+'
37 #define SQREX_SYMBOL_GREEDY_ZERO_OR_MORE '*'
38 #define SQREX_SYMBOL_GREEDY_ZERO_OR_ONE '?'
39 #define SQREX_SYMBOL_BRANCH '|'
40 #define SQREX_SYMBOL_END_OF_STRING '$'
41 #define SQREX_SYMBOL_BEGINNING_OF_STRING '^'
42 #define SQREX_SYMBOL_ESCAPE_CHAR '\\'
43
44
45 typedef int SQRexNodeType;
46
47 typedef struct tagSQRexNode{
48         SQRexNodeType type;
49         long left;
50         long right;
51         int next;
52 }SQRexNode;
53
54 struct SQRex{
55         const SQChar *_eol;
56         const SQChar *_bol;
57         const SQChar *_p;
58         int _first;
59         int _op;
60         SQRexNode *_nodes;
61         int _nallocated;
62         int _nsize;
63         int _nsubexpr;
64         SQRexMatch *_matches;
65         int _currsubexp;
66         void *_jmpbuf;
67         const SQChar **_error;
68 };
69
70 static int sqstd_rex_list(SQRex *exp);
71
72 static int sqstd_rex_newnode(SQRex *exp, SQRexNodeType type)
73 {
74         SQRexNode n;
75         n.type = type;
76         n.next = n.right = n.left = -1;
77         if(type == OP_EXPR)
78                 n.right = exp->_nsubexpr++;
79         if(exp->_nallocated < (exp->_nsize + 1)) {
80                 int oldsize = exp->_nallocated;
81                 exp->_nallocated *= 2;
82                 exp->_nodes = (SQRexNode *)sq_realloc(exp->_nodes, oldsize * sizeof(SQRexNode) ,exp->_nallocated * sizeof(SQRexNode));
83         }
84         exp->_nodes[exp->_nsize++] = n;
85         return (int)exp->_nsize - 1;
86 }
87
88 static void sqstd_rex_error(SQRex *exp,const SQChar *error)
89 {
90         if(exp->_error) *exp->_error = error;
91         longjmp(*((jmp_buf*)exp->_jmpbuf),-1);
92 }
93
94 static void sqstd_rex_expect(SQRex *exp, int n){
95         if((*exp->_p) != n) 
96                 sqstd_rex_error(exp, _SC("expected paren"));
97         exp->_p++;
98 }
99
100 static SQBool sqstd_rex_ischar(SQChar c)
101 {
102         switch(c) {
103         case SQREX_SYMBOL_BRANCH:case SQREX_SYMBOL_GREEDY_ZERO_OR_MORE:
104         case SQREX_SYMBOL_GREEDY_ZERO_OR_ONE:case SQREX_SYMBOL_GREEDY_ONE_OR_MORE:
105         case SQREX_SYMBOL_BEGINNING_OF_STRING:case SQREX_SYMBOL_END_OF_STRING:
106         case SQREX_SYMBOL_ANY_CHAR:case SQREX_SYMBOL_ESCAPE_CHAR:case '(':case ')':case '[':case '{': case '}':
107                 return SQFalse;
108     }
109         return SQTrue;
110 }
111
112 static SQChar sqstd_rex_escapechar(SQRex *exp)
113 {
114         if(*exp->_p == SQREX_SYMBOL_ESCAPE_CHAR){
115                 exp->_p++;
116                 switch(*exp->_p) {
117                 case 'v': exp->_p++; return '\v';
118                 case 'n': exp->_p++; return '\n';
119                 case 't': exp->_p++; return '\t';
120                 case 'r': exp->_p++; return '\r';
121                 case 'f': exp->_p++; return '\f';
122                 default: return (*exp->_p++);
123                 }
124         } else if(!sqstd_rex_ischar(*exp->_p)) sqstd_rex_error(exp,_SC("letter expected"));
125         return (*exp->_p++);
126 }
127
128 static int sqstd_rex_charclass(SQRex *exp,int classid)
129 {
130         int n = sqstd_rex_newnode(exp,OP_CCLASS);
131         exp->_nodes[n].left = classid;
132         return n;
133 }
134
135 static int sqstd_rex_charnode(SQRex *exp,SQBool isclass)
136 {
137         if(*exp->_p == SQREX_SYMBOL_ESCAPE_CHAR) {
138                 exp->_p++;
139                 switch(*exp->_p) {
140                         case 'n': exp->_p++; return sqstd_rex_newnode(exp,'\n');
141                         case 't': exp->_p++; return sqstd_rex_newnode(exp,'\t');
142                         case 'r': exp->_p++; return sqstd_rex_newnode(exp,'\r');
143                         case 'f': exp->_p++; return sqstd_rex_newnode(exp,'\f');
144                         case 'v': exp->_p++; return sqstd_rex_newnode(exp,'\v');
145                         case 'a': case 'A': case 'w': case 'W': case 's': case 'S': 
146                         case 'd': case 'D': case 'x': case 'X': case 'c': case 'C': 
147                         case 'p': case 'P': case 'l': case 'u': 
148                                 {
149                                 SQChar t = *exp->_p;
150                                 exp->_p++; 
151                                 return sqstd_rex_charclass(exp,t);
152                                 }
153                         case 'b': 
154                         case 'B':
155                                 if(!isclass) {
156                                         int node = sqstd_rex_newnode(exp,OP_WB);
157                                         exp->_nodes[node].left = *exp->_p;
158                                         exp->_p++; 
159                                         return node;
160                                 } //else default
161                         default: return sqstd_rex_newnode(exp,(*exp->_p++));
162                 }
163         }
164         else if(!sqstd_rex_ischar(*exp->_p)) {
165                 
166                 sqstd_rex_error(exp,_SC("letter expected"));
167         }
168         return sqstd_rex_newnode(exp,*exp->_p++);
169 }
170 static int sqstd_rex_class(SQRex *exp)
171 {
172         int ret = -1;
173         int first = -1,chain;
174         if(*exp->_p == SQREX_SYMBOL_BEGINNING_OF_STRING){
175                 ret = sqstd_rex_newnode(exp,OP_NCLASS);
176                 exp->_p++;
177         }else ret = sqstd_rex_newnode(exp,OP_CLASS);
178         
179         if(*exp->_p == ']' || *exp->_p == '-'){
180                 first = *exp->_p;
181                 exp->_p++;
182         }
183         chain = ret;
184         while(*exp->_p != ']' && exp->_p != exp->_eol) {
185                 if(*exp->_p == '-' && first != -1){ 
186                         int r;
187                         if(*exp->_p++ == ']') sqstd_rex_error(exp,_SC("unfinished range"));
188                         r = sqstd_rex_newnode(exp,OP_RANGE);
189                         if(first>*exp->_p) sqstd_rex_error(exp,_SC("invalid range"));
190                         if(exp->_nodes[first].type == OP_CCLASS) sqstd_rex_error(exp,_SC("cannot use character classes in ranges"));
191                         exp->_nodes[r].left = exp->_nodes[first].type;
192                         exp->_nodes[r].right = sqstd_rex_escapechar(exp);
193             exp->_nodes[chain].next = r;
194                         chain = r;
195                         first = -1;
196                 }
197                 else{
198                         if(first!=-1){
199                                 int c = first;
200                                 exp->_nodes[chain].next = c;
201                                 chain = c;
202                                 first = sqstd_rex_charnode(exp,SQTrue);
203                         }
204                         else{
205                                 first = sqstd_rex_charnode(exp,SQTrue);
206                         }
207                 }
208         }
209         if(first!=-1){
210                 int c = first;
211                 exp->_nodes[chain].next = c;
212                 chain = c;
213                 first = -1;
214         }
215         /* hack? */
216         exp->_nodes[ret].left = exp->_nodes[ret].next;
217         exp->_nodes[ret].next = -1;
218         return ret;
219 }
220
221 static int sqstd_rex_parsenumber(SQRex *exp)
222 {
223         int ret = *exp->_p-'0';
224         int positions = 10;
225         exp->_p++;
226         while(isdigit(*exp->_p)) {
227                 ret = ret*10+(*exp->_p++-'0');
228                 if(positions==1000000000) sqstd_rex_error(exp,_SC("overflow in numeric constant"));
229                 positions *= 10;
230         };
231         return ret;
232 }
233
234 static int sqstd_rex_element(SQRex *exp)
235 {
236         int ret;
237         switch(*exp->_p)
238         {
239         case '(': {
240                 int expr;
241                 exp->_p++;
242                 
243                 
244                 if(*exp->_p =='?') {
245                         exp->_p++;
246                         sqstd_rex_expect(exp,':');
247                         expr = sqstd_rex_newnode(exp,OP_NOCAPEXPR);
248                 }
249                 else
250                         expr = sqstd_rex_newnode(exp,OP_EXPR);
251                 exp->_nodes[expr].left = sqstd_rex_list(exp);
252                 ret = expr;
253                 sqstd_rex_expect(exp,')');
254         }
255                 break;
256         case '[':
257                 exp->_p++;
258                 ret = sqstd_rex_class(exp);
259                 sqstd_rex_expect(exp,']');
260                 break;
261         case SQREX_SYMBOL_END_OF_STRING: exp->_p++; ret = sqstd_rex_newnode(exp,OP_EOL);break;
262         case SQREX_SYMBOL_ANY_CHAR: exp->_p++; ret = sqstd_rex_newnode(exp,OP_DOT);break;
263         default:
264                 ret = sqstd_rex_charnode(exp,SQFalse);
265                 break;
266         }
267         /* scope block */
268         {
269                 int op;
270                 unsigned short p0 = 0, p1 = 0;
271                 switch(*exp->_p){
272                 case SQREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; goto __end;
273                 case SQREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; goto __end;
274                 case SQREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; goto __end;
275                 case '{':{
276                         exp->_p++;
277                         if(!isdigit(*exp->_p)) sqstd_rex_error(exp,_SC("number expected"));
278                         p0 = sqstd_rex_parsenumber(exp);
279                         switch(*exp->_p) {
280                         case '}':
281                                 p1 = p0; exp->_p++;
282                                 goto __end;
283                         case ',':
284                                 exp->_p++;
285                                 p1 = 0xFFFF;
286                                 if(isdigit(*exp->_p)){
287                                         p1 = sqstd_rex_parsenumber(exp);
288                                 }
289                                 sqstd_rex_expect(exp,'}');
290                                 goto __end;
291                         default:
292                                 sqstd_rex_error(exp,_SC(", or } expected"));
293                         }
294                 }
295                 __end: {
296                                 int nnode = sqstd_rex_newnode(exp,OP_GREEDY);
297                                 op = OP_GREEDY;
298                                 exp->_nodes[nnode].left = ret;
299                                 exp->_nodes[nnode].right = ((p0)<<16)|p1;
300                                 ret = nnode;
301                         }
302                 }
303         }
304         if(*exp->_p != SQREX_SYMBOL_BRANCH && *exp->_p != ')' && *exp->_p != SQREX_SYMBOL_GREEDY_ZERO_OR_MORE && *exp->_p != SQREX_SYMBOL_GREEDY_ONE_OR_MORE && *exp->_p != '\0')
305                 exp->_nodes[ret].next = sqstd_rex_element(exp);
306         return ret;
307 }
308
309 static int sqstd_rex_list(SQRex *exp)
310 {
311         int ret=-1,e;
312         if(*exp->_p == SQREX_SYMBOL_BEGINNING_OF_STRING) {
313                 exp->_p++;
314                 ret = sqstd_rex_newnode(exp,OP_BOL);
315         }
316         e = sqstd_rex_element(exp);
317         if(ret != -1) {
318                 exp->_nodes[ret].next = e;
319         }
320         else ret = e;
321
322         if(*exp->_p == SQREX_SYMBOL_BRANCH) {
323                 int temp;
324                 exp->_p++;
325                 temp = sqstd_rex_newnode(exp,OP_OR);
326                 exp->_nodes[temp].left = ret;
327                 exp->_nodes[temp].right = sqstd_rex_list(exp);
328                 ret = temp;
329         }
330         return ret;
331 }
332
333 static SQBool sqstd_rex_matchcclass(int cclass,SQChar c)
334 {
335         switch(cclass) {
336         case 'a': return isalpha(c)?SQTrue:SQFalse;
337         case 'A': return !isalpha(c)?SQTrue:SQFalse;
338         case 'w': return (isalnum(c) || c == '_')?SQTrue:SQFalse;
339         case 'W': return (!isalnum(c) && c != '_')?SQTrue:SQFalse;
340         case 's': return isspace(c)?SQTrue:SQFalse;
341         case 'S': return !isspace(c)?SQTrue:SQFalse;
342         case 'd': return isdigit(c)?SQTrue:SQFalse;
343         case 'D': return !isdigit(c)?SQTrue:SQFalse;
344         case 'x': return isxdigit(c)?SQTrue:SQFalse;
345         case 'X': return !isxdigit(c)?SQTrue:SQFalse;
346         case 'c': return iscntrl(c)?SQTrue:SQFalse;
347         case 'C': return !iscntrl(c)?SQTrue:SQFalse;
348         case 'p': return ispunct(c)?SQTrue:SQFalse;
349         case 'P': return !ispunct(c)?SQTrue:SQFalse;
350         case 'l': return islower(c)?SQTrue:SQFalse;
351         case 'u': return isupper(c)?SQTrue:SQFalse;
352         }
353         return SQFalse; /*cannot happen*/
354 }
355
356 static SQBool sqstd_rex_matchclass(SQRex* exp,SQRexNode *node,SQChar c)
357 {
358         do {
359                 switch(node->type) {
360                         case OP_RANGE:
361                                 if(c >= node->left && c <= node->right) return SQTrue;
362                                 break;
363                         case OP_CCLASS:
364                                 if(sqstd_rex_matchcclass(node->left,c)) return SQTrue;
365                                 break;
366                         default:
367                                 if(c == node->type)return SQTrue;
368                 }
369         } while((node->next != -1) && (node = &exp->_nodes[node->next]));
370         return SQFalse;
371 }
372
373 static const SQChar *sqstd_rex_matchnode(SQRex* exp,SQRexNode *node,const SQChar *str)
374 {
375         SQRexNodeType type = node->type;
376         switch(type) {
377         case OP_GREEDY: {
378                 int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0;
379                 const SQChar *s=str, *good = str;
380                 while((nmaches == 0xFFFF || nmaches < p1) 
381                         && (s = sqstd_rex_matchnode(exp,&exp->_nodes[node->left],s))) {
382                         good=s;
383                         nmaches++;
384                         if(s >= exp->_eol)
385                                 break;
386                 }
387                 if(p0 == p1 && p0 == nmaches) return good;
388                 else if(nmaches >= p0 && p1 == 0xFFFF) return good;
389                 else if(nmaches >= p0 && nmaches <= p1) return good;
390                 return NULL;
391         }
392         case OP_OR: {
393                         const SQChar *asd = str;
394                         SQRexNode *temp=&exp->_nodes[node->left];
395                         while( (asd = sqstd_rex_matchnode(exp,temp,asd)) ) {
396                                 if(temp->next != -1)
397                                         temp = &exp->_nodes[temp->next];
398                                 else
399                                         return asd;
400                         }
401                         asd = str;
402                         temp = &exp->_nodes[node->right];
403                         while( (asd = sqstd_rex_matchnode(exp,temp,asd)) ) {
404                                 if(temp->next != -1)
405                                         temp = &exp->_nodes[temp->next];
406                                 else
407                                         return asd;
408                         }
409                         return NULL;
410                         break;
411         }
412         case OP_EXPR:
413         case OP_NOCAPEXPR:{
414                         SQRexNode *n = &exp->_nodes[node->left];
415                         const SQChar *cur = str;
416                         int capture = -1;
417                         if(node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {
418                                 capture = exp->_currsubexp;
419                                 exp->_matches[capture].begin = cur;
420                                 exp->_currsubexp++;
421                         }
422
423                         do {
424                                 if(!(cur = sqstd_rex_matchnode(exp,n,cur))) {
425                                         if(capture != -1){
426                                                 exp->_matches[capture].begin = 0;
427                                                 exp->_matches[capture].len = 0;
428                                         }
429                                         return NULL;
430                                 }
431                         } while((n->next != -1) && (n = &exp->_nodes[n->next]));
432
433                         if(capture != -1) 
434                                 exp->_matches[capture].len = cur - exp->_matches[capture].begin;
435                         return cur;
436         }                                
437         case OP_WB:
438                 if((str == exp->_bol && !isspace(*str))
439                  || (str == exp->_eol && !isspace(*(str-1)))
440                  || ((!isspace(*str) && isspace(*(str+1))))
441                  || ((isspace(*str) && !isspace(*(str+1)))) ) {
442                         return (node->left == 'b')?str:NULL;
443                 }
444                 return (node->left == 'b')?NULL:str;
445         case OP_BOL:
446                 if(str == exp->_bol) return str;
447                 return NULL;
448         case OP_EOL:
449                 if(str == exp->_eol) return str;
450                 return NULL;
451         case OP_DOT:
452                 *str++;
453                 return str;
454         case OP_NCLASS:
455         case OP_CLASS:
456                 if(sqstd_rex_matchclass(exp,&exp->_nodes[node->left],*str)?(type == OP_CLASS?SQTrue:SQFalse):(type == OP_NCLASS?SQTrue:SQFalse)) {
457                         *str++;
458                         return str;
459                 }
460                 return NULL;
461         case OP_CCLASS:
462                 if(sqstd_rex_matchcclass(node->left,*str)) {
463                         *str++;
464                         return str;
465                 }
466                 return NULL;
467         default: /* char */
468                 if(*str != node->type) return NULL;
469                 *str++;
470                 return str;
471         }
472         return NULL;
473 }
474
475 /* public api */
476 SQRex *sqstd_rex_compile(const SQChar *pattern,const SQChar **error)
477 {
478         SQRex *exp = (SQRex *)sq_malloc(sizeof(SQRex));
479         exp->_p = pattern;
480         exp->_nallocated = (int)scstrlen(pattern) * sizeof(SQChar);
481         exp->_nodes = (SQRexNode *)sq_malloc(exp->_nallocated * sizeof(SQRexNode));
482         exp->_nsize = 0;
483         exp->_matches = 0;
484         exp->_nsubexpr = 0;
485         exp->_first = sqstd_rex_newnode(exp,OP_EXPR);
486         exp->_error = error;
487         exp->_jmpbuf = sq_malloc(sizeof(jmp_buf));
488         if(setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) {
489                 exp->_nodes[exp->_first].left=sqstd_rex_list(exp);
490                 if(*exp->_p!='\0')
491                         sqstd_rex_error(exp,_SC("unexpected character"));
492 #ifdef _DEBUG
493                 {
494                         int nsize,i;
495                         SQRexNode *t;
496                         nsize = exp->_nsize;
497                         t = &exp->_nodes[0];
498                         scprintf(_SC("\n"));
499                         for(i = 0;i < nsize; i++) {
500                                 if(exp->_nodes[i].type>MAX_CHAR)
501                                         scprintf(_SC("[%02d] %10s "),i,g_nnames[exp->_nodes[i].type-MAX_CHAR]);
502                                 else
503                                         scprintf(_SC("[%02d] %10c "),i,exp->_nodes[i].type);
504                                 scprintf(_SC("left %02d right %02d next %02d\n"),exp->_nodes[i].left,exp->_nodes[i].right,exp->_nodes[i].next);
505                         }
506                         scprintf(_SC("\n"));
507                 }
508 #endif
509                 exp->_matches = (SQRexMatch *) sq_malloc(exp->_nsubexpr * sizeof(SQRexMatch));
510                 memset(exp->_matches,0,exp->_nsubexpr * sizeof(SQRexMatch));
511         }
512         else{
513                 sqstd_rex_free(exp);
514                 return NULL;
515         }
516         return exp;
517 }
518
519 void sqstd_rex_free(SQRex *exp)
520 {
521         if(exp) {
522                 if(exp->_nodes) sq_free(exp->_nodes,exp->_nallocated * sizeof(SQRexNode));
523                 if(exp->_jmpbuf) sq_free(exp->_jmpbuf,sizeof(jmp_buf));
524                 if(exp->_matches) sq_free(exp->_matches,exp->_nsubexpr * sizeof(SQRexMatch));
525                 sq_free(exp,sizeof(SQRex));
526         }
527 }
528
529 SQBool sqstd_rex_match(SQRex* exp,const SQChar* text)
530 {
531         const SQChar* res = NULL;
532         exp->_bol = text;
533         exp->_eol = text + scstrlen(text);
534         exp->_currsubexp = 0;
535         res = sqstd_rex_matchnode(exp,exp->_nodes,text);
536         if(res == NULL || res != exp->_eol)
537                 return SQFalse;
538         return SQTrue;
539 }
540
541 SQBool sqstd_rex_searchrange(SQRex* exp,const SQChar* text_begin,const SQChar* text_end,const SQChar** out_begin, const SQChar** out_end)
542 {
543         const SQChar *cur = NULL;
544         int node = exp->_first;
545         if(text_begin >= text_end) return SQFalse;
546         exp->_bol = text_begin;
547         exp->_eol = text_end;
548         do {
549                 cur = text_begin;
550                 while(node != -1) {
551                         exp->_currsubexp = 0;
552                         cur = sqstd_rex_matchnode(exp,&exp->_nodes[node],cur);
553                         if(!cur)
554                                 break;
555                         node = exp->_nodes[node].next;
556                 }
557                 *text_begin++;
558         } while(cur == NULL && text_begin != text_end);
559
560         if(cur == NULL)
561                 return SQFalse;
562
563         --text_begin;
564
565         if(out_begin) *out_begin = text_begin;
566         if(out_end) *out_end = cur;
567         return SQTrue;
568 }
569
570 SQBool sqstd_rex_search(SQRex* exp,const SQChar* text, const SQChar** out_begin, const SQChar** out_end)
571 {
572         return sqstd_rex_searchrange(exp,text,text + scstrlen(text),out_begin,out_end);
573 }
574
575 int sqstd_rex_getsubexpcount(SQRex* exp)
576 {
577         return exp->_nsubexpr;
578 }
579
580 SQBool sqstd_rex_getsubexp(SQRex* exp, int n, SQRexMatch *subexp)
581 {
582         if( n<0 || n >= exp->_nsubexpr) return SQFalse;
583         *subexp = exp->_matches[n];
584         return SQTrue;
585 }
586