OSSIM - Open Source Software Image Map  Version 1.9.0 (20180803)
ossimRegExp.cpp
Go to the documentation of this file.
1 
2 //
3 // Copyright (C) 1991 Texas Instruments Incorporated.
4 //
5 // Permission is granted to any individual or institution to use, copy, modify,
6 // and distribute this software, provided that this complete copyright and
7 // permission notice is maintained, intact, in all copies and supporting
8 // documentation.
9 //
10 // Texas Instruments Incorporated provides this software "as is" without
11 // express or implied warranty.
12 //
13 //
14 // Created: MNF 06/13/89 Initial Design and Implementation
15 // Updated: LGO 08/09/89 Inherit from Generic
16 // Updated: MBN 09/07/89 Added conditional exception handling
17 // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
18 // Updated: DLS 03/22/91 New lite version
19 //
20 // This is the header file for the regular expression class. An object of
21 // this class contains a regular expression, in a special "compiled" format.
22 // This compiled format consists of several slots all kept as the objects
23 // private data. The RegExp class provides a convenient way to represent
24 // regular expressions. It makes it easy to search for the same regular
25 // expression in many different strings without having to compile a string to
26 // regular expression format more than necessary.
27 //
28 // A regular expression allows a programmer to specify complex patterns that
29 // can be searched for and matched against the character string of a String
30 // object. In its simplest case, a regular expression is a sequence of
31 // characters with which you can search for exact character matches. However,
32 // many times you may not know the exact sequence you want to find, or you may
33 // only want to find a match at the beginning or end of a String. The RegExp
34 // object allows specification of such patterns by utilizing the following
35 // regular expression meta-characters (note that more one of these
36 // meta-characters can be used in a single regular expression in order to
37 // create complex search patterns):
38 //
39 // ^ Match at beginning of line
40 // $ Match at end of line
41 // . Match any single character
42 // [ ] Match any one character inside the brackets
43 // [^ ] Match any character NOT inside the brackets
44 // - Match any character in range on either side of dash
45 // * Match preceding pattern zero or more times
46 // + Match preceding pattern one or more times
47 // ? Match preceding pattern zero or once only
48 // () Save a matched expression and use it in a further match.
49 //
50 // There are three constructors for RegExp. One just creates an empty RegExp
51 // object. Another creates a RegExp object and initializes it with a regular
52 // expression that is given in the form of a char*. The third takes a
53 // reference to a RegExp object as an argument and creates an object
54 // initialized with the information from the given RegExp object.
55 //
56 // The find member function finds the first occurence of the regualr
57 // expression of that object in the string given to find as an argument. Find
58 // returns a boolean, and if true, mutates the private data appropriately.
59 // Find sets pointers to the beginning and end of the thing last found, they
60 // are pointers into the actual string that was searched. The start and end
61 // member functions return indicies into the searched string that correspond
62 // to the beginning and end pointers respectively. The compile member
63 // function takes a char* and puts the compiled version of the char* argument
64 // into the object's private data fields. The == and != operators only check
65 // the to see if the compiled regular expression is the same, and the
66 // deep_equal functions also checks to see if the start and end pointers are
67 // the same. The is_valid function returns false if program is set to NULL,
68 // (i.e. there is no valid compiled exression). The set_invalid function sets
69 // the program to NULL (Warning: this deletes the compiled expression). The
70 // following examples may help clarify regular expression usage:
71 //
72 // * The regular expression "^hello" matches a "hello" only at the
73 // beginning of a line. It would match "hello there" but not "hi,
74 // hello there".
75 //
76 // * The regular expression "long$" matches a "long" only at the end
77 // of a line. It would match "so long\0", but not "long ago".
78 //
79 // * The regular expression "t..t..g" will match anything that has a
80 // "t" then any two characters, another "t", any two characters and
81 // then a "g". It will match "testing", or "test again" but would
82 // not match "toasting"
83 //
84 // * The regular expression "[1-9ab]" matches any number one through
85 // nine, and the characters "a" and "b". It would match "hello 1"
86 // or "begin", but would not match "no-match".
87 //
88 // * The regular expression "[^1-9ab]" matches any character that is
89 // not a number one through nine, or an "a" or "b". It would NOT
90 // match "hello 1" or "begin", but would match "no-match".
91 //
92 // * The regular expression "br* " matches something that begins with
93 // a "b", is followed by zero or more "r"s, and ends in a space. It
94 // would match "brrrrr ", and "b ", but would not match "brrh ".
95 //
96 // * The regular expression "br+ " matches something that begins with
97 // a "b", is followed by one or more "r"s, and ends in a space. It
98 // would match "brrrrr ", and "br ", but would not match "b " or
99 // "brrh ".
100 //
101 // * The regular expression "br? " matches something that begins with
102 // a "b", is followed by zero or one "r"s, and ends in a space. It
103 // would match "br ", and "b ", but would not match "brrrr " or
104 // "brrh ".
105 //
106 // * The regular expression "(..p)b" matches something ending with pb
107 // and beginning with whatever the two characters before the first p
108 // encounterd in the line were. It would find "repb" in "rep drepa
109 // qrepb". The regular expression "(..p)a" would find "repa qrepb"
110 // in "rep drepa qrepb"
111 //
112 // * The regular expression "d(..p)" matches something ending with p,
113 // beginning with d, and having two characters in between that are
114 // the same as the two characters before the first p encounterd in
115 // the line. It would match "drepa qrepb" in "rep drepa qrepb".
116 //
117 
118 #include <cstring>
119 #include <cstdio>
120 #include <ossim/base/ossimRegExp.h>
121 #include <iostream>
122 // ossimRegExp -- Copies the given regular expression.
123 
125  regstart(0), // Internal use only
126  reganch(0), // Internal use only
127  regmust(0), // Internal use only
128  regmlen(0), // Internal use only
129  program(0),
130  progsize(0),
131  searchstring(0),
132 
133  // work variables
134  regparse(0),
135  regnpar(0), // () count.
136  regdummy(0),
137  regcode(0), // Code-emit pointer; &regdummy = don't.
138  regsize(0), // Code size.
139  reginput(0), // String-input pointer.
140  regbol(0), // Beginning of input, for ^ check.
141  regstartp(0), // Pointer to startp array.
142  regendp(0) // Ditto for endp.
143 {
144  if(rxp.program)
145  {
146  int ind = 0;
147  this->progsize = rxp.progsize; // Copy regular expression size
148  this->program = new char[this->progsize]; // Allocate storage
149  for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
150  this->program[ind] = rxp.program[ind];
151  this->startp[0] = rxp.startp[0]; // Copy pointers into last
152  this->endp[0] = rxp.endp[0]; // Successful "find" operation
153  this->regmust = rxp.regmust; // Copy field
154  if (rxp.regmust != NULL) {
155  char* dum = rxp.program;
156  ind = 0;
157  while (dum != rxp.regmust) {
158  ++dum;
159  ++ind;
160  }
161  this->regmust = this->program + ind;
162  }
163  this->regstart = rxp.regstart; // Copy starting index
164  this->reganch = rxp.reganch; // Copy remaining private data
165  this->regmlen = rxp.regmlen; // Copy remaining private data
166  }
167 }
168 
169 
170 // operator== -- Returns true if two regular expressions have the same
171 // compiled program for pattern matching.
172 
173 bool ossimRegExp::operator== (const ossimRegExp& rxp) const {
174  if (this != &rxp) { // Same address?
175  ossim_uint32 ind = this->progsize; // Get regular expression size
176  if (ind != rxp.progsize) // If different size regexp
177  return false; // Return failure
178  while(ind-- != 0) // Else while still characters
179  if(this->program[ind] != rxp.program[ind]) // If regexp are different
180  return false; // Return failure
181  }
182  return true; // Else same, return success
183 }
184 
185 
186 // deep_equal -- Returns true if have the same compiled regular expressions
187 // and the same start and end pointers.
188 
189 bool ossimRegExp::deep_equal (const ossimRegExp& rxp) const {
190  ossim_uint32 ind = this->progsize; // Get regular expression size
191  if (ind != rxp.progsize) // If different size regexp
192  return false; // Return failure
193  while(ind-- != 0) // Else while still characters
194  if(this->program[ind] != rxp.program[ind]) // If regexp are different
195  return false; // Return failure
196  return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
197  this->endp[0] == rxp.endp[0]); // Return true
198 }
199 
200 // The remaining code in this file is derived from the regular expression code
201 // whose copyright statement appears below. It has been changed to work
202 // with the class concepts of C++ and COOL.
203 
204 /*
205  * compile and find
206  *
207  * Copyright (c) 1986 by University of Toronto.
208  * Written by Henry Spencer. Not derived from licensed software.
209  *
210  * Permission is granted to anyone to use this software for any
211  * purpose on any computer system, and to redistribute it freely,
212  * subject to the following restrictions:
213  *
214  * 1. The author is not responsible for the consequences of use of
215  * this software, no matter how awful, even if they arise
216  * from defects in it.
217  *
218  * 2. The origin of this software must not be misrepresented, either
219  * by explicit claim or by omission.
220  *
221  * 3. Altered versions must be plainly marked as such, and must not
222  * be misrepresented as being the original software.
223  *
224  * Beware that some of this code is subtly aware of the way operator
225  * precedence is structured in regular expressions. Serious changes in
226  * regular-expression syntax might require a total rethink.
227  */
228 
229 /*
230  * The "internal use only" fields in regexp.h are present to pass info from
231  * compile to execute that permits the execute phase to run lots faster on
232  * simple cases. They are:
233  *
234  * regstart char that must begin a match; '\0' if none obvious
235  * reganch is the match anchored (at beginning-of-line only)?
236  * regmust string (pointer into program) that match must include, or NULL
237  * regmlen length of regmust string
238  *
239  * Regstart and reganch permit very fast decisions on suitable starting points
240  * for a match, cutting down the work a lot. Regmust permits fast rejection
241  * of lines that cannot possibly match. The regmust tests are costly enough
242  * that compile() supplies a regmust only if the r.e. contains something
243  * potentially expensive (at present, the only such thing detected is * or +
244  * at the start of the r.e., which can involve a lot of backup). Regmlen is
245  * supplied because the test in find() needs it and compile() is computing
246  * it anyway.
247  */
248 
249 /*
250  * Structure for regexp "program". This is essentially a linear encoding
251  * of a nondeterministic finite-state machine (aka syntax charts or
252  * "railroad normal form" in parsing technology). Each node is an opcode
253  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
254  * all nodes except BRANCH implement concatenation; a "next" pointer with
255  * a BRANCH on both ends of it is connecting two alternatives. (Here we
256  * have one of the subtle syntax dependencies: an individual BRANCH (as
257  * opposed to a collection of them) is never concatenated with anything
258  * because of operator precedence.) The operand of some types of node is
259  * a literal string; for others, it is a node leading into a sub-FSM. In
260  * particular, the operand of a BRANCH node is the first node of the branch.
261  * (NB this is *not* a tree structure: the tail of the branch connects
262  * to the thing following the set of BRANCHes.) The opcodes are:
263  */
264 
265 // definition number opnd? meaning
266 #define END 0 // no End of program.
267 #define BOL 1 // no Match "" at beginning of line.
268 #define EOL 2 // no Match "" at end of line.
269 #define ANY 3 // no Match any one character.
270 #define ANYOF 4 // str Match any character in this string.
271 #define ANYBUT 5 // str Match any character not in this
272  // string.
273 #define BRANCH 6 // node Match this alternative, or the
274  // next...
275 #define BACK 7 // no Match "", "next" ptr points backward.
276 #define EXACTLY 8 // str Match this string.
277 #define NOTHING 9 // no Match empty string.
278 #define STAR 10 // node Match this (simple) thing 0 or more
279  // times.
280 #define PLUS 11 // node Match this (simple) thing 1 or more
281  // times.
282 #define OPEN 20 // no Mark this point in input as start of
283  // #n.
284 // OPEN+1 is number 1, etc.
285 #define CLOSE 30 // no Analogous to OPEN.
286 
287 /*
288  * Opcode notes:
289  *
290  * BRANCH The set of branches constituting a single choice are hooked
291  * together with their "next" pointers, since precedence prevents
292  * anything being concatenated to any individual branch. The
293  * "next" pointer of the last BRANCH in a choice points to the
294  * thing following the whole choice. This is also where the
295  * final "next" pointer of each individual branch points; each
296  * branch starts with the operand node of a BRANCH node.
297  *
298  * BACK Normal "next" pointers all implicitly point forward; BACK
299  * exists to make loop structures possible.
300  *
301  * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
302  * BRANCH structures using BACK. Simple cases (one character
303  * per match) are implemented with STAR and PLUS for speed
304  * and to minimize recursive plunges.
305  *
306  * OPEN,CLOSE ...are numbered at compile time.
307  */
308 
309 /*
310  * A node is one char of opcode followed by two chars of "next" pointer.
311  * "Next" pointers are stored as two 8-bit pieces, high order first. The
312  * value is a positive offset from the opcode of the node containing it.
313  * An operand, if any, simply follows the node. (Note that much of the
314  * code generation knows about this implicit relationship.)
315  *
316  * Using two bytes for the "next" pointer is vast overkill for most things,
317  * but allows patterns to get big without disasters.
318  */
319 
320 #define OP(p) (*(p))
321 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
322 #define OPERAND(p) ((p) + 3)
323 
324 const unsigned char MAGIC = 0234;
325 /*
326  * Utility definitions.
327  */
328 
329 #define UCHARAT(p) ((const unsigned char*)(p))[0]
330 
331 
332 #define FAIL(m) { regerror(m); return(NULL); }
333 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
334 #define META "^$.[()|?+*\\"
335 
336 
337 /*
338  * Flags to be passed up and down.
339  */
340 #define HASWIDTH 01 // Known never to match null string.
341 #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
342 #define SPSTART 04 // Starts with * or +.
343 #define WORST 0 // Worst case.
344 
345 
346 
348 //
349 // COMPILE AND ASSOCIATED FUNCTIONS
350 //
352 
353 
354 /*
355  * Global work variables for compile().
356  */
357 // static const char* regparse; // Input-scan pointer.
358 // static int regnpar; // () count.
359 // static char regdummy;
360 // static char* regcode; // Code-emit pointer; &regdummy = don't.
361 // static long regsize; // Code size.
362 
363 /*
364  * Forward declarations for compile()'s friends.
365  */
366 // #ifndef static
367 // #define static static
368 // #endif
369 // static char* reg (int, int*);
370 // static char* regbranch (int*);
371 // static char* regpiece (int*);
372 // static char* regatom (int*);
373 // static char* regnode (char);
374 // static const char* regnext (const char*);
375 // static char* regnext (char*);
376 // static void regc (unsigned char);
377 // static void reginsert (char, char*);
378 // static void regtail (char*, const char*);
379 // static void regoptail (char*, const char*);
380 
381 // #ifdef STRCSPN
382 // static int strcspn ();
383 // #endif
384 
385 
386 
387 /*
388  * We can't allocate space until we know how big the compiled form will be,
389  * but we can't compile it (and thus know how big it is) until we've got a
390  * place to put the code. So we cheat: we compile it twice, once with code
391  * generation turned off and size counting turned on, and once "for real".
392  * This also means that we don't allocate space until we are sure that the
393  * thing really will compile successfully, and we never have to move the
394  * code and thus invalidate pointers into it. (Note that it has to be in
395  * one piece because free() must be able to free it all.)
396  *
397  * Beware that the optimization-preparation code in here knows about some
398  * of the structure of the compiled regexp.
399  */
400 
401 
402 // compile -- compile a regular expression into internal code
403 // for later pattern matching.
404 
405 void ossimRegExp::compile (const char* exp) {
406  const char* scan;
407  const char* longest;
408  unsigned long len;
409  int flags;
410 
411  if (exp == NULL) {
412  //RAISE Error, SYM(ossimRegExp), SYM(No_Expr),
413  printf ("ossimRegExp::compile(): No expression supplied.\n");
414  return;
415  }
416 
417  // First pass: determine size, legality.
418  regparse = exp;
419  regnpar = 1;
420  regsize = 0L;
421  regcode = &regdummy;
422  regc(MAGIC);
423  if(!reg(0, &flags))
424  {
425  printf ("ossimRegExp::compile(): Error in compile.\n");
426  return;
427  }
428  this->startp[0] = this->endp[0] = this->searchstring = NULL;
429 
430  // Small enough for pointer-storage convention?
431  if (regsize >= 32767L) { // Probably could be 65535L.
432  //RAISE Error, SYM(ossimRegExp), SYM(Expr_Too_Big),
433  printf ("ossimRegExp::compile(): Expression too big.\n");
434  return;
435  }
436 
437  // Allocate space.
438 //#ifndef WIN32
439  if (this->program != NULL) delete [] this->program;
440 //#endif
441  this->program = new char[regsize];
442  this->progsize = (int) regsize;
443 
444  if (this->program == NULL) {
445  //RAISE Error, SYM(ossimRegExp), SYM(Out_Of_Memory),
446  printf ("ossimRegExp::compile(): Out of memory.\n");
447  return;
448  }
449 
450  // Second pass: emit code.
451  regparse = exp;
452  regnpar = 1;
453  regcode = this->program;
454  regc(MAGIC);
455  reg(0, &flags);
456 
457  // Dig out information for optimizations.
458  this->regstart = '\0'; // Worst-case defaults.
459  this->reganch = 0;
460  this->regmust = NULL;
461  this->regmlen = 0;
462  scan = this->program + 1; // First BRANCH.
463  if (OP(regnext(scan)) == END) { // Only one top-level choice.
464  scan = OPERAND(scan);
465 
466  // Starting-point info.
467  if (OP(scan) == EXACTLY)
468  this->regstart = *OPERAND(scan);
469  else if (OP(scan) == BOL)
470  this->reganch++;
471 
472  //
473  // If there's something expensive in the r.e., find the longest
474  // literal string that must appear and make it the regmust. Resolve
475  // ties in favor of later strings, since the regstart check works
476  // with the beginning of the r.e. and avoiding duplication
477  // strengthens checking. Not a strong reason, but sufficient in the
478  // absence of others.
479  //
480  if (flags & SPSTART) {
481  longest = NULL;
482  len = 0;
483  for (; scan != NULL; scan = regnext(scan))
484  if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
485  longest = OPERAND(scan);
486  len = (unsigned long)strlen(OPERAND(scan));
487  }
488  this->regmust = longest;
489  this->regmlen = len;
490  }
491  }
492 }
493 
494 
495 /*
496  - reg - regular expression, i.e. main body or parenthesized thing
497  *
498  * Caller must absorb opening parenthesis.
499  *
500  * Combining parenthesis handling with the base level of regular expression
501  * is a trifle forced, but the need to tie the tails of the branches to what
502  * follows makes it hard to avoid.
503  */
504 char* ossimRegExp::reg (int paren, int *flagp) {
505  char* ret;
506  char* br;
507  char* ender;
508  int parno =0;
509  int flags;
510 
511  *flagp = HASWIDTH; // Tentatively.
512 
513  // Make an OPEN node, if parenthesized.
514  if (paren) {
515  if (regnpar >= NSUBEXP) {
516  //RAISE Error, SYM(ossimRegExp), SYM(Too_Many_Parens),
517  printf ("ossimRegExp::compile(): Too many parentheses.\n");
518  return 0;
519  }
520  parno = regnpar;
521  regnpar++;
522  ret = regnode(OPEN + parno);
523  }
524  else
525  ret = NULL;
526 
527  // Pick up the branches, linking them together.
528  br = regbranch(&flags);
529  if (br == NULL)
530  return (NULL);
531  if (ret != NULL)
532  regtail(ret, br); // OPEN -> first.
533  else
534  ret = br;
535  if (!(flags & HASWIDTH))
536  *flagp &= ~HASWIDTH;
537  *flagp |= flags & SPSTART;
538  while (*regparse == '|') {
539  regparse++;
540  br = regbranch(&flags);
541  if (br == NULL)
542  return (NULL);
543  regtail(ret, br); // BRANCH -> BRANCH.
544  if (!(flags & HASWIDTH))
545  *flagp &= ~HASWIDTH;
546  *flagp |= flags & SPSTART;
547  }
548 
549  // Make a closing node, and hook it on the end.
550  ender = regnode((paren) ? CLOSE + parno : END);
551  regtail(ret, ender);
552 
553  // Hook the tails of the branches to the closing node.
554  for (br = ret; br != NULL; br = regnext(br))
555  regoptail(br, ender);
556 
557  // Check for proper termination.
558  if (paren && *regparse++ != ')') {
559  //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens),
560  printf ("ossimRegExp::compile(): Unmatched parentheses.\n");
561  return 0;
562  }
563  else if (!paren && *regparse != '\0') {
564  if (*regparse == ')') {
565  //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens),
566  printf ("ossimRegExp::compile(): Unmatched parentheses.\n");
567  return 0;
568  }
569  else {
570  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
571  printf ("ossimRegExp::compile(): Internal error.\n");
572  return 0;
573  }
574  // NOTREACHED
575  }
576  return (ret);
577 }
578 
579 
580 /*
581  - regbranch - one alternative of an | operator
582  *
583  * Implements the concatenation operator.
584  */
585 char* ossimRegExp::regbranch (int *flagp) {
586  char* ret;
587  char* chain;
588  char* latest;
589  int flags;
590 
591  *flagp = WORST; // Tentatively.
592 
593  ret = regnode(BRANCH);
594  chain = NULL;
595  while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
596  latest = regpiece(&flags);
597  if (latest == NULL)
598  return (NULL);
599  *flagp |= flags & HASWIDTH;
600  if (chain == NULL) // First piece.
601  *flagp |= flags & SPSTART;
602  else
603  regtail(chain, latest);
604  chain = latest;
605  }
606  if (chain == NULL) // Loop ran zero times.
607  regnode(NOTHING);
608 
609  return (ret);
610 }
611 
612 
613 /*
614  - regpiece - something followed by possible [*+?]
615  *
616  * Note that the branching code sequences used for ? and the general cases
617  * of * and + are somewhat optimized: they use the same NOTHING node as
618  * both the endmarker for their branch list and the body of the last branch.
619  * It might seem that this node could be dispensed with entirely, but the
620  * endmarker role is not redundant.
621  */
622 char* ossimRegExp::regpiece (int *flagp) {
623  char* ret;
624  char op;
625  char* next;
626  int flags;
627 
628  ret = regatom(&flags);
629  if (ret == NULL)
630  return (NULL);
631 
632  op = *regparse;
633  if (!ISMULT(op)) {
634  *flagp = flags;
635  return (ret);
636  }
637 
638  if (!(flags & HASWIDTH) && op != '?') {
639  //RAISE Error, SYM(ossimRegExp), SYM(Empty_Operand),
640  printf ("ossimRegExp::compile() : *+ operand could be empty.\n");
641  return 0;
642  }
643  *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
644 
645  if (op == '*' && (flags & SIMPLE))
646  reginsert(STAR, ret);
647  else if (op == '*') {
648  // Emit x* as (x&|), where & means "self".
649  reginsert(BRANCH, ret); // Either x
650  regoptail(ret, regnode(BACK)); // and loop
651  regoptail(ret, ret); // back
652  regtail(ret, regnode(BRANCH)); // or
653  regtail(ret, regnode(NOTHING)); // null.
654  }
655  else if (op == '+' && (flags & SIMPLE))
656  reginsert(PLUS, ret);
657  else if (op == '+') {
658  // Emit x+ as x(&|), where & means "self".
659  next = regnode(BRANCH); // Either
660  regtail(ret, next);
661  regtail(regnode(BACK), ret); // loop back
662  regtail(next, regnode(BRANCH)); // or
663  regtail(ret, regnode(NOTHING)); // null.
664  }
665  else if (op == '?') {
666  // Emit x? as (x|)
667  reginsert(BRANCH, ret); // Either x
668  regtail(ret, regnode(BRANCH)); // or
669  next = regnode(NOTHING);// null.
670  regtail(ret, next);
671  regoptail(ret, next);
672  }
673  regparse++;
674  if (ISMULT(*regparse)) {
675  //RAISE Error, SYM(ossimRegExp), SYM(Nested_Operand),
676  printf ("ossimRegExp::compile(): Nested *?+.\n");
677  return 0;
678  }
679  return (ret);
680 }
681 
682 
683 /*
684  - regatom - the lowest level
685  *
686  * Optimization: gobbles an entire sequence of ordinary characters so that
687  * it can turn them into a single node, which is smaller to store and
688  * faster to run. Backslashed characters are exceptions, each becoming a
689  * separate node; the code is simpler that way and it's not worth fixing.
690  */
691 char* ossimRegExp::regatom (int *flagp) {
692  char* ret;
693  int flags;
694 
695  *flagp = WORST; // Tentatively.
696 
697  switch (*regparse++) {
698  case '^':
699  ret = regnode(BOL);
700  break;
701  case '$':
702  ret = regnode(EOL);
703  break;
704  case '.':
705  ret = regnode(ANY);
706  *flagp |= HASWIDTH | SIMPLE;
707  break;
708  case '[':{
709  int rxpclass;
710  int rxpclassend;
711 
712  if (*regparse == '^') { // Complement of range.
713  ret = regnode(ANYBUT);
714  regparse++;
715  }
716  else
717  ret = regnode(ANYOF);
718  if (*regparse == ']' || *regparse == '-')
719  regc(*regparse++);
720  while (*regparse != '\0' && *regparse != ']') {
721  if (*regparse == '-') {
722  regparse++;
723  if (*regparse == ']' || *regparse == '\0')
724  regc('-');
725  else {
726  rxpclass = UCHARAT(regparse - 2) + 1;
727  rxpclassend = UCHARAT(regparse);
728  if (rxpclass > rxpclassend + 1) {
729  //RAISE Error, SYM(ossimRegExp), SYM(Invalid_Range),
730  printf ("ossimRegExp::compile(): Invalid range in [].\n");
731  return 0;
732  }
733  for (; rxpclass <= rxpclassend; rxpclass++)
734  regc(rxpclass);
735  regparse++;
736  }
737  }
738  else
739  regc(*regparse++);
740  }
741  regc('\0');
742  if (*regparse != ']') {
743  //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Bracket),
744  printf ("ossimRegExp::compile(): Unmatched [].\n");
745  return 0;
746  }
747  regparse++;
748  *flagp |= HASWIDTH | SIMPLE;
749  }
750  break;
751  case '(':
752  ret = reg(1, &flags);
753  if (ret == NULL)
754  return (NULL);
755  *flagp |= flags & (HASWIDTH | SPSTART);
756  break;
757  case '\0':
758  case '|':
759  case ')':
760  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
761  printf ("ossimRegExp::compile(): Internal error.\n"); // Never here
762  return 0;
763  case '?':
764  case '+':
765  case '*':
766  //RAISE Error, SYM(ossimRegExp), SYM(No_Operand),
767  printf ("ossimRegExp::compile(): ?+* follows nothing.\n");
768  return 0;
769  case '\\':
770  if (*regparse == '\0') {
771  //RAISE Error, SYM(ossimRegExp), SYM(Trailing_Backslash),
772  printf ("ossimRegExp::compile(): Trailing backslash.\n");
773  return 0;
774  }
775  ret = regnode(EXACTLY);
776  regc(*regparse++);
777  regc('\0');
778  *flagp |= HASWIDTH | SIMPLE;
779  break;
780  default:{
781  int len;
782  char ender;
783 
784  regparse--;
785  len = (int)strcspn(regparse, META);
786  if (len <= 0) {
787  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
788  printf ("ossimRegExp::compile(): Internal error.\n");
789  return 0;
790  }
791  ender = *(regparse + len);
792  if (len > 1 && ISMULT(ender))
793  len--; // Back off clear of ?+* operand.
794  *flagp |= HASWIDTH;
795  if (len == 1)
796  *flagp |= SIMPLE;
797  ret = regnode(EXACTLY);
798  while (len > 0) {
799  regc(*regparse++);
800  len--;
801  }
802  regc('\0');
803  }
804  break;
805  }
806  return (ret);
807 }
808 
809 
810 /*
811  - regnode - emit a node
812  Location.
813  */
814 char* ossimRegExp::regnode (char op) {
815  char* ret;
816  char* ptr;
817 
818  ret = regcode;
819  if (ret == &regdummy) {
820  regsize += 3;
821  return (ret);
822  }
823 
824  ptr = ret;
825  *ptr++ = op;
826  *ptr++ = '\0'; // Null "next" pointer.
827  *ptr++ = '\0';
828  regcode = ptr;
829 
830  return (ret);
831 }
832 
833 
834 /*
835  - regc - emit (if appropriate) a byte of code
836  */
837 void ossimRegExp::regc (unsigned char b) {
838  if (regcode != &regdummy)
839  *regcode++ = b;
840  else
841  regsize++;
842 }
843 
844 
845 /*
846  - reginsert - insert an operator in front of already-emitted operand
847  *
848  * Means relocating the operand.
849  */
850 void ossimRegExp::reginsert (char op, char* opnd) {
851  char* src;
852  char* dst;
853  char* place;
854 
855  if (regcode == &regdummy) {
856  regsize += 3;
857  return;
858  }
859 
860  src = regcode;
861  regcode += 3;
862  dst = regcode;
863  while (src > opnd)
864  *--dst = *--src;
865 
866  place = opnd; // Op node, where operand used to be.
867  *place++ = op;
868  *place++ = '\0';
869  *place++ = '\0';
870 }
871 
872 
873 /*
874  - regtail - set the next-pointer at the end of a node chain
875  */
876 void ossimRegExp::regtail (char* p, const char* val) {
877  char* scan;
878  char* temp;
879  int offset;
880 
881  if (p == &regdummy)
882  return;
883 
884  // Find last node.
885  scan = p;
886  for (;;) {
887  temp = regnext(scan);
888  if (temp == NULL)
889  break;
890  scan = temp;
891  }
892 
893  if (OP(scan) == BACK)
894  offset = (const char*)scan - val;
895  else
896  offset = val - scan;
897  *(scan + 1) = (offset >> 8) & 0377;
898  *(scan + 2) = offset & 0377;
899 }
900 
901 
902 /*
903  - regoptail - regtail on operand of first argument; nop if operandless
904  */
905 void ossimRegExp::regoptail (char* p, const char* val) {
906  // "Operandless" and "op != BRANCH" are synonymous in practice.
907  if (p == NULL || p == &regdummy || OP(p) != BRANCH)
908  return;
909  regtail(OPERAND(p), val);
910 }
911 
912 
913 
915 //
916 // find and friends
917 //
919 
920 
921 /*
922  * Global work variables for find().
923  */
924 // static const char* reginput; // String-input pointer.
925 // static const char* regbol; // Beginning of input, for ^ check.
926 // static const char* *regstartp; // Pointer to startp array.
927 // static const char* *regendp; // Ditto for endp.
928 
929 /*
930  * Forwards.
931  */
932 // static int regtry (const char*, const char* *,
933 // const char* *, const char*);
934 // static int regmatch (const char*);
935 // static int regrepeat (const char*);
936 
937 // #ifdef DEBUG
938 // int regnarrate = 0;
939 // void regdump ();
940 // static char* regprop ();
941 // #endif
942 
943 
944 
945 // find -- Matches the regular expression to the given string.
946 // Returns true if found, and sets start and end indexes accordingly.
947 
948 bool ossimRegExp::find (const char* string) {
949  const char* s = 0;
950 
951  if(!string) return false;
952  this->searchstring = string;
953 
954  // Check validity of program.
955  if (!this->program || UCHARAT(this->program) != MAGIC) {
956  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
957  printf ("ossimRegExp::find(): Compiled regular expression corrupted.\n");
958  return 0;
959  }
960 
961  // If there is a "must appear" string, look for it.
962  if (this->regmust != NULL) {
963  s = string;
964  while ((s = strchr(s, this->regmust[0])) != NULL) {
965  if (strncmp(s, this->regmust, this->regmlen) == 0)
966  break; // Found it.
967  s++;
968  }
969  if (s == NULL) // Not present.
970  return (0);
971  }
972 
973  // Mark beginning of line for ^ .
974  regbol = string;
975 
976  // Simplest case: anchored match need be tried only once.
977  if (this->reganch)
978  return (regtry(string, this->startp, this->endp, this->program));
979 
980  // Messy cases: unanchored match.
981  s = string;
982  if (this->regstart != '\0')
983  // We know what char it must start with.
984  while ((s = strchr(s, this->regstart)) != NULL) {
985  if (regtry(s, this->startp, this->endp, this->program))
986  return (1);
987  s++;
988 
989  }
990  else
991  // We don't -- general case.
992  do {
993  if (regtry(s, this->startp, this->endp, this->program))
994  return (1);
995  } while (*s++ != '\0');
996 
997  // Failure.
998  return (0);
999 }
1000 
1001 
1002 /*
1003  - regtry - try match at specific point
1004  0 failure, 1 success
1005  */
1006 int ossimRegExp::regtry (const char* string, const char* *start,
1007  const char* *end, const char* prog) {
1008  int i;
1009  const char* *sp1;
1010  const char* *ep;
1011 
1012  reginput = string;
1013  regstartp = start;
1014  regendp = end;
1015 
1016  sp1 = start;
1017  ep = end;
1018  for (i = NSUBEXP; i > 0; i--) {
1019  *sp1++ = NULL;
1020  *ep++ = NULL;
1021  }
1022  if (regmatch(prog + 1)) {
1023  start[0] = string;
1024  end[0] = reginput;
1025  return (1);
1026  }
1027  else
1028  return (0);
1029 }
1030 
1031 
1032 /*
1033  - regmatch - main matching routine
1034  *
1035  * Conceptually the strategy is simple: check to see whether the current
1036  * node matches, call self recursively to see whether the rest matches,
1037  * and then act accordingly. In practice we make some effort to avoid
1038  * recursion, in particular by going through "ordinary" nodes (that don't
1039  * need to know whether the rest of the match failed) by a loop instead of
1040  * by recursion.
1041  * 0 failure, 1 success
1042  */
1043 int ossimRegExp::regmatch (const char* prog) {
1044  const char* scan; // Current node.
1045  const char* next; // Next node.
1046 
1047  scan = prog;
1048 
1049  while (scan != NULL) {
1050 
1051  next = regnext(scan);
1052 
1053  switch (OP(scan)) {
1054  case BOL:
1055  if (reginput != regbol)
1056  return (0);
1057  break;
1058  case EOL:
1059  if (*reginput != '\0')
1060  return (0);
1061  break;
1062  case ANY:
1063  if (*reginput == '\0')
1064  return (0);
1065  reginput++;
1066  break;
1067  case EXACTLY:{
1068  int len;
1069  const char* opnd;
1070 
1071  opnd = OPERAND(scan);
1072  // Inline the first character, for speed.
1073  if (*opnd != *reginput)
1074  return (0);
1075  len = (int)strlen(opnd);
1076  if (len > 1 && strncmp(opnd, reginput, len) != 0)
1077  return (0);
1078  reginput += len;
1079  }
1080  break;
1081  case ANYOF:
1082  if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
1083  return (0);
1084  reginput++;
1085  break;
1086  case ANYBUT:
1087  if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
1088  return (0);
1089  reginput++;
1090  break;
1091  case NOTHING:
1092  break;
1093  case BACK:
1094  break;
1095  case OPEN + 1:
1096  case OPEN + 2:
1097  case OPEN + 3:
1098  case OPEN + 4:
1099  case OPEN + 5:
1100  case OPEN + 6:
1101  case OPEN + 7:
1102  case OPEN + 8:
1103  case OPEN + 9:{
1104  int no;
1105  const char* save;
1106 
1107  no = OP(scan) - OPEN;
1108  save = reginput;
1109 
1110  if (regmatch(next)) {
1111 
1112  //
1113  // Don't set startp if some later invocation of the
1114  // same parentheses already has.
1115  //
1116  if (regstartp[no] == NULL)
1117  regstartp[no] = save;
1118  return (1);
1119  }
1120  else
1121  return (0);
1122  }
1123 // break;
1124  case CLOSE + 1:
1125  case CLOSE + 2:
1126  case CLOSE + 3:
1127  case CLOSE + 4:
1128  case CLOSE + 5:
1129  case CLOSE + 6:
1130  case CLOSE + 7:
1131  case CLOSE + 8:
1132  case CLOSE + 9:{
1133  int no;
1134  const char* save;
1135 
1136  no = OP(scan) - CLOSE;
1137  save = reginput;
1138 
1139  if (regmatch(next)) {
1140 
1141  //
1142  // Don't set endp if some later invocation of the
1143  // same parentheses already has.
1144  //
1145  if (regendp[no] == NULL)
1146  regendp[no] = save;
1147  return (1);
1148  }
1149  else
1150  return (0);
1151  }
1152 // break;
1153  case BRANCH:{
1154 
1155  const char* save;
1156 
1157  if (OP(next) != BRANCH) // No choice.
1158  next = OPERAND(scan); // Avoid recursion.
1159  else {
1160  do {
1161  save = reginput;
1162  if (regmatch(OPERAND(scan)))
1163  return (1);
1164  reginput = save;
1165  scan = regnext(scan);
1166  } while (scan != NULL && OP(scan) == BRANCH);
1167  return (0);
1168  // NOTREACHED
1169  }
1170  }
1171  break;
1172  case STAR:
1173  case PLUS:{
1174  char nextch;
1175  int no;
1176  const char* save;
1177  int min_no;
1178 
1179  //
1180  // Lookahead to avoid useless match attempts when we know
1181  // what character comes next.
1182  //
1183  nextch = '\0';
1184  if (OP(next) == EXACTLY)
1185  nextch = *OPERAND(next);
1186  min_no = (OP(scan) == STAR) ? 0 : 1;
1187  save = reginput;
1188  no = regrepeat(OPERAND(scan));
1189  while (no >= min_no) {
1190  // If it could work, try it.
1191  if (nextch == '\0' || *reginput == nextch)
1192  if (regmatch(next))
1193  return (1);
1194  // Couldn't or didn't -- back up.
1195  no--;
1196  reginput = save + no;
1197  }
1198  return (0);
1199  }
1200 // break;
1201  case END:
1202  return (1); // Success!
1203 
1204  default:
1205  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
1206  printf ("ossimRegExp::find(): Internal error -- memory corrupted.\n");
1207  return 0;
1208  }
1209  scan = next;
1210  }
1211 
1212  //
1213  // We get here only if there's trouble -- normally "case END" is the
1214  // terminating point.
1215  //
1216  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
1217  printf ("ossimRegExp::find(): Internal error -- corrupted pointers.\n");
1218  return (0);
1219 }
1220 
1221 
1222 /*
1223  - regrepeat - repeatedly match something simple, report how many
1224  */
1225 int ossimRegExp::regrepeat (const char* p) {
1226  int count = 0;
1227  const char* scan;
1228  const char* opnd;
1229 
1230  scan = reginput;
1231  opnd = OPERAND(p);
1232  switch (OP(p)) {
1233  case ANY:
1234  count = (int)strlen(scan);
1235  scan += count;
1236  break;
1237  case EXACTLY:
1238  while (*opnd == *scan) {
1239  count++;
1240  scan++;
1241  }
1242  break;
1243  case ANYOF:
1244  while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1245  count++;
1246  scan++;
1247  }
1248  break;
1249  case ANYBUT:
1250  while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1251  count++;
1252  scan++;
1253  }
1254  break;
1255  default: // Oh dear. Called inappropriately.
1256  //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
1257  printf ("ossimRegExp::find(): Internal error.\n");
1258  return 0;
1259  }
1260  reginput = scan;
1261  return (count);
1262 }
1263 
1264 
1265 /*
1266  - regnext - dig the "next" pointer out of a node
1267  */
1268 const char* ossimRegExp::regnext (const char* p) {
1269  int offset;
1270 
1271  if (p == &regdummy)
1272  return (NULL);
1273 
1274  offset = NEXT(p);
1275  if (offset == 0)
1276  return (NULL);
1277 
1278  if (OP(p) == BACK)
1279  return (p - offset);
1280  else
1281  return (p + offset);
1282 }
1283 
1284 
1285 char* ossimRegExp::regnext (char* p) {
1286  int offset;
1287 
1288  if (p == &regdummy)
1289  return (NULL);
1290 
1291  offset = NEXT(p);
1292  if (offset == 0)
1293  return (NULL);
1294 
1295  if (OP(p) == BACK)
1296  return (p - offset);
1297  else
1298  return (p + offset);
1299 }
void regtail(char *, const char *)
#define ISMULT(c)
const char ** regstartp
Definition: ossimRegExp.h:118
const char ** regendp
Definition: ossimRegExp.h:119
const char * regbol
Definition: ossimRegExp.h:117
#define UCHARAT(p)
int regtry(const char *, const char **, const char **, const char *)
void regc(unsigned char)
#define PLUS
char * regbranch(int *)
#define STAR
#define NOTHING
int regmatch(const char *)
void regoptail(char *, const char *)
const char * regnext(const char *)
int regrepeat(const char *)
#define EOL
const char * reginput
Definition: ossimRegExp.h:116
#define SIMPLE
const char * startp[NSUBEXP]
Definition: ossimRegExp.h:100
#define ANY
const unsigned char MAGIC
#define BRANCH
bool operator==(const ossimRegExp &) const
#define NEXT(p)
#define BOL
unsigned int ossim_uint32
#define HASWIDTH
const char * regparse
Definition: ossimRegExp.h:111
char * program
Definition: ossimRegExp.h:106
#define OPEN
void compile(const char *)
#define META
#define ANYBUT
ossim_uint32 start() const
Definition: ossimRegExp.h:209
char * regnode(char)
ossim_uint32 end() const
Definition: ossimRegExp.h:217
#define OPERAND(p)
if(yy_init)
char * regpiece(int *)
#define EXACTLY
const char * regmust
Definition: ossimRegExp.h:104
char * regatom(int *)
#define BACK
#define CLOSE
void reginsert(char, char *)
char * regcode
Definition: ossimRegExp.h:114
const char * searchstring
Definition: ossimRegExp.h:108
const int NSUBEXP
Definition: ossimRegExp.h:72
bool deep_equal(const ossimRegExp &) const
#define END
ossim_uint32 progsize
Definition: ossimRegExp.h:107
char * reg(int, int *)
ossim_uint32 regmlen
Definition: ossimRegExp.h:105
const char * endp[NSUBEXP]
Definition: ossimRegExp.h:101
#define WORST
bool find(const char *)
#define OP(p)
#define SPSTART
#define ANYOF