LLVM  14.0.0git
regengine.inc
Go to the documentation of this file.
1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  * The Regents of the University of California. All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  * may be used to endorse or promote products derived from this software
21  * without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)engine.c 8.5 (Berkeley) 3/20/94
36  */
37 
38 /*
39  * The matching engine and friends. This file is #included by regexec.c
40  * after suitable #defines of a variety of macros used herein, so that
41  * different state representations can be used without duplicating masses
42  * of code.
43  */
44 
45 #ifdef SNAMES
46 #define matcher smatcher
47 #define fast sfast
48 #define slow sslow
49 #define dissect sdissect
50 #define backref sbackref
51 #define step sstep
52 #define print sprint
53 #define at sat
54 #define match smat
55 #define nope snope
56 #endif
57 #ifdef LNAMES
58 #define matcher lmatcher
59 #define fast lfast
60 #define slow lslow
61 #define dissect ldissect
62 #define backref lbackref
63 #define step lstep
64 #define print lprint
65 #define at lat
66 #define match lmat
67 #define nope lnope
68 #endif
69 
70 /* another structure passed up and down to avoid zillions of parameters */
71 struct match {
72  struct re_guts *g;
73  int eflags;
74  llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
75  const char *offp; /* offsets work from here */
76  const char *beginp; /* start of string -- virtual NUL precedes */
77  const char *endp; /* end of string -- virtual NUL here */
78  const char *coldp; /* can be no match starting before here */
79  const char **lastpos; /* [nplus+1] */
80  STATEVARS;
81  states st; /* current states */
82  states fresh; /* states for a fresh start */
83  states tmp; /* temporary */
84  states empty; /* empty set of states */
85 };
86 
87 static int matcher(struct re_guts *, const char *, size_t,
88  llvm_regmatch_t[], int);
89 static const char *dissect(struct match *, const char *, const char *, sopno,
90  sopno);
91 static const char *backref(struct match *, const char *, const char *, sopno,
92  sopno, sopno, int);
93 static const char *fast(struct match *, const char *, const char *, sopno, sopno);
94 static const char *slow(struct match *, const char *, const char *, sopno, sopno);
95 static states step(struct re_guts *, sopno, sopno, states, int, states);
96 #define MAX_RECURSION 100
97 #define BOL (OUT+1)
98 #define EOL (BOL+1)
99 #define BOLEOL (BOL+2)
100 #define NOTHING (BOL+3)
101 #define BOW (BOL+4)
102 #define EOW (BOL+5)
103 #define CODEMAX (BOL+5) /* highest code used */
104 #define NONCHAR(c) ((c) > CHAR_MAX)
105 #define NNONCHAR (CODEMAX-CHAR_MAX)
106 #ifdef REDEBUG
107 static void print(struct match *, char *, states, int, FILE *);
108 #endif
109 #ifdef REDEBUG
110 static void at(struct match *, char *, char *, char *, sopno, sopno);
111 #endif
112 #ifdef REDEBUG
113 static char *pchar(int);
114 #endif
115 
116 #ifdef REDEBUG
117 #define SP(t, s, c) print(m, t, s, c, stdout)
118 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
119 #define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
120 static int nope = 0;
121 #else
122 #define SP(t, s, c) /* nothing */
123 #define AT(t, p1, p2, s1, s2) /* nothing */
124 #define NOTE(s) /* nothing */
125 #endif
126 
127 /*
128  - matcher - the actual matching engine
129  */
130 static int /* 0 success, REG_NOMATCH failure */
131 matcher(struct re_guts *g, const char *string, size_t nmatch,
132  llvm_regmatch_t pmatch[],
133  int eflags)
134 {
135  const char *endp;
136  size_t i;
137  struct match mv;
138  struct match *m = &mv;
139  const char *dp;
140  const sopno gf = g->firststate+1; /* +1 for OEND */
141  const sopno gl = g->laststate;
142  const char *start;
143  const char *stop;
144 
145  /* simplify the situation where possible */
146  if (g->cflags&REG_NOSUB)
147  nmatch = 0;
148  if (eflags&REG_STARTEND) {
149  start = string + pmatch[0].rm_so;
150  stop = string + pmatch[0].rm_eo;
151  } else {
152  start = string;
153  stop = start + strlen(start);
154  }
155  if (stop < start)
156  return(REG_INVARG);
157 
158  /* prescreening; this does wonders for this rather slow code */
159  if (g->must != NULL) {
160  for (dp = start; dp < stop; dp++)
161  if (*dp == g->must[0] && stop - dp >= g->mlen &&
162  memcmp(dp, g->must, (size_t)g->mlen) == 0)
163  break;
164  if (dp == stop) /* we didn't find g->must */
165  return(REG_NOMATCH);
166  }
167 
168  /* match struct setup */
169  m->g = g;
170  m->eflags = eflags;
171  m->pmatch = NULL;
172  m->lastpos = NULL;
173  m->offp = string;
174  m->beginp = start;
175  m->endp = stop;
176  STATESETUP(m, 4);
177  SETUP(m->st);
178  SETUP(m->fresh);
179  SETUP(m->tmp);
180  SETUP(m->empty);
181  CLEAR(m->empty);
182 
183  /* this loop does only one repetition except for backrefs */
184  for (;;) {
185  endp = fast(m, start, stop, gf, gl);
186  if (endp == NULL) { /* a miss */
187  free(m->pmatch);
188  free((void*)m->lastpos);
189  STATETEARDOWN(m);
190  return(REG_NOMATCH);
191  }
192  if (nmatch == 0 && !g->backrefs)
193  break; /* no further info needed */
194 
195  /* where? */
196  assert(m->coldp != NULL);
197  for (;;) {
198  NOTE("finding start");
199  endp = slow(m, m->coldp, stop, gf, gl);
200  if (endp != NULL)
201  break;
202  assert(m->coldp < m->endp);
203  m->coldp++;
204  }
205  if (nmatch == 1 && !g->backrefs)
206  break; /* no further info needed */
207 
208  /* oh my, they want the subexpressions... */
209  if (m->pmatch == NULL)
210  m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
211  sizeof(llvm_regmatch_t));
212  if (m->pmatch == NULL) {
213  STATETEARDOWN(m);
214  return(REG_ESPACE);
215  }
216  for (i = 1; i <= m->g->nsub; i++)
217  m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
218  if (!g->backrefs && !(m->eflags&REG_BACKR)) {
219  NOTE("dissecting");
220  dp = dissect(m, m->coldp, endp, gf, gl);
221  } else {
222  if (g->nplus > 0 && m->lastpos == NULL)
223  m->lastpos = (const char **)malloc((g->nplus+1) *
224  sizeof(char *));
225  if (g->nplus > 0 && m->lastpos == NULL) {
226  free(m->pmatch);
227  STATETEARDOWN(m);
228  return(REG_ESPACE);
229  }
230  NOTE("backref dissect");
231  dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
232  }
233  if (dp != NULL)
234  break;
235 
236  /* uh-oh... we couldn't find a subexpression-level match */
237  assert(g->backrefs); /* must be back references doing it */
238  assert(g->nplus == 0 || m->lastpos != NULL);
239  for (;;) {
240  if (dp != NULL || endp <= m->coldp)
241  break; /* defeat */
242  NOTE("backoff");
243  endp = slow(m, m->coldp, endp-1, gf, gl);
244  if (endp == NULL)
245  break; /* defeat */
246  /* try it on a shorter possibility */
247 #ifndef NDEBUG
248  for (i = 1; i <= m->g->nsub; i++) {
249  assert(m->pmatch[i].rm_so == -1);
250  assert(m->pmatch[i].rm_eo == -1);
251  }
252 #endif
253  NOTE("backoff dissect");
254  dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
255  }
256  assert(dp == NULL || dp == endp);
257  if (dp != NULL) /* found a shorter one */
258  break;
259 
260  /* despite initial appearances, there is no match here */
261  NOTE("false alarm");
262  if (m->coldp == stop)
263  break;
264  start = m->coldp + 1; /* recycle starting later */
265  }
266 
267  /* fill in the details if requested */
268  if (nmatch > 0) {
269  pmatch[0].rm_so = m->coldp - m->offp;
270  pmatch[0].rm_eo = endp - m->offp;
271  }
272  if (nmatch > 1) {
273  assert(m->pmatch != NULL);
274  for (i = 1; i < nmatch; i++)
275  if (i <= m->g->nsub)
276  pmatch[i] = m->pmatch[i];
277  else {
278  pmatch[i].rm_so = -1;
279  pmatch[i].rm_eo = -1;
280  }
281  }
282 
283  if (m->pmatch != NULL)
284  free((char *)m->pmatch);
285  if (m->lastpos != NULL)
286  free((char *)m->lastpos);
287  STATETEARDOWN(m);
288  return(0);
289 }
290 
291 /*
292  - dissect - figure out what matched what, no back references
293  */
294 static const char * /* == stop (success) always */
295 dissect(struct match *m, const char *start, const char *stop, sopno startst,
296  sopno stopst)
297 {
298  int i;
299  sopno ss; /* start sop of current subRE */
300  sopno es; /* end sop of current subRE */
301  const char *sp; /* start of string matched by it */
302  const char *stp; /* string matched by it cannot pass here */
303  const char *rest; /* start of rest of string */
304  const char *tail; /* string unmatched by rest of RE */
305  sopno ssub; /* start sop of subsubRE */
306  sopno esub; /* end sop of subsubRE */
307  const char *ssp; /* start of string matched by subsubRE */
308  const char *sep; /* end of string matched by subsubRE */
309  const char *oldssp; /* previous ssp */
310 
311  AT("diss", start, stop, startst, stopst);
312  sp = start;
313  for (ss = startst; ss < stopst; ss = es) {
314  /* identify end of subRE */
315  es = ss;
316  switch (OP(m->g->strip[es])) {
317  case OPLUS_:
318  case OQUEST_:
319  es += OPND(m->g->strip[es]);
320  break;
321  case OCH_:
322  while (OP(m->g->strip[es]) != O_CH)
323  es += OPND(m->g->strip[es]);
324  break;
325  }
326  es++;
327 
328  /* figure out what it matched */
329  switch (OP(m->g->strip[ss])) {
330  case OEND:
331  assert(nope);
332  break;
333  case OCHAR:
334  sp++;
335  break;
336  case OBOL:
337  case OEOL:
338  case OBOW:
339  case OEOW:
340  break;
341  case OANY:
342  case OANYOF:
343  sp++;
344  break;
345  case OBACK_:
346  case O_BACK:
347  assert(nope);
348  break;
349  /* cases where length of match is hard to find */
350  case OQUEST_:
351  stp = stop;
352  for (;;) {
353  /* how long could this one be? */
354  rest = slow(m, sp, stp, ss, es);
355  assert(rest != NULL); /* it did match */
356  /* could the rest match the rest? */
357  tail = slow(m, rest, stop, es, stopst);
358  if (tail == stop)
359  break; /* yes! */
360  /* no -- try a shorter match for this one */
361  stp = rest - 1;
362  assert(stp >= sp); /* it did work */
363  }
364  ssub = ss + 1;
365  esub = es - 1;
366  /* did innards match? */
367  if (slow(m, sp, rest, ssub, esub) != NULL) {
368  const char *dp = dissect(m, sp, rest, ssub, esub);
369  (void)dp; /* avoid warning if assertions off */
370  assert(dp == rest);
371  } else /* no */
372  assert(sp == rest);
373  sp = rest;
374  break;
375  case OPLUS_:
376  stp = stop;
377  for (;;) {
378  /* how long could this one be? */
379  rest = slow(m, sp, stp, ss, es);
380  assert(rest != NULL); /* it did match */
381  /* could the rest match the rest? */
382  tail = slow(m, rest, stop, es, stopst);
383  if (tail == stop)
384  break; /* yes! */
385  /* no -- try a shorter match for this one */
386  stp = rest - 1;
387  assert(stp >= sp); /* it did work */
388  }
389  ssub = ss + 1;
390  esub = es - 1;
391  ssp = sp;
392  oldssp = ssp;
393  for (;;) { /* find last match of innards */
394  sep = slow(m, ssp, rest, ssub, esub);
395  if (sep == NULL || sep == ssp)
396  break; /* failed or matched null */
397  oldssp = ssp; /* on to next try */
398  ssp = sep;
399  }
400  if (sep == NULL) {
401  /* last successful match */
402  sep = ssp;
403  ssp = oldssp;
404  }
405  assert(sep == rest); /* must exhaust substring */
406  assert(slow(m, ssp, sep, ssub, esub) == rest);
407  {
408  const char *dp = dissect(m, ssp, sep, ssub, esub);
409  (void)dp; /* avoid warning if assertions off */
410  assert(dp == sep);
411  }
412  sp = rest;
413  break;
414  case OCH_:
415  stp = stop;
416  for (;;) {
417  /* how long could this one be? */
418  rest = slow(m, sp, stp, ss, es);
419  assert(rest != NULL); /* it did match */
420  /* could the rest match the rest? */
421  tail = slow(m, rest, stop, es, stopst);
422  if (tail == stop)
423  break; /* yes! */
424  /* no -- try a shorter match for this one */
425  stp = rest - 1;
426  assert(stp >= sp); /* it did work */
427  }
428  ssub = ss + 1;
429  esub = ss + OPND(m->g->strip[ss]) - 1;
430  assert(OP(m->g->strip[esub]) == OOR1);
431  for (;;) { /* find first matching branch */
432  if (slow(m, sp, rest, ssub, esub) == rest)
433  break; /* it matched all of it */
434  /* that one missed, try next one */
435  assert(OP(m->g->strip[esub]) == OOR1);
436  esub++;
437  assert(OP(m->g->strip[esub]) == OOR2);
438  ssub = esub + 1;
439  esub += OPND(m->g->strip[esub]);
440  if (OP(m->g->strip[esub]) == OOR2)
441  esub--;
442  else
443  assert(OP(m->g->strip[esub]) == O_CH);
444  }
445  {
446  const char *dp = dissect(m, sp, rest, ssub, esub);
447  (void)dp; /* avoid warning if assertions off */
448  assert(dp == rest);
449  }
450  sp = rest;
451  break;
452  case O_PLUS:
453  case O_QUEST:
454  case OOR1:
455  case OOR2:
456  case O_CH:
457  assert(nope);
458  break;
459  case OLPAREN:
460  i = OPND(m->g->strip[ss]);
461  assert(0 < i && i <= m->g->nsub);
462  m->pmatch[i].rm_so = sp - m->offp;
463  break;
464  case ORPAREN:
465  i = OPND(m->g->strip[ss]);
466  assert(0 < i && i <= m->g->nsub);
467  m->pmatch[i].rm_eo = sp - m->offp;
468  break;
469  default: /* uh oh */
470  assert(nope);
471  break;
472  }
473  }
474 
475  assert(sp == stop);
476  return(sp);
477 }
478 
479 /*
480  - backref - figure out what matched what, figuring in back references
481  */
482 static const char * /* == stop (success) or NULL (failure) */
483 backref(struct match *m, const char *start, const char *stop, sopno startst,
484  sopno stopst, sopno lev, int rec) /* PLUS nesting level */
485 {
486  int i;
487  sopno ss; /* start sop of current subRE */
488  const char *sp; /* start of string matched by it */
489  sopno ssub; /* start sop of subsubRE */
490  sopno esub; /* end sop of subsubRE */
491  const char *ssp; /* start of string matched by subsubRE */
492  const char *dp;
493  size_t len;
494  int hard;
495  sop s;
496  llvm_regoff_t offsave;
497  cset *cs;
498 
499  AT("back", start, stop, startst, stopst);
500  sp = start;
501 
502  /* get as far as we can with easy stuff */
503  hard = 0;
504  for (ss = startst; !hard && ss < stopst; ss++)
505  switch (OP(s = m->g->strip[ss])) {
506  case OCHAR:
507  if (sp == stop || *sp++ != (char)OPND(s))
508  return(NULL);
509  break;
510  case OANY:
511  if (sp == stop)
512  return(NULL);
513  sp++;
514  break;
515  case OANYOF:
516  cs = &m->g->sets[OPND(s)];
517  if (sp == stop || !CHIN(cs, *sp++))
518  return(NULL);
519  break;
520  case OBOL:
521  if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
522  (sp < m->endp && *(sp-1) == '\n' &&
523  (m->g->cflags&REG_NEWLINE)) )
524  { /* yes */ }
525  else
526  return(NULL);
527  break;
528  case OEOL:
529  if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
530  (sp < m->endp && *sp == '\n' &&
531  (m->g->cflags&REG_NEWLINE)) )
532  { /* yes */ }
533  else
534  return(NULL);
535  break;
536  case OBOW:
537  if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
538  (sp < m->endp && *(sp-1) == '\n' &&
539  (m->g->cflags&REG_NEWLINE)) ||
540  (sp > m->beginp &&
541  !ISWORD(*(sp-1))) ) &&
542  (sp < m->endp && ISWORD(*sp)) )
543  { /* yes */ }
544  else
545  return(NULL);
546  break;
547  case OEOW:
548  if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
549  (sp < m->endp && *sp == '\n' &&
550  (m->g->cflags&REG_NEWLINE)) ||
551  (sp < m->endp && !ISWORD(*sp)) ) &&
552  (sp > m->beginp && ISWORD(*(sp-1))) )
553  { /* yes */ }
554  else
555  return(NULL);
556  break;
557  case O_QUEST:
558  break;
559  case OOR1: /* matches null but needs to skip */
560  ss++;
561  s = m->g->strip[ss];
562  do {
563  assert(OP(s) == OOR2);
564  ss += OPND(s);
565  } while (OP(s = m->g->strip[ss]) != O_CH);
566  /* note that the ss++ gets us past the O_CH */
567  break;
568  default: /* have to make a choice */
569  hard = 1;
570  break;
571  }
572  if (!hard) { /* that was it! */
573  if (sp != stop)
574  return(NULL);
575  return(sp);
576  }
577  ss--; /* adjust for the for's final increment */
578 
579  /* the hard stuff */
580  AT("hard", sp, stop, ss, stopst);
581  s = m->g->strip[ss];
582  switch (OP(s)) {
583  case OBACK_: /* the vilest depths */
584  i = OPND(s);
585  assert(0 < i && i <= m->g->nsub);
586  if (m->pmatch[i].rm_eo == -1)
587  return(NULL);
588  assert(m->pmatch[i].rm_so != -1);
589  len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
590  if (len == 0 && rec++ > MAX_RECURSION)
591  return(NULL);
592  assert(stop - m->beginp >= len);
593  if (sp > stop - len)
594  return(NULL); /* not enough left to match */
595  ssp = m->offp + m->pmatch[i].rm_so;
596  if (memcmp(sp, ssp, len) != 0)
597  return(NULL);
598  while (m->g->strip[ss] != SOP(O_BACK, i))
599  ss++;
600  return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
601  break;
602  case OQUEST_: /* to null or not */
603  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
604  if (dp != NULL)
605  return(dp); /* not */
606  return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
607  break;
608  case OPLUS_:
609  assert(m->lastpos != NULL);
610  assert(lev+1 <= m->g->nplus);
611  m->lastpos[lev+1] = sp;
612  return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
613  break;
614  case O_PLUS:
615  if (sp == m->lastpos[lev]) /* last pass matched null */
616  return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
617  /* try another pass */
618  m->lastpos[lev] = sp;
619  dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
620  if (dp == NULL)
621  return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
622  else
623  return(dp);
624  break;
625  case OCH_: /* find the right one, if any */
626  ssub = ss + 1;
627  esub = ss + OPND(s) - 1;
628  assert(OP(m->g->strip[esub]) == OOR1);
629  for (;;) { /* find first matching branch */
630  dp = backref(m, sp, stop, ssub, esub, lev, rec);
631  if (dp != NULL)
632  return(dp);
633  /* that one missed, try next one */
634  if (OP(m->g->strip[esub]) == O_CH)
635  return(NULL); /* there is none */
636  esub++;
637  assert(OP(m->g->strip[esub]) == OOR2);
638  ssub = esub + 1;
639  esub += OPND(m->g->strip[esub]);
640  if (OP(m->g->strip[esub]) == OOR2)
641  esub--;
642  else
643  assert(OP(m->g->strip[esub]) == O_CH);
644  }
645  break;
646  case OLPAREN: /* must undo assignment if rest fails */
647  i = OPND(s);
648  assert(0 < i && i <= m->g->nsub);
649  offsave = m->pmatch[i].rm_so;
650  m->pmatch[i].rm_so = sp - m->offp;
651  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
652  if (dp != NULL)
653  return(dp);
654  m->pmatch[i].rm_so = offsave;
655  return(NULL);
656  break;
657  case ORPAREN: /* must undo assignment if rest fails */
658  i = OPND(s);
659  assert(0 < i && i <= m->g->nsub);
660  offsave = m->pmatch[i].rm_eo;
661  m->pmatch[i].rm_eo = sp - m->offp;
662  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
663  if (dp != NULL)
664  return(dp);
665  m->pmatch[i].rm_eo = offsave;
666  return(NULL);
667  break;
668  default: /* uh oh */
669  assert(nope);
670  break;
671  }
672 
673  /* "can't happen" */
674  assert(nope);
675  /* NOTREACHED */
676  return NULL;
677 }
678 
679 /*
680  - fast - step through the string at top speed
681  */
682 static const char * /* where tentative match ended, or NULL */
683 fast(struct match *m, const char *start, const char *stop, sopno startst,
684  sopno stopst)
685 {
686  states st = m->st;
687  states fresh = m->fresh;
688  states tmp = m->tmp;
689  const char *p = start;
690  int c = (start == m->beginp) ? OUT : *(start-1);
691  int lastc; /* previous c */
692  int flagch;
693  int i;
694  const char *coldp; /* last p after which no match was underway */
695 
696  CLEAR(st);
697  SET1(st, startst);
698  st = step(m->g, startst, stopst, st, NOTHING, st);
699  ASSIGN(fresh, st);
700  SP("start", st, *p);
701  coldp = NULL;
702  for (;;) {
703  /* next character */
704  lastc = c;
705  c = (p == m->endp) ? OUT : *p;
706  if (EQ(st, fresh))
707  coldp = p;
708 
709  /* is there an EOL and/or BOL between lastc and c? */
710  flagch = '\0';
711  i = 0;
712  if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
713  (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
714  flagch = BOL;
715  i = m->g->nbol;
716  }
717  if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
718  (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
719  flagch = (flagch == BOL) ? BOLEOL : EOL;
720  i += m->g->neol;
721  }
722  if (i != 0) {
723  for (; i > 0; i--)
724  st = step(m->g, startst, stopst, st, flagch, st);
725  SP("boleol", st, c);
726  }
727 
728  /* how about a word boundary? */
729  if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
730  (c != OUT && ISWORD(c)) ) {
731  flagch = BOW;
732  }
733  if ( (lastc != OUT && ISWORD(lastc)) &&
734  (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
735  flagch = EOW;
736  }
737  if (flagch == BOW || flagch == EOW) {
738  st = step(m->g, startst, stopst, st, flagch, st);
739  SP("boweow", st, c);
740  }
741 
742  /* are we done? */
743  if (ISSET(st, stopst) || p == stop)
744  break; /* NOTE BREAK OUT */
745 
746  /* no, we must deal with this character */
747  ASSIGN(tmp, st);
748  ASSIGN(st, fresh);
749  assert(c != OUT);
750  st = step(m->g, startst, stopst, tmp, c, st);
751  SP("aft", st, c);
752  assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
753  p++;
754  }
755 
756  assert(coldp != NULL);
757  m->coldp = coldp;
758  if (ISSET(st, stopst))
759  return(p+1);
760  else
761  return(NULL);
762 }
763 
764 /*
765  - slow - step through the string more deliberately
766  */
767 static const char * /* where it ended */
768 slow(struct match *m, const char *start, const char *stop, sopno startst,
769  sopno stopst)
770 {
771  states st = m->st;
772  states empty = m->empty;
773  states tmp = m->tmp;
774  const char *p = start;
775  int c = (start == m->beginp) ? OUT : *(start-1);
776  int lastc; /* previous c */
777  int flagch;
778  int i;
779  const char *matchp; /* last p at which a match ended */
780 
781  AT("slow", start, stop, startst, stopst);
782  CLEAR(st);
783  SET1(st, startst);
784  SP("sstart", st, *p);
785  st = step(m->g, startst, stopst, st, NOTHING, st);
786  matchp = NULL;
787  for (;;) {
788  /* next character */
789  lastc = c;
790  c = (p == m->endp) ? OUT : *p;
791 
792  /* is there an EOL and/or BOL between lastc and c? */
793  flagch = '\0';
794  i = 0;
795  if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
796  (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
797  flagch = BOL;
798  i = m->g->nbol;
799  }
800  if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
801  (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
802  flagch = (flagch == BOL) ? BOLEOL : EOL;
803  i += m->g->neol;
804  }
805  if (i != 0) {
806  for (; i > 0; i--)
807  st = step(m->g, startst, stopst, st, flagch, st);
808  SP("sboleol", st, c);
809  }
810 
811  /* how about a word boundary? */
812  if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
813  (c != OUT && ISWORD(c)) ) {
814  flagch = BOW;
815  }
816  if ( (lastc != OUT && ISWORD(lastc)) &&
817  (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
818  flagch = EOW;
819  }
820  if (flagch == BOW || flagch == EOW) {
821  st = step(m->g, startst, stopst, st, flagch, st);
822  SP("sboweow", st, c);
823  }
824 
825  /* are we done? */
826  if (ISSET(st, stopst))
827  matchp = p;
828  if (EQ(st, empty) || p == stop)
829  break; /* NOTE BREAK OUT */
830 
831  /* no, we must deal with this character */
832  ASSIGN(tmp, st);
833  ASSIGN(st, empty);
834  assert(c != OUT);
835  st = step(m->g, startst, stopst, tmp, c, st);
836  SP("saft", st, c);
837  assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
838  p++;
839  }
840 
841  return(matchp);
842 }
843 
844 
845 /*
846  - step - map set of states reachable before char to set reachable after
847  */
848 static states
849 step(struct re_guts *g,
850  sopno start, /* start state within strip */
851  sopno stop, /* state after stop state within strip */
852  states bef, /* states reachable before */
853  int ch, /* character or NONCHAR code */
854  states aft) /* states already known reachable after */
855 {
856  cset *cs;
857  sop s;
858  sopno pc;
859  onestate here; /* note, macros know this name */
860  sopno look;
861  int i;
862 
863  for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
864  s = g->strip[pc];
865  switch (OP(s)) {
866  case OEND:
867  assert(pc == stop-1);
868  break;
869  case OCHAR:
870  /* only characters can match */
871  assert(!NONCHAR(ch) || ch != (char)OPND(s));
872  if (ch == (char)OPND(s))
873  FWD(aft, bef, 1);
874  break;
875  case OBOL:
876  if (ch == BOL || ch == BOLEOL)
877  FWD(aft, bef, 1);
878  break;
879  case OEOL:
880  if (ch == EOL || ch == BOLEOL)
881  FWD(aft, bef, 1);
882  break;
883  case OBOW:
884  if (ch == BOW)
885  FWD(aft, bef, 1);
886  break;
887  case OEOW:
888  if (ch == EOW)
889  FWD(aft, bef, 1);
890  break;
891  case OANY:
892  if (!NONCHAR(ch))
893  FWD(aft, bef, 1);
894  break;
895  case OANYOF:
896  cs = &g->sets[OPND(s)];
897  if (!NONCHAR(ch) && CHIN(cs, ch))
898  FWD(aft, bef, 1);
899  break;
900  case OBACK_: /* ignored here */
901  case O_BACK:
902  FWD(aft, aft, 1);
903  break;
904  case OPLUS_: /* forward, this is just an empty */
905  FWD(aft, aft, 1);
906  break;
907  case O_PLUS: /* both forward and back */
908  FWD(aft, aft, 1);
909  i = ISSETBACK(aft, OPND(s));
910  BACK(aft, aft, OPND(s));
911  if (!i && ISSETBACK(aft, OPND(s))) {
912  /* oho, must reconsider loop body */
913  pc -= OPND(s) + 1;
914  INIT(here, pc);
915  }
916  break;
917  case OQUEST_: /* two branches, both forward */
918  FWD(aft, aft, 1);
919  FWD(aft, aft, OPND(s));
920  break;
921  case O_QUEST: /* just an empty */
922  FWD(aft, aft, 1);
923  break;
924  case OLPAREN: /* not significant here */
925  case ORPAREN:
926  FWD(aft, aft, 1);
927  break;
928  case OCH_: /* mark the first two branches */
929  FWD(aft, aft, 1);
930  assert(OP(g->strip[pc+OPND(s)]) == OOR2);
931  FWD(aft, aft, OPND(s));
932  break;
933  case OOR1: /* done a branch, find the O_CH */
934  if (ISSTATEIN(aft, here)) {
935  for (look = 1;
936  OP(s = g->strip[pc+look]) != O_CH;
937  look += OPND(s))
938  assert(OP(s) == OOR2);
939  FWD(aft, aft, look);
940  }
941  break;
942  case OOR2: /* propagate OCH_'s marking */
943  FWD(aft, aft, 1);
944  if (OP(g->strip[pc+OPND(s)]) != O_CH) {
945  assert(OP(g->strip[pc+OPND(s)]) == OOR2);
946  FWD(aft, aft, OPND(s));
947  }
948  break;
949  case O_CH: /* just empty */
950  FWD(aft, aft, 1);
951  break;
952  default: /* ooooops... */
953  assert(nope);
954  break;
955  }
956  }
957 
958  return(aft);
959 }
960 
961 #ifdef REDEBUG
962 /*
963  - print - print a set of states
964  */
965 static void
966 print(struct match *m, char *caption, states st, int ch, FILE *d)
967 {
968  struct re_guts *g = m->g;
969  int i;
970  int first = 1;
971 
972  if (!(m->eflags&REG_TRACE))
973  return;
974 
975  (void)fprintf(d, "%s", caption);
976  if (ch != '\0')
977  (void)fprintf(d, " %s", pchar(ch));
978  for (i = 0; i < g->nstates; i++)
979  if (ISSET(st, i)) {
980  (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
981  first = 0;
982  }
983  (void)fprintf(d, "\n");
984 }
985 
986 /*
987  - at - print current situation
988  */
989 static void
990 at(struct match *m, char *title, char *start, char *stop, sopno startst,
991  sopno stopst)
992 {
993  if (!(m->eflags&REG_TRACE))
994  return;
995 
996  (void)printf("%s %s-", title, pchar(*start));
997  (void)printf("%s ", pchar(*stop));
998  (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
999 }
1000 
1001 #ifndef PCHARDONE
1002 #define PCHARDONE /* never again */
1003 /*
1004  - pchar - make a character printable
1005  *
1006  * Is this identical to regchar() over in debug.c? Well, yes. But a
1007  * duplicate here avoids having a debugging-capable regexec.o tied to
1008  * a matching debug.o, and this is convenient. It all disappears in
1009  * the non-debug compilation anyway, so it doesn't matter much.
1010  */
1011 static char * /* -> representation */
1012 pchar(int ch)
1013 {
1014  static char pbuf[10];
1015 
1016  if (isPrint(ch) || ch == ' ')
1017  (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1018  else
1019  (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1020  return(pbuf);
1021 }
1022 #endif
1023 #endif
1024 
1025 #undef matcher
1026 #undef fast
1027 #undef slow
1028 #undef dissect
1029 #undef backref
1030 #undef step
1031 #undef print
1032 #undef at
1033 #undef match
1034 #undef nope
i
i
Definition: README.txt:29
ISSETBACK
#define ISSETBACK(v, n)
Definition: regexec.c:127
STATEVARS
#define STATEVARS
Definition: regexec.c:113
llvm_regmatch_t::rm_so
llvm_regoff_t rm_so
Definition: regex_impl.h:44
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
SETUP
#define SETUP(v)
Definition: regexec.c:118
O_PLUS
#define O_PLUS
Definition: regex2.h:87
REG_STARTEND
#define REG_STARTEND
Definition: regex_impl.h:88
REG_NOSUB
#define REG_NOSUB
Definition: regex_impl.h:59
llvm_regmatch_t::rm_eo
llvm_regoff_t rm_eo
Definition: regex_impl.h:45
memcmp
Merge contiguous icmps into a memcmp
Definition: MergeICmps.cpp:888
OANY
#define OANY
Definition: regex2.h:82
at
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning at
Definition: README.txt:582
g
should just be implemented with a CLZ instruction Since there are other e g
Definition: README.txt:709
llvm::sys::locale::isPrint
bool isPrint(int c)
Definition: Locale.cpp:13
OCH_
#define OCH_
Definition: regex2.h:92
tmp
alloca< 16 x float >, align 16 %tmp2=alloca< 16 x float >, align 16 store< 16 x float > %A,< 16 x float > *%tmp %s=bitcast< 16 x float > *%tmp to i8 *%s2=bitcast< 16 x float > *%tmp2 to i8 *call void @llvm.memcpy.i64(i8 *%s, i8 *%s2, i64 64, i32 16) %R=load< 16 x float > *%tmp2 ret< 16 x float > %R } declare void @llvm.memcpy.i64(i8 *nocapture, i8 *nocapture, i64, i32) nounwind which compiles to:_foo:subl $140, %esp movaps %xmm3, 112(%esp) movaps %xmm2, 96(%esp) movaps %xmm1, 80(%esp) movaps %xmm0, 64(%esp) movl 60(%esp), %eax movl %eax, 124(%esp) movl 56(%esp), %eax movl %eax, 120(%esp) movl 52(%esp), %eax< many many more 32-bit copies > movaps(%esp), %xmm0 movaps 16(%esp), %xmm1 movaps 32(%esp), %xmm2 movaps 48(%esp), %xmm3 addl $140, %esp ret On Nehalem, it may even be cheaper to just use movups when unaligned than to fall back to lower-granularity chunks. Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations:1. ix86_pad_returns inserts a noop before ret instructions if immediately preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. The first one is done for all AMDs, Core2, and "Generic" The second one is done for:Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" Testcase:int x(int a) { return(a &0xf0)> >4 tmp
Definition: README.txt:1347
OBACK_
#define OBACK_
Definition: regex2.h:84
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
OOR2
#define OOR2
Definition: regex2.h:94
INC
#define INC(o)
Definition: regexec.c:121
re_guts
Definition: regex2.h:132
OBOW
#define OBOW
Definition: regex2.h:96
here
A predicate compare being used in a select_cc should have the same peephole applied to it as a predicate compare used by a br_cc There should be no mfcr here
Definition: README_ALTIVEC.txt:147
ORPAREN
#define ORPAREN
Definition: regex2.h:91
llvm_regoff_t
off_t llvm_regoff_t
Definition: regex_impl.h:42
OEOW
#define OEOW
Definition: regex2.h:97
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
sopno
long sopno
Definition: regex2.h:69
OANYOF
#define OANYOF
Definition: regex2.h:83
O_CH
#define O_CH
Definition: regex2.h:95
EQ
#define EQ(a, b)
Definition: regexec.c:112
OUT
#define OUT
Definition: regex2.h:162
SET1
#define SET1(v, n)
Definition: regexec.c:109
states
#define states
Definition: regexec.c:106
REG_TRACE
#define REG_TRACE
Definition: regex_impl.h:89
OCHAR
#define OCHAR
Definition: regex2.h:79
ISSET
#define ISSET(v, n)
Definition: regexec.c:110
sop
unsigned long sop
Definition: regex2.h:68
OLPAREN
#define OLPAREN
Definition: regex2.h:90
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
OBOL
#define OBOL
Definition: regex2.h:80
STATESETUP
#define STATESETUP(m, n)
Definition: regexec.c:114
REG_NOTBOL
#define REG_NOTBOL
Definition: regex_impl.h:86
INIT
#define INIT(o, n)
Definition: regexec.c:120
CHIN
#define CHIN(cs, c)
Definition: regex2.h:121
ASSIGN
#define ASSIGN(d, s)
Definition: regexec.c:111
s
multiplies can be turned into SHL s
Definition: README.txt:370
REG_NEWLINE
#define REG_NEWLINE
Definition: regex_impl.h:60
REG_NOMATCH
#define REG_NOMATCH
Definition: regex_impl.h:66
O_BACK
#define O_BACK
Definition: regex2.h:85
FWD
#define FWD(dst, src, n)
Definition: regexec.c:125
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
CLEAR
#define CLEAR(v)
Definition: regexec.c:107
SOP
#define SOP(op, opnd)
Definition: regex2.h:75
OP
#define OP(n)
Definition: regex2.h:73
O_QUEST
#define O_QUEST
Definition: regex2.h:89
cset
Definition: regex2.h:111
STATETEARDOWN
#define STATETEARDOWN(m)
Definition: regexec.c:117
onestate
#define onestate
Definition: regexec.c:119
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
ISSTATEIN
#define ISSTATEIN(v, o)
Definition: regexec.c:122
ISWORD
#define ISWORD(c)
Definition: regex2.h:163
BACK
#define BACK(dst, src, n)
Definition: regexec.c:126
OEOL
#define OEOL
Definition: regex2.h:81
pc
add sub stmia L5 ldr pc
Definition: README.txt:201
sp
_bar sp
Definition: README.txt:151
REG_ESPACE
#define REG_ESPACE
Definition: regex_impl.h:77
OPND
#define OPND(n)
Definition: regex2.h:74
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
REG_NOTEOL
#define REG_NOTEOL
Definition: regex_impl.h:87
llvm_regmatch_t
Definition: regex_impl.h:43
OOR1
#define OOR1
Definition: regex2.h:93
REG_INVARG
#define REG_INVARG
Definition: regex_impl.h:81
OEND
#define OEND
Definition: regex2.h:78
matcher
Code Generation Notes for reduce the size of the ISel matcher
Definition: MSA.txt:5
REG_BACKR
#define REG_BACKR
Definition: regex_impl.h:91
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
OPLUS_
#define OPLUS_
Definition: regex2.h:86
OQUEST_
#define OQUEST_
Definition: regex2.h:88
SpecialSubKind::string
@ string