LLVM  14.0.0git
regcomp.c
Go to the documentation of this file.
1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  * The Regents of the University of California. All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  * may be used to endorse or promote products derived from this software
21  * without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36  */
37 
38 #include <sys/types.h>
39 #include <stdint.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <stdlib.h>
45 #include "regex_impl.h"
46 
47 #include "regutils.h"
48 #include "regex2.h"
49 
50 #include "llvm/Config/config.h"
51 #include "llvm/Support/Compiler.h"
52 
53 /* character-class table */
54 static struct cclass {
55  const char *name;
56  const char *chars;
57  const char *multis;
58 } cclasses[] = {
59  { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
60 0123456789", ""} ,
61  { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
62  ""} ,
63  { "blank", " \t", ""} ,
64  { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
65 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
66  { "digit", "0123456789", ""} ,
67  { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
68 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
69  ""} ,
70  { "lower", "abcdefghijklmnopqrstuvwxyz",
71  ""} ,
72  { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
73 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
74  ""} ,
75  { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
76  ""} ,
77  { "space", "\t\n\v\f\r ", ""} ,
78  { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
79  ""} ,
80  { "xdigit", "0123456789ABCDEFabcdef",
81  ""} ,
82  { NULL, 0, "" }
83 };
84 
85 /* character-name table */
86 static struct cname {
87  const char *name;
88  char code;
89 } cnames[] = {
90  { "NUL", '\0' },
91  { "SOH", '\001' },
92  { "STX", '\002' },
93  { "ETX", '\003' },
94  { "EOT", '\004' },
95  { "ENQ", '\005' },
96  { "ACK", '\006' },
97  { "BEL", '\007' },
98  { "alert", '\007' },
99  { "BS", '\010' },
100  { "backspace", '\b' },
101  { "HT", '\011' },
102  { "tab", '\t' },
103  { "LF", '\012' },
104  { "newline", '\n' },
105  { "VT", '\013' },
106  { "vertical-tab", '\v' },
107  { "FF", '\014' },
108  { "form-feed", '\f' },
109  { "CR", '\015' },
110  { "carriage-return", '\r' },
111  { "SO", '\016' },
112  { "SI", '\017' },
113  { "DLE", '\020' },
114  { "DC1", '\021' },
115  { "DC2", '\022' },
116  { "DC3", '\023' },
117  { "DC4", '\024' },
118  { "NAK", '\025' },
119  { "SYN", '\026' },
120  { "ETB", '\027' },
121  { "CAN", '\030' },
122  { "EM", '\031' },
123  { "SUB", '\032' },
124  { "ESC", '\033' },
125  { "IS4", '\034' },
126  { "FS", '\034' },
127  { "IS3", '\035' },
128  { "GS", '\035' },
129  { "IS2", '\036' },
130  { "RS", '\036' },
131  { "IS1", '\037' },
132  { "US", '\037' },
133  { "space", ' ' },
134  { "exclamation-mark", '!' },
135  { "quotation-mark", '"' },
136  { "number-sign", '#' },
137  { "dollar-sign", '$' },
138  { "percent-sign", '%' },
139  { "ampersand", '&' },
140  { "apostrophe", '\'' },
141  { "left-parenthesis", '(' },
142  { "right-parenthesis", ')' },
143  { "asterisk", '*' },
144  { "plus-sign", '+' },
145  { "comma", ',' },
146  { "hyphen", '-' },
147  { "hyphen-minus", '-' },
148  { "period", '.' },
149  { "full-stop", '.' },
150  { "slash", '/' },
151  { "solidus", '/' },
152  { "zero", '0' },
153  { "one", '1' },
154  { "two", '2' },
155  { "three", '3' },
156  { "four", '4' },
157  { "five", '5' },
158  { "six", '6' },
159  { "seven", '7' },
160  { "eight", '8' },
161  { "nine", '9' },
162  { "colon", ':' },
163  { "semicolon", ';' },
164  { "less-than-sign", '<' },
165  { "equals-sign", '=' },
166  { "greater-than-sign", '>' },
167  { "question-mark", '?' },
168  { "commercial-at", '@' },
169  { "left-square-bracket", '[' },
170  { "backslash", '\\' },
171  { "reverse-solidus", '\\' },
172  { "right-square-bracket", ']' },
173  { "circumflex", '^' },
174  { "circumflex-accent", '^' },
175  { "underscore", '_' },
176  { "low-line", '_' },
177  { "grave-accent", '`' },
178  { "left-brace", '{' },
179  { "left-curly-bracket", '{' },
180  { "vertical-line", '|' },
181  { "right-brace", '}' },
182  { "right-curly-bracket", '}' },
183  { "tilde", '~' },
184  { "DEL", '\177' },
185  { NULL, 0 }
186 };
187 
188 /*
189  * parse structure, passed up and down to avoid global variables and
190  * other clumsinesses
191  */
192 struct parse {
193  char *next; /* next character in RE */
194  char *end; /* end of string (-> NUL normally) */
195  int error; /* has an error been seen? */
196  sop *strip; /* malloced strip */
197  sopno ssize; /* malloced strip size (allocated) */
198  sopno slen; /* malloced strip length (used) */
199  int ncsalloc; /* number of csets allocated */
200  struct re_guts *g;
201 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
202  sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
203  sopno pend[NPAREN]; /* -> ) ([0] unused) */
204 };
205 
206 static void p_ere(struct parse *, int);
207 static void p_ere_exp(struct parse *);
208 static void p_str(struct parse *);
209 static void p_bre(struct parse *, int, int);
210 static int p_simp_re(struct parse *, int);
211 static int p_count(struct parse *);
212 static void p_bracket(struct parse *);
213 static void p_b_term(struct parse *, cset *);
214 static void p_b_cclass(struct parse *, cset *);
215 static void p_b_eclass(struct parse *, cset *);
216 static char p_b_symbol(struct parse *);
217 static char p_b_coll_elem(struct parse *, int);
218 static char othercase(int);
219 static void bothcases(struct parse *, int);
220 static void ordinary(struct parse *, int);
221 static void nonnewline(struct parse *);
222 static void repeat(struct parse *, sopno, int, int);
223 static int seterr(struct parse *, int);
224 static cset *allocset(struct parse *);
225 static void freeset(struct parse *, cset *);
226 static int freezeset(struct parse *, cset *);
227 static int firstch(struct parse *, cset *);
228 static int nch(struct parse *, cset *);
229 static void mcadd(struct parse *, cset *, const char *);
230 static void mcinvert(struct parse *, cset *);
231 static void mccase(struct parse *, cset *);
232 static int isinsets(struct re_guts *, int);
233 static int samesets(struct re_guts *, int, int);
234 static void categorize(struct parse *, struct re_guts *);
235 static sopno dupl(struct parse *, sopno, sopno);
236 static void doemit(struct parse *, sop, size_t);
237 static void doinsert(struct parse *, sop, size_t, sopno);
238 static void dofwd(struct parse *, sopno, sop);
239 static void enlarge(struct parse *, sopno);
240 static void stripsnug(struct parse *, struct re_guts *);
241 static void findmust(struct parse *, struct re_guts *);
242 static sopno pluscount(struct parse *, struct re_guts *);
243 
244 static char nuls[10]; /* place to point scanner in event of error */
245 
246 /*
247  * macros for use with parse structure
248  * BEWARE: these know that the parse structure is named `p' !!!
249  */
250 #define PEEK() (*p->next)
251 #define PEEK2() (*(p->next+1))
252 #define MORE() (p->next < p->end)
253 #define MORE2() (p->next+1 < p->end)
254 #define SEE(c) (MORE() && PEEK() == (c))
255 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
256 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
257 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
258 #define NEXT() (p->next++)
259 #define NEXT2() (p->next += 2)
260 #define NEXTn(n) (p->next += (n))
261 #define GETNEXT() (*p->next++)
262 #define SETERROR(e) seterr(p, (e))
263 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
264 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
265 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
266 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
267 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
268 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
269 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
270 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
271 #define HERE() (p->slen)
272 #define THERE() (p->slen - 1)
273 #define THERETHERE() (p->slen - 2)
274 #define DROP(n) (p->slen -= (n))
275 
276 #ifdef _POSIX2_RE_DUP_MAX
277 #define DUPMAX _POSIX2_RE_DUP_MAX
278 #else
279 #define DUPMAX 255
280 #endif
281 #define INFINITY (DUPMAX + 1)
282 
283 #ifndef NDEBUG
284 static int never = 0; /* for use in asserts; shuts lint up */
285 #else
286 #define never 0 /* some <assert.h>s have bugs too */
287 #endif
288 
289 /*
290  - llvm_regcomp - interface for parser and compilation
291  */
292 int /* 0 success, otherwise REG_something */
293 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
294 {
295  struct parse pa;
296  struct re_guts *g;
297  struct parse *p = &pa;
298  int i;
299  size_t len;
300 #ifdef REDEBUG
301 # define GOODFLAGS(f) (f)
302 #else
303 # define GOODFLAGS(f) ((f)&~REG_DUMP)
304 #endif
305 
306  cflags = GOODFLAGS(cflags);
307  if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
308  return(REG_INVARG);
309 
310  if (cflags&REG_PEND) {
311  if (preg->re_endp < pattern)
312  return(REG_INVARG);
313  len = preg->re_endp - pattern;
314  } else
315  len = strlen((const char *)pattern);
316 
317  /* do the mallocs early so failure handling is easy */
318  g = (struct re_guts *)malloc(sizeof(struct re_guts) +
319  (NC-1)*sizeof(cat_t));
320  if (g == NULL)
321  return(REG_ESPACE);
322  p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
323  p->strip = (sop *)calloc(p->ssize, sizeof(sop));
324  p->slen = 0;
325  if (p->strip == NULL) {
326  free((char *)g);
327  return(REG_ESPACE);
328  }
329 
330  /* set things up */
331  p->g = g;
332  p->next = (char *)pattern; /* convenience; we do not modify it */
333  p->end = p->next + len;
334  p->error = 0;
335  p->ncsalloc = 0;
336  for (i = 0; i < NPAREN; i++) {
337  p->pbegin[i] = 0;
338  p->pend[i] = 0;
339  }
340  g->csetsize = NC;
341  g->sets = NULL;
342  g->setbits = NULL;
343  g->ncsets = 0;
344  g->cflags = cflags;
345  g->iflags = 0;
346  g->nbol = 0;
347  g->neol = 0;
348  g->must = NULL;
349  g->mlen = 0;
350  g->nsub = 0;
351  g->ncategories = 1; /* category 0 is "everything else" */
352  g->categories = &g->catspace[-(CHAR_MIN)];
353  (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
354  g->backrefs = 0;
355 
356  /* do it */
357  EMIT(OEND, 0);
358  g->firststate = THERE();
359  if (cflags&REG_EXTENDED)
360  p_ere(p, OUT);
361  else if (cflags&REG_NOSPEC)
362  p_str(p);
363  else
364  p_bre(p, OUT, OUT);
365  EMIT(OEND, 0);
366  g->laststate = THERE();
367 
368  /* tidy up loose ends and fill things in */
369  categorize(p, g);
370  stripsnug(p, g);
371  findmust(p, g);
372  g->nplus = pluscount(p, g);
373  g->magic = MAGIC2;
374  preg->re_nsub = g->nsub;
375  preg->re_g = g;
376  preg->re_magic = MAGIC1;
377 #ifndef REDEBUG
378  /* not debugging, so can't rely on the assert() in llvm_regexec() */
379  if (g->iflags&REGEX_BAD)
381 #endif
382 
383  /* win or lose, we're done */
384  if (p->error != 0) /* lose */
385  llvm_regfree(preg);
386  return(p->error);
387 }
388 
389 /*
390  - p_ere - ERE parser top level, concatenation and alternation
391  */
392 static void
393 p_ere(struct parse *p, int stop) /* character this ERE should end at */
394 {
395  char c;
396  sopno prevback = 0;
397  sopno prevfwd = 0;
398  sopno conc;
399  int first = 1; /* is this the first alternative? */
400 
401  for (;;) {
402  /* do a bunch of concatenated expressions */
403  conc = HERE();
404  while (MORE() && (c = PEEK()) != '|' && c != stop)
405  p_ere_exp(p);
406  REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
407 
408  if (!EAT('|'))
409  break; /* NOTE BREAK OUT */
410 
411  if (first) {
412  INSERT(OCH_, conc); /* offset is wrong */
413  prevfwd = conc;
414  prevback = conc;
415  first = 0;
416  }
417  ASTERN(OOR1, prevback);
418  prevback = THERE();
419  AHEAD(prevfwd); /* fix previous offset */
420  prevfwd = HERE();
421  EMIT(OOR2, 0); /* offset is very wrong */
422  }
423 
424  if (!first) { /* tail-end fixups */
425  AHEAD(prevfwd);
426  ASTERN(O_CH, prevback);
427  }
428 
429  assert(!MORE() || SEE(stop));
430 }
431 
432 /*
433  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
434  */
435 static void
436 p_ere_exp(struct parse *p)
437 {
438  char c;
439  sopno pos;
440  int count;
441  int count2;
442  int backrefnum;
443  sopno subno;
444  int wascaret = 0;
445 
446  assert(MORE()); /* caller should have ensured this */
447  c = GETNEXT();
448 
449  pos = HERE();
450  switch (c) {
451  case '(':
452  REQUIRE(MORE(), REG_EPAREN);
453  p->g->nsub++;
454  subno = p->g->nsub;
455  if (subno < NPAREN)
456  p->pbegin[subno] = HERE();
457  EMIT(OLPAREN, subno);
458  if (!SEE(')'))
459  p_ere(p, ')');
460  if (subno < NPAREN) {
461  p->pend[subno] = HERE();
462  assert(p->pend[subno] != 0);
463  }
464  EMIT(ORPAREN, subno);
465  MUSTEAT(')', REG_EPAREN);
466  break;
467 #ifndef POSIX_MISTAKE
468  case ')': /* happens only if no current unmatched ( */
469  /*
470  * You may ask, why the ifndef? Because I didn't notice
471  * this until slightly too late for 1003.2, and none of the
472  * other 1003.2 regular-expression reviewers noticed it at
473  * all. So an unmatched ) is legal POSIX, at least until
474  * we can get it fixed.
475  */
477  break;
478 #endif
479  case '^':
480  EMIT(OBOL, 0);
481  p->g->iflags |= USEBOL;
482  p->g->nbol++;
483  wascaret = 1;
484  break;
485  case '$':
486  EMIT(OEOL, 0);
487  p->g->iflags |= USEEOL;
488  p->g->neol++;
489  break;
490  case '|':
492  break;
493  case '*':
494  case '+':
495  case '?':
497  break;
498  case '.':
499  if (p->g->cflags&REG_NEWLINE)
500  nonnewline(p);
501  else
502  EMIT(OANY, 0);
503  break;
504  case '[':
505  p_bracket(p);
506  break;
507  case '\\':
509  c = GETNEXT();
510  if (c >= '1' && c <= '9') {
511  /* \[0-9] is taken to be a back-reference to a previously specified
512  * matching group. backrefnum will hold the number. The matching
513  * group must exist (i.e. if \4 is found there must have been at
514  * least 4 matching groups specified in the pattern previously).
515  */
516  backrefnum = c - '0';
517  if (p->pend[backrefnum] == 0) {
519  break;
520  }
521 
522  /* Make sure everything checks out and emit the sequence
523  * that marks a back-reference to the parse structure.
524  */
525  assert(backrefnum <= p->g->nsub);
526  EMIT(OBACK_, backrefnum);
527  assert(p->pbegin[backrefnum] != 0);
528  assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
529  assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
530  (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
531  EMIT(O_BACK, backrefnum);
532  p->g->backrefs = 1;
533  } else {
534  /* Other chars are simply themselves when escaped with a backslash.
535  */
536  ordinary(p, c);
537  }
538  break;
539  case '{': /* okay as ordinary except if digit follows */
540  REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
542  default:
543  ordinary(p, c);
544  break;
545  }
546 
547  if (!MORE())
548  return;
549  c = PEEK();
550  /* we call { a repetition if followed by a digit */
551  if (!( c == '*' || c == '+' || c == '?' ||
552  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
553  return; /* no repetition, we're done */
554  NEXT();
555 
556  REQUIRE(!wascaret, REG_BADRPT);
557  switch (c) {
558  case '*': /* implemented as +? */
559  /* this case does not require the (y|) trick, noKLUDGE */
560  INSERT(OPLUS_, pos);
561  ASTERN(O_PLUS, pos);
562  INSERT(OQUEST_, pos);
563  ASTERN(O_QUEST, pos);
564  break;
565  case '+':
566  INSERT(OPLUS_, pos);
567  ASTERN(O_PLUS, pos);
568  break;
569  case '?':
570  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
571  INSERT(OCH_, pos); /* offset slightly wrong */
572  ASTERN(OOR1, pos); /* this one's right */
573  AHEAD(pos); /* fix the OCH_ */
574  EMIT(OOR2, 0); /* offset very wrong... */
575  AHEAD(THERE()); /* ...so fix it */
576  ASTERN(O_CH, THERETHERE());
577  break;
578  case '{':
579  count = p_count(p);
580  if (EAT(',')) {
581  if (isdigit((uch)PEEK())) {
582  count2 = p_count(p);
583  REQUIRE(count <= count2, REG_BADBR);
584  } else /* single number with comma */
585  count2 = INFINITY;
586  } else /* just a single number */
587  count2 = count;
588  repeat(p, pos, count, count2);
589  if (!EAT('}')) { /* error heuristics */
590  while (MORE() && PEEK() != '}')
591  NEXT();
592  REQUIRE(MORE(), REG_EBRACE);
594  }
595  break;
596  }
597 
598  if (!MORE())
599  return;
600  c = PEEK();
601  if (!( c == '*' || c == '+' || c == '?' ||
602  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
603  return;
605 }
606 
607 /*
608  - p_str - string (no metacharacters) "parser"
609  */
610 static void
611 p_str(struct parse *p)
612 {
613  REQUIRE(MORE(), REG_EMPTY);
614  while (MORE())
615  ordinary(p, GETNEXT());
616 }
617 
618 /*
619  - p_bre - BRE parser top level, anchoring and concatenation
620  * Giving end1 as OUT essentially eliminates the end1/end2 check.
621  *
622  * This implementation is a bit of a kludge, in that a trailing $ is first
623  * taken as an ordinary character and then revised to be an anchor. The
624  * only undesirable side effect is that '$' gets included as a character
625  * category in such cases. This is fairly harmless; not worth fixing.
626  * The amount of lookahead needed to avoid this kludge is excessive.
627  */
628 static void
629 p_bre(struct parse *p,
630  int end1, /* first terminating character */
631  int end2) /* second terminating character */
632 {
633  sopno start = HERE();
634  int first = 1; /* first subexpression? */
635  int wasdollar = 0;
636 
637  if (EAT('^')) {
638  EMIT(OBOL, 0);
639  p->g->iflags |= USEBOL;
640  p->g->nbol++;
641  }
642  while (MORE() && !SEETWO(end1, end2)) {
643  wasdollar = p_simp_re(p, first);
644  first = 0;
645  }
646  if (wasdollar) { /* oops, that was a trailing anchor */
647  DROP(1);
648  EMIT(OEOL, 0);
649  p->g->iflags |= USEEOL;
650  p->g->neol++;
651  }
652 
653  REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
654 }
655 
656 /*
657  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
658  */
659 static int /* was the simple RE an unbackslashed $? */
660 p_simp_re(struct parse *p,
661  int starordinary) /* is a leading * an ordinary character? */
662 {
663  int c;
664  int count;
665  int count2;
666  sopno pos;
667  int i;
668  sopno subno;
669 # define BACKSL (1<<CHAR_BIT)
670 
671  pos = HERE(); /* repetition op, if any, covers from here */
672 
673  assert(MORE()); /* caller should have ensured this */
674  c = GETNEXT();
675  if (c == '\\') {
677  c = BACKSL | GETNEXT();
678  }
679  switch (c) {
680  case '.':
681  if (p->g->cflags&REG_NEWLINE)
682  nonnewline(p);
683  else
684  EMIT(OANY, 0);
685  break;
686  case '[':
687  p_bracket(p);
688  break;
689  case BACKSL|'{':
691  break;
692  case BACKSL|'(':
693  p->g->nsub++;
694  subno = p->g->nsub;
695  if (subno < NPAREN)
696  p->pbegin[subno] = HERE();
697  EMIT(OLPAREN, subno);
698  /* the MORE here is an error heuristic */
699  if (MORE() && !SEETWO('\\', ')'))
700  p_bre(p, '\\', ')');
701  if (subno < NPAREN) {
702  p->pend[subno] = HERE();
703  assert(p->pend[subno] != 0);
704  }
705  EMIT(ORPAREN, subno);
706  REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
707  break;
708  case BACKSL|')': /* should not get here -- must be user */
709  case BACKSL|'}':
711  break;
712  case BACKSL|'1':
713  case BACKSL|'2':
714  case BACKSL|'3':
715  case BACKSL|'4':
716  case BACKSL|'5':
717  case BACKSL|'6':
718  case BACKSL|'7':
719  case BACKSL|'8':
720  case BACKSL|'9':
721  i = (c&~BACKSL) - '0';
722  assert(i < NPAREN);
723  if (p->pend[i] != 0) {
724  assert(i <= p->g->nsub);
725  EMIT(OBACK_, i);
726  assert(p->pbegin[i] != 0);
727  assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
728  assert(OP(p->strip[p->pend[i]]) == ORPAREN);
729  (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
730  EMIT(O_BACK, i);
731  } else
733  p->g->backrefs = 1;
734  break;
735  case '*':
736  REQUIRE(starordinary, REG_BADRPT);
738  default:
739  ordinary(p, (char)c);
740  break;
741  }
742 
743  if (EAT('*')) { /* implemented as +? */
744  /* this case does not require the (y|) trick, noKLUDGE */
745  INSERT(OPLUS_, pos);
746  ASTERN(O_PLUS, pos);
747  INSERT(OQUEST_, pos);
748  ASTERN(O_QUEST, pos);
749  } else if (EATTWO('\\', '{')) {
750  count = p_count(p);
751  if (EAT(',')) {
752  if (MORE() && isdigit((uch)PEEK())) {
753  count2 = p_count(p);
754  REQUIRE(count <= count2, REG_BADBR);
755  } else /* single number with comma */
756  count2 = INFINITY;
757  } else /* just a single number */
758  count2 = count;
759  repeat(p, pos, count, count2);
760  if (!EATTWO('\\', '}')) { /* error heuristics */
761  while (MORE() && !SEETWO('\\', '}'))
762  NEXT();
763  REQUIRE(MORE(), REG_EBRACE);
765  }
766  } else if (c == '$') /* $ (but not \$) ends it */
767  return(1);
768 
769  return(0);
770 }
771 
772 /*
773  - p_count - parse a repetition count
774  */
775 static int /* the value */
776 p_count(struct parse *p)
777 {
778  int count = 0;
779  int ndigits = 0;
780 
781  while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
782  count = count*10 + (GETNEXT() - '0');
783  ndigits++;
784  }
785 
786  REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
787  return(count);
788 }
789 
790 /*
791  - p_bracket - parse a bracketed character list
792  *
793  * Note a significant property of this code: if the allocset() did SETERROR,
794  * no set operations are done.
795  */
796 static void
797 p_bracket(struct parse *p)
798 {
799  cset *cs;
800  int invert = 0;
801 
802  /* Dept of Truly Sickening Special-Case Kludges */
803  if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
804  EMIT(OBOW, 0);
805  NEXTn(6);
806  return;
807  }
808  if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
809  EMIT(OEOW, 0);
810  NEXTn(6);
811  return;
812  }
813 
814  if ((cs = allocset(p)) == NULL) {
815  /* allocset did set error status in p */
816  return;
817  }
818 
819  if (EAT('^'))
820  invert++; /* make note to invert set at end */
821  if (EAT(']'))
822  CHadd(cs, ']');
823  else if (EAT('-'))
824  CHadd(cs, '-');
825  while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
826  p_b_term(p, cs);
827  if (EAT('-'))
828  CHadd(cs, '-');
829  MUSTEAT(']', REG_EBRACK);
830 
831  if (p->error != 0) { /* don't mess things up further */
832  freeset(p, cs);
833  return;
834  }
835 
836  if (p->g->cflags&REG_ICASE) {
837  int i;
838  int ci;
839 
840  for (i = p->g->csetsize - 1; i >= 0; i--)
841  if (CHIN(cs, i) && isalpha(i)) {
842  ci = othercase(i);
843  if (ci != i)
844  CHadd(cs, ci);
845  }
846  if (cs->multis != NULL)
847  mccase(p, cs);
848  }
849  if (invert) {
850  int i;
851 
852  for (i = p->g->csetsize - 1; i >= 0; i--)
853  if (CHIN(cs, i))
854  CHsub(cs, i);
855  else
856  CHadd(cs, i);
857  if (p->g->cflags&REG_NEWLINE)
858  CHsub(cs, '\n');
859  if (cs->multis != NULL)
860  mcinvert(p, cs);
861  }
862 
863  assert(cs->multis == NULL); /* xxx */
864 
865  if (nch(p, cs) == 1) { /* optimize singleton sets */
866  ordinary(p, firstch(p, cs));
867  freeset(p, cs);
868  } else
869  EMIT(OANYOF, freezeset(p, cs));
870 }
871 
872 /*
873  - p_b_term - parse one term of a bracketed character list
874  */
875 static void
876 p_b_term(struct parse *p, cset *cs)
877 {
878  char c;
879  char start, finish;
880  int i;
881 
882  /* classify what we've got */
883  switch ((MORE()) ? PEEK() : '\0') {
884  case '[':
885  c = (MORE2()) ? PEEK2() : '\0';
886  break;
887  case '-':
889  return; /* NOTE RETURN */
890  break;
891  default:
892  c = '\0';
893  break;
894  }
895 
896  switch (c) {
897  case ':': /* character class */
898  NEXT2();
899  REQUIRE(MORE(), REG_EBRACK);
900  c = PEEK();
901  REQUIRE(c != '-' && c != ']', REG_ECTYPE);
902  p_b_cclass(p, cs);
903  REQUIRE(MORE(), REG_EBRACK);
904  REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
905  break;
906  case '=': /* equivalence class */
907  NEXT2();
908  REQUIRE(MORE(), REG_EBRACK);
909  c = PEEK();
910  REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
911  p_b_eclass(p, cs);
912  REQUIRE(MORE(), REG_EBRACK);
913  REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
914  break;
915  default: /* symbol, ordinary character, or range */
916 /* xxx revision needed for multichar stuff */
917  start = p_b_symbol(p);
918  if (SEE('-') && MORE2() && PEEK2() != ']') {
919  /* range */
920  NEXT();
921  if (EAT('-'))
922  finish = '-';
923  else
924  finish = p_b_symbol(p);
925  } else
926  finish = start;
927 /* xxx what about signed chars here... */
928  REQUIRE(start <= finish, REG_ERANGE);
929  for (i = start; i <= finish; i++)
930  CHadd(cs, i);
931  break;
932  }
933 }
934 
935 /*
936  - p_b_cclass - parse a character-class name and deal with it
937  */
938 static void
939 p_b_cclass(struct parse *p, cset *cs)
940 {
941  char *sp = p->next;
942  struct cclass *cp;
943  size_t len;
944  const char *u;
945  char c;
946 
947  while (MORE() && isalpha((uch)PEEK()))
948  NEXT();
949  len = p->next - sp;
950  for (cp = cclasses; cp->name != NULL; cp++)
951  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
952  break;
953  if (cp->name == NULL) {
954  /* oops, didn't find it */
956  return;
957  }
958 
959  u = cp->chars;
960  while ((c = *u++) != '\0')
961  CHadd(cs, c);
962  for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
963  MCadd(p, cs, u);
964 }
965 
966 /*
967  - p_b_eclass - parse an equivalence-class name and deal with it
968  *
969  * This implementation is incomplete. xxx
970  */
971 static void
972 p_b_eclass(struct parse *p, cset *cs)
973 {
974  char c;
975 
976  c = p_b_coll_elem(p, '=');
977  CHadd(cs, c);
978 }
979 
980 /*
981  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
982  */
983 static char /* value of symbol */
985 {
986  char value;
987 
988  REQUIRE(MORE(), REG_EBRACK);
989  if (!EATTWO('[', '.'))
990  return(GETNEXT());
991 
992  /* collating symbol */
993  value = p_b_coll_elem(p, '.');
994  REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
995  return(value);
996 }
997 
998 /*
999  - p_b_coll_elem - parse a collating-element name and look it up
1000  */
1001 static char /* value of collating element */
1003  int endc) /* name ended by endc,']' */
1004 {
1005  char *sp = p->next;
1006  struct cname *cp;
1007  size_t len;
1008 
1009  while (MORE() && !SEETWO(endc, ']'))
1010  NEXT();
1011  if (!MORE()) {
1013  return(0);
1014  }
1015  len = p->next - sp;
1016  for (cp = cnames; cp->name != NULL; cp++)
1017  if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1018  return(cp->code); /* known name */
1019  if (len == 1)
1020  return(*sp); /* single character */
1021  SETERROR(REG_ECOLLATE); /* neither */
1022  return(0);
1023 }
1024 
1025 /*
1026  - othercase - return the case counterpart of an alphabetic
1027  */
1028 static char /* if no counterpart, return ch */
1029 othercase(int ch)
1030 {
1031  ch = (uch)ch;
1032  assert(isalpha(ch));
1033  if (isupper(ch))
1034  return ((uch)tolower(ch));
1035  else if (islower(ch))
1036  return ((uch)toupper(ch));
1037  else /* peculiar, but could happen */
1038  return(ch);
1039 }
1040 
1041 /*
1042  - bothcases - emit a dualcase version of a two-case character
1043  *
1044  * Boy, is this implementation ever a kludge...
1045  */
1046 static void
1047 bothcases(struct parse *p, int ch)
1048 {
1049  char *oldnext = p->next;
1050  char *oldend = p->end;
1051  char bracket[3];
1052 
1053  ch = (uch)ch;
1054  assert(othercase(ch) != ch); /* p_bracket() would recurse */
1055  p->next = bracket;
1056  p->end = bracket+2;
1057  bracket[0] = ch;
1058  bracket[1] = ']';
1059  bracket[2] = '\0';
1060  p_bracket(p);
1061  assert(p->next == bracket+2);
1062  p->next = oldnext;
1063  p->end = oldend;
1064 }
1065 
1066 /*
1067  - ordinary - emit an ordinary character
1068  */
1069 static void
1070 ordinary(struct parse *p, int ch)
1071 {
1072  cat_t *cap = p->g->categories;
1073 
1074  if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1075  bothcases(p, ch);
1076  else {
1077  EMIT(OCHAR, (uch)ch);
1078  if (cap[ch] == 0)
1079  cap[ch] = p->g->ncategories++;
1080  }
1081 }
1082 
1083 /*
1084  - nonnewline - emit REG_NEWLINE version of OANY
1085  *
1086  * Boy, is this implementation ever a kludge...
1087  */
1088 static void
1090 {
1091  char *oldnext = p->next;
1092  char *oldend = p->end;
1093  char bracket[4];
1094 
1095  p->next = bracket;
1096  p->end = bracket+3;
1097  bracket[0] = '^';
1098  bracket[1] = '\n';
1099  bracket[2] = ']';
1100  bracket[3] = '\0';
1101  p_bracket(p);
1102  assert(p->next == bracket+3);
1103  p->next = oldnext;
1104  p->end = oldend;
1105 }
1106 
1107 /*
1108  - repeat - generate code for a bounded repetition, recursively if needed
1109  */
1110 static void
1111 repeat(struct parse *p,
1112  sopno start, /* operand from here to end of strip */
1113  int from, /* repeated from this number */
1114  int to) /* to this number of times (maybe INFINITY) */
1115 {
1116  sopno finish = HERE();
1117 # define N 2
1118 # define INF 3
1119 # define REP(f, t) ((f)*8 + (t))
1120 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1121  sopno copy;
1122 
1123  if (p->error != 0) /* head off possible runaway recursion */
1124  return;
1125 
1126  assert(from <= to);
1127 
1128  switch (REP(MAP(from), MAP(to))) {
1129  case REP(0, 0): /* must be user doing this */
1130  DROP(finish-start); /* drop the operand */
1131  break;
1132  case REP(0, 1): /* as x{1,1}? */
1133  case REP(0, N): /* as x{1,n}? */
1134  case REP(0, INF): /* as x{1,}? */
1135  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1136  INSERT(OCH_, start); /* offset is wrong... */
1137  repeat(p, start+1, 1, to);
1138  ASTERN(OOR1, start);
1139  AHEAD(start); /* ... fix it */
1140  EMIT(OOR2, 0);
1141  AHEAD(THERE());
1142  ASTERN(O_CH, THERETHERE());
1143  break;
1144  case REP(1, 1): /* trivial case */
1145  /* done */
1146  break;
1147  case REP(1, N): /* as x?x{1,n-1} */
1148  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1149  INSERT(OCH_, start);
1150  ASTERN(OOR1, start);
1151  AHEAD(start);
1152  EMIT(OOR2, 0); /* offset very wrong... */
1153  AHEAD(THERE()); /* ...so fix it */
1154  ASTERN(O_CH, THERETHERE());
1155  copy = dupl(p, start+1, finish+1);
1156  assert(copy == finish+4);
1157  repeat(p, copy, 1, to-1);
1158  break;
1159  case REP(1, INF): /* as x+ */
1160  INSERT(OPLUS_, start);
1161  ASTERN(O_PLUS, start);
1162  break;
1163  case REP(N, N): /* as xx{m-1,n-1} */
1164  copy = dupl(p, start, finish);
1165  repeat(p, copy, from-1, to-1);
1166  break;
1167  case REP(N, INF): /* as xx{n-1,INF} */
1168  copy = dupl(p, start, finish);
1169  repeat(p, copy, from-1, to);
1170  break;
1171  default: /* "can't happen" */
1172  SETERROR(REG_ASSERT); /* just in case */
1173  break;
1174  }
1175 }
1176 
1177 /*
1178  - seterr - set an error condition
1179  */
1180 static int /* useless but makes type checking happy */
1181 seterr(struct parse *p, int e)
1182 {
1183  if (p->error == 0) /* keep earliest error condition */
1184  p->error = e;
1185  p->next = nuls; /* try to bring things to a halt */
1186  p->end = nuls;
1187  return(0); /* make the return value well-defined */
1188 }
1189 
1190 /*
1191  - allocset - allocate a set of characters for []
1192  */
1193 static cset *
1194 allocset(struct parse *p)
1195 {
1196  int no = p->g->ncsets++;
1197  size_t nc;
1198  size_t nbytes;
1199  cset *cs;
1200  size_t css = (size_t)p->g->csetsize;
1201  int i;
1202 
1203  if (no >= p->ncsalloc) { /* need another column of space */
1204  void *ptr;
1205 
1206  p->ncsalloc += CHAR_BIT;
1207  nc = p->ncsalloc;
1208  if (nc > SIZE_MAX / sizeof(cset))
1209  goto nomem;
1210  assert(nc % CHAR_BIT == 0);
1211  nbytes = nc / CHAR_BIT * css;
1212 
1213  ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1214  if (ptr == NULL)
1215  goto nomem;
1216  p->g->sets = ptr;
1217 
1218  ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1219  if (ptr == NULL)
1220  goto nomem;
1221  p->g->setbits = ptr;
1222 
1223  for (i = 0; i < no; i++)
1224  p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1225 
1226  (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1227  }
1228  /* XXX should not happen */
1229  if (p->g->sets == NULL || p->g->setbits == NULL)
1230  goto nomem;
1231 
1232  cs = &p->g->sets[no];
1233  cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1234  cs->mask = 1 << ((no) % CHAR_BIT);
1235  cs->hash = 0;
1236  cs->smultis = 0;
1237  cs->multis = NULL;
1238 
1239  return(cs);
1240 nomem:
1241  free(p->g->sets);
1242  p->g->sets = NULL;
1243  free(p->g->setbits);
1244  p->g->setbits = NULL;
1245 
1247  /* caller's responsibility not to do set ops */
1248  return(NULL);
1249 }
1250 
1251 /*
1252  - freeset - free a now-unused set
1253  */
1254 static void
1255 freeset(struct parse *p, cset *cs)
1256 {
1257  size_t i;
1258  cset *top = &p->g->sets[p->g->ncsets];
1259  size_t css = (size_t)p->g->csetsize;
1260 
1261  for (i = 0; i < css; i++)
1262  CHsub(cs, i);
1263  if (cs == top-1) /* recover only the easy case */
1264  p->g->ncsets--;
1265 }
1266 
1267 /*
1268  - freezeset - final processing on a set of characters
1269  *
1270  * The main task here is merging identical sets. This is usually a waste
1271  * of time (although the hash code minimizes the overhead), but can win
1272  * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1273  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1274  * the same value!
1275  */
1276 static int /* set number */
1277 freezeset(struct parse *p, cset *cs)
1278 {
1279  uch h = cs->hash;
1280  size_t i;
1281  cset *top = &p->g->sets[p->g->ncsets];
1282  cset *cs2;
1283  size_t css = (size_t)p->g->csetsize;
1284 
1285  /* look for an earlier one which is the same */
1286  for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1287  if (cs2->hash == h && cs2 != cs) {
1288  /* maybe */
1289  for (i = 0; i < css; i++)
1290  if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1291  break; /* no */
1292  if (i == css)
1293  break; /* yes */
1294  }
1295 
1296  if (cs2 < top) { /* found one */
1297  freeset(p, cs);
1298  cs = cs2;
1299  }
1300 
1301  return((int)(cs - p->g->sets));
1302 }
1303 
1304 /*
1305  - firstch - return first character in a set (which must have at least one)
1306  */
1307 static int /* character; there is no "none" value */
1308 firstch(struct parse *p, cset *cs)
1309 {
1310  size_t i;
1311  size_t css = (size_t)p->g->csetsize;
1312 
1313  for (i = 0; i < css; i++)
1314  if (CHIN(cs, i))
1315  return((char)i);
1316  assert(never);
1317  return(0); /* arbitrary */
1318 }
1319 
1320 /*
1321  - nch - number of characters in a set
1322  */
1323 static int
1324 nch(struct parse *p, cset *cs)
1325 {
1326  size_t i;
1327  size_t css = (size_t)p->g->csetsize;
1328  int n = 0;
1329 
1330  for (i = 0; i < css; i++)
1331  if (CHIN(cs, i))
1332  n++;
1333  return(n);
1334 }
1335 
1336 /*
1337  - mcadd - add a collating element to a cset
1338  */
1339 static void
1340 mcadd( struct parse *p, cset *cs, const char *cp)
1341 {
1342  size_t oldend = cs->smultis;
1343  void *np;
1344 
1345  cs->smultis += strlen(cp) + 1;
1346  np = realloc(cs->multis, cs->smultis);
1347  if (np == NULL) {
1348  if (cs->multis)
1349  free(cs->multis);
1350  cs->multis = NULL;
1352  return;
1353  }
1354  cs->multis = np;
1355 
1356  llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1357 }
1358 
1359 /*
1360  - mcinvert - invert the list of collating elements in a cset
1361  *
1362  * This would have to know the set of possibilities. Implementation
1363  * is deferred.
1364  */
1365 /* ARGSUSED */
1366 static void
1367 mcinvert(struct parse *p, cset *cs)
1368 {
1369  assert(cs->multis == NULL); /* xxx */
1370 }
1371 
1372 /*
1373  - mccase - add case counterparts of the list of collating elements in a cset
1374  *
1375  * This would have to know the set of possibilities. Implementation
1376  * is deferred.
1377  */
1378 /* ARGSUSED */
1379 static void
1380 mccase(struct parse *p, cset *cs)
1381 {
1382  assert(cs->multis == NULL); /* xxx */
1383 }
1384 
1385 /*
1386  - isinsets - is this character in any sets?
1387  */
1388 static int /* predicate */
1389 isinsets(struct re_guts *g, int c)
1390 {
1391  uch *col;
1392  int i;
1393  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1394  unsigned uc = (uch)c;
1395 
1396  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1397  if (col[uc] != 0)
1398  return(1);
1399  return(0);
1400 }
1401 
1402 /*
1403  - samesets - are these two characters in exactly the same sets?
1404  */
1405 static int /* predicate */
1406 samesets(struct re_guts *g, int c1, int c2)
1407 {
1408  uch *col;
1409  int i;
1410  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1411  unsigned uc1 = (uch)c1;
1412  unsigned uc2 = (uch)c2;
1413 
1414  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1415  if (col[uc1] != col[uc2])
1416  return(0);
1417  return(1);
1418 }
1419 
1420 /*
1421  - categorize - sort out character categories
1422  */
1423 static void
1424 categorize(struct parse *p, struct re_guts *g)
1425 {
1426  cat_t *cats = g->categories;
1427  int c;
1428  int c2;
1429  cat_t cat;
1430 
1431  /* avoid making error situations worse */
1432  if (p->error != 0)
1433  return;
1434 
1435  for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1436  if (cats[c] == 0 && isinsets(g, c)) {
1437  cat = g->ncategories++;
1438  cats[c] = cat;
1439  for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1440  if (cats[c2] == 0 && samesets(g, c, c2))
1441  cats[c2] = cat;
1442  }
1443 }
1444 
1445 /*
1446  - dupl - emit a duplicate of a bunch of sops
1447  */
1448 static sopno /* start of duplicate */
1449 dupl(struct parse *p,
1450  sopno start, /* from here */
1451  sopno finish) /* to this less one */
1452 {
1453  sopno ret = HERE();
1454  sopno len = finish - start;
1455 
1456  assert(finish >= start);
1457  if (len == 0)
1458  return(ret);
1459  enlarge(p, p->ssize + len); /* this many unexpected additions */
1460  assert(p->ssize >= p->slen + len);
1461  (void) memmove((char *)(p->strip + p->slen),
1462  (char *)(p->strip + start), (size_t)len*sizeof(sop));
1463  p->slen += len;
1464  return(ret);
1465 }
1466 
1467 /*
1468  - doemit - emit a strip operator
1469  *
1470  * It might seem better to implement this as a macro with a function as
1471  * hard-case backup, but it's just too big and messy unless there are
1472  * some changes to the data structures. Maybe later.
1473  */
1474 static void
1475 doemit(struct parse *p, sop op, size_t opnd)
1476 {
1477  /* avoid making error situations worse */
1478  if (p->error != 0)
1479  return;
1480 
1481  /* deal with oversize operands ("can't happen", more or less) */
1482  assert(opnd < 1<<OPSHIFT);
1483 
1484  /* deal with undersized strip */
1485  if (p->slen >= p->ssize)
1486  enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1487  assert(p->slen < p->ssize);
1488 
1489  /* finally, it's all reduced to the easy case */
1490  p->strip[p->slen++] = SOP(op, opnd);
1491 }
1492 
1493 /*
1494  - doinsert - insert a sop into the strip
1495  */
1496 static void
1497 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1498 {
1499  sopno sn;
1500  sop s;
1501  int i;
1502 
1503  /* avoid making error situations worse */
1504  if (p->error != 0)
1505  return;
1506 
1507  sn = HERE();
1508  EMIT(op, opnd); /* do checks, ensure space */
1509  assert(HERE() == sn+1);
1510  s = p->strip[sn];
1511 
1512  /* adjust paren pointers */
1513  assert(pos > 0);
1514  for (i = 1; i < NPAREN; i++) {
1515  if (p->pbegin[i] >= pos) {
1516  p->pbegin[i]++;
1517  }
1518  if (p->pend[i] >= pos) {
1519  p->pend[i]++;
1520  }
1521  }
1522 
1523  memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1524  (HERE()-pos-1)*sizeof(sop));
1525  p->strip[pos] = s;
1526 }
1527 
1528 /*
1529  - dofwd - complete a forward reference
1530  */
1531 static void
1532 dofwd(struct parse *p, sopno pos, sop value)
1533 {
1534  /* avoid making error situations worse */
1535  if (p->error != 0)
1536  return;
1537 
1538  assert(value < 1<<OPSHIFT);
1539  p->strip[pos] = OP(p->strip[pos]) | value;
1540 }
1541 
1542 /*
1543  - enlarge - enlarge the strip
1544  */
1545 static void
1547 {
1548  sop *sp;
1549 
1550  if (p->ssize >= size)
1551  return;
1552 
1553  if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1555  return;
1556  }
1557 
1558  sp = (sop *)realloc(p->strip, size*sizeof(sop));
1559  if (sp == NULL) {
1561  return;
1562  }
1563  p->strip = sp;
1564  p->ssize = size;
1565 }
1566 
1567 /*
1568  - stripsnug - compact the strip
1569  */
1570 static void
1571 stripsnug(struct parse *p, struct re_guts *g)
1572 {
1573  g->nstates = p->slen;
1574  if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1575  g->strip = p->strip;
1577  return;
1578  }
1579 
1580  g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1581  if (g->strip == NULL) {
1583  g->strip = p->strip;
1584  }
1585 }
1586 
1587 /*
1588  - findmust - fill in must and mlen with longest mandatory literal string
1589  *
1590  * This algorithm could do fancy things like analyzing the operands of |
1591  * for common subsequences. Someday. This code is simple and finds most
1592  * of the interesting cases.
1593  *
1594  * Note that must and mlen got initialized during setup.
1595  */
1596 static void
1597 findmust(struct parse *p, struct re_guts *g)
1598 {
1599  sop *scan;
1600  sop *start = 0; /* start initialized in the default case, after that */
1601  sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1602  sopno newlen;
1603  sop s;
1604  char *cp;
1605  sopno i;
1606 
1607  /* avoid making error situations worse */
1608  if (p->error != 0)
1609  return;
1610 
1611  /* find the longest OCHAR sequence in strip */
1612  newlen = 0;
1613  scan = g->strip + 1;
1614  do {
1615  s = *scan++;
1616  switch (OP(s)) {
1617  case OCHAR: /* sequence member */
1618  if (newlen == 0) /* new sequence */
1619  newstart = scan - 1;
1620  newlen++;
1621  break;
1622  case OPLUS_: /* things that don't break one */
1623  case OLPAREN:
1624  case ORPAREN:
1625  break;
1626  case OQUEST_: /* things that must be skipped */
1627  case OCH_:
1628  scan--;
1629  do {
1630  scan += OPND(s);
1631  s = *scan;
1632  /* assert() interferes w debug printouts */
1633  if (OP(s) != O_QUEST && OP(s) != O_CH &&
1634  OP(s) != OOR2) {
1635  g->iflags |= REGEX_BAD;
1636  return;
1637  }
1638  } while (OP(s) != O_QUEST && OP(s) != O_CH);
1640  default: /* things that break a sequence */
1641  if (newlen > g->mlen) { /* ends one */
1642  start = newstart;
1643  g->mlen = newlen;
1644  }
1645  newlen = 0;
1646  break;
1647  }
1648  } while (OP(s) != OEND);
1649 
1650  if (g->mlen == 0) /* there isn't one */
1651  return;
1652 
1653  /* turn it into a character string */
1654  g->must = malloc((size_t)g->mlen + 1);
1655  if (g->must == NULL) { /* argh; just forget it */
1656  g->mlen = 0;
1657  return;
1658  }
1659  cp = g->must;
1660  scan = start;
1661  for (i = g->mlen; i > 0; i--) {
1662  while (OP(s = *scan++) != OCHAR)
1663  continue;
1664  assert(cp < g->must + g->mlen);
1665  *cp++ = (char)OPND(s);
1666  }
1667  assert(cp == g->must + g->mlen);
1668  *cp++ = '\0'; /* just on general principles */
1669 }
1670 
1671 /*
1672  - pluscount - count + nesting
1673  */
1674 static sopno /* nesting depth */
1675 pluscount(struct parse *p, struct re_guts *g)
1676 {
1677  sop *scan;
1678  sop s;
1679  sopno plusnest = 0;
1680  sopno maxnest = 0;
1681 
1682  if (p->error != 0)
1683  return(0); /* there may not be an OEND */
1684 
1685  scan = g->strip + 1;
1686  do {
1687  s = *scan++;
1688  switch (OP(s)) {
1689  case OPLUS_:
1690  plusnest++;
1691  break;
1692  case O_PLUS:
1693  if (plusnest > maxnest)
1694  maxnest = plusnest;
1695  plusnest--;
1696  break;
1697  }
1698  } while (OP(s) != OEND);
1699  if (plusnest != 0)
1700  g->iflags |= REGEX_BAD;
1701  return(maxnest);
1702 }
REQUIRE
#define REQUIRE(co, e)
Definition: regcomp.c:263
i
i
Definition: README.txt:29
parse::next
char * next
Definition: regcomp.c:193
BACKSL
#define BACKSL
cset::multis
char * multis
Definition: regex2.h:116
REP
#define REP(f, t)
freezeset
static int freezeset(struct parse *, cset *)
Definition: regcomp.c:1277
scan
static Expected< BitVector > scan(StringRef &S, StringRef Original)
Definition: GlobPattern.cpp:67
mccase
static void mccase(struct parse *, cset *)
Definition: regcomp.c:1380
REG_ECOLLATE
#define REG_ECOLLATE
Definition: regex_impl.h:68
REGEX_BAD
#define REGEX_BAD
Definition: regex2.h:147
NEXT2
#define NEXT2()
Definition: regcomp.c:259
p_b_term
static void p_b_term(struct parse *, cset *)
Definition: regcomp.c:876
nonnewline
static void nonnewline(struct parse *)
Definition: regcomp.c:1089
cclasses
static struct cclass cclasses[]
c2
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps c2(%esp) ... xorps %xmm0
USEBOL
#define USEBOL
Definition: regex2.h:145
categorize
static void categorize(struct parse *, struct re_guts *)
Definition: regcomp.c:1424
mcinvert
static void mcinvert(struct parse *, cset *)
Definition: regcomp.c:1367
p_simp_re
static int p_simp_re(struct parse *, int)
Definition: regcomp.c:660
llvm_regex::re_endp
const char * re_endp
Definition: regex_impl.h:51
firstch
static int firstch(struct parse *, cset *)
Definition: regcomp.c:1308
regex_impl.h
parse::ncsalloc
int ncsalloc
Definition: regcomp.c:199
EAT
#define EAT(c)
Definition: regcomp.c:256
O_PLUS
#define O_PLUS
Definition: regex2.h:87
DUPMAX
#define DUPMAX
Definition: regcomp.c:279
ASTERN
#define ASTERN(sop, pos)
Definition: regcomp.c:270
allocset
static cset * allocset(struct parse *)
Definition: regcomp.c:1194
op
#define op(i)
parse::g
struct re_guts * g
Definition: regcomp.c:200
seterr
static int seterr(struct parse *, int)
Definition: regcomp.c:1181
samesets
static int samesets(struct re_guts *, int, int)
Definition: regcomp.c:1406
cname::name
const char * name
Definition: regcomp.c:87
USEEOL
#define USEEOL
Definition: regex2.h:146
THERE
#define THERE()
Definition: regcomp.c:272
stripsnug
static void stripsnug(struct parse *, struct re_guts *)
Definition: regcomp.c:1571
EMIT
#define EMIT(op, sopnd)
Definition: regcomp.c:267
to
Should compile to
Definition: README.txt:449
regex2.h
cclass
Definition: regcomp.c:54
findmust
static void findmust(struct parse *, struct re_guts *)
Definition: regcomp.c:1597
OANY
#define OANY
Definition: regex2.h:82
ret
to esp esp setne al movzbw ax esp setg cl movzbw cx cmove cx cl jne LBB1_2 esp ret(also really horrible code on ppc). This is due to the expand code for 64-bit compares. GCC produces multiple branches
GETNEXT
#define GETNEXT()
Definition: regcomp.c:261
llvm_regcomp
int llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
Definition: regcomp.c:293
llvm_regex
Definition: regex_impl.h:48
doemit
static void doemit(struct parse *, sop, size_t)
Definition: regcomp.c:1475
g
should just be implemented with a CLZ instruction Since there are other e g
Definition: README.txt:709
OCH_
#define OCH_
Definition: regex2.h:92
NEXT
#define NEXT()
Definition: regcomp.c:258
REG_ECTYPE
#define REG_ECTYPE
Definition: regex_impl.h:69
OBACK_
#define OBACK_
Definition: regex2.h:84
MAGIC2
#define MAGIC2
Definition: regex2.h:134
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
cat_t
unsigned char cat_t
Definition: regex2.h:127
REG_EBRACK
#define REG_EBRACK
Definition: regex_impl.h:72
OOR2
#define OOR2
Definition: regex2.h:94
size_t
re_guts
Definition: regex2.h:132
nch
static int nch(struct parse *, cset *)
Definition: regcomp.c:1324
CHsub
#define CHsub(cs, c)
Definition: regex2.h:120
REG_ICASE
#define REG_ICASE
Definition: regex_impl.h:58
llvm_regex::re_nsub
size_t re_nsub
Definition: regex_impl.h:50
cset::hash
uch hash
Definition: regex2.h:114
OBOW
#define OBOW
Definition: regex2.h:96
dofwd
static void dofwd(struct parse *, sopno, sop)
Definition: regcomp.c:1532
ORPAREN
#define ORPAREN
Definition: regex2.h:91
bothcases
static void bothcases(struct parse *, int)
Definition: regcomp.c:1047
OEOW
#define OEOW
Definition: regex2.h:97
sopno
long sopno
Definition: regex2.h:69
OANYOF
#define OANYOF
Definition: regex2.h:83
p_ere
static void p_ere(struct parse *, int)
Definition: regcomp.c:393
O_CH
#define O_CH
Definition: regex2.h:95
REG_ERANGE
#define REG_ERANGE
Definition: regex_impl.h:76
NEXTn
#define NEXTn(n)
Definition: regcomp.c:260
OUT
#define OUT
Definition: regex2.h:162
cname::code
char code
Definition: regcomp.c:88
AHEAD
#define AHEAD(pos)
Definition: regcomp.c:269
parse
Definition: regcomp.c:192
OCHAR
#define OCHAR
Definition: regex2.h:79
cname
Definition: regcomp.c:86
REG_EESCAPE
#define REG_EESCAPE
Definition: regex_impl.h:70
HERE
#define HERE()
Definition: regcomp.c:271
NPAREN
#define NPAREN
Definition: regcomp.c:201
sop
unsigned long sop
Definition: regex2.h:68
parse::pend
sopno pend[NPAREN]
Definition: regcomp.c:203
enlarge
static void enlarge(struct parse *, sopno)
Definition: regcomp.c:1546
OLPAREN
#define OLPAREN
Definition: regex2.h:90
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
uch
unsigned char uch
Definition: regutils.h:43
llvm_regex::re_g
struct re_guts * re_g
Definition: regex_impl.h:52
llvm_strlcpy
size_t llvm_strlcpy(char *dst, const char *src, size_t siz)
Definition: regstrlcpy.c:29
SETERROR
#define SETERROR(e)
Definition: regcomp.c:262
cnames
static struct cname cnames[]
INFINITY
#define INFINITY
Definition: regcomp.c:281
OBOL
#define OBOL
Definition: regex2.h:80
llvm_regex::re_magic
int re_magic
Definition: regex_impl.h:49
REG_BADRPT
#define REG_BADRPT
Definition: regex_impl.h:78
parse::slen
sopno slen
Definition: regcomp.c:198
nuls
static char nuls[10]
Definition: regcomp.c:244
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1683
CHIN
#define CHIN(cs, c)
Definition: regex2.h:121
llvm_regfree
void llvm_regfree(llvm_regex_t *)
Definition: regfree.c:50
s
multiplies can be turned into SHL s
Definition: README.txt:370
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
SEETWO
#define SEETWO(a, b)
Definition: regcomp.c:255
REG_NEWLINE
#define REG_NEWLINE
Definition: regex_impl.h:60
parse::strip
sop * strip
Definition: regcomp.c:196
O_BACK
#define O_BACK
Definition: regex2.h:85
size
i< reg-> size
Definition: README.txt:166
REG_EXTENDED
#define REG_EXTENDED
Definition: regex_impl.h:57
p_b_cclass
static void p_b_cclass(struct parse *, cset *)
Definition: regcomp.c:939
mcadd
static void mcadd(struct parse *, cset *, const char *)
Definition: regcomp.c:1340
re_guts::cflags
int cflags
Definition: regex2.h:140
DROP
#define DROP(n)
Definition: regcomp.c:274
CHadd
#define CHadd(cs, c)
Definition: regex2.h:119
REG_ESUBREG
#define REG_ESUBREG
Definition: regex_impl.h:71
GOODFLAGS
#define GOODFLAGS(f)
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
EATTWO
#define EATTWO(a, b)
Definition: regcomp.c:257
doinsert
static void doinsert(struct parse *, sop, size_t, sopno)
Definition: regcomp.c:1497
cset::ptr
uch * ptr
Definition: regex2.h:112
cset::smultis
size_t smultis
Definition: regex2.h:115
SOP
#define SOP(op, opnd)
Definition: regex2.h:75
REG_ASSERT
#define REG_ASSERT
Definition: regex_impl.h:80
OP
#define OP(n)
Definition: regex2.h:73
parse::ssize
sopno ssize
Definition: regcomp.c:197
parse::error
int error
Definition: regcomp.c:195
O_QUEST
#define O_QUEST
Definition: regex2.h:89
cset
Definition: regex2.h:111
MORE2
#define MORE2()
Definition: regcomp.c:253
MAP
#define MAP(n)
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
Compiler.h
SEE
#define SEE(c)
Definition: regcomp.c:254
OEOL
#define OEOL
Definition: regex2.h:81
p_count
static int p_count(struct parse *)
Definition: regcomp.c:776
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:286
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
freeset
static void freeset(struct parse *, cset *)
Definition: regcomp.c:1255
MORE
#define MORE()
Definition: regcomp.c:252
PEEK2
#define PEEK2()
Definition: regcomp.c:251
cset::mask
uch mask
Definition: regex2.h:113
sp
_bar sp
Definition: README.txt:151
OPSHIFT
#define OPSHIFT
Definition: regex2.h:72
p_bracket
static void p_bracket(struct parse *)
Definition: regcomp.c:797
p_b_symbol
static char p_b_symbol(struct parse *)
Definition: regcomp.c:984
parse::end
char * end
Definition: regcomp.c:194
p_b_eclass
static void p_b_eclass(struct parse *, cset *)
Definition: regcomp.c:972
ordinary
static void ordinary(struct parse *, int)
Definition: regcomp.c:1070
REG_ESPACE
#define REG_ESPACE
Definition: regex_impl.h:77
OPND
#define OPND(n)
Definition: regex2.h:74
cclass::chars
const char * chars
Definition: regcomp.c:56
p_bre
static void p_bre(struct parse *, int, int)
Definition: regcomp.c:629
NC
#define NC
Definition: regutils.h:42
othercase
static char othercase(int)
Definition: regcomp.c:1029
REG_NOSPEC
#define REG_NOSPEC
Definition: regex_impl.h:61
REG_PEND
#define REG_PEND
Definition: regex_impl.h:62
OOR1
#define OOR1
Definition: regex2.h:93
REG_EMPTY
#define REG_EMPTY
Definition: regex_impl.h:79
never
#define never
Definition: regcomp.c:286
cclass::name
const char * name
Definition: regcomp.c:55
cclass::multis
const char * multis
Definition: regcomp.c:57
p_b_coll_elem
static char p_b_coll_elem(struct parse *, int)
Definition: regcomp.c:1002
regutils.h
p_str
static void p_str(struct parse *)
Definition: regcomp.c:611
THERETHERE
#define THERETHERE()
Definition: regcomp.c:273
REG_INVARG
#define REG_INVARG
Definition: regex_impl.h:81
N
#define N
dupl
static sopno dupl(struct parse *, sopno, sopno)
Definition: regcomp.c:1449
repeat
static void repeat(struct parse *, sopno, int, int)
Definition: regcomp.c:1111
OEND
#define OEND
Definition: regex2.h:78
REG_EBRACE
#define REG_EBRACE
Definition: regex_impl.h:74
h
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
Definition: README.txt:261
INSERT
#define INSERT(op, pos)
Definition: regcomp.c:268
pluscount
static sopno pluscount(struct parse *, struct re_guts *)
Definition: regcomp.c:1675
MCadd
#define MCadd(p, cs, cp)
Definition: regex2.h:122
MUSTEAT
#define MUSTEAT(c, e)
Definition: regcomp.c:265
REG_BADBR
#define REG_BADBR
Definition: regex_impl.h:75
OPLUS_
#define OPLUS_
Definition: regex2.h:86
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
p_ere_exp
static void p_ere_exp(struct parse *)
Definition: regcomp.c:436
MAGIC1
#define MAGIC1
Definition: regex2.h:47
copy
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local copy
Definition: README.txt:101
isinsets
static int isinsets(struct re_guts *, int)
Definition: regcomp.c:1389
parse::pbegin
sopno pbegin[NPAREN]
Definition: regcomp.c:202
INF
#define INF
PEEK
#define PEEK()
Definition: regcomp.c:250
OQUEST_
#define OQUEST_
Definition: regex2.h:88
REG_EPAREN
#define REG_EPAREN
Definition: regex_impl.h:73