LLVM  15.0.0git
regcomp.c
Go to the documentation of this file.
1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  * The Regents of the University of California. All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  * may be used to endorse or promote products derived from this software
21  * without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36  */
37 
38 #include <sys/types.h>
39 #include <stdint.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <stdlib.h>
45 #include "regex_impl.h"
46 
47 #include "regutils.h"
48 #include "regex2.h"
49 
50 #include "llvm/Config/config.h"
51 #include "llvm/Support/Compiler.h"
52 
53 /* character-class table */
54 static struct cclass {
55  const char *name;
56  const char *chars;
57  const char *multis;
58 } cclasses[] = {
59  { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
60 0123456789", ""} ,
61  { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
62  ""} ,
63  { "blank", " \t", ""} ,
64  { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
65 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
66  { "digit", "0123456789", ""} ,
67  { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
68 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
69  ""} ,
70  { "lower", "abcdefghijklmnopqrstuvwxyz",
71  ""} ,
72  { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
73 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
74  ""} ,
75  { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
76  ""} ,
77  { "space", "\t\n\v\f\r ", ""} ,
78  { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
79  ""} ,
80  { "xdigit", "0123456789ABCDEFabcdef",
81  ""} ,
82  { NULL, 0, "" }
83 };
84 
85 /* character-name table */
86 static struct cname {
87  const char *name;
88  char code;
89 } cnames[] = {
90  { "NUL", '\0' },
91  { "SOH", '\001' },
92  { "STX", '\002' },
93  { "ETX", '\003' },
94  { "EOT", '\004' },
95  { "ENQ", '\005' },
96  { "ACK", '\006' },
97  { "BEL", '\007' },
98  { "alert", '\007' },
99  { "BS", '\010' },
100  { "backspace", '\b' },
101  { "HT", '\011' },
102  { "tab", '\t' },
103  { "LF", '\012' },
104  { "newline", '\n' },
105  { "VT", '\013' },
106  { "vertical-tab", '\v' },
107  { "FF", '\014' },
108  { "form-feed", '\f' },
109  { "CR", '\015' },
110  { "carriage-return", '\r' },
111  { "SO", '\016' },
112  { "SI", '\017' },
113  { "DLE", '\020' },
114  { "DC1", '\021' },
115  { "DC2", '\022' },
116  { "DC3", '\023' },
117  { "DC4", '\024' },
118  { "NAK", '\025' },
119  { "SYN", '\026' },
120  { "ETB", '\027' },
121  { "CAN", '\030' },
122  { "EM", '\031' },
123  { "SUB", '\032' },
124  { "ESC", '\033' },
125  { "IS4", '\034' },
126  { "FS", '\034' },
127  { "IS3", '\035' },
128  { "GS", '\035' },
129  { "IS2", '\036' },
130  { "RS", '\036' },
131  { "IS1", '\037' },
132  { "US", '\037' },
133  { "space", ' ' },
134  { "exclamation-mark", '!' },
135  { "quotation-mark", '"' },
136  { "number-sign", '#' },
137  { "dollar-sign", '$' },
138  { "percent-sign", '%' },
139  { "ampersand", '&' },
140  { "apostrophe", '\'' },
141  { "left-parenthesis", '(' },
142  { "right-parenthesis", ')' },
143  { "asterisk", '*' },
144  { "plus-sign", '+' },
145  { "comma", ',' },
146  { "hyphen", '-' },
147  { "hyphen-minus", '-' },
148  { "period", '.' },
149  { "full-stop", '.' },
150  { "slash", '/' },
151  { "solidus", '/' },
152  { "zero", '0' },
153  { "one", '1' },
154  { "two", '2' },
155  { "three", '3' },
156  { "four", '4' },
157  { "five", '5' },
158  { "six", '6' },
159  { "seven", '7' },
160  { "eight", '8' },
161  { "nine", '9' },
162  { "colon", ':' },
163  { "semicolon", ';' },
164  { "less-than-sign", '<' },
165  { "equals-sign", '=' },
166  { "greater-than-sign", '>' },
167  { "question-mark", '?' },
168  { "commercial-at", '@' },
169  { "left-square-bracket", '[' },
170  { "backslash", '\\' },
171  { "reverse-solidus", '\\' },
172  { "right-square-bracket", ']' },
173  { "circumflex", '^' },
174  { "circumflex-accent", '^' },
175  { "underscore", '_' },
176  { "low-line", '_' },
177  { "grave-accent", '`' },
178  { "left-brace", '{' },
179  { "left-curly-bracket", '{' },
180  { "vertical-line", '|' },
181  { "right-brace", '}' },
182  { "right-curly-bracket", '}' },
183  { "tilde", '~' },
184  { "DEL", '\177' },
185  { NULL, 0 }
186 };
187 
188 /*
189  * parse structure, passed up and down to avoid global variables and
190  * other clumsinesses
191  */
192 struct parse {
193  char *next; /* next character in RE */
194  char *end; /* end of string (-> NUL normally) */
195  int error; /* has an error been seen? */
196  sop *strip; /* malloced strip */
197  sopno ssize; /* malloced strip size (allocated) */
198  sopno slen; /* malloced strip length (used) */
199  int ncsalloc; /* number of csets allocated */
200  struct re_guts *g;
201 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
202  sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
203  sopno pend[NPAREN]; /* -> ) ([0] unused) */
204 };
205 
206 static void p_ere(struct parse *, int);
207 static void p_ere_exp(struct parse *);
208 static void p_str(struct parse *);
209 static void p_bre(struct parse *, int, int);
210 static int p_simp_re(struct parse *, int);
211 static int p_count(struct parse *);
212 static void p_bracket(struct parse *);
213 static void p_b_term(struct parse *, cset *);
214 static void p_b_cclass(struct parse *, cset *);
215 static void p_b_eclass(struct parse *, cset *);
216 static char p_b_symbol(struct parse *);
217 static char p_b_coll_elem(struct parse *, int);
218 static char othercase(int);
219 static void bothcases(struct parse *, int);
220 static void ordinary(struct parse *, int);
221 static void nonnewline(struct parse *);
222 static void repeat(struct parse *, sopno, int, int);
223 static int seterr(struct parse *, int);
224 static cset *allocset(struct parse *);
225 static void freeset(struct parse *, cset *);
226 static int freezeset(struct parse *, cset *);
227 static int firstch(struct parse *, cset *);
228 static int nch(struct parse *, cset *);
229 static void mcadd(struct parse *, cset *, const char *);
230 static void mcinvert(struct parse *, cset *);
231 static void mccase(struct parse *, cset *);
232 static int isinsets(struct re_guts *, int);
233 static int samesets(struct re_guts *, int, int);
234 static void categorize(struct parse *, struct re_guts *);
235 static sopno dupl(struct parse *, sopno, sopno);
236 static void doemit(struct parse *, sop, size_t);
237 static void doinsert(struct parse *, sop, size_t, sopno);
238 static void dofwd(struct parse *, sopno, sop);
239 static void enlarge(struct parse *, sopno);
240 static void stripsnug(struct parse *, struct re_guts *);
241 static void findmust(struct parse *, struct re_guts *);
242 static sopno pluscount(struct parse *, struct re_guts *);
243 
244 static char nuls[10]; /* place to point scanner in event of error */
245 
246 /*
247  * macros for use with parse structure
248  * BEWARE: these know that the parse structure is named `p' !!!
249  */
250 #define PEEK() (*p->next)
251 #define PEEK2() (*(p->next+1))
252 #define MORE() (p->end - p->next > 0)
253 #define MORE2() (p->end - p->next > 1)
254 #define SEE(c) (MORE() && PEEK() == (c))
255 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b))
256 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
257 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
258 #define NEXT() (p->next++)
259 #define NEXT2() (p->next += 2)
260 #define NEXTn(n) (p->next += (n))
261 #define GETNEXT() (*p->next++)
262 #define SETERROR(e) seterr(p, (e))
263 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
264 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
265 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
266 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
267 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
268 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
269 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
270 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
271 #define HERE() (p->slen)
272 #define THERE() (p->slen - 1)
273 #define THERETHERE() (p->slen - 2)
274 #define DROP(n) (p->slen -= (n))
275 
276 #ifdef _POSIX2_RE_DUP_MAX
277 #define DUPMAX _POSIX2_RE_DUP_MAX
278 #else
279 #define DUPMAX 255
280 #endif
281 #define INFINITY (DUPMAX + 1)
282 
283 #ifndef NDEBUG
284 static int never = 0; /* for use in asserts; shuts lint up */
285 #else
286 #define never 0 /* some <assert.h>s have bugs too */
287 #endif
288 
289 /*
290  - llvm_regcomp - interface for parser and compilation
291  */
292 int /* 0 success, otherwise REG_something */
293 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
294 {
295  struct parse pa;
296  struct re_guts *g;
297  struct parse *p = &pa;
298  int i;
299  size_t len;
300 #ifdef REDEBUG
301 # define GOODFLAGS(f) (f)
302 #else
303 # define GOODFLAGS(f) ((f)&~REG_DUMP)
304 #endif
305 
306  cflags = GOODFLAGS(cflags);
307  if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
308  return(REG_INVARG);
309 
310  if (cflags&REG_PEND) {
311  if (preg->re_endp < pattern)
312  return(REG_INVARG);
313  len = preg->re_endp - pattern;
314  } else
315  len = strlen((const char *)pattern);
316 
317  /* do the mallocs early so failure handling is easy */
318  g = (struct re_guts *)malloc(sizeof(struct re_guts) +
319  (NC-1)*sizeof(cat_t));
320  if (g == NULL)
321  return(REG_ESPACE);
322  p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
323  p->strip = (sop *)calloc(p->ssize, sizeof(sop));
324  p->slen = 0;
325  if (p->strip == NULL) {
326  free((char *)g);
327  return(REG_ESPACE);
328  }
329 
330  /* set things up */
331  p->g = g;
332  p->next = (char *)pattern; /* convenience; we do not modify it */
333  p->end = p->next + len;
334  p->error = 0;
335  p->ncsalloc = 0;
336  for (i = 0; i < NPAREN; i++) {
337  p->pbegin[i] = 0;
338  p->pend[i] = 0;
339  }
340  g->csetsize = NC;
341  g->sets = NULL;
342  g->setbits = NULL;
343  g->ncsets = 0;
344  g->cflags = cflags;
345  g->iflags = 0;
346  g->nbol = 0;
347  g->neol = 0;
348  g->must = NULL;
349  g->mlen = 0;
350  g->nsub = 0;
351  g->ncategories = 1; /* category 0 is "everything else" */
352  g->categories = &g->catspace[-(CHAR_MIN)];
353  (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
354  g->backrefs = 0;
355 
356  /* do it */
357  EMIT(OEND, 0);
358  g->firststate = THERE();
359  if (cflags&REG_EXTENDED)
360  p_ere(p, OUT);
361  else if (cflags&REG_NOSPEC)
362  p_str(p);
363  else
364  p_bre(p, OUT, OUT);
365  EMIT(OEND, 0);
366  g->laststate = THERE();
367 
368  /* tidy up loose ends and fill things in */
369  categorize(p, g);
370  stripsnug(p, g);
371  findmust(p, g);
372  g->nplus = pluscount(p, g);
373  g->magic = MAGIC2;
374  preg->re_nsub = g->nsub;
375  preg->re_g = g;
376  preg->re_magic = MAGIC1;
377 #ifndef REDEBUG
378  /* not debugging, so can't rely on the assert() in llvm_regexec() */
379  if (g->iflags&REGEX_BAD)
381 #endif
382 
383  /* win or lose, we're done */
384  if (p->error != 0) /* lose */
385  llvm_regfree(preg);
386  return(p->error);
387 }
388 
389 /*
390  - p_ere - ERE parser top level, concatenation and alternation
391  */
392 static void
393 p_ere(struct parse *p, int stop) /* character this ERE should end at */
394 {
395  char c;
396  sopno prevback = 0;
397  sopno prevfwd = 0;
398  sopno conc;
399  int first = 1; /* is this the first alternative? */
400 
401  for (;;) {
402  /* do a bunch of concatenated expressions */
403  conc = HERE();
404  while (MORE() && (c = PEEK()) != '|' && c != stop)
405  p_ere_exp(p);
406  REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
407 
408  if (!EAT('|'))
409  break; /* NOTE BREAK OUT */
410 
411  if (first) {
412  INSERT(OCH_, conc); /* offset is wrong */
413  prevfwd = conc;
414  prevback = conc;
415  first = 0;
416  }
417  ASTERN(OOR1, prevback);
418  prevback = THERE();
419  AHEAD(prevfwd); /* fix previous offset */
420  prevfwd = HERE();
421  EMIT(OOR2, 0); /* offset is very wrong */
422  }
423 
424  if (!first) { /* tail-end fixups */
425  AHEAD(prevfwd);
426  ASTERN(O_CH, prevback);
427  }
428 
429  assert(!MORE() || SEE(stop));
430 }
431 
432 /*
433  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
434  */
435 static void
436 p_ere_exp(struct parse *p)
437 {
438  char c;
439  sopno pos;
440  int count;
441  int count2;
442  int backrefnum;
443  sopno subno;
444  int wascaret = 0;
445 
446  assert(MORE()); /* caller should have ensured this */
447  c = GETNEXT();
448 
449  pos = HERE();
450  switch (c) {
451  case '(':
452  REQUIRE(MORE(), REG_EPAREN);
453  p->g->nsub++;
454  subno = p->g->nsub;
455  if (subno < NPAREN)
456  p->pbegin[subno] = HERE();
457  EMIT(OLPAREN, subno);
458  if (!SEE(')'))
459  p_ere(p, ')');
460  if (subno < NPAREN) {
461  p->pend[subno] = HERE();
462  assert(p->pend[subno] != 0);
463  }
464  EMIT(ORPAREN, subno);
465  MUSTEAT(')', REG_EPAREN);
466  break;
467 #ifndef POSIX_MISTAKE
468  case ')': /* happens only if no current unmatched ( */
469  /*
470  * You may ask, why the ifndef? Because I didn't notice
471  * this until slightly too late for 1003.2, and none of the
472  * other 1003.2 regular-expression reviewers noticed it at
473  * all. So an unmatched ) is legal POSIX, at least until
474  * we can get it fixed.
475  */
477  break;
478 #endif
479  case '^':
480  EMIT(OBOL, 0);
481  p->g->iflags |= USEBOL;
482  p->g->nbol++;
483  wascaret = 1;
484  break;
485  case '$':
486  EMIT(OEOL, 0);
487  p->g->iflags |= USEEOL;
488  p->g->neol++;
489  break;
490  case '|':
492  break;
493  case '*':
494  case '+':
495  case '?':
497  break;
498  case '.':
499  if (p->g->cflags&REG_NEWLINE)
500  nonnewline(p);
501  else
502  EMIT(OANY, 0);
503  break;
504  case '[':
505  p_bracket(p);
506  break;
507  case '\\':
509  c = GETNEXT();
510  if (c >= '1' && c <= '9') {
511  /* \[0-9] is taken to be a back-reference to a previously specified
512  * matching group. backrefnum will hold the number. The matching
513  * group must exist (i.e. if \4 is found there must have been at
514  * least 4 matching groups specified in the pattern previously).
515  */
516  backrefnum = c - '0';
517  if (p->pend[backrefnum] == 0) {
519  break;
520  }
521 
522  /* Make sure everything checks out and emit the sequence
523  * that marks a back-reference to the parse structure.
524  */
525  assert(backrefnum <= p->g->nsub);
526  EMIT(OBACK_, backrefnum);
527  assert(p->pbegin[backrefnum] != 0);
528  assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
529  assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
530  (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
531  EMIT(O_BACK, backrefnum);
532  p->g->backrefs = 1;
533  } else {
534  /* Other chars are simply themselves when escaped with a backslash.
535  */
536  ordinary(p, c);
537  }
538  break;
539  case '{': /* okay as ordinary except if digit follows */
540  REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
542  default:
543  ordinary(p, c);
544  break;
545  }
546 
547  if (!MORE())
548  return;
549  c = PEEK();
550  /* we call { a repetition if followed by a digit */
551  if (!( c == '*' || c == '+' || c == '?' ||
552  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
553  return; /* no repetition, we're done */
554  NEXT();
555 
556  REQUIRE(!wascaret, REG_BADRPT);
557  switch (c) {
558  case '*': /* implemented as +? */
559  /* this case does not require the (y|) trick, noKLUDGE */
560  INSERT(OPLUS_, pos);
561  ASTERN(O_PLUS, pos);
562  INSERT(OQUEST_, pos);
563  ASTERN(O_QUEST, pos);
564  break;
565  case '+':
566  INSERT(OPLUS_, pos);
567  ASTERN(O_PLUS, pos);
568  break;
569  case '?':
570  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
571  INSERT(OCH_, pos); /* offset slightly wrong */
572  ASTERN(OOR1, pos); /* this one's right */
573  AHEAD(pos); /* fix the OCH_ */
574  EMIT(OOR2, 0); /* offset very wrong... */
575  AHEAD(THERE()); /* ...so fix it */
576  ASTERN(O_CH, THERETHERE());
577  break;
578  case '{':
579  count = p_count(p);
580  if (EAT(',')) {
581  if (isdigit((uch)PEEK())) {
582  count2 = p_count(p);
583  REQUIRE(count <= count2, REG_BADBR);
584  } else /* single number with comma */
585  count2 = INFINITY;
586  } else /* just a single number */
587  count2 = count;
588  repeat(p, pos, count, count2);
589  if (!EAT('}')) { /* error heuristics */
590  while (MORE() && PEEK() != '}')
591  NEXT();
592  REQUIRE(MORE(), REG_EBRACE);
594  }
595  break;
596  }
597 
598  if (!MORE())
599  return;
600  c = PEEK();
601  if (!( c == '*' || c == '+' || c == '?' ||
602  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
603  return;
605 }
606 
607 /*
608  - p_str - string (no metacharacters) "parser"
609  */
610 static void
611 p_str(struct parse *p)
612 {
613  REQUIRE(MORE(), REG_EMPTY);
614  while (MORE())
615  ordinary(p, GETNEXT());
616 }
617 
618 /*
619  - p_bre - BRE parser top level, anchoring and concatenation
620  * Giving end1 as OUT essentially eliminates the end1/end2 check.
621  *
622  * This implementation is a bit of a kludge, in that a trailing $ is first
623  * taken as an ordinary character and then revised to be an anchor. The
624  * only undesirable side effect is that '$' gets included as a character
625  * category in such cases. This is fairly harmless; not worth fixing.
626  * The amount of lookahead needed to avoid this kludge is excessive.
627  */
628 static void
629 p_bre(struct parse *p,
630  int end1, /* first terminating character */
631  int end2) /* second terminating character */
632 {
633  sopno start = HERE();
634  int first = 1; /* first subexpression? */
635  int wasdollar = 0;
636 
637  if (EAT('^')) {
638  EMIT(OBOL, 0);
639  p->g->iflags |= USEBOL;
640  p->g->nbol++;
641  }
642  while (MORE() && !SEETWO(end1, end2)) {
643  wasdollar = p_simp_re(p, first);
644  first = 0;
645  }
646  if (wasdollar) { /* oops, that was a trailing anchor */
647  DROP(1);
648  EMIT(OEOL, 0);
649  p->g->iflags |= USEEOL;
650  p->g->neol++;
651  }
652 
653  REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
654 }
655 
656 /*
657  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
658  */
659 static int /* was the simple RE an unbackslashed $? */
660 p_simp_re(struct parse *p,
661  int starordinary) /* is a leading * an ordinary character? */
662 {
663  int c;
664  int count;
665  int count2;
666  sopno pos;
667  int i;
668  sopno subno;
669 # define BACKSL (1<<CHAR_BIT)
670 
671  pos = HERE(); /* repetition op, if any, covers from here */
672 
673  assert(MORE()); /* caller should have ensured this */
674  c = GETNEXT();
675  if (c == '\\') {
677  c = BACKSL | GETNEXT();
678  }
679  switch (c) {
680  case '.':
681  if (p->g->cflags&REG_NEWLINE)
682  nonnewline(p);
683  else
684  EMIT(OANY, 0);
685  break;
686  case '[':
687  p_bracket(p);
688  break;
689  case BACKSL|'{':
691  break;
692  case BACKSL|'(':
693  p->g->nsub++;
694  subno = p->g->nsub;
695  if (subno < NPAREN)
696  p->pbegin[subno] = HERE();
697  EMIT(OLPAREN, subno);
698  /* the MORE here is an error heuristic */
699  if (MORE() && !SEETWO('\\', ')'))
700  p_bre(p, '\\', ')');
701  if (subno < NPAREN) {
702  p->pend[subno] = HERE();
703  assert(p->pend[subno] != 0);
704  }
705  EMIT(ORPAREN, subno);
706  REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
707  break;
708  case BACKSL|')': /* should not get here -- must be user */
709  case BACKSL|'}':
711  break;
712  case BACKSL|'1':
713  case BACKSL|'2':
714  case BACKSL|'3':
715  case BACKSL|'4':
716  case BACKSL|'5':
717  case BACKSL|'6':
718  case BACKSL|'7':
719  case BACKSL|'8':
720  case BACKSL|'9':
721  i = (c&~BACKSL) - '0';
722  assert(i < NPAREN);
723  if (p->pend[i] != 0) {
724  assert(i <= p->g->nsub);
725  EMIT(OBACK_, i);
726  assert(p->pbegin[i] != 0);
727  assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
728  assert(OP(p->strip[p->pend[i]]) == ORPAREN);
729  (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
730  EMIT(O_BACK, i);
731  } else
733  p->g->backrefs = 1;
734  break;
735  case '*':
736  REQUIRE(starordinary, REG_BADRPT);
738  default:
739  ordinary(p, (char)c);
740  break;
741  }
742 
743  if (EAT('*')) { /* implemented as +? */
744  /* this case does not require the (y|) trick, noKLUDGE */
745  INSERT(OPLUS_, pos);
746  ASTERN(O_PLUS, pos);
747  INSERT(OQUEST_, pos);
748  ASTERN(O_QUEST, pos);
749  } else if (EATTWO('\\', '{')) {
750  count = p_count(p);
751  if (EAT(',')) {
752  if (MORE() && isdigit((uch)PEEK())) {
753  count2 = p_count(p);
754  REQUIRE(count <= count2, REG_BADBR);
755  } else /* single number with comma */
756  count2 = INFINITY;
757  } else /* just a single number */
758  count2 = count;
759  repeat(p, pos, count, count2);
760  if (!EATTWO('\\', '}')) { /* error heuristics */
761  while (MORE() && !SEETWO('\\', '}'))
762  NEXT();
763  REQUIRE(MORE(), REG_EBRACE);
765  }
766  } else if (c == '$') /* $ (but not \$) ends it */
767  return(1);
768 
769  return(0);
770 }
771 
772 /*
773  - p_count - parse a repetition count
774  */
775 static int /* the value */
776 p_count(struct parse *p)
777 {
778  int count = 0;
779  int ndigits = 0;
780 
781  while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
782  count = count*10 + (GETNEXT() - '0');
783  ndigits++;
784  }
785 
786  REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
787  return(count);
788 }
789 
790 /*
791  - p_bracket - parse a bracketed character list
792  *
793  * Note a significant property of this code: if the allocset() did SETERROR,
794  * no set operations are done.
795  */
796 static void
797 p_bracket(struct parse *p)
798 {
799  cset *cs;
800  int invert = 0;
801 
802  /* Dept of Truly Sickening Special-Case Kludges */
803  if (p->end - p->next > 5) {
804  if (strncmp(p->next, "[:<:]]", 6) == 0) {
805  EMIT(OBOW, 0);
806  NEXTn(6);
807  return;
808  }
809  if (strncmp(p->next, "[:>:]]", 6) == 0) {
810  EMIT(OEOW, 0);
811  NEXTn(6);
812  return;
813  }
814  }
815 
816  if ((cs = allocset(p)) == NULL) {
817  /* allocset did set error status in p */
818  return;
819  }
820 
821  if (EAT('^'))
822  invert++; /* make note to invert set at end */
823  if (EAT(']'))
824  CHadd(cs, ']');
825  else if (EAT('-'))
826  CHadd(cs, '-');
827  while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
828  p_b_term(p, cs);
829  if (EAT('-'))
830  CHadd(cs, '-');
831  MUSTEAT(']', REG_EBRACK);
832 
833  if (p->error != 0) { /* don't mess things up further */
834  freeset(p, cs);
835  return;
836  }
837 
838  if (p->g->cflags&REG_ICASE) {
839  int i;
840  int ci;
841 
842  for (i = p->g->csetsize - 1; i >= 0; i--)
843  if (CHIN(cs, i) && isalpha(i)) {
844  ci = othercase(i);
845  if (ci != i)
846  CHadd(cs, ci);
847  }
848  if (cs->multis != NULL)
849  mccase(p, cs);
850  }
851  if (invert) {
852  int i;
853 
854  for (i = p->g->csetsize - 1; i >= 0; i--)
855  if (CHIN(cs, i))
856  CHsub(cs, i);
857  else
858  CHadd(cs, i);
859  if (p->g->cflags&REG_NEWLINE)
860  CHsub(cs, '\n');
861  if (cs->multis != NULL)
862  mcinvert(p, cs);
863  }
864 
865  assert(cs->multis == NULL); /* xxx */
866 
867  if (nch(p, cs) == 1) { /* optimize singleton sets */
868  ordinary(p, firstch(p, cs));
869  freeset(p, cs);
870  } else
871  EMIT(OANYOF, freezeset(p, cs));
872 }
873 
874 /*
875  - p_b_term - parse one term of a bracketed character list
876  */
877 static void
878 p_b_term(struct parse *p, cset *cs)
879 {
880  char c;
881  char start, finish;
882  int i;
883 
884  /* classify what we've got */
885  switch ((MORE()) ? PEEK() : '\0') {
886  case '[':
887  c = (MORE2()) ? PEEK2() : '\0';
888  break;
889  case '-':
891  return; /* NOTE RETURN */
892  break;
893  default:
894  c = '\0';
895  break;
896  }
897 
898  switch (c) {
899  case ':': /* character class */
900  NEXT2();
901  REQUIRE(MORE(), REG_EBRACK);
902  c = PEEK();
903  REQUIRE(c != '-' && c != ']', REG_ECTYPE);
904  p_b_cclass(p, cs);
905  REQUIRE(MORE(), REG_EBRACK);
906  REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
907  break;
908  case '=': /* equivalence class */
909  NEXT2();
910  REQUIRE(MORE(), REG_EBRACK);
911  c = PEEK();
912  REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
913  p_b_eclass(p, cs);
914  REQUIRE(MORE(), REG_EBRACK);
915  REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
916  break;
917  default: /* symbol, ordinary character, or range */
918 /* xxx revision needed for multichar stuff */
919  start = p_b_symbol(p);
920  if (SEE('-') && MORE2() && PEEK2() != ']') {
921  /* range */
922  NEXT();
923  if (EAT('-'))
924  finish = '-';
925  else
926  finish = p_b_symbol(p);
927  } else
928  finish = start;
929 /* xxx what about signed chars here... */
930  REQUIRE(start <= finish, REG_ERANGE);
931  for (i = start; i <= finish; i++)
932  CHadd(cs, i);
933  break;
934  }
935 }
936 
937 /*
938  - p_b_cclass - parse a character-class name and deal with it
939  */
940 static void
941 p_b_cclass(struct parse *p, cset *cs)
942 {
943  char *sp = p->next;
944  struct cclass *cp;
945  size_t len;
946  const char *u;
947  char c;
948 
949  while (MORE() && isalpha((uch)PEEK()))
950  NEXT();
951  len = p->next - sp;
952  for (cp = cclasses; cp->name != NULL; cp++)
953  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
954  break;
955  if (cp->name == NULL) {
956  /* oops, didn't find it */
958  return;
959  }
960 
961  u = cp->chars;
962  while ((c = *u++) != '\0')
963  CHadd(cs, c);
964  for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
965  MCadd(p, cs, u);
966 }
967 
968 /*
969  - p_b_eclass - parse an equivalence-class name and deal with it
970  *
971  * This implementation is incomplete. xxx
972  */
973 static void
974 p_b_eclass(struct parse *p, cset *cs)
975 {
976  char c;
977 
978  c = p_b_coll_elem(p, '=');
979  CHadd(cs, c);
980 }
981 
982 /*
983  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
984  */
985 static char /* value of symbol */
987 {
988  char value;
989 
990  REQUIRE(MORE(), REG_EBRACK);
991  if (!EATTWO('[', '.'))
992  return(GETNEXT());
993 
994  /* collating symbol */
995  value = p_b_coll_elem(p, '.');
996  REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
997  return(value);
998 }
999 
1000 /*
1001  - p_b_coll_elem - parse a collating-element name and look it up
1002  */
1003 static char /* value of collating element */
1005  int endc) /* name ended by endc,']' */
1006 {
1007  char *sp = p->next;
1008  struct cname *cp;
1009  size_t len;
1010 
1011  while (MORE() && !SEETWO(endc, ']'))
1012  NEXT();
1013  if (!MORE()) {
1015  return(0);
1016  }
1017  len = p->next - sp;
1018  for (cp = cnames; cp->name != NULL; cp++)
1019  if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1020  return(cp->code); /* known name */
1021  if (len == 1)
1022  return(*sp); /* single character */
1023  SETERROR(REG_ECOLLATE); /* neither */
1024  return(0);
1025 }
1026 
1027 /*
1028  - othercase - return the case counterpart of an alphabetic
1029  */
1030 static char /* if no counterpart, return ch */
1031 othercase(int ch)
1032 {
1033  ch = (uch)ch;
1034  assert(isalpha(ch));
1035  if (isupper(ch))
1036  return ((uch)tolower(ch));
1037  else if (islower(ch))
1038  return ((uch)toupper(ch));
1039  else /* peculiar, but could happen */
1040  return(ch);
1041 }
1042 
1043 /*
1044  - bothcases - emit a dualcase version of a two-case character
1045  *
1046  * Boy, is this implementation ever a kludge...
1047  */
1048 static void
1049 bothcases(struct parse *p, int ch)
1050 {
1051  char *oldnext = p->next;
1052  char *oldend = p->end;
1053  char bracket[3];
1054 
1055  ch = (uch)ch;
1056  assert(othercase(ch) != ch); /* p_bracket() would recurse */
1057  p->next = bracket;
1058  p->end = bracket+2;
1059  bracket[0] = ch;
1060  bracket[1] = ']';
1061  bracket[2] = '\0';
1062  p_bracket(p);
1063  assert(p->next == bracket+2);
1064  p->next = oldnext;
1065  p->end = oldend;
1066 }
1067 
1068 /*
1069  - ordinary - emit an ordinary character
1070  */
1071 static void
1072 ordinary(struct parse *p, int ch)
1073 {
1074  cat_t *cap = p->g->categories;
1075 
1076  if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1077  bothcases(p, ch);
1078  else {
1079  EMIT(OCHAR, (uch)ch);
1080  if (cap[ch] == 0)
1081  cap[ch] = p->g->ncategories++;
1082  }
1083 }
1084 
1085 /*
1086  - nonnewline - emit REG_NEWLINE version of OANY
1087  *
1088  * Boy, is this implementation ever a kludge...
1089  */
1090 static void
1092 {
1093  char *oldnext = p->next;
1094  char *oldend = p->end;
1095  char bracket[4];
1096 
1097  p->next = bracket;
1098  p->end = bracket+3;
1099  bracket[0] = '^';
1100  bracket[1] = '\n';
1101  bracket[2] = ']';
1102  bracket[3] = '\0';
1103  p_bracket(p);
1104  assert(p->next == bracket+3);
1105  p->next = oldnext;
1106  p->end = oldend;
1107 }
1108 
1109 /*
1110  - repeat - generate code for a bounded repetition, recursively if needed
1111  */
1112 static void
1113 repeat(struct parse *p,
1114  sopno start, /* operand from here to end of strip */
1115  int from, /* repeated from this number */
1116  int to) /* to this number of times (maybe INFINITY) */
1117 {
1118  sopno finish = HERE();
1119 # define N 2
1120 # define INF 3
1121 # define REP(f, t) ((f)*8 + (t))
1122 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1123  sopno copy;
1124 
1125  if (p->error != 0) /* head off possible runaway recursion */
1126  return;
1127 
1128  assert(from <= to);
1129 
1130  switch (REP(MAP(from), MAP(to))) {
1131  case REP(0, 0): /* must be user doing this */
1132  DROP(finish-start); /* drop the operand */
1133  break;
1134  case REP(0, 1): /* as x{1,1}? */
1135  case REP(0, N): /* as x{1,n}? */
1136  case REP(0, INF): /* as x{1,}? */
1137  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1138  INSERT(OCH_, start); /* offset is wrong... */
1139  repeat(p, start+1, 1, to);
1140  ASTERN(OOR1, start);
1141  AHEAD(start); /* ... fix it */
1142  EMIT(OOR2, 0);
1143  AHEAD(THERE());
1144  ASTERN(O_CH, THERETHERE());
1145  break;
1146  case REP(1, 1): /* trivial case */
1147  /* done */
1148  break;
1149  case REP(1, N): /* as x?x{1,n-1} */
1150  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1151  INSERT(OCH_, start);
1152  ASTERN(OOR1, start);
1153  AHEAD(start);
1154  EMIT(OOR2, 0); /* offset very wrong... */
1155  AHEAD(THERE()); /* ...so fix it */
1156  ASTERN(O_CH, THERETHERE());
1157  copy = dupl(p, start+1, finish+1);
1158  assert(copy == finish+4);
1159  repeat(p, copy, 1, to-1);
1160  break;
1161  case REP(1, INF): /* as x+ */
1162  INSERT(OPLUS_, start);
1163  ASTERN(O_PLUS, start);
1164  break;
1165  case REP(N, N): /* as xx{m-1,n-1} */
1166  copy = dupl(p, start, finish);
1167  repeat(p, copy, from-1, to-1);
1168  break;
1169  case REP(N, INF): /* as xx{n-1,INF} */
1170  copy = dupl(p, start, finish);
1171  repeat(p, copy, from-1, to);
1172  break;
1173  default: /* "can't happen" */
1174  SETERROR(REG_ASSERT); /* just in case */
1175  break;
1176  }
1177 }
1178 
1179 /*
1180  - seterr - set an error condition
1181  */
1182 static int /* useless but makes type checking happy */
1183 seterr(struct parse *p, int e)
1184 {
1185  if (p->error == 0) /* keep earliest error condition */
1186  p->error = e;
1187  p->next = nuls; /* try to bring things to a halt */
1188  p->end = nuls;
1189  return(0); /* make the return value well-defined */
1190 }
1191 
1192 /*
1193  - allocset - allocate a set of characters for []
1194  */
1195 static cset *
1196 allocset(struct parse *p)
1197 {
1198  int no = p->g->ncsets++;
1199  size_t nc;
1200  size_t nbytes;
1201  cset *cs;
1202  size_t css = (size_t)p->g->csetsize;
1203  int i;
1204 
1205  if (no >= p->ncsalloc) { /* need another column of space */
1206  void *ptr;
1207 
1208  p->ncsalloc += CHAR_BIT;
1209  nc = p->ncsalloc;
1210  if (nc > SIZE_MAX / sizeof(cset))
1211  goto nomem;
1212  assert(nc % CHAR_BIT == 0);
1213  nbytes = nc / CHAR_BIT * css;
1214 
1215  ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1216  if (ptr == NULL)
1217  goto nomem;
1218  p->g->sets = ptr;
1219 
1220  ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1221  if (ptr == NULL)
1222  goto nomem;
1223  p->g->setbits = ptr;
1224 
1225  for (i = 0; i < no; i++)
1226  p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1227 
1228  (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1229  }
1230  /* XXX should not happen */
1231  if (p->g->sets == NULL || p->g->setbits == NULL)
1232  goto nomem;
1233 
1234  cs = &p->g->sets[no];
1235  cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1236  cs->mask = 1 << ((no) % CHAR_BIT);
1237  cs->hash = 0;
1238  cs->smultis = 0;
1239  cs->multis = NULL;
1240 
1241  return(cs);
1242 nomem:
1243  free(p->g->sets);
1244  p->g->sets = NULL;
1245  free(p->g->setbits);
1246  p->g->setbits = NULL;
1247 
1249  /* caller's responsibility not to do set ops */
1250  return(NULL);
1251 }
1252 
1253 /*
1254  - freeset - free a now-unused set
1255  */
1256 static void
1257 freeset(struct parse *p, cset *cs)
1258 {
1259  size_t i;
1260  cset *top = &p->g->sets[p->g->ncsets];
1261  size_t css = (size_t)p->g->csetsize;
1262 
1263  for (i = 0; i < css; i++)
1264  CHsub(cs, i);
1265  if (cs == top-1) /* recover only the easy case */
1266  p->g->ncsets--;
1267 }
1268 
1269 /*
1270  - freezeset - final processing on a set of characters
1271  *
1272  * The main task here is merging identical sets. This is usually a waste
1273  * of time (although the hash code minimizes the overhead), but can win
1274  * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1275  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1276  * the same value!
1277  */
1278 static int /* set number */
1279 freezeset(struct parse *p, cset *cs)
1280 {
1281  uch h = cs->hash;
1282  size_t i;
1283  cset *top = &p->g->sets[p->g->ncsets];
1284  cset *cs2;
1285  size_t css = (size_t)p->g->csetsize;
1286 
1287  /* look for an earlier one which is the same */
1288  for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1289  if (cs2->hash == h && cs2 != cs) {
1290  /* maybe */
1291  for (i = 0; i < css; i++)
1292  if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1293  break; /* no */
1294  if (i == css)
1295  break; /* yes */
1296  }
1297 
1298  if (cs2 < top) { /* found one */
1299  freeset(p, cs);
1300  cs = cs2;
1301  }
1302 
1303  return((int)(cs - p->g->sets));
1304 }
1305 
1306 /*
1307  - firstch - return first character in a set (which must have at least one)
1308  */
1309 static int /* character; there is no "none" value */
1310 firstch(struct parse *p, cset *cs)
1311 {
1312  size_t i;
1313  size_t css = (size_t)p->g->csetsize;
1314 
1315  for (i = 0; i < css; i++)
1316  if (CHIN(cs, i))
1317  return((char)i);
1318  assert(never);
1319  return(0); /* arbitrary */
1320 }
1321 
1322 /*
1323  - nch - number of characters in a set
1324  */
1325 static int
1326 nch(struct parse *p, cset *cs)
1327 {
1328  size_t i;
1329  size_t css = (size_t)p->g->csetsize;
1330  int n = 0;
1331 
1332  for (i = 0; i < css; i++)
1333  if (CHIN(cs, i))
1334  n++;
1335  return(n);
1336 }
1337 
1338 /*
1339  - mcadd - add a collating element to a cset
1340  */
1341 static void
1342 mcadd( struct parse *p, cset *cs, const char *cp)
1343 {
1344  size_t oldend = cs->smultis;
1345  void *np;
1346 
1347  cs->smultis += strlen(cp) + 1;
1348  np = realloc(cs->multis, cs->smultis);
1349  if (np == NULL) {
1350  if (cs->multis)
1351  free(cs->multis);
1352  cs->multis = NULL;
1354  return;
1355  }
1356  cs->multis = np;
1357 
1358  llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1359 }
1360 
1361 /*
1362  - mcinvert - invert the list of collating elements in a cset
1363  *
1364  * This would have to know the set of possibilities. Implementation
1365  * is deferred.
1366  */
1367 /* ARGSUSED */
1368 static void
1369 mcinvert(struct parse *p, cset *cs)
1370 {
1371  assert(cs->multis == NULL); /* xxx */
1372 }
1373 
1374 /*
1375  - mccase - add case counterparts of the list of collating elements in a cset
1376  *
1377  * This would have to know the set of possibilities. Implementation
1378  * is deferred.
1379  */
1380 /* ARGSUSED */
1381 static void
1382 mccase(struct parse *p, cset *cs)
1383 {
1384  assert(cs->multis == NULL); /* xxx */
1385 }
1386 
1387 /*
1388  - isinsets - is this character in any sets?
1389  */
1390 static int /* predicate */
1391 isinsets(struct re_guts *g, int c)
1392 {
1393  uch *col;
1394  int i;
1395  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1396  unsigned uc = (uch)c;
1397 
1398  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1399  if (col[uc] != 0)
1400  return(1);
1401  return(0);
1402 }
1403 
1404 /*
1405  - samesets - are these two characters in exactly the same sets?
1406  */
1407 static int /* predicate */
1408 samesets(struct re_guts *g, int c1, int c2)
1409 {
1410  uch *col;
1411  int i;
1412  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1413  unsigned uc1 = (uch)c1;
1414  unsigned uc2 = (uch)c2;
1415 
1416  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1417  if (col[uc1] != col[uc2])
1418  return(0);
1419  return(1);
1420 }
1421 
1422 /*
1423  - categorize - sort out character categories
1424  */
1425 static void
1426 categorize(struct parse *p, struct re_guts *g)
1427 {
1428  cat_t *cats = g->categories;
1429  int c;
1430  int c2;
1431  cat_t cat;
1432 
1433  /* avoid making error situations worse */
1434  if (p->error != 0)
1435  return;
1436 
1437  for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1438  if (cats[c] == 0 && isinsets(g, c)) {
1439  cat = g->ncategories++;
1440  cats[c] = cat;
1441  for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1442  if (cats[c2] == 0 && samesets(g, c, c2))
1443  cats[c2] = cat;
1444  }
1445 }
1446 
1447 /*
1448  - dupl - emit a duplicate of a bunch of sops
1449  */
1450 static sopno /* start of duplicate */
1451 dupl(struct parse *p,
1452  sopno start, /* from here */
1453  sopno finish) /* to this less one */
1454 {
1455  sopno ret = HERE();
1456  sopno len = finish - start;
1457 
1458  assert(finish >= start);
1459  if (len == 0)
1460  return(ret);
1461  enlarge(p, p->ssize + len); /* this many unexpected additions */
1462  assert(p->ssize >= p->slen + len);
1463  (void) memmove((char *)(p->strip + p->slen),
1464  (char *)(p->strip + start), (size_t)len*sizeof(sop));
1465  p->slen += len;
1466  return(ret);
1467 }
1468 
1469 /*
1470  - doemit - emit a strip operator
1471  *
1472  * It might seem better to implement this as a macro with a function as
1473  * hard-case backup, but it's just too big and messy unless there are
1474  * some changes to the data structures. Maybe later.
1475  */
1476 static void
1477 doemit(struct parse *p, sop op, size_t opnd)
1478 {
1479  /* avoid making error situations worse */
1480  if (p->error != 0)
1481  return;
1482 
1483  /* deal with oversize operands ("can't happen", more or less) */
1484  assert(opnd < 1<<OPSHIFT);
1485 
1486  /* deal with undersized strip */
1487  if (p->slen >= p->ssize)
1488  enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1489  assert(p->slen < p->ssize);
1490 
1491  /* finally, it's all reduced to the easy case */
1492  p->strip[p->slen++] = SOP(op, opnd);
1493 }
1494 
1495 /*
1496  - doinsert - insert a sop into the strip
1497  */
1498 static void
1499 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1500 {
1501  sopno sn;
1502  sop s;
1503  int i;
1504 
1505  /* avoid making error situations worse */
1506  if (p->error != 0)
1507  return;
1508 
1509  sn = HERE();
1510  EMIT(op, opnd); /* do checks, ensure space */
1511  assert(HERE() == sn+1);
1512  s = p->strip[sn];
1513 
1514  /* adjust paren pointers */
1515  assert(pos > 0);
1516  for (i = 1; i < NPAREN; i++) {
1517  if (p->pbegin[i] >= pos) {
1518  p->pbegin[i]++;
1519  }
1520  if (p->pend[i] >= pos) {
1521  p->pend[i]++;
1522  }
1523  }
1524 
1525  memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1526  (HERE()-pos-1)*sizeof(sop));
1527  p->strip[pos] = s;
1528 }
1529 
1530 /*
1531  - dofwd - complete a forward reference
1532  */
1533 static void
1534 dofwd(struct parse *p, sopno pos, sop value)
1535 {
1536  /* avoid making error situations worse */
1537  if (p->error != 0)
1538  return;
1539 
1540  assert(value < 1<<OPSHIFT);
1541  p->strip[pos] = OP(p->strip[pos]) | value;
1542 }
1543 
1544 /*
1545  - enlarge - enlarge the strip
1546  */
1547 static void
1549 {
1550  sop *sp;
1551 
1552  if (p->ssize >= size)
1553  return;
1554 
1555  if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1557  return;
1558  }
1559 
1560  sp = (sop *)realloc(p->strip, size*sizeof(sop));
1561  if (sp == NULL) {
1563  return;
1564  }
1565  p->strip = sp;
1566  p->ssize = size;
1567 }
1568 
1569 /*
1570  - stripsnug - compact the strip
1571  */
1572 static void
1573 stripsnug(struct parse *p, struct re_guts *g)
1574 {
1575  g->nstates = p->slen;
1576  if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1577  g->strip = p->strip;
1579  return;
1580  }
1581 
1582  g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1583  if (g->strip == NULL) {
1585  g->strip = p->strip;
1586  }
1587 }
1588 
1589 /*
1590  - findmust - fill in must and mlen with longest mandatory literal string
1591  *
1592  * This algorithm could do fancy things like analyzing the operands of |
1593  * for common subsequences. Someday. This code is simple and finds most
1594  * of the interesting cases.
1595  *
1596  * Note that must and mlen got initialized during setup.
1597  */
1598 static void
1599 findmust(struct parse *p, struct re_guts *g)
1600 {
1601  sop *scan;
1602  sop *start = 0; /* start initialized in the default case, after that */
1603  sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1604  sopno newlen;
1605  sop s;
1606  char *cp;
1607  sopno i;
1608 
1609  /* avoid making error situations worse */
1610  if (p->error != 0)
1611  return;
1612 
1613  /* find the longest OCHAR sequence in strip */
1614  newlen = 0;
1615  scan = g->strip + 1;
1616  do {
1617  s = *scan++;
1618  switch (OP(s)) {
1619  case OCHAR: /* sequence member */
1620  if (newlen == 0) /* new sequence */
1621  newstart = scan - 1;
1622  newlen++;
1623  break;
1624  case OPLUS_: /* things that don't break one */
1625  case OLPAREN:
1626  case ORPAREN:
1627  break;
1628  case OQUEST_: /* things that must be skipped */
1629  case OCH_:
1630  scan--;
1631  do {
1632  scan += OPND(s);
1633  s = *scan;
1634  /* assert() interferes w debug printouts */
1635  if (OP(s) != O_QUEST && OP(s) != O_CH &&
1636  OP(s) != OOR2) {
1637  g->iflags |= REGEX_BAD;
1638  return;
1639  }
1640  } while (OP(s) != O_QUEST && OP(s) != O_CH);
1642  default: /* things that break a sequence */
1643  if (newlen > g->mlen) { /* ends one */
1644  start = newstart;
1645  g->mlen = newlen;
1646  }
1647  newlen = 0;
1648  break;
1649  }
1650  } while (OP(s) != OEND);
1651 
1652  if (g->mlen == 0) /* there isn't one */
1653  return;
1654 
1655  /* turn it into a character string */
1656  g->must = malloc((size_t)g->mlen + 1);
1657  if (g->must == NULL) { /* argh; just forget it */
1658  g->mlen = 0;
1659  return;
1660  }
1661  cp = g->must;
1662  scan = start;
1663  for (i = g->mlen; i > 0; i--) {
1664  while (OP(s = *scan++) != OCHAR)
1665  continue;
1666  assert(cp < g->must + g->mlen);
1667  *cp++ = (char)OPND(s);
1668  }
1669  assert(cp == g->must + g->mlen);
1670  *cp++ = '\0'; /* just on general principles */
1671 }
1672 
1673 /*
1674  - pluscount - count + nesting
1675  */
1676 static sopno /* nesting depth */
1677 pluscount(struct parse *p, struct re_guts *g)
1678 {
1679  sop *scan;
1680  sop s;
1681  sopno plusnest = 0;
1682  sopno maxnest = 0;
1683 
1684  if (p->error != 0)
1685  return(0); /* there may not be an OEND */
1686 
1687  scan = g->strip + 1;
1688  do {
1689  s = *scan++;
1690  switch (OP(s)) {
1691  case OPLUS_:
1692  plusnest++;
1693  break;
1694  case O_PLUS:
1695  if (plusnest > maxnest)
1696  maxnest = plusnest;
1697  plusnest--;
1698  break;
1699  }
1700  } while (OP(s) != OEND);
1701  if (plusnest != 0)
1702  g->iflags |= REGEX_BAD;
1703  return(maxnest);
1704 }
REQUIRE
#define REQUIRE(co, e)
Definition: regcomp.c:263
i
i
Definition: README.txt:29
parse::next
char * next
Definition: regcomp.c:193
BACKSL
#define BACKSL
cset::multis
char * multis
Definition: regex2.h:116
REP
#define REP(f, t)
freezeset
static int freezeset(struct parse *, cset *)
Definition: regcomp.c:1279
scan
static Expected< BitVector > scan(StringRef &S, StringRef Original)
Definition: GlobPattern.cpp:67
mccase
static void mccase(struct parse *, cset *)
Definition: regcomp.c:1382
REG_ECOLLATE
#define REG_ECOLLATE
Definition: regex_impl.h:68
REGEX_BAD
#define REGEX_BAD
Definition: regex2.h:147
NEXT2
#define NEXT2()
Definition: regcomp.c:259
p_b_term
static void p_b_term(struct parse *, cset *)
Definition: regcomp.c:878
nonnewline
static void nonnewline(struct parse *)
Definition: regcomp.c:1091
cclasses
static struct cclass cclasses[]
c2
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps c2(%esp) ... xorps %xmm0
USEBOL
#define USEBOL
Definition: regex2.h:145
categorize
static void categorize(struct parse *, struct re_guts *)
Definition: regcomp.c:1426
mcinvert
static void mcinvert(struct parse *, cset *)
Definition: regcomp.c:1369
p_simp_re
static int p_simp_re(struct parse *, int)
Definition: regcomp.c:660
llvm_regex::re_endp
const char * re_endp
Definition: regex_impl.h:51
firstch
static int firstch(struct parse *, cset *)
Definition: regcomp.c:1310
regex_impl.h
parse::ncsalloc
int ncsalloc
Definition: regcomp.c:199
EAT
#define EAT(c)
Definition: regcomp.c:256
O_PLUS
#define O_PLUS
Definition: regex2.h:87
DUPMAX
#define DUPMAX
Definition: regcomp.c:279
ASTERN
#define ASTERN(sop, pos)
Definition: regcomp.c:270
allocset
static cset * allocset(struct parse *)
Definition: regcomp.c:1196
op
#define op(i)
parse::g
struct re_guts * g
Definition: regcomp.c:200
seterr
static int seterr(struct parse *, int)
Definition: regcomp.c:1183
samesets
static int samesets(struct re_guts *, int, int)
Definition: regcomp.c:1408
cname::name
const char * name
Definition: regcomp.c:87
USEEOL
#define USEEOL
Definition: regex2.h:146
THERE
#define THERE()
Definition: regcomp.c:272
stripsnug
static void stripsnug(struct parse *, struct re_guts *)
Definition: regcomp.c:1573
EMIT
#define EMIT(op, sopnd)
Definition: regcomp.c:267
to
Should compile to
Definition: README.txt:449
regex2.h
cclass
Definition: regcomp.c:54
findmust
static void findmust(struct parse *, struct re_guts *)
Definition: regcomp.c:1599
OANY
#define OANY
Definition: regex2.h:82
ret
to esp esp setne al movzbw ax esp setg cl movzbw cx cmove cx cl jne LBB1_2 esp ret(also really horrible code on ppc). This is due to the expand code for 64-bit compares. GCC produces multiple branches
GETNEXT
#define GETNEXT()
Definition: regcomp.c:261
llvm_regcomp
int llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
Definition: regcomp.c:293
llvm_regex
Definition: regex_impl.h:48
doemit
static void doemit(struct parse *, sop, size_t)
Definition: regcomp.c:1477
g
should just be implemented with a CLZ instruction Since there are other e g
Definition: README.txt:709
OCH_
#define OCH_
Definition: regex2.h:92
NEXT
#define NEXT()
Definition: regcomp.c:258
REG_ECTYPE
#define REG_ECTYPE
Definition: regex_impl.h:69
OBACK_
#define OBACK_
Definition: regex2.h:84
MAGIC2
#define MAGIC2
Definition: regex2.h:134
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
cat_t
unsigned char cat_t
Definition: regex2.h:127
REG_EBRACK
#define REG_EBRACK
Definition: regex_impl.h:72
OOR2
#define OOR2
Definition: regex2.h:94
size_t
re_guts
Definition: regex2.h:132
nch
static int nch(struct parse *, cset *)
Definition: regcomp.c:1326
CHsub
#define CHsub(cs, c)
Definition: regex2.h:120
REG_ICASE
#define REG_ICASE
Definition: regex_impl.h:58
llvm_regex::re_nsub
size_t re_nsub
Definition: regex_impl.h:50
cset::hash
uch hash
Definition: regex2.h:114
OBOW
#define OBOW
Definition: regex2.h:96
dofwd
static void dofwd(struct parse *, sopno, sop)
Definition: regcomp.c:1534
ORPAREN
#define ORPAREN
Definition: regex2.h:91
bothcases
static void bothcases(struct parse *, int)
Definition: regcomp.c:1049
OEOW
#define OEOW
Definition: regex2.h:97
sopno
long sopno
Definition: regex2.h:69
OANYOF
#define OANYOF
Definition: regex2.h:83
p_ere
static void p_ere(struct parse *, int)
Definition: regcomp.c:393
O_CH
#define O_CH
Definition: regex2.h:95
REG_ERANGE
#define REG_ERANGE
Definition: regex_impl.h:76
NEXTn
#define NEXTn(n)
Definition: regcomp.c:260
OUT
#define OUT
Definition: regex2.h:162
cname::code
char code
Definition: regcomp.c:88
AHEAD
#define AHEAD(pos)
Definition: regcomp.c:269
parse
Definition: regcomp.c:192
OCHAR
#define OCHAR
Definition: regex2.h:79
cname
Definition: regcomp.c:86
REG_EESCAPE
#define REG_EESCAPE
Definition: regex_impl.h:70
HERE
#define HERE()
Definition: regcomp.c:271
NPAREN
#define NPAREN
Definition: regcomp.c:201
sop
unsigned long sop
Definition: regex2.h:68
parse::pend
sopno pend[NPAREN]
Definition: regcomp.c:203
enlarge
static void enlarge(struct parse *, sopno)
Definition: regcomp.c:1548
OLPAREN
#define OLPAREN
Definition: regex2.h:90
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
uch
unsigned char uch
Definition: regutils.h:43
llvm_regex::re_g
struct re_guts * re_g
Definition: regex_impl.h:52
llvm_strlcpy
size_t llvm_strlcpy(char *dst, const char *src, size_t siz)
Definition: regstrlcpy.c:29
SETERROR
#define SETERROR(e)
Definition: regcomp.c:262
cnames
static struct cname cnames[]
INFINITY
#define INFINITY
Definition: regcomp.c:281
OBOL
#define OBOL
Definition: regex2.h:80
llvm_regex::re_magic
int re_magic
Definition: regex_impl.h:49
REG_BADRPT
#define REG_BADRPT
Definition: regex_impl.h:78
parse::slen
sopno slen
Definition: regcomp.c:198
nuls
static char nuls[10]
Definition: regcomp.c:244
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1699
CHIN
#define CHIN(cs, c)
Definition: regex2.h:121
llvm_regfree
void llvm_regfree(llvm_regex_t *)
Definition: regfree.c:50
s
multiplies can be turned into SHL s
Definition: README.txt:370
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
SEETWO
#define SEETWO(a, b)
Definition: regcomp.c:255
REG_NEWLINE
#define REG_NEWLINE
Definition: regex_impl.h:60
parse::strip
sop * strip
Definition: regcomp.c:196
O_BACK
#define O_BACK
Definition: regex2.h:85
size
i< reg-> size
Definition: README.txt:166
REG_EXTENDED
#define REG_EXTENDED
Definition: regex_impl.h:57
p_b_cclass
static void p_b_cclass(struct parse *, cset *)
Definition: regcomp.c:941
mcadd
static void mcadd(struct parse *, cset *, const char *)
Definition: regcomp.c:1342
re_guts::cflags
int cflags
Definition: regex2.h:140
DROP
#define DROP(n)
Definition: regcomp.c:274
CHadd
#define CHadd(cs, c)
Definition: regex2.h:119
REG_ESUBREG
#define REG_ESUBREG
Definition: regex_impl.h:71
GOODFLAGS
#define GOODFLAGS(f)
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
EATTWO
#define EATTWO(a, b)
Definition: regcomp.c:257
doinsert
static void doinsert(struct parse *, sop, size_t, sopno)
Definition: regcomp.c:1499
cset::ptr
uch * ptr
Definition: regex2.h:112
cset::smultis
size_t smultis
Definition: regex2.h:115
SOP
#define SOP(op, opnd)
Definition: regex2.h:75
REG_ASSERT
#define REG_ASSERT
Definition: regex_impl.h:80
OP
#define OP(n)
Definition: regex2.h:73
parse::ssize
sopno ssize
Definition: regcomp.c:197
parse::error
int error
Definition: regcomp.c:195
O_QUEST
#define O_QUEST
Definition: regex2.h:89
cset
Definition: regex2.h:111
MORE2
#define MORE2()
Definition: regcomp.c:253
MAP
#define MAP(n)
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
Compiler.h
SEE
#define SEE(c)
Definition: regcomp.c:254
OEOL
#define OEOL
Definition: regex2.h:81
p_count
static int p_count(struct parse *)
Definition: regcomp.c:776
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
freeset
static void freeset(struct parse *, cset *)
Definition: regcomp.c:1257
MORE
#define MORE()
Definition: regcomp.c:252
PEEK2
#define PEEK2()
Definition: regcomp.c:251
cset::mask
uch mask
Definition: regex2.h:113
sp
_bar sp
Definition: README.txt:151
OPSHIFT
#define OPSHIFT
Definition: regex2.h:72
p_bracket
static void p_bracket(struct parse *)
Definition: regcomp.c:797
p_b_symbol
static char p_b_symbol(struct parse *)
Definition: regcomp.c:986
parse::end
char * end
Definition: regcomp.c:194
p_b_eclass
static void p_b_eclass(struct parse *, cset *)
Definition: regcomp.c:974
ordinary
static void ordinary(struct parse *, int)
Definition: regcomp.c:1072
REG_ESPACE
#define REG_ESPACE
Definition: regex_impl.h:77
OPND
#define OPND(n)
Definition: regex2.h:74
cclass::chars
const char * chars
Definition: regcomp.c:56
p_bre
static void p_bre(struct parse *, int, int)
Definition: regcomp.c:629
NC
#define NC
Definition: regutils.h:42
othercase
static char othercase(int)
Definition: regcomp.c:1031
REG_NOSPEC
#define REG_NOSPEC
Definition: regex_impl.h:61
REG_PEND
#define REG_PEND
Definition: regex_impl.h:62
OOR1
#define OOR1
Definition: regex2.h:93
REG_EMPTY
#define REG_EMPTY
Definition: regex_impl.h:79
never
#define never
Definition: regcomp.c:286
cclass::name
const char * name
Definition: regcomp.c:55
cclass::multis
const char * multis
Definition: regcomp.c:57
p_b_coll_elem
static char p_b_coll_elem(struct parse *, int)
Definition: regcomp.c:1004
regutils.h
p_str
static void p_str(struct parse *)
Definition: regcomp.c:611
THERETHERE
#define THERETHERE()
Definition: regcomp.c:273
REG_INVARG
#define REG_INVARG
Definition: regex_impl.h:81
N
#define N
dupl
static sopno dupl(struct parse *, sopno, sopno)
Definition: regcomp.c:1451
repeat
static void repeat(struct parse *, sopno, int, int)
Definition: regcomp.c:1113
OEND
#define OEND
Definition: regex2.h:78
REG_EBRACE
#define REG_EBRACE
Definition: regex_impl.h:74
h
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
Definition: README.txt:261
INSERT
#define INSERT(op, pos)
Definition: regcomp.c:268
pluscount
static sopno pluscount(struct parse *, struct re_guts *)
Definition: regcomp.c:1677
MCadd
#define MCadd(p, cs, cp)
Definition: regex2.h:122
MUSTEAT
#define MUSTEAT(c, e)
Definition: regcomp.c:265
REG_BADBR
#define REG_BADBR
Definition: regex_impl.h:75
OPLUS_
#define OPLUS_
Definition: regex2.h:86
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
p_ere_exp
static void p_ere_exp(struct parse *)
Definition: regcomp.c:436
MAGIC1
#define MAGIC1
Definition: regex2.h:47
copy
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local copy
Definition: README.txt:101
isinsets
static int isinsets(struct re_guts *, int)
Definition: regcomp.c:1391
parse::pbegin
sopno pbegin[NPAREN]
Definition: regcomp.c:202
INF
#define INF
PEEK
#define PEEK()
Definition: regcomp.c:250
OQUEST_
#define OQUEST_
Definition: regex2.h:88
REG_EPAREN
#define REG_EPAREN
Definition: regex_impl.h:73