Process Hacker
pcre_compile.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2010 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #define HAVE_CONFIG_H
46 #ifdef HAVE_CONFIG_H
47 #include "config.h"
48 #endif
49 
50 #define NLBLOCK cd /* Block containing newline information */
51 #define PSSTART start_pattern /* Field containing processed string start */
52 #define PSEND end_pattern /* Field containing processed string end */
53 
54 #include "pcre_internal.h"
55 
56 
57 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
58 also used by pcretest. PCRE_DEBUG is not defined when building a production
59 library. */
60 
61 #ifdef PCRE_DEBUG
62 #include "pcre_printint.src"
63 #endif
64 
65 
66 /* Macro for setting individual bits in class bitmaps. */
67 
68 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
69 
70 /* Maximum length value to check against when making sure that the integer that
71 holds the compiled pattern length does not overflow. We make it a bit less than
72 INT_MAX to allow for adding in group terminating bytes, so that we don't have
73 to check them every time. */
74 
75 #define OFLOW_MAX (INT_MAX - 20)
76 
77 
78 /*************************************************
79 * Code parameters and static tables *
80 *************************************************/
81 
82 /* This value specifies the size of stack workspace that is used during the
83 first pre-compile phase that determines how much memory is required. The regex
84 is partly compiled into this space, but the compiled parts are discarded as
85 soon as they can be, so that hopefully there will never be an overrun. The code
86 does, however, check for an overrun. The largest amount I've seen used is 218,
87 so this number is very generous.
88 
89 The same workspace is used during the second, actual compile phase for
90 remembering forward references to groups so that they can be filled in at the
91 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
92 is 4 there is plenty of room. */
93 
94 #define COMPILE_WORK_SIZE (4096)
95 
96 /* The overrun tests check for a slightly smaller size so that they detect the
97 overrun before it actually does run off the end of the data block. */
98 
99 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
100 
101 
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106 
107 #ifndef EBCDIC
108 
109 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
110 in UTF-8 mode. */
111 
112 static const short int escapes[] = {
113  0, 0,
114  0, 0,
115  0, 0,
116  0, 0,
117  0, 0,
122  -ESC_B, -ESC_C,
123  -ESC_D, -ESC_E,
124  0, -ESC_G,
125  -ESC_H, 0,
126  0, -ESC_K,
127  0, 0,
128  -ESC_N, 0,
129  -ESC_P, -ESC_Q,
130  -ESC_R, -ESC_S,
131  0, 0,
132  -ESC_V, -ESC_W,
133  -ESC_X, 0,
138  -ESC_b, 0,
139  -ESC_d, ESC_e,
140  ESC_f, 0,
141  -ESC_h, 0,
142  0, -ESC_k,
143  0, 0,
144  ESC_n, 0,
145  -ESC_p, 0,
146  ESC_r, -ESC_s,
147  ESC_tee, 0,
148  -ESC_v, -ESC_w,
149  0, 0,
150  -ESC_z
151 };
152 
153 #else
154 
155 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
156 
157 static const short int escapes[] = {
158 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
159 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
160 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
161 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
162 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
163 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
164 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
165 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
166 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
167 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
168 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
169 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
170 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
171 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
172 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
173 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
174 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
175 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
176 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
177 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
178 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
179 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
180 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
181 };
182 #endif
183 
184 
185 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
186 searched linearly. Put all the names into a single string, in order to reduce
187 the number of relocations when a shared library is dynamically linked. The
188 string is built from string macros so that it works in UTF-8 mode on EBCDIC
189 platforms. */
190 
191 typedef struct verbitem {
192  int len; /* Length of verb name */
193  int op; /* Op when no arg, or -1 if arg mandatory */
194  int op_arg; /* Op when arg present, or -1 if not allowed */
195 } verbitem;
196 
197 static const char verbnames[] =
198  "\0" /* Empty name is a shorthand for MARK */
202  STRING_F0
206  STRING_THEN;
207 
208 static const verbitem verbs[] = {
209  { 0, -1, OP_MARK },
210  { 4, -1, OP_MARK },
211  { 6, OP_ACCEPT, -1 },
212  { 6, OP_COMMIT, -1 },
213  { 1, OP_FAIL, -1 },
214  { 4, OP_FAIL, -1 },
215  { 5, OP_PRUNE, OP_PRUNE_ARG },
216  { 4, OP_SKIP, OP_SKIP_ARG },
217  { 4, OP_THEN, OP_THEN_ARG }
218 };
219 
220 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
221 
222 
223 /* Tables of names of POSIX character classes and their lengths. The names are
224 now all in a single string, to reduce the number of relocations when a shared
225 library is dynamically loaded. The list of lengths is terminated by a zero
226 length entry. The first three must be alpha, lower, upper, as this is assumed
227 for handling case independence. */
228 
229 static const char posix_names[] =
234 
235 static const uschar posix_name_lengths[] = {
236  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
237 
238 /* Table of class bit maps for each POSIX class. Each class is formed from a
239 base map, with an optional addition or removal of another map. Then, for some
240 classes, there is some additional tweaking: for [:blank:] the vertical space
241 characters are removed, and for [:alpha:] and [:alnum:] the underscore
242 character is removed. The triples in the table consist of the base map offset,
243 second map offset or -1 if no second map, and a non-negative value for map
244 addition or a negative value for map subtraction (if there are two maps). The
245 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
246 remove vertical space characters, 2 => remove underscore. */
247 
248 static const int posix_class_maps[] = {
249  cbit_word, cbit_digit, -2, /* alpha */
250  cbit_lower, -1, 0, /* lower */
251  cbit_upper, -1, 0, /* upper */
252  cbit_word, -1, 2, /* alnum - word without underscore */
253  cbit_print, cbit_cntrl, 0, /* ascii */
254  cbit_space, -1, 1, /* blank - a GNU extension */
255  cbit_cntrl, -1, 0, /* cntrl */
256  cbit_digit, -1, 0, /* digit */
257  cbit_graph, -1, 0, /* graph */
258  cbit_print, -1, 0, /* print */
259  cbit_punct, -1, 0, /* punct */
260  cbit_space, -1, 0, /* space */
261  cbit_word, -1, 0, /* word - a Perl extension */
262  cbit_xdigit,-1, 0 /* xdigit */
263 };
264 
265 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
266 substitutes must be in the order of the names, defined above, and there are
267 both positive and negative cases. NULL means no substitute. */
268 
269 #ifdef SUPPORT_UCP
270 static const uschar *substitutes[] = {
271  (uschar *)"\\P{Nd}", /* \D */
272  (uschar *)"\\p{Nd}", /* \d */
273  (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
274  (uschar *)"\\p{Xsp}", /* \s */
275  (uschar *)"\\P{Xwd}", /* \W */
276  (uschar *)"\\p{Xwd}" /* \w */
277 };
278 
279 static const uschar *posix_substitutes[] = {
280  (uschar *)"\\p{L}", /* alpha */
281  (uschar *)"\\p{Ll}", /* lower */
282  (uschar *)"\\p{Lu}", /* upper */
283  (uschar *)"\\p{Xan}", /* alnum */
284  NULL, /* ascii */
285  (uschar *)"\\h", /* blank */
286  NULL, /* cntrl */
287  (uschar *)"\\p{Nd}", /* digit */
288  NULL, /* graph */
289  NULL, /* print */
290  NULL, /* punct */
291  (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
292  (uschar *)"\\p{Xwd}", /* word */
293  NULL, /* xdigit */
294  /* Negated cases */
295  (uschar *)"\\P{L}", /* ^alpha */
296  (uschar *)"\\P{Ll}", /* ^lower */
297  (uschar *)"\\P{Lu}", /* ^upper */
298  (uschar *)"\\P{Xan}", /* ^alnum */
299  NULL, /* ^ascii */
300  (uschar *)"\\H", /* ^blank */
301  NULL, /* ^cntrl */
302  (uschar *)"\\P{Nd}", /* ^digit */
303  NULL, /* ^graph */
304  NULL, /* ^print */
305  NULL, /* ^punct */
306  (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
307  (uschar *)"\\P{Xwd}", /* ^word */
308  NULL /* ^xdigit */
309 };
310 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
311 #endif
312 
313 #define STRING(a) # a
314 #define XSTRING(s) STRING(s)
315 
316 /* The texts of compile-time error messages. These are "char *" because they
317 are passed to the outside world. Do not ever re-use any error number, because
318 they are documented. Always add a new error instead. Messages marked DEAD below
319 are no longer used. This used to be a table of strings, but in order to reduce
320 the number of relocations needed when a shared library is loaded dynamically,
321 it is now one long string. We cannot use a table of offsets, because the
322 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
323 simply count through to the one we want - this isn't a performance issue
324 because these strings are used only when there is a compilation error.
325 
326 Each substring ends with \0 to insert a null character. This includes the final
327 substring, so that the whole string ends with \0\0, which can be detected when
328 counting through. */
329 
330 static const char error_texts[] =
331  "no error\0"
332  "\\ at end of pattern\0"
333  "\\c at end of pattern\0"
334  "unrecognized character follows \\\0"
335  "numbers out of order in {} quantifier\0"
336  /* 5 */
337  "number too big in {} quantifier\0"
338  "missing terminating ] for character class\0"
339  "invalid escape sequence in character class\0"
340  "range out of order in character class\0"
341  "nothing to repeat\0"
342  /* 10 */
343  "operand of unlimited repeat could match the empty string\0"
344  "internal error: unexpected repeat\0"
345  "unrecognized character after (? or (?-\0"
346  "POSIX named classes are supported only within a class\0"
347  "missing )\0"
348  /* 15 */
349  "reference to non-existent subpattern\0"
350  "erroffset passed as NULL\0"
351  "unknown option bit(s) set\0"
352  "missing ) after comment\0"
353  "parentheses nested too deeply\0"
354  /* 20 */
355  "regular expression is too large\0"
356  "failed to get memory\0"
357  "unmatched parentheses\0"
358  "internal error: code overflow\0"
359  "unrecognized character after (?<\0"
360  /* 25 */
361  "lookbehind assertion is not fixed length\0"
362  "malformed number or name after (?(\0"
363  "conditional group contains more than two branches\0"
364  "assertion expected after (?(\0"
365  "(?R or (?[+-]digits must be followed by )\0"
366  /* 30 */
367  "unknown POSIX class name\0"
368  "POSIX collating elements are not supported\0"
369  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
370  "spare error\0"
371  "character value in \\x{...} sequence is too large\0"
372  /* 35 */
373  "invalid condition (?(0)\0"
374  "\\C not allowed in lookbehind assertion\0"
375  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
376  "number after (?C is > 255\0"
377  "closing ) for (?C expected\0"
378  /* 40 */
379  "recursive call could loop indefinitely\0"
380  "unrecognized character after (?P\0"
381  "syntax error in subpattern name (missing terminator)\0"
382  "two named subpatterns have the same name\0"
383  "invalid UTF-8 string\0"
384  /* 45 */
385  "support for \\P, \\p, and \\X has not been compiled\0"
386  "malformed \\P or \\p sequence\0"
387  "unknown property name after \\P or \\p\0"
388  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
389  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
390  /* 50 */
391  "repeated subpattern is too long\0"
392  "octal value is greater than \\377 (not in UTF-8 mode)\0"
393  "internal error: overran compiling workspace\0"
394  "internal error: previously-checked referenced subpattern not found\0"
395  "DEFINE group contains more than one branch\0"
396  /* 55 */
397  "repeating a DEFINE group is not allowed\0"
398  "inconsistent NEWLINE options\0"
399  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
400  "a numbered reference must not be zero\0"
401  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
402  /* 60 */
403  "(*VERB) not recognized\0"
404  "number is too big\0"
405  "subpattern name expected\0"
406  "digit expected after (?+\0"
407  "] is an invalid data character in JavaScript compatibility mode\0"
408  /* 65 */
409  "different names for subpatterns of the same number are not allowed\0"
410  "(*MARK) must have an argument\0"
411  "this version of PCRE is not compiled with PCRE_UCP support\0"
412  "\\c must be followed by an ASCII character\0"
413  ;
414 
415 /* Table to identify digits and hex digits. This is used when compiling
416 patterns. Note that the tables in chartables are dependent on the locale, and
417 may mark arbitrary characters as digits - but the PCRE compiling code expects
418 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
419 a private table here. It costs 256 bytes, but it is a lot faster than doing
420 character value tests (at least in some simple cases I timed), and in some
421 applications one wants PCRE to compile efficiently as well as match
422 efficiently.
423 
424 For convenience, we use the same bit definitions as in chartables:
425 
426  0x04 decimal digit
427  0x08 hexadecimal digit
428 
429 Then we can use ctype_digit and ctype_xdigit in the code. */
430 
431 #ifndef EBCDIC
432 
433 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
434 UTF-8 mode. */
435 
436 static const unsigned char digitab[] =
437  {
438  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
439  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
440  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
441  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
442  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
443  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
444  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
445  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
446  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
447  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
448  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
449  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
450  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
451  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
452  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
453  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
454  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
455  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
456  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
457  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
458  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
459  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
460  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
461  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
462  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
463  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
464  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
465  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
466  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
467  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
468  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
469  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
470 
471 #else
472 
473 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
474 
475 static const unsigned char digitab[] =
476  {
477  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
478  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
479  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
480  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
481  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
482  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
483  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
484  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
485  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
486  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
487  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
488  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
489  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
490  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
491  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
492  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
493  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
494  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
495  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
496  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
497  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
498  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
499  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
500  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
501  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
502  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
503  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
504  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
505  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
506  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
507  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
508  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
509 
510 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
511  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
512  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
513  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
514  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
515  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
516  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
517  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
518  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
519  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
520  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
521  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
522  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
523  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
524  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
525  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
526  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
527  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
528  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
529  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
530  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
531  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
532  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
533  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
534  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
535  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
536  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
537  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
538  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
539  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
540  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
541  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
542  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
543 #endif
544 
545 
546 /* Definition to allow mutual recursion */
547 
548 static BOOL
549  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
550  int *, int *, branch_chain *, compile_data *, int *);
551 
552 
553 
554 /*************************************************
555 * Find an error text *
556 *************************************************/
557 
558 /* The error texts are now all in one long string, to save on relocations. As
559 some of the text is of unknown length, we can't use a table of offsets.
560 Instead, just count through the strings. This is not a performance issue
561 because it happens only when there has been a compilation error.
562 
563 Argument: the error number
564 Returns: pointer to the error string
565 */
566 
567 static const char *
568 find_error_text(int n)
569 {
570 const char *s = error_texts;
571 for (; n > 0; n--)
572  {
573  while (*s++ != 0) {};
574  if (*s == 0) return "Error text not found (please report)";
575  }
576 return s;
577 }
578 
579 
580 /*************************************************
581 * Handle escapes *
582 *************************************************/
583 
584 /* This function is called when a \ has been encountered. It either returns a
585 positive value for a simple escape such as \n, or a negative value which
586 encodes one of the more complicated things such as \d. A backreference to group
587 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
588 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
589 ptr is pointing at the \. On exit, it is on the final character of the escape
590 sequence.
591 
592 Arguments:
593  ptrptr points to the pattern position pointer
594  errorcodeptr points to the errorcode variable
595  bracount number of previous extracting brackets
596  options the options bits
597  isclass TRUE if inside a character class
598 
599 Returns: zero or positive => a data character
600  negative => a special escape sequence
601  on error, errorcodeptr is set
602 */
603 
604 static int
605 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
606  int options, BOOL isclass)
607 {
608 BOOL utf8 = (options & PCRE_UTF8) != 0;
609 const uschar *ptr = *ptrptr + 1;
610 int c, i;
611 
612 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
613 ptr--; /* Set pointer back to the last byte */
614 
615 /* If backslash is at the end of the pattern, it's an error. */
616 
617 if (c == 0) *errorcodeptr = ERR1;
618 
619 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
620 in a table. A non-zero result is something that can be returned immediately.
621 Otherwise further processing may be required. */
622 
623 #ifndef EBCDIC /* ASCII/UTF-8 coding */
624 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
625 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
626 
627 #else /* EBCDIC coding */
628 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
629 else if ((i = escapes[c - 0x48]) != 0) c = i;
630 #endif
631 
632 /* Escapes that need further processing, or are illegal. */
633 
634 else
635  {
636  const uschar *oldptr;
637  BOOL braced, negated;
638 
639  switch (c)
640  {
641  /* A number of Perl escapes are not handled by PCRE. We give an explicit
642  error. */
643 
644  case CHAR_l:
645  case CHAR_L:
646  case CHAR_u:
647  case CHAR_U:
648  *errorcodeptr = ERR37;
649  break;
650 
651  /* \g must be followed by one of a number of specific things:
652 
653  (1) A number, either plain or braced. If positive, it is an absolute
654  backreference. If negative, it is a relative backreference. This is a Perl
655  5.10 feature.
656 
657  (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
658  is part of Perl's movement towards a unified syntax for back references. As
659  this is synonymous with \k{name}, we fudge it up by pretending it really
660  was \k.
661 
662  (3) For Oniguruma compatibility we also support \g followed by a name or a
663  number either in angle brackets or in single quotes. However, these are
664  (possibly recursive) subroutine calls, _not_ backreferences. Just return
665  the -ESC_g code (cf \k). */
666 
667  case CHAR_g:
668  if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
669  {
670  c = -ESC_g;
671  break;
672  }
673 
674  /* Handle the Perl-compatible cases */
675 
676  if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
677  {
678  const uschar *p;
679  for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
680  if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
681  if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
682  {
683  c = -ESC_k;
684  break;
685  }
686  braced = TRUE;
687  ptr++;
688  }
689  else braced = FALSE;
690 
691  if (ptr[1] == CHAR_MINUS)
692  {
693  negated = TRUE;
694  ptr++;
695  }
696  else negated = FALSE;
697 
698  c = 0;
699  while ((digitab[ptr[1]] & ctype_digit) != 0)
700  c = c * 10 + *(++ptr) - CHAR_0;
701 
702  if (c < 0) /* Integer overflow */
703  {
704  *errorcodeptr = ERR61;
705  break;
706  }
707 
708  if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
709  {
710  *errorcodeptr = ERR57;
711  break;
712  }
713 
714  if (c == 0)
715  {
716  *errorcodeptr = ERR58;
717  break;
718  }
719 
720  if (negated)
721  {
722  if (c > bracount)
723  {
724  *errorcodeptr = ERR15;
725  break;
726  }
727  c = bracount - (c - 1);
728  }
729 
730  c = -(ESC_REF + c);
731  break;
732 
733  /* The handling of escape sequences consisting of a string of digits
734  starting with one that is not zero is not straightforward. By experiment,
735  the way Perl works seems to be as follows:
736 
737  Outside a character class, the digits are read as a decimal number. If the
738  number is less than 10, or if there are that many previous extracting
739  left brackets, then it is a back reference. Otherwise, up to three octal
740  digits are read to form an escaped byte. Thus \123 is likely to be octal
741  123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
742  value is greater than 377, the least significant 8 bits are taken. Inside a
743  character class, \ followed by a digit is always an octal number. */
744 
745  case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
746  case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
747 
748  if (!isclass)
749  {
750  oldptr = ptr;
751  c -= CHAR_0;
752  while ((digitab[ptr[1]] & ctype_digit) != 0)
753  c = c * 10 + *(++ptr) - CHAR_0;
754  if (c < 0) /* Integer overflow */
755  {
756  *errorcodeptr = ERR61;
757  break;
758  }
759  if (c < 10 || c <= bracount)
760  {
761  c = -(ESC_REF + c);
762  break;
763  }
764  ptr = oldptr; /* Put the pointer back and fall through */
765  }
766 
767  /* Handle an octal number following \. If the first digit is 8 or 9, Perl
768  generates a binary zero byte and treats the digit as a following literal.
769  Thus we have to pull back the pointer by one. */
770 
771  if ((c = *ptr) >= CHAR_8)
772  {
773  ptr--;
774  c = 0;
775  break;
776  }
777 
778  /* \0 always starts an octal number, but we may drop through to here with a
779  larger first octal digit. The original code used just to take the least
780  significant 8 bits of octal numbers (I think this is what early Perls used
781  to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
782  than 3 octal digits. */
783 
784  case CHAR_0:
785  c -= CHAR_0;
786  while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
787  c = c * 8 + *(++ptr) - CHAR_0;
788  if (!utf8 && c > 255) *errorcodeptr = ERR51;
789  break;
790 
791  /* \x is complicated. \x{ddd} is a character number which can be greater
792  than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
793  treated as a data character. */
794 
795  case CHAR_x:
796  if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
797  {
798  const uschar *pt = ptr + 2;
799  int count = 0;
800 
801  c = 0;
802  while ((digitab[*pt] & ctype_xdigit) != 0)
803  {
804  register int cc = *pt++;
805  if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
806  count++;
807 
808 #ifndef EBCDIC /* ASCII/UTF-8 coding */
809  if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
810  c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
811 #else /* EBCDIC coding */
812  if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
813  c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
814 #endif
815  }
816 
817  if (*pt == CHAR_RIGHT_CURLY_BRACKET)
818  {
819  if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
820  ptr = pt;
821  break;
822  }
823 
824  /* If the sequence of hex digits does not end with '}', then we don't
825  recognize this construct; fall through to the normal \x handling. */
826  }
827 
828  /* Read just a single-byte hex-defined char */
829 
830  c = 0;
831  while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
832  {
833  int cc; /* Some compilers don't like */
834  cc = *(++ptr); /* ++ in initializers */
835 #ifndef EBCDIC /* ASCII/UTF-8 coding */
836  if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
837  c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
838 #else /* EBCDIC coding */
839  if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
840  c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
841 #endif
842  }
843  break;
844 
845  /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
846  An error is given if the byte following \c is not an ASCII character. This
847  coding is ASCII-specific, but then the whole concept of \cx is
848  ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
849 
850  case CHAR_c:
851  c = *(++ptr);
852  if (c == 0)
853  {
854  *errorcodeptr = ERR2;
855  break;
856  }
857 #ifndef EBCDIC /* ASCII/UTF-8 coding */
858  if (c > 127) /* Excludes all non-ASCII in either mode */
859  {
860  *errorcodeptr = ERR68;
861  break;
862  }
863  if (c >= CHAR_a && c <= CHAR_z) c -= 32;
864  c ^= 0x40;
865 #else /* EBCDIC coding */
866  if (c >= CHAR_a && c <= CHAR_z) c += 64;
867  c ^= 0xC0;
868 #endif
869  break;
870 
871  /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
872  other alphanumeric following \ is an error if PCRE_EXTRA was set;
873  otherwise, for Perl compatibility, it is a literal. This code looks a bit
874  odd, but there used to be some cases other than the default, and there may
875  be again in future, so I haven't "optimized" it. */
876 
877  default:
878  if ((options & PCRE_EXTRA) != 0) switch(c)
879  {
880  default:
881  *errorcodeptr = ERR3;
882  break;
883  }
884  break;
885  }
886  }
887 
888 /* Perl supports \N{name} for character names, as well as plain \N for "not
889 newline". PCRE does not support \N{name}. */
890 
891 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
892  *errorcodeptr = ERR37;
893 
894 /* If PCRE_UCP is set, we change the values for \d etc. */
895 
896 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
897  c -= (ESC_DU - ESC_D);
898 
899 /* Set the pointer to the final character before returning. */
900 
901 *ptrptr = ptr;
902 return c;
903 }
904 
905 
906 
907 #ifdef SUPPORT_UCP
908 /*************************************************
909 * Handle \P and \p *
910 *************************************************/
911 
912 /* This function is called after \P or \p has been encountered, provided that
913 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
914 pointing at the P or p. On exit, it is pointing at the final character of the
915 escape sequence.
916 
917 Argument:
918  ptrptr points to the pattern position pointer
919  negptr points to a boolean that is set TRUE for negation else FALSE
920  dptr points to an int that is set to the detailed property value
921  errorcodeptr points to the error code variable
922 
923 Returns: type value from ucp_type_table, or -1 for an invalid type
924 */
925 
926 static int
927 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
928 {
929 int c, i, bot, top;
930 const uschar *ptr = *ptrptr;
931 char name[32];
932 
933 c = *(++ptr);
934 if (c == 0) goto ERROR_RETURN;
935 
936 *negptr = FALSE;
937 
938 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
939 negation. */
940 
941 if (c == CHAR_LEFT_CURLY_BRACKET)
942  {
943  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
944  {
945  *negptr = TRUE;
946  ptr++;
947  }
948  for (i = 0; i < (int)sizeof(name) - 1; i++)
949  {
950  c = *(++ptr);
951  if (c == 0) goto ERROR_RETURN;
952  if (c == CHAR_RIGHT_CURLY_BRACKET) break;
953  name[i] = c;
954  }
955  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
956  name[i] = 0;
957  }
958 
959 /* Otherwise there is just one following character */
960 
961 else
962  {
963  name[0] = c;
964  name[1] = 0;
965  }
966 
967 *ptrptr = ptr;
968 
969 /* Search for a recognized property name using binary chop */
970 
971 bot = 0;
972 top = _pcre_utt_size;
973 
974 while (bot < top)
975  {
976  i = (bot + top) >> 1;
977  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
978  if (c == 0)
979  {
980  *dptr = _pcre_utt[i].value;
981  return _pcre_utt[i].type;
982  }
983  if (c > 0) bot = i + 1; else top = i;
984  }
985 
986 *errorcodeptr = ERR47;
987 *ptrptr = ptr;
988 return -1;
989 
990 ERROR_RETURN:
991 *errorcodeptr = ERR46;
992 *ptrptr = ptr;
993 return -1;
994 }
995 #endif
996 
997 
998 
999 
1000 /*************************************************
1001 * Check for counted repeat *
1002 *************************************************/
1003 
1004 /* This function is called when a '{' is encountered in a place where it might
1005 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1006 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1007 where the ddds are digits.
1008 
1009 Arguments:
1010  p pointer to the first char after '{'
1011 
1012 Returns: TRUE or FALSE
1013 */
1014 
1015 static BOOL
1016 is_counted_repeat(const uschar *p)
1017 {
1018 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019 while ((digitab[*p] & ctype_digit) != 0) p++;
1020 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1021 
1022 if (*p++ != CHAR_COMMA) return FALSE;
1023 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1024 
1025 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1026 while ((digitab[*p] & ctype_digit) != 0) p++;
1027 
1028 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1029 }
1030 
1031 
1032 
1033 /*************************************************
1034 * Read repeat counts *
1035 *************************************************/
1036 
1037 /* Read an item of the form {n,m} and return the values. This is called only
1038 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1039 so the syntax is guaranteed to be correct, but we need to check the values.
1040 
1041 Arguments:
1042  p pointer to first char after '{'
1043  minp pointer to int for min
1044  maxp pointer to int for max
1045  returned as -1 if no max
1046  errorcodeptr points to error code variable
1047 
1048 Returns: pointer to '}' on success;
1049  current ptr on error, with errorcodeptr set non-zero
1050 */
1051 
1052 static const uschar *
1053 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1054 {
1055 int min = 0;
1056 int max = -1;
1057 
1058 /* Read the minimum value and do a paranoid check: a negative value indicates
1059 an integer overflow. */
1060 
1061 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1062 if (min < 0 || min > 65535)
1063  {
1064  *errorcodeptr = ERR5;
1065  return p;
1066  }
1067 
1068 /* Read the maximum value if there is one, and again do a paranoid on its size.
1069 Also, max must not be less than min. */
1070 
1071 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1072  {
1073  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1074  {
1075  max = 0;
1076  while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1077  if (max < 0 || max > 65535)
1078  {
1079  *errorcodeptr = ERR5;
1080  return p;
1081  }
1082  if (max < min)
1083  {
1084  *errorcodeptr = ERR4;
1085  return p;
1086  }
1087  }
1088  }
1089 
1090 /* Fill in the required variables, and pass back the pointer to the terminating
1091 '}'. */
1092 
1093 *minp = min;
1094 *maxp = max;
1095 return p;
1096 }
1097 
1098 
1099 
1100 /*************************************************
1101 * Subroutine for finding forward reference *
1102 *************************************************/
1103 
1104 /* This recursive function is called only from find_parens() below. The
1105 top-level call starts at the beginning of the pattern. All other calls must
1106 start at a parenthesis. It scans along a pattern's text looking for capturing
1107 subpatterns, and counting them. If it finds a named pattern that matches the
1108 name it is given, it returns its number. Alternatively, if the name is NULL, it
1109 returns when it reaches a given numbered subpattern. Recursion is used to keep
1110 track of subpatterns that reset the capturing group numbers - the (?| feature.
1111 
1112 This function was originally called only from the second pass, in which we know
1113 that if (?< or (?' or (?P< is encountered, the name will be correctly
1114 terminated because that is checked in the first pass. There is now one call to
1115 this function in the first pass, to check for a recursive back reference by
1116 name (so that we can make the whole group atomic). In this case, we need check
1117 only up to the current position in the pattern, and that is still OK because
1118 and previous occurrences will have been checked. To make this work, the test
1119 for "end of pattern" is a check against cd->end_pattern in the main loop,
1120 instead of looking for a binary zero. This means that the special first-pass
1121 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1122 processing items within the loop are OK, because afterwards the main loop will
1123 terminate.)
1124 
1125 Arguments:
1126  ptrptr address of the current character pointer (updated)
1127  cd compile background data
1128  name name to seek, or NULL if seeking a numbered subpattern
1129  lorn name length, or subpattern number if name is NULL
1130  xmode TRUE if we are in /x mode
1131  utf8 TRUE if we are in UTF-8 mode
1132  count pointer to the current capturing subpattern number (updated)
1133 
1134 Returns: the number of the named subpattern, or -1 if not found
1135 */
1136 
1137 static int
1138 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1139  BOOL xmode, BOOL utf8, int *count)
1140 {
1141 uschar *ptr = *ptrptr;
1142 int start_count = *count;
1143 int hwm_count = start_count;
1144 BOOL dup_parens = FALSE;
1145 
1146 /* If the first character is a parenthesis, check on the type of group we are
1147 dealing with. The very first call may not start with a parenthesis. */
1148 
1149 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1150  {
1151  /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1152 
1153  if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1154 
1155  /* Handle a normal, unnamed capturing parenthesis. */
1156 
1157  else if (ptr[1] != CHAR_QUESTION_MARK)
1158  {
1159  *count += 1;
1160  if (name == NULL && *count == lorn) return *count;
1161  ptr++;
1162  }
1163 
1164  /* All cases now have (? at the start. Remember when we are in a group
1165  where the parenthesis numbers are duplicated. */
1166 
1167  else if (ptr[2] == CHAR_VERTICAL_LINE)
1168  {
1169  ptr += 3;
1170  dup_parens = TRUE;
1171  }
1172 
1173  /* Handle comments; all characters are allowed until a ket is reached. */
1174 
1175  else if (ptr[2] == CHAR_NUMBER_SIGN)
1176  {
1177  for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1178  goto FAIL_EXIT;
1179  }
1180 
1181  /* Handle a condition. If it is an assertion, just carry on so that it
1182  is processed as normal. If not, skip to the closing parenthesis of the
1183  condition (there can't be any nested parens). */
1184 
1185  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1186  {
1187  ptr += 2;
1188  if (ptr[1] != CHAR_QUESTION_MARK)
1189  {
1190  while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1191  if (*ptr != 0) ptr++;
1192  }
1193  }
1194 
1195  /* Start with (? but not a condition. */
1196 
1197  else
1198  {
1199  ptr += 2;
1200  if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1201 
1202  /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1203 
1204  if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1205  ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1206  {
1207  int term;
1208  const uschar *thisname;
1209  *count += 1;
1210  if (name == NULL && *count == lorn) return *count;
1211  term = *ptr++;
1212  if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1213  thisname = ptr;
1214  while (*ptr != term) ptr++;
1215  if (name != NULL && lorn == ptr - thisname &&
1216  strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1217  return *count;
1218  term++;
1219  }
1220  }
1221  }
1222 
1223 /* Past any initial parenthesis handling, scan for parentheses or vertical
1224 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1225 first-pass call when this value is temporarily adjusted to stop at the current
1226 position. So DO NOT change this to a test for binary zero. */
1227 
1228 for (; ptr < cd->end_pattern; ptr++)
1229  {
1230  /* Skip over backslashed characters and also entire \Q...\E */
1231 
1232  if (*ptr == CHAR_BACKSLASH)
1233  {
1234  if (*(++ptr) == 0) goto FAIL_EXIT;
1235  if (*ptr == CHAR_Q) for (;;)
1236  {
1237  while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1238  if (*ptr == 0) goto FAIL_EXIT;
1239  if (*(++ptr) == CHAR_E) break;
1240  }
1241  continue;
1242  }
1243 
1244  /* Skip over character classes; this logic must be similar to the way they
1245  are handled for real. If the first character is '^', skip it. Also, if the
1246  first few characters (either before or after ^) are \Q\E or \E we skip them
1247  too. This makes for compatibility with Perl. Note the use of STR macros to
1248  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1249 
1250  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1251  {
1252  BOOL negate_class = FALSE;
1253  for (;;)
1254  {
1255  if (ptr[1] == CHAR_BACKSLASH)
1256  {
1257  if (ptr[2] == CHAR_E)
1258  ptr+= 2;
1259  else if (strncmp((const char *)ptr+2,
1260  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1261  ptr += 4;
1262  else
1263  break;
1264  }
1265  else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1266  {
1267  negate_class = TRUE;
1268  ptr++;
1269  }
1270  else break;
1271  }
1272 
1273  /* If the next character is ']', it is a data character that must be
1274  skipped, except in JavaScript compatibility mode. */
1275 
1276  if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1278  ptr++;
1279 
1280  while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1281  {
1282  if (*ptr == 0) return -1;
1283  if (*ptr == CHAR_BACKSLASH)
1284  {
1285  if (*(++ptr) == 0) goto FAIL_EXIT;
1286  if (*ptr == CHAR_Q) for (;;)
1287  {
1288  while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1289  if (*ptr == 0) goto FAIL_EXIT;
1290  if (*(++ptr) == CHAR_E) break;
1291  }
1292  continue;
1293  }
1294  }
1295  continue;
1296  }
1297 
1298  /* Skip comments in /x mode */
1299 
1300  if (xmode && *ptr == CHAR_NUMBER_SIGN)
1301  {
1302  ptr++;
1303  while (*ptr != 0)
1304  {
1305  if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1306  ptr++;
1307 #ifdef SUPPORT_UTF8
1308  if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1309 #endif
1310  }
1311  if (*ptr == 0) goto FAIL_EXIT;
1312  continue;
1313  }
1314 
1315  /* Check for the special metacharacters */
1316 
1317  if (*ptr == CHAR_LEFT_PARENTHESIS)
1318  {
1319  int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1320  if (rc > 0) return rc;
1321  if (*ptr == 0) goto FAIL_EXIT;
1322  }
1323 
1324  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1325  {
1326  if (dup_parens && *count < hwm_count) *count = hwm_count;
1327  goto FAIL_EXIT;
1328  }
1329 
1330  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1331  {
1332  if (*count > hwm_count) hwm_count = *count;
1333  *count = start_count;
1334  }
1335  }
1336 
1337 FAIL_EXIT:
1338 *ptrptr = ptr;
1339 return -1;
1340 }
1341 
1342 
1343 
1344 
1345 /*************************************************
1346 * Find forward referenced subpattern *
1347 *************************************************/
1348 
1349 /* This function scans along a pattern's text looking for capturing
1350 subpatterns, and counting them. If it finds a named pattern that matches the
1351 name it is given, it returns its number. Alternatively, if the name is NULL, it
1352 returns when it reaches a given numbered subpattern. This is used for forward
1353 references to subpatterns. We used to be able to start this scan from the
1354 current compiling point, using the current count value from cd->bracount, and
1355 do it all in a single loop, but the addition of the possibility of duplicate
1356 subpattern numbers means that we have to scan from the very start, in order to
1357 take account of such duplicates, and to use a recursive function to keep track
1358 of the different types of group.
1359 
1360 Arguments:
1361  cd compile background data
1362  name name to seek, or NULL if seeking a numbered subpattern
1363  lorn name length, or subpattern number if name is NULL
1364  xmode TRUE if we are in /x mode
1365  utf8 TRUE if we are in UTF-8 mode
1366 
1367 Returns: the number of the found subpattern, or -1 if not found
1368 */
1369 
1370 static int
1371 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1372  BOOL utf8)
1373 {
1374 uschar *ptr = (uschar *)cd->start_pattern;
1375 int count = 0;
1376 int rc;
1377 
1378 /* If the pattern does not start with an opening parenthesis, the first call
1379 to find_parens_sub() will scan right to the end (if necessary). However, if it
1380 does start with a parenthesis, find_parens_sub() will return when it hits the
1381 matching closing parens. That is why we have to have a loop. */
1382 
1383 for (;;)
1384  {
1385  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1386  if (rc > 0 || *ptr++ == 0) break;
1387  }
1388 
1389 return rc;
1390 }
1391 
1392 
1393 
1394 
1395 /*************************************************
1396 * Find first significant op code *
1397 *************************************************/
1398 
1399 /* This is called by several functions that scan a compiled expression looking
1400 for a fixed first character, or an anchoring op code etc. It skips over things
1401 that do not influence this. For some calls, a change of option is important.
1402 For some calls, it makes sense to skip negative forward and all backward
1403 assertions, and also the \b assertion; for others it does not.
1404 
1405 Arguments:
1406  code pointer to the start of the group
1407  options pointer to external options
1408  optbit the option bit whose changing is significant, or
1409  zero if none are
1410  skipassert TRUE if certain assertions are to be skipped
1411 
1412 Returns: pointer to the first significant opcode
1413 */
1414 
1415 static const uschar*
1416 first_significant_code(const uschar *code, int *options, int optbit,
1417  BOOL skipassert)
1418 {
1419 for (;;)
1420  {
1421  switch ((int)*code)
1422  {
1423  case OP_OPT:
1424  if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1425  *options = (int)code[1];
1426  code += 2;
1427  break;
1428 
1429  case OP_ASSERT_NOT:
1430  case OP_ASSERTBACK:
1431  case OP_ASSERTBACK_NOT:
1432  if (!skipassert) return code;
1433  do code += GET(code, 1); while (*code == OP_ALT);
1434  code += _pcre_OP_lengths[*code];
1435  break;
1436 
1437  case OP_WORD_BOUNDARY:
1438  case OP_NOT_WORD_BOUNDARY:
1439  if (!skipassert) return code;
1440  /* Fall through */
1441 
1442  case OP_CALLOUT:
1443  case OP_CREF:
1444  case OP_NCREF:
1445  case OP_RREF:
1446  case OP_NRREF:
1447  case OP_DEF:
1448  code += _pcre_OP_lengths[*code];
1449  break;
1450 
1451  default:
1452  return code;
1453  }
1454  }
1455 /* Control never reaches here */
1456 }
1457 
1458 
1459 
1460 
1461 /*************************************************
1462 * Find the fixed length of a branch *
1463 *************************************************/
1464 
1465 /* Scan a branch and compute the fixed length of subject that will match it,
1466 if the length is fixed. This is needed for dealing with backward assertions.
1467 In UTF8 mode, the result is in characters rather than bytes. The branch is
1468 temporarily terminated with OP_END when this function is called.
1469 
1470 This function is called when a backward assertion is encountered, so that if it
1471 fails, the error message can point to the correct place in the pattern.
1472 However, we cannot do this when the assertion contains subroutine calls,
1473 because they can be forward references. We solve this by remembering this case
1474 and doing the check at the end; a flag specifies which mode we are running in.
1475 
1476 Arguments:
1477  code points to the start of the pattern (the bracket)
1478  options the compiling options
1479  atend TRUE if called when the pattern is complete
1480  cd the "compile data" structure
1481 
1482 Returns: the fixed length,
1483  or -1 if there is no fixed length,
1484  or -2 if \C was encountered
1485  or -3 if an OP_RECURSE item was encountered and atend is FALSE
1486 */
1487 
1488 static int
1489 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1490 {
1491 int length = -1;
1492 
1493 register int branchlength = 0;
1494 register uschar *cc = code + 1 + LINK_SIZE;
1495 
1496 /* Scan along the opcodes for this branch. If we get to the end of the
1497 branch, check the length against that of the other branches. */
1498 
1499 for (;;)
1500  {
1501  int d;
1502  uschar *ce, *cs;
1503  register int op = *cc;
1504  switch (op)
1505  {
1506  case OP_CBRA:
1507  case OP_BRA:
1508  case OP_ONCE:
1509  case OP_COND:
1510  d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1511  if (d < 0) return d;
1512  branchlength += d;
1513  do cc += GET(cc, 1); while (*cc == OP_ALT);
1514  cc += 1 + LINK_SIZE;
1515  break;
1516 
1517  /* Reached end of a branch; if it's a ket it is the end of a nested
1518  call. If it's ALT it is an alternation in a nested call. If it is
1519  END it's the end of the outer call. All can be handled by the same code. */
1520 
1521  case OP_ALT:
1522  case OP_KET:
1523  case OP_KETRMAX:
1524  case OP_KETRMIN:
1525  case OP_END:
1526  if (length < 0) length = branchlength;
1527  else if (length != branchlength) return -1;
1528  if (*cc != OP_ALT) return length;
1529  cc += 1 + LINK_SIZE;
1530  branchlength = 0;
1531  break;
1532 
1533  /* A true recursion implies not fixed length, but a subroutine call may
1534  be OK. If the subroutine is a forward reference, we can't deal with
1535  it until the end of the pattern, so return -3. */
1536 
1537  case OP_RECURSE:
1538  if (!atend) return -3;
1539  cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1540  do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1541  if (cc > cs && cc < ce) return -1; /* Recursion */
1542  d = find_fixedlength(cs + 2, options, atend, cd);
1543  if (d < 0) return d;
1544  branchlength += d;
1545  cc += 1 + LINK_SIZE;
1546  break;
1547 
1548  /* Skip over assertive subpatterns */
1549 
1550  case OP_ASSERT:
1551  case OP_ASSERT_NOT:
1552  case OP_ASSERTBACK:
1553  case OP_ASSERTBACK_NOT:
1554  do cc += GET(cc, 1); while (*cc == OP_ALT);
1555  /* Fall through */
1556 
1557  /* Skip over things that don't match chars */
1558 
1559  case OP_REVERSE:
1560  case OP_CREF:
1561  case OP_NCREF:
1562  case OP_RREF:
1563  case OP_NRREF:
1564  case OP_DEF:
1565  case OP_OPT:
1566  case OP_CALLOUT:
1567  case OP_SOD:
1568  case OP_SOM:
1569  case OP_SET_SOM:
1570  case OP_EOD:
1571  case OP_EODN:
1572  case OP_CIRC:
1573  case OP_DOLL:
1574  case OP_NOT_WORD_BOUNDARY:
1575  case OP_WORD_BOUNDARY:
1576  cc += _pcre_OP_lengths[*cc];
1577  break;
1578 
1579  /* Handle literal characters */
1580 
1581  case OP_CHAR:
1582  case OP_CHARNC:
1583  case OP_NOT:
1584  branchlength++;
1585  cc += 2;
1586 #ifdef SUPPORT_UTF8
1587  if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1588  cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1589 #endif
1590  break;
1591 
1592  /* Handle exact repetitions. The count is already in characters, but we
1593  need to skip over a multibyte character in UTF8 mode. */
1594 
1595  case OP_EXACT:
1596  branchlength += GET2(cc,1);
1597  cc += 4;
1598 #ifdef SUPPORT_UTF8
1599  if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1600  cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1601 #endif
1602  break;
1603 
1604  case OP_TYPEEXACT:
1605  branchlength += GET2(cc,1);
1606  if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1607  cc += 4;
1608  break;
1609 
1610  /* Handle single-char matchers */
1611 
1612  case OP_PROP:
1613  case OP_NOTPROP:
1614  cc += 2;
1615  /* Fall through */
1616 
1617  case OP_NOT_DIGIT:
1618  case OP_DIGIT:
1619  case OP_NOT_WHITESPACE:
1620  case OP_WHITESPACE:
1621  case OP_NOT_WORDCHAR:
1622  case OP_WORDCHAR:
1623  case OP_ANY:
1624  case OP_ALLANY:
1625  branchlength++;
1626  cc++;
1627  break;
1628 
1629  /* The single-byte matcher isn't allowed */
1630 
1631  case OP_ANYBYTE:
1632  return -2;
1633 
1634  /* Check a class for variable quantification */
1635 
1636 #ifdef SUPPORT_UTF8
1637  case OP_XCLASS:
1638  cc += GET(cc, 1) - 33;
1639  /* Fall through */
1640 #endif
1641 
1642  case OP_CLASS:
1643  case OP_NCLASS:
1644  cc += 33;
1645 
1646  switch (*cc)
1647  {
1648  case OP_CRSTAR:
1649  case OP_CRMINSTAR:
1650  case OP_CRQUERY:
1651  case OP_CRMINQUERY:
1652  return -1;
1653 
1654  case OP_CRRANGE:
1655  case OP_CRMINRANGE:
1656  if (GET2(cc,1) != GET2(cc,3)) return -1;
1657  branchlength += GET2(cc,1);
1658  cc += 5;
1659  break;
1660 
1661  default:
1662  branchlength++;
1663  }
1664  break;
1665 
1666  /* Anything else is variable length */
1667 
1668  default:
1669  return -1;
1670  }
1671  }
1672 /* Control never gets here */
1673 }
1674 
1675 
1676 
1677 
1678 /*************************************************
1679 * Scan compiled regex for specific bracket *
1680 *************************************************/
1681 
1682 /* This little function scans through a compiled pattern until it finds a
1683 capturing bracket with the given number, or, if the number is negative, an
1684 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1685 so that it can be called from pcre_study() when finding the minimum matching
1686 length.
1687 
1688 Arguments:
1689  code points to start of expression
1690  utf8 TRUE in UTF-8 mode
1691  number the required bracket number or negative to find a lookbehind
1692 
1693 Returns: pointer to the opcode for the bracket, or NULL if not found
1694 */
1695 
1696 const uschar *
1697 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1698 {
1699 for (;;)
1700  {
1701  register int c = *code;
1702  if (c == OP_END) return NULL;
1703 
1704  /* XCLASS is used for classes that cannot be represented just by a bit
1705  map. This includes negated single high-valued characters. The length in
1706  the table is zero; the actual length is stored in the compiled code. */
1707 
1708  if (c == OP_XCLASS) code += GET(code, 1);
1709 
1710  /* Handle recursion */
1711 
1712  else if (c == OP_REVERSE)
1713  {
1714  if (number < 0) return (uschar *)code;
1715  code += _pcre_OP_lengths[c];
1716  }
1717 
1718  /* Handle capturing bracket */
1719 
1720  else if (c == OP_CBRA)
1721  {
1722  int n = GET2(code, 1+LINK_SIZE);
1723  if (n == number) return (uschar *)code;
1724  code += _pcre_OP_lengths[c];
1725  }
1726 
1727  /* Otherwise, we can get the item's length from the table, except that for
1728  repeated character types, we have to test for \p and \P, which have an extra
1729  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1730  must add in its length. */
1731 
1732  else
1733  {
1734  switch(c)
1735  {
1736  case OP_TYPESTAR:
1737  case OP_TYPEMINSTAR:
1738  case OP_TYPEPLUS:
1739  case OP_TYPEMINPLUS:
1740  case OP_TYPEQUERY:
1741  case OP_TYPEMINQUERY:
1742  case OP_TYPEPOSSTAR:
1743  case OP_TYPEPOSPLUS:
1744  case OP_TYPEPOSQUERY:
1745  if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1746  break;
1747 
1748  case OP_TYPEUPTO:
1749  case OP_TYPEMINUPTO:
1750  case OP_TYPEEXACT:
1751  case OP_TYPEPOSUPTO:
1752  if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1753  break;
1754 
1755  case OP_MARK:
1756  case OP_PRUNE_ARG:
1757  case OP_SKIP_ARG:
1758  code += code[1];
1759  break;
1760 
1761  case OP_THEN_ARG:
1762  code += code[1+LINK_SIZE];
1763  break;
1764  }
1765 
1766  /* Add in the fixed length from the table */
1767 
1768  code += _pcre_OP_lengths[c];
1769 
1770  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1771  a multi-byte character. The length in the table is a minimum, so we have to
1772  arrange to skip the extra bytes. */
1773 
1774 #ifdef SUPPORT_UTF8
1775  if (utf8) switch(c)
1776  {
1777  case OP_CHAR:
1778  case OP_CHARNC:
1779  case OP_EXACT:
1780  case OP_UPTO:
1781  case OP_MINUPTO:
1782  case OP_POSUPTO:
1783  case OP_STAR:
1784  case OP_MINSTAR:
1785  case OP_POSSTAR:
1786  case OP_PLUS:
1787  case OP_MINPLUS:
1788  case OP_POSPLUS:
1789  case OP_QUERY:
1790  case OP_MINQUERY:
1791  case OP_POSQUERY:
1792  if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1793  break;
1794  }
1795 #else
1796  (void)(utf8); /* Keep compiler happy by referencing function argument */
1797 #endif
1798  }
1799  }
1800 }
1801 
1802 
1803 
1804 /*************************************************
1805 * Scan compiled regex for recursion reference *
1806 *************************************************/
1807 
1808 /* This little function scans through a compiled pattern until it finds an
1809 instance of OP_RECURSE.
1810 
1811 Arguments:
1812  code points to start of expression
1813  utf8 TRUE in UTF-8 mode
1814 
1815 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1816 */
1817 
1818 static const uschar *
1819 find_recurse(const uschar *code, BOOL utf8)
1820 {
1821 for (;;)
1822  {
1823  register int c = *code;
1824  if (c == OP_END) return NULL;
1825  if (c == OP_RECURSE) return code;
1826 
1827  /* XCLASS is used for classes that cannot be represented just by a bit
1828  map. This includes negated single high-valued characters. The length in
1829  the table is zero; the actual length is stored in the compiled code. */
1830 
1831  if (c == OP_XCLASS) code += GET(code, 1);
1832 
1833  /* Otherwise, we can get the item's length from the table, except that for
1834  repeated character types, we have to test for \p and \P, which have an extra
1835  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1836  must add in its length. */
1837 
1838  else
1839  {
1840  switch(c)
1841  {
1842  case OP_TYPESTAR:
1843  case OP_TYPEMINSTAR:
1844  case OP_TYPEPLUS:
1845  case OP_TYPEMINPLUS:
1846  case OP_TYPEQUERY:
1847  case OP_TYPEMINQUERY:
1848  case OP_TYPEPOSSTAR:
1849  case OP_TYPEPOSPLUS:
1850  case OP_TYPEPOSQUERY:
1851  if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1852  break;
1853 
1854  case OP_TYPEPOSUPTO:
1855  case OP_TYPEUPTO:
1856  case OP_TYPEMINUPTO:
1857  case OP_TYPEEXACT:
1858  if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1859  break;
1860 
1861  case OP_MARK:
1862  case OP_PRUNE_ARG:
1863  case OP_SKIP_ARG:
1864  code += code[1];
1865  break;
1866 
1867  case OP_THEN_ARG:
1868  code += code[1+LINK_SIZE];
1869  break;
1870  }
1871 
1872  /* Add in the fixed length from the table */
1873 
1874  code += _pcre_OP_lengths[c];
1875 
1876  /* In UTF-8 mode, opcodes that are followed by a character may be followed
1877  by a multi-byte character. The length in the table is a minimum, so we have
1878  to arrange to skip the extra bytes. */
1879 
1880 #ifdef SUPPORT_UTF8
1881  if (utf8) switch(c)
1882  {
1883  case OP_CHAR:
1884  case OP_CHARNC:
1885  case OP_EXACT:
1886  case OP_UPTO:
1887  case OP_MINUPTO:
1888  case OP_POSUPTO:
1889  case OP_STAR:
1890  case OP_MINSTAR:
1891  case OP_POSSTAR:
1892  case OP_PLUS:
1893  case OP_MINPLUS:
1894  case OP_POSPLUS:
1895  case OP_QUERY:
1896  case OP_MINQUERY:
1897  case OP_POSQUERY:
1898  if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1899  break;
1900  }
1901 #else
1902  (void)(utf8); /* Keep compiler happy by referencing function argument */
1903 #endif
1904  }
1905  }
1906 }
1907 
1908 
1909 
1910 /*************************************************
1911 * Scan compiled branch for non-emptiness *
1912 *************************************************/
1913 
1914 /* This function scans through a branch of a compiled pattern to see whether it
1915 can match the empty string or not. It is called from could_be_empty()
1916 below and from compile_branch() when checking for an unlimited repeat of a
1917 group that can match nothing. Note that first_significant_code() skips over
1918 backward and negative forward assertions when its final argument is TRUE. If we
1919 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1920 bracket whose current branch will already have been scanned.
1921 
1922 Arguments:
1923  code points to start of search
1924  endcode points to where to stop
1925  utf8 TRUE if in UTF8 mode
1926  cd contains pointers to tables etc.
1927 
1928 Returns: TRUE if what is matched could be empty
1929 */
1930 
1931 static BOOL
1932 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1933  compile_data *cd)
1934 {
1935 register int c;
1936 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1937  code < endcode;
1938  code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1939  {
1940  const uschar *ccode;
1941 
1942  c = *code;
1943 
1944  /* Skip over forward assertions; the other assertions are skipped by
1945  first_significant_code() with a TRUE final argument. */
1946 
1947  if (c == OP_ASSERT)
1948  {
1949  do code += GET(code, 1); while (*code == OP_ALT);
1950  c = *code;
1951  continue;
1952  }
1953 
1954  /* Groups with zero repeats can of course be empty; skip them. */
1955 
1956  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1957  {
1958  code += _pcre_OP_lengths[c];
1959  do code += GET(code, 1); while (*code == OP_ALT);
1960  c = *code;
1961  continue;
1962  }
1963 
1964  /* For a recursion/subroutine call, if its end has been reached, which
1965  implies a subroutine call, we can scan it. */
1966 
1967  if (c == OP_RECURSE)
1968  {
1969  BOOL empty_branch = FALSE;
1970  const uschar *scode = cd->start_code + GET(code, 1);
1971  if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1972  do
1973  {
1974  if (could_be_empty_branch(scode, endcode, utf8, cd))
1975  {
1976  empty_branch = TRUE;
1977  break;
1978  }
1979  scode += GET(scode, 1);
1980  }
1981  while (*scode == OP_ALT);
1982  if (!empty_branch) return FALSE; /* All branches are non-empty */
1983  continue;
1984  }
1985 
1986  /* For other groups, scan the branches. */
1987 
1988  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1989  {
1990  BOOL empty_branch;
1991  if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1992 
1993  /* If a conditional group has only one branch, there is a second, implied,
1994  empty branch, so just skip over the conditional, because it could be empty.
1995  Otherwise, scan the individual branches of the group. */
1996 
1997  if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1998  code += GET(code, 1);
1999  else
2000  {
2001  empty_branch = FALSE;
2002  do
2003  {
2004  if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2005  empty_branch = TRUE;
2006  code += GET(code, 1);
2007  }
2008  while (*code == OP_ALT);
2009  if (!empty_branch) return FALSE; /* All branches are non-empty */
2010  }
2011 
2012  c = *code;
2013  continue;
2014  }
2015 
2016  /* Handle the other opcodes */
2017 
2018  switch (c)
2019  {
2020  /* Check for quantifiers after a class. XCLASS is used for classes that
2021  cannot be represented just by a bit map. This includes negated single
2022  high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2023  actual length is stored in the compiled code, so we must update "code"
2024  here. */
2025 
2026 #ifdef SUPPORT_UTF8
2027  case OP_XCLASS:
2028  ccode = code += GET(code, 1);
2029  goto CHECK_CLASS_REPEAT;
2030 #endif
2031 
2032  case OP_CLASS:
2033  case OP_NCLASS:
2034  ccode = code + 33;
2035 
2036 #ifdef SUPPORT_UTF8
2037  CHECK_CLASS_REPEAT:
2038 #endif
2039 
2040  switch (*ccode)
2041  {
2042  case OP_CRSTAR: /* These could be empty; continue */
2043  case OP_CRMINSTAR:
2044  case OP_CRQUERY:
2045  case OP_CRMINQUERY:
2046  break;
2047 
2048  default: /* Non-repeat => class must match */
2049  case OP_CRPLUS: /* These repeats aren't empty */
2050  case OP_CRMINPLUS:
2051  return FALSE;
2052 
2053  case OP_CRRANGE:
2054  case OP_CRMINRANGE:
2055  if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2056  break;
2057  }
2058  break;
2059 
2060  /* Opcodes that must match a character */
2061 
2062  case OP_PROP:
2063  case OP_NOTPROP:
2064  case OP_EXTUNI:
2065  case OP_NOT_DIGIT:
2066  case OP_DIGIT:
2067  case OP_NOT_WHITESPACE:
2068  case OP_WHITESPACE:
2069  case OP_NOT_WORDCHAR:
2070  case OP_WORDCHAR:
2071  case OP_ANY:
2072  case OP_ALLANY:
2073  case OP_ANYBYTE:
2074  case OP_CHAR:
2075  case OP_CHARNC:
2076  case OP_NOT:
2077  case OP_PLUS:
2078  case OP_MINPLUS:
2079  case OP_POSPLUS:
2080  case OP_EXACT:
2081  case OP_NOTPLUS:
2082  case OP_NOTMINPLUS:
2083  case OP_NOTPOSPLUS:
2084  case OP_NOTEXACT:
2085  case OP_TYPEPLUS:
2086  case OP_TYPEMINPLUS:
2087  case OP_TYPEPOSPLUS:
2088  case OP_TYPEEXACT:
2089  return FALSE;
2090 
2091  /* These are going to continue, as they may be empty, but we have to
2092  fudge the length for the \p and \P cases. */
2093 
2094  case OP_TYPESTAR:
2095  case OP_TYPEMINSTAR:
2096  case OP_TYPEPOSSTAR:
2097  case OP_TYPEQUERY:
2098  case OP_TYPEMINQUERY:
2099  case OP_TYPEPOSQUERY:
2100  if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2101  break;
2102 
2103  /* Same for these */
2104 
2105  case OP_TYPEUPTO:
2106  case OP_TYPEMINUPTO:
2107  case OP_TYPEPOSUPTO:
2108  if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2109  break;
2110 
2111  /* End of branch */
2112 
2113  case OP_KET:
2114  case OP_KETRMAX:
2115  case OP_KETRMIN:
2116  case OP_ALT:
2117  return TRUE;
2118 
2119  /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2120  MINUPTO, and POSUPTO may be followed by a multibyte character */
2121 
2122 #ifdef SUPPORT_UTF8
2123  case OP_STAR:
2124  case OP_MINSTAR:
2125  case OP_POSSTAR:
2126  case OP_QUERY:
2127  case OP_MINQUERY:
2128  case OP_POSQUERY:
2129  if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2130  break;
2131 
2132  case OP_UPTO:
2133  case OP_MINUPTO:
2134  case OP_POSUPTO:
2135  if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2136  break;
2137 #endif
2138 
2139  /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2140  string. */
2141 
2142  case OP_MARK:
2143  case OP_PRUNE_ARG:
2144  case OP_SKIP_ARG:
2145  code += code[1];
2146  break;
2147 
2148  case OP_THEN_ARG:
2149  code += code[1+LINK_SIZE];
2150  break;
2151 
2152  /* None of the remaining opcodes are required to match a character. */
2153 
2154  default:
2155  break;
2156  }
2157  }
2158 
2159 return TRUE;
2160 }
2161 
2162 
2163 
2164 /*************************************************
2165 * Scan compiled regex for non-emptiness *
2166 *************************************************/
2167 
2168 /* This function is called to check for left recursive calls. We want to check
2169 the current branch of the current pattern to see if it could match the empty
2170 string. If it could, we must look outwards for branches at other levels,
2171 stopping when we pass beyond the bracket which is the subject of the recursion.
2172 
2173 Arguments:
2174  code points to start of the recursion
2175  endcode points to where to stop (current RECURSE item)
2176  bcptr points to the chain of current (unclosed) branch starts
2177  utf8 TRUE if in UTF-8 mode
2178  cd pointers to tables etc
2179 
2180 Returns: TRUE if what is matched could be empty
2181 */
2182 
2183 static BOOL
2184 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2185  BOOL utf8, compile_data *cd)
2186 {
2187 while (bcptr != NULL && bcptr->current_branch >= code)
2188  {
2189  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2190  return FALSE;
2191  bcptr = bcptr->outer;
2192  }
2193 return TRUE;
2194 }
2195 
2196 
2197 
2198 /*************************************************
2199 * Check for POSIX class syntax *
2200 *************************************************/
2201 
2202 /* This function is called when the sequence "[:" or "[." or "[=" is
2203 encountered in a character class. It checks whether this is followed by a
2204 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2205 reach an unescaped ']' without the special preceding character, return FALSE.
2206 
2207 Originally, this function only recognized a sequence of letters between the
2208 terminators, but it seems that Perl recognizes any sequence of characters,
2209 though of course unknown POSIX names are subsequently rejected. Perl gives an
2210 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2211 didn't consider this to be a POSIX class. Likewise for [:1234:].
2212 
2213 The problem in trying to be exactly like Perl is in the handling of escapes. We
2214 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2215 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2216 below handles the special case of \], but does not try to do any other escape
2217 processing. This makes it different from Perl for cases such as [:l\ower:]
2218 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2219 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2220 I think.
2221 
2222 Arguments:
2223  ptr pointer to the initial [
2224  endptr where to return the end pointer
2225 
2226 Returns: TRUE or FALSE
2227 */
2228 
2229 static BOOL
2230 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2231 {
2232 int terminator; /* Don't combine these lines; the Solaris cc */
2233 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2234 for (++ptr; *ptr != 0; ptr++)
2235  {
2236  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2237  {
2238  if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2239  if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2240  {
2241  *endptr = ptr;
2242  return TRUE;
2243  }
2244  }
2245  }
2246 return FALSE;
2247 }
2248 
2249 
2250 
2251 
2252 /*************************************************
2253 * Check POSIX class name *
2254 *************************************************/
2255 
2256 /* This function is called to check the name given in a POSIX-style class entry
2257 such as [:alnum:].
2258 
2259 Arguments:
2260  ptr points to the first letter
2261  len the length of the name
2262 
2263 Returns: a value representing the name, or -1 if unknown
2264 */
2265 
2266 static int
2267 check_posix_name(const uschar *ptr, int len)
2268 {
2269 const char *pn = posix_names;
2270 register int yield = 0;
2271 while (posix_name_lengths[yield] != 0)
2272  {
2273  if (len == posix_name_lengths[yield] &&
2274  strncmp((const char *)ptr, pn, len) == 0) return yield;
2275  pn += posix_name_lengths[yield] + 1;
2276  yield++;
2277  }
2278 return -1;
2279 }
2280 
2281 
2282 /*************************************************
2283 * Adjust OP_RECURSE items in repeated group *
2284 *************************************************/
2285 
2286 /* OP_RECURSE items contain an offset from the start of the regex to the group
2287 that is referenced. This means that groups can be replicated for fixed
2288 repetition simply by copying (because the recursion is allowed to refer to
2289 earlier groups that are outside the current group). However, when a group is
2290 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2291 inserted before it, after it has been compiled. This means that any OP_RECURSE
2292 items within it that refer to the group itself or any contained groups have to
2293 have their offsets adjusted. That one of the jobs of this function. Before it
2294 is called, the partially compiled regex must be temporarily terminated with
2295 OP_END.
2296 
2297 This function has been extended with the possibility of forward references for
2298 recursions and subroutine calls. It must also check the list of such references
2299 for the group we are dealing with. If it finds that one of the recursions in
2300 the current group is on this list, it adjusts the offset in the list, not the
2301 value in the reference (which is a group number).
2302 
2303 Arguments:
2304  group points to the start of the group
2305  adjust the amount by which the group is to be moved
2306  utf8 TRUE in UTF-8 mode
2307  cd contains pointers to tables etc.
2308  save_hwm the hwm forward reference pointer at the start of the group
2309 
2310 Returns: nothing
2311 */
2312 
2313 static void
2314 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2315  uschar *save_hwm)
2316 {
2317 uschar *ptr = group;
2318 
2319 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2320  {
2321  int offset;
2322  uschar *hc;
2323 
2324  /* See if this recursion is on the forward reference list. If so, adjust the
2325  reference. */
2326 
2327  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2328  {
2329  offset = GET(hc, 0);
2330  if (cd->start_code + offset == ptr + 1)
2331  {
2332  PUT(hc, 0, offset + adjust);
2333  break;
2334  }
2335  }
2336 
2337  /* Otherwise, adjust the recursion offset if it's after the start of this
2338  group. */
2339 
2340  if (hc >= cd->hwm)
2341  {
2342  offset = GET(ptr, 1);
2343  if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2344  }
2345 
2346  ptr += 1 + LINK_SIZE;
2347  }
2348 }
2349 
2350 
2351 
2352 /*************************************************
2353 * Insert an automatic callout point *
2354 *************************************************/
2355 
2356 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2357 callout points before each pattern item.
2358 
2359 Arguments:
2360  code current code pointer
2361  ptr current pattern pointer
2362  cd pointers to tables etc
2363 
2364 Returns: new code pointer
2365 */
2366 
2367 static uschar *
2368 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2369 {
2370 *code++ = OP_CALLOUT;
2371 *code++ = 255;
2372 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2373 PUT(code, LINK_SIZE, 0); /* Default length */
2374 return code + 2*LINK_SIZE;
2375 }
2376 
2377 
2378 
2379 /*************************************************
2380 * Complete a callout item *
2381 *************************************************/
2382 
2383 /* A callout item contains the length of the next item in the pattern, which
2384 we can't fill in till after we have reached the relevant point. This is used
2385 for both automatic and manual callouts.
2386 
2387 Arguments:
2388  previous_callout points to previous callout item
2389  ptr current pattern pointer
2390  cd pointers to tables etc
2391 
2392 Returns: nothing
2393 */
2394 
2395 static void
2396 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2397 {
2398 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2399 PUT(previous_callout, 2 + LINK_SIZE, length);
2400 }
2401 
2402 
2403 
2404 #ifdef SUPPORT_UCP
2405 /*************************************************
2406 * Get othercase range *
2407 *************************************************/
2408 
2409 /* This function is passed the start and end of a class range, in UTF-8 mode
2410 with UCP support. It searches up the characters, looking for internal ranges of
2411 characters in the "other" case. Each call returns the next one, updating the
2412 start address.
2413 
2414 Arguments:
2415  cptr points to starting character value; updated
2416  d end value
2417  ocptr where to put start of othercase range
2418  odptr where to put end of othercase range
2419 
2420 Yield: TRUE when range returned; FALSE when no more
2421 */
2422 
2423 static BOOL
2424 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2425  unsigned int *odptr)
2426 {
2427 unsigned int c, othercase, next;
2428 
2429 for (c = *cptr; c <= d; c++)
2430  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2431 
2432 if (c > d) return FALSE;
2433 
2434 *ocptr = othercase;
2435 next = othercase + 1;
2436 
2437 for (++c; c <= d; c++)
2438  {
2439  if (UCD_OTHERCASE(c) != next) break;
2440  next++;
2441  }
2442 
2443 *odptr = next - 1;
2444 *cptr = c;
2445 
2446 return TRUE;
2447 }
2448 
2449 
2450 
2451 /*************************************************
2452 * Check a character and a property *
2453 *************************************************/
2454 
2455 /* This function is called by check_auto_possessive() when a property item
2456 is adjacent to a fixed character.
2457 
2458 Arguments:
2459  c the character
2460  ptype the property type
2461  pdata the data for the type
2462  negated TRUE if it's a negated property (\P or \p{^)
2463 
2464 Returns: TRUE if auto-possessifying is OK
2465 */
2466 
2467 static BOOL
2468 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2469 {
2470 const ucd_record *prop = GET_UCD(c);
2471 switch(ptype)
2472  {
2473  case PT_LAMP:
2474  return (prop->chartype == ucp_Lu ||
2475  prop->chartype == ucp_Ll ||
2476  prop->chartype == ucp_Lt) == negated;
2477 
2478  case PT_GC:
2479  return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2480 
2481  case PT_PC:
2482  return (pdata == prop->chartype) == negated;
2483 
2484  case PT_SC:
2485  return (pdata == prop->script) == negated;
2486 
2487  /* These are specials */
2488 
2489  case PT_ALNUM:
2490  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2491  _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2492 
2493  case PT_SPACE: /* Perl space */
2494  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2495  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2496  == negated;
2497 
2498  case PT_PXSPACE: /* POSIX space */
2499  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2500  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2501  c == CHAR_FF || c == CHAR_CR)
2502  == negated;
2503 
2504  case PT_WORD:
2505  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2506  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2507  c == CHAR_UNDERSCORE) == negated;
2508  }
2509 return FALSE;
2510 }
2511 #endif /* SUPPORT_UCP */
2512 
2513 
2514 
2515 /*************************************************
2516 * Check if auto-possessifying is possible *
2517 *************************************************/
2518 
2519 /* This function is called for unlimited repeats of certain items, to see
2520 whether the next thing could possibly match the repeated item. If not, it makes
2521 sense to automatically possessify the repeated item.
2522 
2523 Arguments:
2524  previous pointer to the repeated opcode
2525  utf8 TRUE in UTF-8 mode
2526  ptr next character in pattern
2527  options options bits
2528  cd contains pointers to tables etc.
2529 
2530 Returns: TRUE if possessifying is wanted
2531 */
2532 
2533 static BOOL
2534 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2535  int options, compile_data *cd)
2536 {
2537 int c, next;
2538 int op_code = *previous++;
2539 
2540 /* Skip whitespace and comments in extended mode */
2541 
2542 if ((options & PCRE_EXTENDED) != 0)
2543  {
2544  for (;;)
2545  {
2546  while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2547  if (*ptr == CHAR_NUMBER_SIGN)
2548  {
2549  ptr++;
2550  while (*ptr != 0)
2551  {
2552  if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2553  ptr++;
2554 #ifdef SUPPORT_UTF8
2555  if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2556 #endif
2557  }
2558  }
2559  else break;
2560  }
2561  }
2562 
2563 /* If the next item is one that we can handle, get its value. A non-negative
2564 value is a character, a negative value is an escape value. */
2565 
2566 if (*ptr == CHAR_BACKSLASH)
2567  {
2568  int temperrorcode = 0;
2569  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2570  if (temperrorcode != 0) return FALSE;
2571  ptr++; /* Point after the escape sequence */
2572  }
2573 
2574 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2575  {
2576 #ifdef SUPPORT_UTF8
2577  if (utf8) { GETCHARINC(next, ptr); } else
2578 #endif
2579  next = *ptr++;
2580  }
2581 
2582 else return FALSE;
2583 
2584 /* Skip whitespace and comments in extended mode */
2585 
2586 if ((options & PCRE_EXTENDED) != 0)
2587  {
2588  for (;;)
2589  {
2590  while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2591  if (*ptr == CHAR_NUMBER_SIGN)
2592  {
2593  ptr++;
2594  while (*ptr != 0)
2595  {
2596  if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2597  ptr++;
2598 #ifdef SUPPORT_UTF8
2599  if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2600 #endif
2601  }
2602  }
2603  else break;
2604  }
2605  }
2606 
2607 /* If the next thing is itself optional, we have to give up. */
2608 
2609 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2610  strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2611  return FALSE;
2612 
2613 /* Now compare the next item with the previous opcode. First, handle cases when
2614 the next item is a character. */
2615 
2616 if (next >= 0) switch(op_code)
2617  {
2618  case OP_CHAR:
2619 #ifdef SUPPORT_UTF8
2620  GETCHARTEST(c, previous);
2621 #else
2622  c = *previous;
2623 #endif
2624  return c != next;
2625 
2626  /* For CHARNC (caseless character) we must check the other case. If we have
2627  Unicode property support, we can use it to test the other case of
2628  high-valued characters. */
2629 
2630  case OP_CHARNC:
2631 #ifdef SUPPORT_UTF8
2632  GETCHARTEST(c, previous);
2633 #else
2634  c = *previous;
2635 #endif
2636  if (c == next) return FALSE;
2637 #ifdef SUPPORT_UTF8
2638  if (utf8)
2639  {
2640  unsigned int othercase;
2641  if (next < 128) othercase = cd->fcc[next]; else
2642 #ifdef SUPPORT_UCP
2643  othercase = UCD_OTHERCASE((unsigned int)next);
2644 #else
2645  othercase = NOTACHAR;
2646 #endif
2647  return (unsigned int)c != othercase;
2648  }
2649  else
2650 #endif /* SUPPORT_UTF8 */
2651  return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2652 
2653  /* For OP_NOT, its data is always a single-byte character. */
2654 
2655  case OP_NOT:
2656  if ((c = *previous) == next) return TRUE;
2657  if ((options & PCRE_CASELESS) == 0) return FALSE;
2658 #ifdef SUPPORT_UTF8
2659  if (utf8)
2660  {
2661  unsigned int othercase;
2662  if (next < 128) othercase = cd->fcc[next]; else
2663 #ifdef SUPPORT_UCP
2664  othercase = UCD_OTHERCASE(next);
2665 #else
2666  othercase = NOTACHAR;
2667 #endif
2668  return (unsigned int)c == othercase;
2669  }
2670  else
2671 #endif /* SUPPORT_UTF8 */
2672  return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2673 
2674  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2675  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2676 
2677  case OP_DIGIT:
2678  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2679 
2680  case OP_NOT_DIGIT:
2681  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2682 
2683  case OP_WHITESPACE:
2684  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2685 
2686  case OP_NOT_WHITESPACE:
2687  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2688 
2689  case OP_WORDCHAR:
2690  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2691 
2692  case OP_NOT_WORDCHAR:
2693  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2694 
2695  case OP_HSPACE:
2696  case OP_NOT_HSPACE:
2697  switch(next)
2698  {
2699  case 0x09:
2700  case 0x20:
2701  case 0xa0:
2702  case 0x1680:
2703  case 0x180e:
2704  case 0x2000:
2705  case 0x2001:
2706  case 0x2002:
2707  case 0x2003:
2708  case 0x2004:
2709  case 0x2005:
2710  case 0x2006:
2711  case 0x2007:
2712  case 0x2008:
2713  case 0x2009:
2714  case 0x200A:
2715  case 0x202f:
2716  case 0x205f:
2717  case 0x3000:
2718  return op_code == OP_NOT_HSPACE;
2719  default:
2720  return op_code != OP_NOT_HSPACE;
2721  }
2722 
2723  case OP_ANYNL:
2724  case OP_VSPACE:
2725  case OP_NOT_VSPACE:
2726  switch(next)
2727  {
2728  case 0x0a:
2729  case 0x0b:
2730  case 0x0c:
2731  case 0x0d:
2732  case 0x85:
2733  case 0x2028:
2734  case 0x2029:
2735  return op_code == OP_NOT_VSPACE;
2736  default:
2737  return op_code != OP_NOT_VSPACE;
2738  }
2739 
2740 #ifdef SUPPORT_UCP
2741  case OP_PROP:
2742  return check_char_prop(next, previous[0], previous[1], FALSE);
2743 
2744  case OP_NOTPROP:
2745  return check_char_prop(next, previous[0], previous[1], TRUE);
2746 #endif
2747 
2748  default:
2749  return FALSE;
2750  }
2751 
2752 
2753 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2754 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2755 generated only when PCRE_UCP is *not* set, that is, when only ASCII
2756 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2757 replaced by OP_PROP codes when PCRE_UCP is set. */
2758 
2759 switch(op_code)
2760  {
2761  case OP_CHAR:
2762  case OP_CHARNC:
2763 #ifdef SUPPORT_UTF8
2764  GETCHARTEST(c, previous);
2765 #else
2766  c = *previous;
2767 #endif
2768  switch(-next)
2769  {
2770  case ESC_d:
2771  return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2772 
2773  case ESC_D:
2774  return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2775 
2776  case ESC_s:
2777  return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2778 
2779  case ESC_S:
2780  return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2781 
2782  case ESC_w:
2783  return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2784 
2785  case ESC_W:
2786  return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2787 
2788  case ESC_h:
2789  case ESC_H:
2790  switch(c)
2791  {
2792  case 0x09:
2793  case 0x20:
2794  case 0xa0:
2795  case 0x1680:
2796  case 0x180e:
2797  case 0x2000:
2798  case 0x2001:
2799  case 0x2002:
2800  case 0x2003:
2801  case 0x2004:
2802  case 0x2005:
2803  case 0x2006:
2804  case 0x2007:
2805  case 0x2008:
2806  case 0x2009:
2807  case 0x200A:
2808  case 0x202f:
2809  case 0x205f:
2810  case 0x3000:
2811  return -next != ESC_h;
2812  default:
2813  return -next == ESC_h;
2814  }
2815 
2816  case ESC_v:
2817  case ESC_V:
2818  switch(c)
2819  {
2820  case 0x0a:
2821  case 0x0b:
2822  case 0x0c:
2823  case 0x0d:
2824  case 0x85:
2825  case 0x2028:
2826  case 0x2029:
2827  return -next != ESC_v;
2828  default:
2829  return -next == ESC_v;
2830  }
2831 
2832  /* When PCRE_UCP is set, these values get generated for \d etc. Find
2833  their substitutions and process them. The result will always be either
2834  -ESC_p or -ESC_P. Then fall through to process those values. */
2835 
2836 #ifdef SUPPORT_UCP
2837  case ESC_du:
2838  case ESC_DU:
2839  case ESC_wu:
2840  case ESC_WU:
2841  case ESC_su:
2842  case ESC_SU:
2843  {
2844  int temperrorcode = 0;
2845  ptr = substitutes[-next - ESC_DU];
2846  next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2847  if (temperrorcode != 0) return FALSE;
2848  ptr++; /* For compatibility */
2849  }
2850  /* Fall through */
2851 
2852  case ESC_p:
2853  case ESC_P:
2854  {
2855  int ptype, pdata, errorcodeptr;
2856  BOOL negated;
2857 
2858  ptr--; /* Make ptr point at the p or P */
2859  ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2860  if (ptype < 0) return FALSE;
2861  ptr++; /* Point past the final curly ket */
2862 
2863  /* If the property item is optional, we have to give up. (When generated
2864  from \d etc by PCRE_UCP, this test will have been applied much earlier,
2865  to the original \d etc. At this point, ptr will point to a zero byte. */
2866 
2867  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2868  strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2869  return FALSE;
2870 
2871  /* Do the property check. */
2872 
2873  return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2874  }
2875 #endif
2876 
2877  default:
2878  return FALSE;
2879  }
2880 
2881  /* In principle, support for Unicode properties should be integrated here as
2882  well. It means re-organizing the above code so as to get hold of the property
2883  values before switching on the op-code. However, I wonder how many patterns
2884  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2885  these op-codes are never generated.) */
2886 
2887  case OP_DIGIT:
2888  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2889  next == -ESC_h || next == -ESC_v || next == -ESC_R;
2890 
2891  case OP_NOT_DIGIT:
2892  return next == -ESC_d;
2893 
2894  case OP_WHITESPACE:
2895  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2896 
2897  case OP_NOT_WHITESPACE:
2898  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2899 
2900  case OP_HSPACE:
2901  return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2902  next == -ESC_w || next == -ESC_v || next == -ESC_R;
2903 
2904  case OP_NOT_HSPACE:
2905  return next == -ESC_h;
2906 
2907  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2908  case OP_ANYNL:
2909  case OP_VSPACE:
2910  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2911 
2912  case OP_NOT_VSPACE:
2913  return next == -ESC_v || next == -ESC_R;
2914 
2915  case OP_WORDCHAR:
2916  return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2917  next == -ESC_v || next == -ESC_R;
2918 
2919  case OP_NOT_WORDCHAR:
2920  return next == -ESC_w || next == -ESC_d;
2921 
2922  default:
2923  return FALSE;
2924  }
2925 
2926 /* Control does not reach here */
2927 }
2928 
2929 
2930 
2931 /*************************************************
2932 * Compile one branch *
2933 *************************************************/
2934 
2935 /* Scan the pattern, compiling it into the a vector. If the options are
2936 changed during the branch, the pointer is used to change the external options
2937 bits. This function is used during the pre-compile phase when we are trying
2938 to find out the amount of memory needed, as well as during the real compile
2939 phase. The value of lengthptr distinguishes the two phases.
2940 
2941 Arguments:
2942  optionsptr pointer to the option bits
2943  codeptr points to the pointer to the current code point
2944  ptrptr points to the current pattern pointer
2945  errorcodeptr points to error code variable
2946  firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2947  reqbyteptr set to the last literal character required, else < 0
2948  bcptr points to current branch chain
2949  cd contains pointers to tables etc.
2950  lengthptr NULL during the real compile phase
2951  points to length accumulator during pre-compile phase
2952 
2953 Returns: TRUE on success
2954  FALSE, with *errorcodeptr set non-zero on error
2955 */
2956 
2957 static BOOL
2958 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2959  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2960  compile_data *cd, int *lengthptr)
2961 {
2962 int repeat_type, op_type;
2963 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2964 int bravalue = 0;
2965 int greedy_default, greedy_non_default;
2966 int firstbyte, reqbyte;
2967 int zeroreqbyte, zerofirstbyte;
2968 int req_caseopt, reqvary, tempreqvary;
2969 int options = *optionsptr;
2970 int after_manual_callout = 0;
2971 int length_prevgroup = 0;
2972 register int c;
2973 register uschar *code = *codeptr;
2974 uschar *last_code = code;
2975 uschar *orig_code = code;
2976 uschar *tempcode;
2977 BOOL inescq = FALSE;
2978 BOOL groupsetfirstbyte = FALSE;
2979 const uschar *ptr = *ptrptr;
2980 const uschar *tempptr;
2981 const uschar *nestptr = NULL;
2982 uschar *previous = NULL;
2983 uschar *previous_callout = NULL;
2984 uschar *save_hwm = NULL;
2985 uschar classbits[32];
2986 
2987 #ifdef SUPPORT_UTF8
2988 BOOL class_utf8;
2989 BOOL utf8 = (options & PCRE_UTF8) != 0;
2990 uschar *class_utf8data;
2991 uschar *class_utf8data_base;
2992 uschar utf8_char[6];
2993 #else
2994 BOOL utf8 = FALSE;
2995 uschar *utf8_char = NULL;
2996 #endif
2997 
2998 #ifdef PCRE_DEBUG
2999 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3000 #endif
3001 
3002 /* Set up the default and non-default settings for greediness */
3003 
3004 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3005 greedy_non_default = greedy_default ^ 1;
3006 
3007 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3008 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3009 matches a non-fixed char first char; reqbyte just remains unset if we never
3010 find one.
3011 
3012 When we hit a repeat whose minimum is zero, we may have to adjust these values
3013 to take the zero repeat into account. This is implemented by setting them to
3014 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3015 item types that can be repeated set these backoff variables appropriately. */
3016 
3017 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3018 
3019 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3020 according to the current setting of the caseless flag. REQ_CASELESS is a bit
3021 value > 255. It is added into the firstbyte or reqbyte variables to record the
3022 case status of the value. This is used only for ASCII characters. */
3023 
3024 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3025 
3026 /* Switch on next character until the end of the branch */
3027 
3028 for (;; ptr++)
3029  {
3030  BOOL negate_class;
3031  BOOL should_flip_negation;
3032  BOOL possessive_quantifier;
3033  BOOL is_quantifier;
3034  BOOL is_recurse;
3035  BOOL reset_bracount;
3036  int class_charcount;
3037  int class_lastchar;
3038  int newoptions;
3039  int recno;
3040  int refsign;
3041  int skipbytes;
3042  int subreqbyte;
3043  int subfirstbyte;
3044  int terminator;
3045  int mclength;
3046  uschar mcbuffer[8];
3047 
3048  /* Get next byte in the pattern */
3049 
3050  c = *ptr;
3051 
3052  /* If we are at the end of a nested substitution, revert to the outer level
3053  string. Nesting only happens one level deep. */
3054 
3055  if (c == 0 && nestptr != NULL)
3056  {
3057  ptr = nestptr;
3058  nestptr = NULL;
3059  c = *ptr;
3060  }
3061 
3062  /* If we are in the pre-compile phase, accumulate the length used for the
3063  previous cycle of this loop. */
3064 
3065  if (lengthptr != NULL)
3066  {
3067 #ifdef PCRE_DEBUG
3068  if (code > cd->hwm) cd->hwm = code; /* High water info */
3069 #endif
3070  if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3071  {
3072  *errorcodeptr = ERR52;
3073  goto FAILED;
3074  }
3075 
3076  /* There is at least one situation where code goes backwards: this is the
3077  case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3078  the class is simply eliminated. However, it is created first, so we have to
3079  allow memory for it. Therefore, don't ever reduce the length at this point.
3080  */
3081 
3082  if (code < last_code) code = last_code;
3083 
3084  /* Paranoid check for integer overflow */
3085 
3086  if (OFLOW_MAX - *lengthptr < code - last_code)
3087  {
3088  *errorcodeptr = ERR20;
3089  goto FAILED;
3090  }
3091 
3092  *lengthptr += (int)(code - last_code);
3093  DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3094 
3095  /* If "previous" is set and it is not at the start of the work space, move
3096  it back to there, in order to avoid filling up the work space. Otherwise,
3097  if "previous" is NULL, reset the current code pointer to the start. */
3098 
3099  if (previous != NULL)
3100  {
3101  if (previous > orig_code)
3102  {
3103  memmove(orig_code, previous, code - previous);
3104  code -= previous - orig_code;
3105  previous = orig_code;
3106  }
3107  }
3108  else code = orig_code;
3109 
3110  /* Remember where this code item starts so we can pick up the length
3111  next time round. */
3112 
3113  last_code = code;
3114  }
3115 
3116  /* In the real compile phase, just check the workspace used by the forward
3117  reference list. */
3118 
3119  else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3120  {
3121  *errorcodeptr = ERR52;
3122  goto FAILED;
3123  }
3124 
3125  /* If in \Q...\E, check for the end; if not, we have a literal */
3126 
3127  if (inescq && c != 0)
3128  {
3129  if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3130  {
3131  inescq = FALSE;
3132  ptr++;
3133  continue;
3134  }
3135  else
3136  {
3137  if (previous_callout != NULL)
3138  {
3139  if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3140  complete_callout(previous_callout, ptr, cd);
3141  previous_callout = NULL;
3142  }
3143  if ((options & PCRE_AUTO_CALLOUT) != 0)
3144  {
3145  previous_callout = code;
3146  code = auto_callout(code, ptr, cd);
3147  }
3148  goto NORMAL_CHAR;
3149  }
3150  }
3151 
3152  /* Fill in length of a previous callout, except when the next thing is
3153  a quantifier. */
3154 
3155  is_quantifier =
3156  c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3157  (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3158 
3159  if (!is_quantifier && previous_callout != NULL &&
3160  after_manual_callout-- <= 0)
3161  {
3162  if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3163  complete_callout(previous_callout, ptr, cd);
3164  previous_callout = NULL;
3165  }
3166 
3167  /* In extended mode, skip white space and comments */
3168 
3169  if ((options & PCRE_EXTENDED) != 0)
3170  {
3171  if ((cd->ctypes[c] & ctype_space) != 0) continue;
3172  if (c == CHAR_NUMBER_SIGN)
3173  {
3174  ptr++;
3175  while (*ptr != 0)
3176  {
3177  if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3178  ptr++;
3179 #ifdef SUPPORT_UTF8
3180  if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3181 #endif
3182  }
3183  if (*ptr != 0) continue;
3184 
3185  /* Else fall through to handle end of string */
3186  c = 0;
3187  }
3188  }
3189 
3190  /* No auto callout for quantifiers. */
3191 
3192  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3193  {
3194  previous_callout = code;
3195  code = auto_callout(code, ptr, cd);
3196  }
3197 
3198  switch(c)
3199  {
3200  /* ===================================================================*/
3201  case 0: /* The branch terminates at string end */
3202  case CHAR_VERTICAL_LINE: /* or | or ) */
3204  *firstbyteptr = firstbyte;
3205  *reqbyteptr = reqbyte;
3206  *codeptr = code;
3207  *ptrptr = ptr;
3208  if (lengthptr != NULL)
3209  {
3210  if (OFLOW_MAX - *lengthptr < code - last_code)
3211  {
3212  *errorcodeptr = ERR20;
3213  goto FAILED;
3214  }
3215  *lengthptr += (int)(code - last_code); /* To include callout length */
3216  DPRINTF((">> end branch\n"));
3217  }
3218  return TRUE;
3219 
3220 
3221  /* ===================================================================*/
3222  /* Handle single-character metacharacters. In multiline mode, ^ disables
3223  the setting of any following char as a first character. */
3224 
3226  if ((options & PCRE_MULTILINE) != 0)
3227  {
3228  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3229  }
3230  previous = NULL;
3231  *code++ = OP_CIRC;
3232  break;
3233 
3234  case CHAR_DOLLAR_SIGN:
3235  previous = NULL;
3236  *code++ = OP_DOLL;
3237  break;
3238 
3239  /* There can never be a first char if '.' is first, whatever happens about
3240  repeats. The value of reqbyte doesn't change either. */
3241 
3242  case CHAR_DOT:
3243  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3244  zerofirstbyte = firstbyte;
3245  zeroreqbyte = reqbyte;
3246  previous = code;
3247  *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3248  break;
3249 
3250 
3251  /* ===================================================================*/
3252  /* Character classes. If the included characters are all < 256, we build a
3253  32-byte bitmap of the permitted characters, except in the special case
3254  where there is only one such character. For negated classes, we build the
3255  map as usual, then invert it at the end. However, we use a different opcode
3256  so that data characters > 255 can be handled correctly.
3257 
3258  If the class contains characters outside the 0-255 range, a different
3259  opcode is compiled. It may optionally have a bit map for characters < 256,
3260  but those above are are explicitly listed afterwards. A flag byte tells
3261  whether the bitmap is present, and whether this is a negated class or not.
3262 
3263  In JavaScript compatibility mode, an isolated ']' causes an error. In
3264  default (Perl) mode, it is treated as a data character. */
3265 
3267  if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3268  {
3269  *errorcodeptr = ERR64;
3270  goto FAILED;
3271  }
3272  goto NORMAL_CHAR;
3273 
3275  previous = code;
3276 
3277  /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3278  they are encountered at the top level, so we'll do that too. */
3279 
3280  if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3281  ptr[1] == CHAR_EQUALS_SIGN) &&
3282  check_posix_syntax(ptr, &tempptr))
3283  {
3284  *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3285  goto FAILED;
3286  }
3287 
3288  /* If the first character is '^', set the negation flag and skip it. Also,
3289  if the first few characters (either before or after ^) are \Q\E or \E we
3290  skip them too. This makes for compatibility with Perl. */
3291 
3292  negate_class = FALSE;
3293  for (;;)
3294  {
3295  c = *(++ptr);
3296  if (c == CHAR_BACKSLASH)
3297  {
3298  if (ptr[1] == CHAR_E)
3299  ptr++;
3300  else if (strncmp((const char *)ptr+1,
3301  STR_Q STR_BACKSLASH STR_E, 3) == 0)
3302  ptr += 3;
3303  else
3304  break;
3305  }
3306  else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3307  negate_class = TRUE;
3308  else break;
3309  }
3310 
3311  /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3312  an initial ']' is taken as a data character -- the code below handles
3313  that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3314  [^] must match any character, so generate OP_ALLANY. */
3315 
3316  if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3318  {
3319  *code++ = negate_class? OP_ALLANY : OP_FAIL;
3320  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3321  zerofirstbyte = firstbyte;
3322  break;
3323  }
3324 
3325  /* If a class contains a negative special such as \S, we need to flip the
3326  negation flag at the end, so that support for characters > 255 works
3327  correctly (they are all included in the class). */
3328 
3329  should_flip_negation = FALSE;
3330 
3331  /* Keep a count of chars with values < 256 so that we can optimize the case
3332  of just a single character (as long as it's < 256). However, For higher
3333  valued UTF-8 characters, we don't yet do any optimization. */
3334 
3335  class_charcount = 0;
3336  class_lastchar = -1;
3337 
3338  /* Initialize the 32-char bit map to all zeros. We build the map in a
3339  temporary bit of memory, in case the class contains only 1 character (less
3340  than 256), because in that case the compiled code doesn't use the bit map.
3341  */
3342 
3343  memset(classbits, 0, 32 * sizeof(uschar));
3344 
3345 #ifdef SUPPORT_UTF8
3346  class_utf8 = FALSE; /* No chars >= 256 */
3347  class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3348  class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3349 #endif
3350 
3351  /* Process characters until ] is reached. By writing this as a "do" it
3352  means that an initial ] is taken as a data character. At the start of the
3353  loop, c contains the first byte of the character. */
3354 
3355  if (c != 0) do
3356  {
3357  const uschar *oldptr;
3358 
3359 #ifdef SUPPORT_UTF8
3360  if (utf8 && c > 127)
3361  { /* Braces are required because the */
3362  GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3363  }
3364 
3365  /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3366  data and reset the pointer. This is so that very large classes that
3367  contain a zillion UTF-8 characters no longer overwrite the work space
3368  (which is on the stack). */
3369 
3370  if (lengthptr != NULL)
3371  {
3372  *lengthptr += class_utf8data - class_utf8data_base;
3373  class_utf8data = class_utf8data_base;
3374  }
3375 
3376 #endif
3377 
3378  /* Inside \Q...\E everything is literal except \E */
3379 
3380  if (inescq)
3381  {
3382  if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3383  {
3384  inescq = FALSE; /* Reset literal state */
3385  ptr++; /* Skip the 'E' */
3386  continue; /* Carry on with next */
3387  }
3388  goto CHECK_RANGE; /* Could be range if \E follows */
3389  }
3390 
3391  /* Handle POSIX class names. Perl allows a negation extension of the
3392  form [:^name:]. A square bracket that doesn't match the syntax is
3393  treated as a literal. We also recognize the POSIX constructions
3394  [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3395  5.6 and 5.8 do. */
3396 
3397  if (c == CHAR_LEFT_SQUARE_BRACKET &&
3398  (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3399  ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3400  {
3401  BOOL local_negate = FALSE;
3402  int posix_class, taboffset, tabopt;
3403  register const uschar *cbits = cd->cbits;
3404  uschar pbits[32];
3405 
3406  if (ptr[1] != CHAR_COLON)
3407  {
3408  *errorcodeptr = ERR31;
3409  goto FAILED;
3410  }
3411 
3412  ptr += 2;
3413  if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3414  {
3415  local_negate = TRUE;
3416  should_flip_negation = TRUE; /* Note negative special */
3417  ptr++;
3418  }
3419 
3420  posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3421  if (posix_class < 0)
3422  {
3423  *errorcodeptr = ERR30;
3424  goto FAILED;
3425  }
3426 
3427  /* If matching is caseless, upper and lower are converted to
3428  alpha. This relies on the fact that the class table starts with
3429  alpha, lower, upper as the first 3 entries. */
3430 
3431  if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3432  posix_class = 0;
3433 
3434  /* When PCRE_UCP is set, some of the POSIX classes are converted to
3435  different escape sequences that use Unicode properties. */
3436 
3437 #ifdef SUPPORT_UCP
3438  if ((options & PCRE_UCP) != 0)
3439  {
3440  int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3441  if (posix_substitutes[pc] != NULL)
3442  {
3443  nestptr = tempptr + 1;
3444  ptr = posix_substitutes[pc] - 1;
3445  continue;
3446  }
3447  }
3448 #endif
3449  /* In the non-UCP case, we build the bit map for the POSIX class in a
3450  chunk of local store because we may be adding and subtracting from it,
3451  and we don't want to subtract bits that may be in the main map already.
3452  At the end we or the result into the bit map that is being built. */
3453 
3454  posix_class *= 3;
3455 
3456  /* Copy in the first table (always present) */
3457 
3458  memcpy(pbits, cbits + posix_class_maps[posix_class],
3459  32 * sizeof(uschar));
3460 
3461  /* If there is a second table, add or remove it as required. */
3462 
3463  taboffset = posix_class_maps[posix_class + 1];
3464  tabopt = posix_class_maps[posix_class + 2];
3465 
3466  if (taboffset >= 0)
3467  {
3468  if (tabopt >= 0)
3469  for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3470  else
3471  for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3472  }
3473 
3474  /* Not see if we need to remove any special characters. An option
3475  value of 1 removes vertical space and 2 removes underscore. */
3476 
3477  if (tabopt < 0) tabopt = -tabopt;
3478  if (tabopt == 1) pbits[1] &= ~0x3c;
3479  else if (tabopt == 2) pbits[11] &= 0x7f;
3480 
3481  /* Add the POSIX table or its complement into the main table that is
3482  being built and we are done. */
3483 
3484  if (local_negate)
3485  for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3486  else
3487  for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3488 
3489  ptr = tempptr + 1;
3490  class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3491  continue; /* End of POSIX syntax handling */
3492  }
3493 
3494  /* Backslash may introduce a single character, or it may introduce one
3495  of the specials, which just set a flag. The sequence \b is a special
3496  case. Inside a class (and only there) it is treated as backspace. We
3497  assume that other escapes have more than one character in them, so set
3498  class_charcount bigger than one. Unrecognized escapes fall through and
3499  are either treated as literal characters (by default), or are faulted if
3500  PCRE_EXTRA is set. */
3501 
3502  if (c == CHAR_BACKSLASH)
3503  {
3504  c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3505  if (*errorcodeptr != 0) goto FAILED;
3506 
3507  if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3508  else if (-c == ESC_Q) /* Handle start of quoted string */
3509  {
3510  if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3511  {
3512  ptr += 2; /* avoid empty string */
3513  }
3514  else inescq = TRUE;
3515  continue;
3516  }
3517  else if (-c == ESC_E) continue; /* Ignore orphan \E */
3518 
3519  if (c < 0)
3520  {
3521  register const uschar *cbits = cd->cbits;
3522  class_charcount += 2; /* Greater than 1 is what matters */
3523 
3524  switch (-c)
3525  {
3526 #ifdef SUPPORT_UCP
3527  case ESC_du: /* These are the values given for \d etc */
3528  case ESC_DU: /* when PCRE_UCP is set. We replace the */
3529  case ESC_wu: /* escape sequence with an appropriate \p */
3530  case ESC_WU: /* or \P to test Unicode properties instead */
3531  case ESC_su: /* of the default ASCII testing. */
3532  case ESC_SU:
3533  nestptr = ptr;
3534  ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3535  class_charcount -= 2; /* Undo! */
3536  continue;
3537 #endif
3538  case ESC_d:
3539  for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3540  continue;
3541 
3542  case ESC_D:
3543  should_flip_negation = TRUE;
3544  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3545  continue;
3546 
3547  case ESC_w:
3548  for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3549  continue;
3550 
3551  case ESC_W:
3552  should_flip_negation = TRUE;
3553  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3554  continue;
3555 
3556  /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3557  if it was previously set by something earlier in the character
3558  class. */
3559 
3560  case ESC_s:
3561  classbits[0] |= cbits[cbit_space];
3562  classbits[1] |= cbits[cbit_space+1] & ~0x08;
3563  for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3564  continue;
3565 
3566  case ESC_S:
3567  should_flip_negation = TRUE;
3568  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3569  classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3570  continue;
3571 
3572  case ESC_h:
3573  SETBIT(classbits, 0x09); /* VT */
3574  SETBIT(classbits, 0x20); /* SPACE */
3575  SETBIT(classbits, 0xa0); /* NSBP */
3576 #ifdef SUPPORT_UTF8
3577  if (utf8)
3578  {
3579  class_utf8 = TRUE;
3580  *class_utf8data++ = XCL_SINGLE;
3581  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3582  *class_utf8data++ = XCL_SINGLE;
3583  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3584  *class_utf8data++ = XCL_RANGE;
3585  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3586  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3587  *class_utf8data++ = XCL_SINGLE;
3588  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3589  *class_utf8data++ = XCL_SINGLE;
3590  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3591  *class_utf8data++ = XCL_SINGLE;
3592  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3593  }
3594 #endif
3595  continue;
3596 
3597  case ESC_H:
3598  for (c = 0; c < 32; c++)
3599  {
3600  int x = 0xff;
3601  switch (c)
3602  {
3603  case 0x09/8: x ^= 1 << (0x09%8); break;
3604  case 0x20/8: x ^= 1 << (0x20%8); break;
3605  case 0xa0/8: x ^= 1 << (0xa0%8); break;
3606  default: break;
3607  }
3608  classbits[c] |= x;
3609  }
3610 
3611 #ifdef SUPPORT_UTF8
3612  if (utf8)
3613  {
3614  class_utf8 = TRUE;
3615  *class_utf8data++ = XCL_RANGE;
3616  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3617  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3618  *class_utf8data++ = XCL_RANGE;
3619  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3620  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3621  *class_utf8data++ = XCL_RANGE;
3622  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3623  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3624  *class_utf8data++ = XCL_RANGE;
3625  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3626  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3627  *class_utf8data++ = XCL_RANGE;
3628  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3629  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3630  *class_utf8data++ = XCL_RANGE;
3631  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3632  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3633  *class_utf8data++ = XCL_RANGE;
3634  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3635  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3636  }
3637 #endif
3638  continue;
3639 
3640  case ESC_v:
3641  SETBIT(classbits, 0x0a); /* LF */
3642  SETBIT(classbits, 0x0b); /* VT */
3643  SETBIT(classbits, 0x0c); /* FF */
3644  SETBIT(classbits, 0x0d); /* CR */
3645  SETBIT(classbits, 0x85); /* NEL */
3646 #ifdef SUPPORT_UTF8
3647  if (utf8)
3648  {
3649  class_utf8 = TRUE;
3650  *class_utf8data++ = XCL_RANGE;
3651  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3652  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3653  }
3654 #endif
3655  continue;
3656 
3657  case ESC_V:
3658  for (c = 0; c < 32; c++)
3659  {
3660  int x = 0xff;
3661  switch (c)
3662  {
3663  case 0x0a/8: x ^= 1 << (0x0a%8);
3664  x ^= 1 << (0x0b%8);
3665  x ^= 1 << (0x0c%8);
3666  x ^= 1 << (0x0d%8);
3667  break;
3668  case 0x85/8: x ^= 1 << (0x85%8); break;
3669  default: break;
3670  }
3671  classbits[c] |= x;
3672  }
3673 
3674 #ifdef SUPPORT_UTF8
3675  if (utf8)
3676  {
3677  class_utf8 = TRUE;
3678  *class_utf8data++ = XCL_RANGE;
3679  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3680  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3681  *class_utf8data++ = XCL_RANGE;
3682  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3683  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3684  }
3685 #endif
3686  continue;
3687 
3688 #ifdef SUPPORT_UCP
3689  case ESC_p:
3690  case ESC_P:
3691  {
3692  BOOL negated;
3693  int pdata;
3694  int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3695  if (ptype < 0) goto FAILED;
3696  class_utf8 = TRUE;
3697  *class_utf8data++ = ((-c == ESC_p) != negated)?
3699  *class_utf8data++ = ptype;
3700  *class_utf8data++ = pdata;
3701  class_charcount -= 2; /* Not a < 256 character */
3702  continue;
3703  }
3704 #endif
3705  /* Unrecognized escapes are faulted if PCRE is running in its
3706  strict mode. By default, for compatibility with Perl, they are
3707  treated as literals. */
3708 
3709  default:
3710  if ((options & PCRE_EXTRA) != 0)
3711  {
3712  *errorcodeptr = ERR7;
3713  goto FAILED;
3714  }
3715  class_charcount -= 2; /* Undo the default count from above */
3716  c = *ptr; /* Get the final character and fall through */
3717  break;
3718  }
3719  }
3720 
3721  /* Fall through if we have a single character (c >= 0). This may be
3722  greater than 256 in UTF-8 mode. */
3723 
3724  } /* End of backslash handling */
3725 
3726  /* A single character may be followed by '-' to form a range. However,
3727  Perl does not permit ']' to be the end of the range. A '-' character
3728  at the end is treated as a literal. Perl ignores orphaned \E sequences
3729  entirely. The code for handling \Q and \E is messy. */
3730 
3731  CHECK_RANGE:
3732  while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3733  {
3734  inescq = FALSE;
3735  ptr += 2;
3736  }
3737 
3738  oldptr = ptr;
3739 
3740  /* Remember \r or \n */
3741 
3742  if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3743 
3744  /* Check for range */
3745 
3746  if (!inescq && ptr[1] == CHAR_MINUS)
3747  {
3748  int d;
3749  ptr += 2;
3750  while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3751 
3752  /* If we hit \Q (not followed by \E) at this point, go into escaped
3753  mode. */
3754 
3755  while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3756  {
3757  ptr += 2;
3758  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3759  { ptr += 2; continue; }
3760  inescq = TRUE;
3761  break;
3762  }
3763 
3764  if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3765  {
3766  ptr = oldptr;
3767  goto LONE_SINGLE_CHARACTER;
3768  }
3769 
3770 #ifdef SUPPORT_UTF8
3771  if (utf8)
3772  { /* Braces are required because the */
3773  GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3774  }
3775  else
3776 #endif
3777  d = *ptr; /* Not UTF-8 mode */
3778 
3779  /* The second part of a range can be a single-character escape, but
3780  not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3781  in such circumstances. */
3782 
3783  if (!inescq && d == CHAR_BACKSLASH)
3784  {
3785  d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3786  if (*errorcodeptr != 0) goto FAILED;
3787 
3788  /* \b is backspace; any other special means the '-' was literal */
3789 
3790  if (d < 0)
3791  {
3792  if (d == -ESC_b) d = CHAR_BS; else
3793  {
3794  ptr = oldptr;
3795  goto LONE_SINGLE_CHARACTER; /* A few lines below */
3796  }
3797  }
3798  }
3799 
3800  /* Check that the two values are in the correct order. Optimize
3801  one-character ranges */
3802 
3803  if (d < c)
3804  {
3805  *errorcodeptr = ERR8;
3806  goto FAILED;
3807  }
3808 
3809  if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3810 
3811  /* Remember \r or \n */
3812 
3813  if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3814 
3815  /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3816  matching, we have to use an XCLASS with extra data items. Caseless
3817  matching for characters > 127 is available only if UCP support is
3818  available. */
3819 
3820 #ifdef SUPPORT_UTF8
3821  if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3822  {
3823  class_utf8 = TRUE;
3824 
3825  /* With UCP support, we can find the other case equivalents of
3826  the relevant characters. There may be several ranges. Optimize how
3827  they fit with the basic range. */
3828 
3829 #ifdef SUPPORT_UCP
3830  if ((options & PCRE_CASELESS) != 0)
3831  {
3832  unsigned int occ, ocd;
3833  unsigned int cc = c;
3834  unsigned int origd = d;
3835  while (get_othercase_range(&cc, origd, &occ, &ocd))
3836  {
3837  if (occ >= (unsigned int)c &&
3838  ocd <= (unsigned int)d)
3839  continue; /* Skip embedded ranges */
3840 
3841  if (occ < (unsigned int)c &&
3842  ocd >= (unsigned int)c - 1) /* Extend the basic range */
3843  { /* if there is overlap, */
3844  c = occ; /* noting that if occ < c */
3845  continue; /* we can't have ocd > d */
3846  } /* because a subrange is */
3847  if (ocd > (unsigned int)d &&
3848  occ <= (unsigned int)d + 1) /* always shorter than */
3849  { /* the basic range. */
3850  d = ocd;
3851  continue;
3852  }
3853 
3854  if (occ == ocd)
3855  {
3856  *class_utf8data++ = XCL_SINGLE;
3857  }
3858  else
3859  {
3860  *class_utf8data++ = XCL_RANGE;
3861  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3862  }
3863  class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3864  }
3865  }
3866 #endif /* SUPPORT_UCP */
3867 
3868  /* Now record the original range, possibly modified for UCP caseless
3869  overlapping ranges. */
3870 
3871  *class_utf8data++ = XCL_RANGE;
3872  class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3873  class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3874 
3875  /* With UCP support, we are done. Without UCP support, there is no
3876  caseless matching for UTF-8 characters > 127; we can use the bit map
3877  for the smaller ones. */
3878 
3879 #ifdef SUPPORT_UCP
3880  continue; /* With next character in the class */
3881 #else
3882  if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3883 
3884  /* Adjust upper limit and fall through to set up the map */
3885 
3886  d = 127;
3887 
3888 #endif /* SUPPORT_UCP */
3889  }
3890 #endif /* SUPPORT_UTF8 */
3891 
3892  /* We use the bit map for all cases when not in UTF-8 mode; else
3893  ranges that lie entirely within 0-127 when there is UCP support; else
3894  for partial ranges without UCP support. */
3895 
3896  class_charcount += d - c + 1;
3897  class_lastchar = d;
3898 
3899  /* We can save a bit of time by skipping this in the pre-compile. */
3900 
3901  if (lengthptr == NULL) for (; c <= d; c++)
3902  {
3903  classbits[c/8] |= (1 << (c&7));
3904  if ((options & PCRE_CASELESS) != 0)
3905  {
3906  int uc = cd->fcc[c]; /* flip case */
3907  classbits[uc/8] |= (1 << (uc&7));
3908  }
3909  }
3910 
3911  continue; /* Go get the next char in the class */
3912  }
3913 
3914  /* Handle a lone single character - we can get here for a normal
3915  non-escape char, or after \ that introduces a single character or for an
3916  apparent range that isn't. */
3917 
3918  LONE_SINGLE_CHARACTER:
3919 
3920  /* Handle a character that cannot go in the bit map */
3921 
3922 #ifdef SUPPORT_UTF8
3923  if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3924  {
3925  class_utf8 = TRUE;
3926  *class_utf8data++ = XCL_SINGLE;
3927  class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3928 
3929 #ifdef SUPPORT_UCP
3930  if ((options & PCRE_CASELESS) != 0)
3931  {
3932  unsigned int othercase;
3933  if ((othercase = UCD_OTHERCASE(c)) != c)
3934  {
3935  *class_utf8data++ = XCL_SINGLE;
3936  class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3937  }
3938  }
3939 #endif /* SUPPORT_UCP */
3940 
3941  }
3942  else
3943 #endif /* SUPPORT_UTF8 */
3944 
3945  /* Handle a single-byte character */
3946  {
3947  classbits[c/8] |= (1 << (c&7));
3948  if ((options & PCRE_CASELESS) != 0)
3949  {
3950  c = cd->fcc[c]; /* flip case */
3951  classbits[c/8] |= (1 << (c&7));
3952  }
3953  class_charcount++;
3954  class_lastchar = c;
3955  }
3956  }
3957 
3958  /* Loop until ']' reached. This "while" is the end of the "do" far above.
3959  If we are at the end of an internal nested string, revert to the outer
3960  string. */
3961 
3962  while (((c = *(++ptr)) != 0 ||
3963  (nestptr != NULL &&
3964  (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3965  (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3966 
3967  /* Check for missing terminating ']' */
3968 
3969  if (c == 0)
3970  {
3971  *errorcodeptr = ERR6;
3972  goto FAILED;
3973  }
3974 
3975  /* If class_charcount is 1, we saw precisely one character whose value is
3976  less than 256. As long as there were no characters >= 128 and there was no
3977  use of \p or \P, in other words, no use of any XCLASS features, we can
3978  optimize.
3979 
3980  In UTF-8 mode, we can optimize the negative case only if there were no
3981  characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3982  operate on single-bytes only. This is an historical hangover. Maybe one day
3983  we can tidy these opcodes to handle multi-byte characters.
3984 
3985  The optimization throws away the bit map. We turn the item into a
3986  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3987  that OP_NOT does not support multibyte characters. In the positive case, it
3988  can cause firstbyte to be set. Otherwise, there can be no first char if
3989  this item is first, whatever repeat count may follow. In the case of
3990  reqbyte, save the previous value for reinstating. */
3991 
3992 #ifdef SUPPORT_UTF8
3993  if (class_charcount == 1 && !class_utf8 &&
3994  (!utf8 || !negate_class || class_lastchar < 128))
3995 #else
3996  if (class_charcount == 1)
3997 #endif
3998  {
3999  zeroreqbyte = reqbyte;
4000 
4001  /* The OP_NOT opcode works on one-byte characters only. */
4002 
4003  if (negate_class)
4004  {
4005  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4006  zerofirstbyte = firstbyte;
4007  *code++ = OP_NOT;
4008  *code++ = class_lastchar;
4009  break;
4010  }
4011 
4012  /* For a single, positive character, get the value into mcbuffer, and
4013  then we can handle this with the normal one-character code. */
4014 
4015 #ifdef SUPPORT_UTF8
4016  if (utf8 && class_lastchar > 127)
4017  mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
4018  else
4019 #endif
4020  {
4021  mcbuffer[0] = class_lastchar;
4022  mclength = 1;
4023  }
4024  goto ONE_CHAR;
4025  } /* End of 1-char optimization */
4026 
4027  /* The general case - not the one-char optimization. If this is the first
4028  thing in the branch, there can be no first char setting, whatever the
4029  repeat count. Any reqbyte setting must remain unchanged after any kind of
4030  repeat. */
4031 
4032  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4033  zerofirstbyte = firstbyte;
4034  zeroreqbyte = reqbyte;
4035 
4036  /* If there are characters with values > 255, we have to compile an
4037  extended class, with its own opcode, unless there was a negated special
4038  such as \S in the class, and PCRE_UCP is not set, because in that case all
4039  characters > 255 are in the class, so any that were explicitly given as
4040  well can be ignored. If (when there are explicit characters > 255 that must
4041  be listed) there are no characters < 256, we can omit the bitmap in the
4042  actual compiled code. */
4043 
4044 #ifdef SUPPORT_UTF8
4045  if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4046  {
4047  *class_utf8data++ = XCL_END; /* Marks the end of extra data */
4048  *code++ = OP_XCLASS;
4049  code += LINK_SIZE;
4050  *code = negate_class? XCL_NOT : 0;
4051 
4052  /* If the map is required, move up the extra data to make room for it;
4053  otherwise just move the code pointer to the end of the extra data. */
4054 
4055  if (class_charcount > 0)
4056  {
4057  *code++ |= XCL_MAP;
4058  memmove(code + 32, code, class_utf8data - code);
4059  memcpy(code, classbits, 32);
4060  code = class_utf8data + 32;
4061  }
4062  else code = class_utf8data;
4063 
4064  /* Now fill in the complete length of the item */
4065 
4066  PUT(previous, 1, code - previous);
4067  break; /* End of class handling */
4068  }
4069 #endif
4070 
4071  /* If there are no characters > 255, or they are all to be included or
4072  excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4073  whole class was negated and whether there were negative specials such as \S
4074  (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4075  negating it if necessary. */
4076 
4077  *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4078  if (negate_class)
4079  {
4080  if (lengthptr == NULL) /* Save time in the pre-compile phase */
4081  for (c = 0; c < 32; c++) code[c] = ~classbits[c];
4082  }
4083  else
4084  {
4085  memcpy(code, classbits, 32);
4086  }
4087  code += 32;
4088  break;
4089 
4090 
4091  /* ===================================================================*/
4092  /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4093  has been tested above. */
4094 
4096  if (!is_quantifier) goto NORMAL_CHAR;
4097  ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4098  if (*errorcodeptr != 0) goto FAILED;
4099  goto REPEAT;
4100 
4101  case CHAR_ASTERISK:
4102  repeat_min = 0;
4103  repeat_max = -1;
4104  goto REPEAT;
4105 
4106  case CHAR_PLUS:
4107  repeat_min = 1;
4108  repeat_max = -1;
4109  goto REPEAT;
4110 
4111  case CHAR_QUESTION_MARK:
4112  repeat_min = 0;
4113  repeat_max = 1;
4114 
4115  REPEAT:
4116  if (previous == NULL)
4117  {
4118  *errorcodeptr = ERR9;
4119  goto FAILED;
4120  }
4121 
4122  if (repeat_min == 0)
4123  {
4124  firstbyte = zerofirstbyte; /* Adjust for zero repeat */
4125  reqbyte = zeroreqbyte; /* Ditto */
4126  }
4127 
4128  /* Remember whether this is a variable length repeat */
4129 
4130  reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4131 
4132  op_type = 0; /* Default single-char op codes */
4133  possessive_quantifier = FALSE; /* Default not possessive quantifier */
4134 
4135  /* Save start of previous item, in case we have to move it up to make space
4136  for an inserted OP_ONCE for the additional '+' extension. */
4137 
4138  tempcode = previous;
4139 
4140  /* If the next character is '+', we have a possessive quantifier. This
4141  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4142  If the next character is '?' this is a minimizing repeat, by default,
4143  but if PCRE_UNGREEDY is set, it works the other way round. We change the
4144  repeat type to the non-default. */
4145 
4146  if (ptr[1] == CHAR_PLUS)
4147  {
4148  repeat_type = 0; /* Force greedy */
4149  possessive_quantifier = TRUE;
4150  ptr++;
4151  }
4152  else if (ptr[1] == CHAR_QUESTION_MARK)
4153  {
4154  repeat_type = greedy_non_default;
4155  ptr++;
4156  }
4157  else repeat_type = greedy_default;
4158 
4159  /* If previous was a character match, abolish the item and generate a
4160  repeat item instead. If a char item has a minumum of more than one, ensure
4161  that it is set in reqbyte - it might not be if a sequence such as x{3} is
4162  the first thing in a branch because the x will have gone into firstbyte
4163  instead. */
4164 
4165  if (*previous == OP_CHAR || *previous == OP_CHARNC)
4166  {
4167  /* Deal with UTF-8 characters that take up more than one byte. It's
4168  easier to write this out separately than try to macrify it. Use c to
4169  hold the length of the character in bytes, plus 0x80 to flag that it's a
4170  length rather than a small character. */
4171 
4172 #ifdef SUPPORT_UTF8
4173  if (utf8 && (code[-1] & 0x80) != 0)
4174  {
4175  uschar *lastchar = code - 1;
4176  while((*lastchar & 0xc0) == 0x80) lastchar--;
4177  c = code - lastchar; /* Length of UTF-8 character */
4178  memcpy(utf8_char, lastchar, c); /* Save the char */
4179  c |= 0x80; /* Flag c as a length */
4180  }
4181  else
4182 #endif
4183 
4184  /* Handle the case of a single byte - either with no UTF8 support, or
4185  with UTF-8 disabled, or for a UTF-8 character < 128. */
4186 
4187  {
4188  c = code[-1];
4189  if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
4190  }
4191 
4192  /* If the repetition is unlimited, it pays to see if the next thing on
4193  the line is something that cannot possibly match this character. If so,
4194  automatically possessifying this item gains some performance in the case
4195  where the match fails. */
4196 
4197  if (!possessive_quantifier &&
4198  repeat_max < 0 &&
4199  check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4200  {
4201  repeat_type = 0; /* Force greedy */
4202  possessive_quantifier = TRUE;
4203  }
4204 
4205  goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4206  }
4207 
4208  /* If previous was a single negated character ([^a] or similar), we use
4209  one of the special opcodes, replacing it. The code is shared with single-
4210  character repeats by setting opt_type to add a suitable offset into
4211  repeat_type. We can also test for auto-possessification. OP_NOT is
4212  currently used only for single-byte chars. */
4213 
4214  else if (*previous == OP_NOT)
4215  {
4216  op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
4217  c = previous[1];
4218  if (!possessive_quantifier &&
4219  repeat_max < 0 &&
4220  check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4221  {
4222  repeat_type = 0; /* Force greedy */
4223  possessive_quantifier = TRUE;
4224  }
4225  goto OUTPUT_SINGLE_REPEAT;
4226  }
4227 
4228  /* If previous was a character type match (\d or similar), abolish it and
4229  create a suitable repeat item. The code is shared with single-character
4230  repeats by setting op_type to add a suitable offset into repeat_type. Note
4231  the the Unicode property types will be present only when SUPPORT_UCP is
4232  defined, but we don't wrap the little bits of code here because it just
4233  makes it horribly messy. */
4234 
4235  else if (*previous < OP_EODN)
4236  {
4237  uschar *oldcode;
4238  int prop_type, prop_value;
4239  op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4240  c = *previous;
4241 
4242  if (!possessive_quantifier &&
4243  repeat_max < 0 &&
4244  check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4245  {
4246  repeat_type = 0; /* Force greedy */
4247  possessive_quantifier = TRUE;
4248  }
4249 
4250  OUTPUT_SINGLE_REPEAT:
4251  if (*previous == OP_PROP || *previous == OP_NOTPROP)
4252  {
4253  prop_type = previous[1];
4254  prop_value = previous[2];
4255  }
4256  else prop_type = prop_value = -1;
4257 
4258  oldcode = code;
4259  code = previous; /* Usually overwrite previous item */
4260 
4261  /* If the maximum is zero then the minimum must also be zero; Perl allows
4262  this case, so we do too - by simply omitting the item altogether. */
4263 
4264  if (repeat_max == 0) goto END_REPEAT;
4265 
4266  /*--------------------------------------------------------------------*/
4267  /* This code is obsolete from release 8.00; the restriction was finally
4268  removed: */
4269 
4270  /* All real repeats make it impossible to handle partial matching (maybe
4271  one day we will be able to remove this restriction). */
4272 
4273  /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4274  /*--------------------------------------------------------------------*/
4275 
4276  /* Combine the op_type with the repeat_type */
4277 
4278  repeat_type += op_type;
4279 
4280  /* A minimum of zero is handled either as the special case * or ?, or as
4281  an UPTO, with the maximum given. */
4282 
4283  if (repeat_min == 0)
4284  {
4285  if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4286  else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4287  else
4288  {
4289  *code++ = OP_UPTO + repeat_type;
4290  PUT2INC(code, 0, repeat_max);
4291  }
4292  }
4293 
4294  /* A repeat minimum of 1 is optimized into some special cases. If the
4295  maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4296  left in place and, if the maximum is greater than 1, we use OP_UPTO with
4297  one less than the maximum. */
4298 
4299  else if (repeat_min == 1)
4300  {
4301  if (repeat_max == -1)
4302  *code++ = OP_PLUS + repeat_type;
4303  else
4304  {
4305  code = oldcode; /* leave previous item in place */
4306  if (repeat_max == 1) goto END_REPEAT;
4307  *code++ = OP_UPTO + repeat_type;
4308  PUT2INC(code, 0, repeat_max - 1);
4309  }
4310  }
4311 
4312  /* The case {n,n} is just an EXACT, while the general case {n,m} is
4313  handled as an EXACT followed by an UPTO. */
4314 
4315  else
4316  {
4317  *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4318  PUT2INC(code, 0, repeat_min);
4319 
4320  /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4321  we have to insert the character for the previous code. For a repeated
4322  Unicode property match, there are two extra bytes that define the
4323  required property. In UTF-8 mode, long characters have their length in
4324  c, with the 0x80 bit as a flag. */
4325 
4326  if (repeat_max < 0)
4327  {
4328 #ifdef SUPPORT_UTF8
4329  if (utf8 && c >= 128)
4330  {
4331  memcpy(code, utf8_char, c & 7);
4332  code += c & 7;
4333  }
4334  else
4335 #endif
4336  {
4337  *code++ = c;
4338  if (prop_type >= 0)
4339  {
4340  *code++ = prop_type;
4341  *code++ = prop_value;
4342  }
4343  }
4344  *code++ = OP_STAR + repeat_type;
4345  }
4346 
4347  /* Else insert an UPTO if the max is greater than the min, again
4348  preceded by the character, for the previously inserted code. If the
4349  UPTO is just for 1 instance, we can use QUERY instead. */
4350 
4351  else if (repeat_max != repeat_min)
4352  {
4353 #ifdef SUPPORT_UTF8
4354  if (utf8 && c >= 128)
4355  {
4356  memcpy(code, utf8_char, c & 7);
4357  code += c & 7;
4358  }
4359  else
4360 #endif
4361  *code++ = c;
4362  if (prop_type >= 0)
4363  {
4364  *code++ = prop_type;
4365  *code++ = prop_value;
4366  }
4367  repeat_max -= repeat_min;
4368 
4369  if (repeat_max == 1)
4370  {
4371  *code++ = OP_QUERY + repeat_type;
4372  }
4373  else
4374  {
4375  *code++ = OP_UPTO + repeat_type;
4376  PUT2INC(code, 0, repeat_max);
4377  }
4378  }
4379  }
4380 
4381  /* The character or character type itself comes last in all cases. */
4382 
4383 #ifdef SUPPORT_UTF8
4384  if (utf8 && c >= 128)
4385  {
4386  memcpy(code, utf8_char, c & 7);
4387  code += c & 7;
4388  }
4389  else
4390 #endif
4391  *code++ = c;
4392 
4393  /* For a repeated Unicode property match, there are two extra bytes that
4394  define the required property. */
4395 
4396 #ifdef SUPPORT_UCP
4397  if (prop_type >= 0)
4398  {
4399  *code++ = prop_type;
4400  *code++ = prop_value;
4401  }
4402 #endif
4403  }
4404 
4405  /* If previous was a character class or a back reference, we put the repeat
4406  stuff after it, but just skip the item if the repeat was {0,0}. */
4407 
4408  else if (*previous == OP_CLASS ||
4409  *previous == OP_NCLASS ||
4410 #ifdef SUPPORT_UTF8
4411  *previous == OP_XCLASS ||
4412 #endif
4413  *previous == OP_REF)
4414  {
4415  if (repeat_max == 0)
4416  {
4417  code = previous;
4418  goto END_REPEAT;
4419  }
4420 
4421  /*--------------------------------------------------------------------*/
4422  /* This code is obsolete from release 8.00; the restriction was finally
4423  removed: */
4424 
4425  /* All real repeats make it impossible to handle partial matching (maybe
4426  one day we will be able to remove this restriction). */
4427 
4428  /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4429  /*--------------------------------------------------------------------*/
4430 
4431  if (repeat_min == 0 && repeat_max == -1)
4432  *code++ = OP_CRSTAR + repeat_type;
4433  else if (repeat_min == 1 && repeat_max == -1)
4434  *code++ = OP_CRPLUS + repeat_type;
4435  else if (repeat_min == 0 && repeat_max == 1)
4436  *code++ = OP_CRQUERY + repeat_type;
4437  else
4438  {
4439  *code++ = OP_CRRANGE + repeat_type;
4440  PUT2INC(code, 0, repeat_min);
4441  if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4442  PUT2INC(code, 0, repeat_max);
4443  }
4444  }
4445 
4446  /* If previous was a bracket group, we may have to replicate it in certain
4447  cases. */
4448 
4449  else if (*previous == OP_BRA || *previous == OP_CBRA ||
4450  *previous == OP_ONCE || *previous == OP_COND)
4451  {
4452  register int i;
4453  int ketoffset = 0;
4454  int len = (int)(code - previous);
4455  uschar *bralink = NULL;
4456 
4457  /* Repeating a DEFINE group is pointless */
4458 
4459  if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4460  {
4461  *errorcodeptr = ERR55;
4462  goto FAILED;
4463  }
4464 
4465  /* If the maximum repeat count is unlimited, find the end of the bracket
4466  by scanning through from the start, and compute the offset back to it
4467  from the current code pointer. There may be an OP_OPT setting following
4468  the final KET, so we can't find the end just by going back from the code
4469  pointer. */
4470 
4471  if (repeat_max == -1)
4472  {
4473  register uschar *ket = previous;
4474  do ket += GET(ket, 1); while (*ket != OP_KET);
4475  ketoffset = (int)(code - ket);
4476  }
4477 
4478  /* The case of a zero minimum is special because of the need to stick
4479  OP_BRAZERO in front of it, and because the group appears once in the
4480  data, whereas in other cases it appears the minimum number of times. For
4481  this reason, it is simplest to treat this case separately, as otherwise
4482  the code gets far too messy. There are several special subcases when the
4483  minimum is zero. */
4484 
4485  if (repeat_min == 0)
4486  {
4487  /* If the maximum is also zero, we used to just omit the group from the
4488  output altogether, like this:
4489 
4490  ** if (repeat_max == 0)
4491  ** {
4492  ** code = previous;
4493  ** goto END_REPEAT;
4494  ** }
4495 
4496  However, that fails when a group is referenced as a subroutine from
4497  elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4498  so that it is skipped on execution. As we don't have a list of which
4499  groups are referenced, we cannot do this selectively.
4500 
4501  If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4502  and do no more at this point. However, we do need to adjust any
4503  OP_RECURSE calls inside the group that refer to the group itself or any
4504  internal or forward referenced group, because the offset is from the
4505  start of the whole regex. Temporarily terminate the pattern while doing
4506  this. */
4507 
4508  if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4509  {
4510  *code = OP_END;
4511  adjust_recurse(previous, 1, utf8, cd, save_hwm);
4512  memmove(previous+1, previous, len);
4513  code++;
4514  if (repeat_max == 0)
4515  {
4516  *previous++ = OP_SKIPZERO;
4517  goto END_REPEAT;
4518  }
4519  *previous++ = OP_BRAZERO + repeat_type;
4520  }
4521 
4522  /* If the maximum is greater than 1 and limited, we have to replicate
4523  in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4524  The first one has to be handled carefully because it's the original
4525  copy, which has to be moved up. The remainder can be handled by code
4526  that is common with the non-zero minimum case below. We have to
4527  adjust the value or repeat_max, since one less copy is required. Once
4528  again, we may have to adjust any OP_RECURSE calls inside the group. */
4529 
4530  else
4531  {
4532  int offset;
4533  *code = OP_END;
4534  adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4535  memmove(previous + 2 + LINK_SIZE, previous, len);
4536  code += 2 + LINK_SIZE;
4537  *previous++ = OP_BRAZERO + repeat_type;
4538  *previous++ = OP_BRA;
4539 
4540  /* We chain together the bracket offset fields that have to be
4541  filled in later when the ends of the brackets are reached. */
4542 
4543  offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4544  bralink = previous;
4545  PUTINC(previous, 0, offset);
4546  }
4547 
4548  repeat_max--;
4549  }
4550 
4551  /* If the minimum is greater than zero, replicate the group as many
4552  times as necessary, and adjust the maximum to the number of subsequent
4553  copies that we need. If we set a first char from the group, and didn't
4554  set a required char, copy the latter from the former. If there are any
4555  forward reference subroutine calls in the group, there will be entries on
4556  the workspace list; replicate these with an appropriate increment. */
4557 
4558  else
4559  {
4560  if (repeat_min > 1)
4561  {
4562  /* In the pre-compile phase, we don't actually do the replication. We
4563  just adjust the length as if we had. Do some paranoid checks for
4564  potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4565  integer type when available, otherwise double. */
4566 
4567  if (lengthptr != NULL)
4568  {
4569  int delta = (repeat_min - 1)*length_prevgroup;
4570  if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4571  (INT64_OR_DOUBLE)length_prevgroup >
4572  (INT64_OR_DOUBLE)INT_MAX ||
4573  OFLOW_MAX - *lengthptr < delta)
4574  {
4575  *errorcodeptr = ERR20;
4576  goto FAILED;
4577  }
4578  *lengthptr += delta;
4579  }
4580 
4581  /* This is compiling for real */
4582 
4583  else
4584  {
4585  if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4586  for (i = 1; i < repeat_min; i++)
4587  {
4588  uschar *hc;
4589  uschar *this_hwm = cd->hwm;
4590  memcpy(code, previous, len);
4591  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4592  {
4593  PUT(cd->hwm, 0, GET(hc, 0) + len);
4594  cd->hwm += LINK_SIZE;
4595  }
4596  save_hwm = this_hwm;
4597  code += len;
4598  }
4599  }
4600  }
4601 
4602  if (repeat_max > 0) repeat_max -= repeat_min;
4603  }
4604 
4605  /* This code is common to both the zero and non-zero minimum cases. If
4606  the maximum is limited, it replicates the group in a nested fashion,
4607  remembering the bracket starts on a stack. In the case of a zero minimum,
4608  the first one was set up above. In all cases the repeat_max now specifies
4609  the number of additional copies needed. Again, we must remember to
4610  replicate entries on the forward reference list. */
4611 
4612  if (repeat_max >= 0)
4613  {
4614  /* In the pre-compile phase, we don't actually do the replication. We
4615  just adjust the length as if we had. For each repetition we must add 1
4616  to the length for BRAZERO and for all but the last repetition we must
4617  add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4618  paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4619  a 64-bit integer type when available, otherwise double. */
4620 
4621  if (lengthptr != NULL && repeat_max > 0)
4622  {
4623  int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4624  2 - 2*LINK_SIZE; /* Last one doesn't nest */
4625  if ((INT64_OR_DOUBLE)repeat_max *
4626  (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4627  > (INT64_OR_DOUBLE)INT_MAX ||
4628  OFLOW_MAX - *lengthptr < delta)
4629  {
4630  *errorcodeptr = ERR20;
4631  goto FAILED;
4632  }
4633  *lengthptr += delta;
4634  }
4635 
4636  /* This is compiling for real */
4637 
4638  else for (i = repeat_max - 1; i >= 0; i--)
4639  {
4640  uschar *hc;
4641  uschar *this_hwm = cd->hwm;
4642 
4643  *code++ = OP_BRAZERO + repeat_type;
4644 
4645  /* All but the final copy start a new nesting, maintaining the
4646  chain of brackets outstanding. */
4647 
4648  if (i != 0)
4649  {
4650  int offset;
4651  *code++ = OP_BRA;
4652  offset = (bralink == NULL)? 0 : (int)(code - bralink);
4653  bralink = code;
4654  PUTINC(code, 0, offset);
4655  }
4656 
4657  memcpy(code, previous, len);
4658  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4659  {
4660  PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4661  cd->hwm += LINK_SIZE;
4662  }
4663  save_hwm = this_hwm;
4664  code += len;
4665  }
4666 
4667  /* Now chain through the pending brackets, and fill in their length
4668  fields (which are holding the chain links pro tem). */
4669 
4670  while (bralink != NULL)
4671  {
4672  int oldlinkoffset;
4673  int offset = (int)(code - bralink + 1);
4674  uschar *bra = code - offset;
4675  oldlinkoffset = GET(bra, 1);
4676  bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4677  *code++ = OP_KET;
4678  PUTINC(code, 0, offset);
4679  PUT(bra, 1, offset);
4680  }
4681  }
4682 
4683  /* If the maximum is unlimited, set a repeater in the final copy. We
4684  can't just offset backwards from the current code point, because we
4685  don't know if there's been an options resetting after the ket. The
4686  correct offset was computed above.
4687 
4688  Then, when we are doing the actual compile phase, check to see whether
4689  this group is a non-atomic one that could match an empty string. If so,
4690  convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4691  that runtime checking can be done. [This check is also applied to
4692  atomic groups at runtime, but in a different way.] */
4693 
4694  else
4695  {
4696  uschar *ketcode = code - ketoffset;
4697  uschar *bracode = ketcode - GET(ketcode, 1);
4698  *ketcode = OP_KETRMAX + repeat_type;
4699  if (lengthptr == NULL && *bracode != OP_ONCE)
4700  {
4701  uschar *scode = bracode;
4702  do
4703  {
4704  if (could_be_empty_branch(scode, ketcode, utf8, cd))
4705  {
4706  *bracode += OP_SBRA - OP_BRA;
4707  break;
4708  }
4709  scode += GET(scode, 1);
4710  }
4711  while (*scode == OP_ALT);
4712  }
4713  }
4714  }
4715 
4716  /* If previous is OP_FAIL, it was generated by an empty class [] in
4717  JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4718  by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4719  error above. We can just ignore the repeat in JS case. */
4720 
4721  else if (*previous == OP_FAIL) goto END_REPEAT;
4722 
4723  /* Else there's some kind of shambles */
4724 
4725  else
4726  {
4727  *errorcodeptr = ERR11;
4728  goto FAILED;
4729  }
4730 
4731  /* If the character following a repeat is '+', or if certain optimization
4732  tests above succeeded, possessive_quantifier is TRUE. For some of the
4733  simpler opcodes, there is an special alternative opcode for this. For
4734  anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4735  The '+' notation is just syntactic sugar, taken from Sun's Java package,
4736  but the special opcodes can optimize it a bit. The repeated item starts at
4737  tempcode, not at previous, which might be the first part of a string whose
4738  (former) last char we repeated.
4739 
4740  Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4741  an 'upto' may follow. We skip over an 'exact' item, and then test the
4742  length of what remains before proceeding. */
4743 
4744  if (possessive_quantifier)
4745  {
4746  int len;
4747 
4748  if (*tempcode == OP_TYPEEXACT)
4749  tempcode += _pcre_OP_lengths[*tempcode] +
4750  ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4751 
4752  else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4753  {
4754  tempcode += _pcre_OP_lengths[*tempcode];
4755 #ifdef SUPPORT_UTF8
4756  if (utf8 && tempcode[-1] >= 0xc0)
4757  tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4758 #endif
4759  }
4760 
4761  len = (int)(code - tempcode);
4762  if (len > 0) switch (*tempcode)
4763  {
4764  case OP_STAR: *tempcode = OP_POSSTAR; break;
4765  case OP_PLUS: *tempcode = OP_POSPLUS; break;
4766  case OP_QUERY: *tempcode = OP_POSQUERY; break;
4767  case OP_UPTO: *tempcode = OP_POSUPTO; break;
4768 
4769  case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4770  case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4771  case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4772  case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4773 
4774  case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4775  case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4776  case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4777  case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4778 
4779  /* Because we are moving code along, we must ensure that any
4780  pending recursive references are updated. */
4781 
4782  default:
4783  *code = OP_END;
4784  adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4785  memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4786  code += 1 + LINK_SIZE;
4787  len += 1 + LINK_SIZE;
4788  tempcode[0] = OP_ONCE;
4789  *code++ = OP_KET;
4790  PUTINC(code, 0, len);
4791  PUT(tempcode, 1, len);
4792  break;
4793  }
4794  }
4795 
4796  /* In all case we no longer have a previous item. We also set the
4797  "follows varying string" flag for subsequently encountered reqbytes if
4798  it isn't already set and we have just passed a varying length item. */
4799 
4800  END_REPEAT:
4801  previous = NULL;
4802  cd->req_varyopt |= reqvary;
4803  break;
4804 
4805 
4806  /* ===================================================================*/
4807  /* Start of nested parenthesized sub-expression, or comment or lookahead or
4808  lookbehind or option setting or condition or all the other extended
4809  parenthesis forms. */
4810 
4811  case CHAR_LEFT_PARENTHESIS:
4812  newoptions = options;
4813  skipbytes = 0;
4814  bravalue = OP_CBRA;
4815  save_hwm = cd->hwm;
4816  reset_bracount = FALSE;
4817 
4818  /* First deal with various "verbs" that can be introduced by '*'. */
4819 
4820  if (*(++ptr) == CHAR_ASTERISK &&
4821  ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4822  {
4823  int i, namelen;
4824  int arglen = 0;
4825  const char *vn = verbnames;
4826  const uschar *name = ptr + 1;
4827  const uschar *arg = NULL;
4828  previous = NULL;
4829  while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4830  namelen = (int)(ptr - name);
4831 
4832  if (*ptr == CHAR_COLON)
4833  {
4834  arg = ++ptr;
4835  while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4836  || *ptr == '_') ptr++;
4837  arglen = (int)(ptr - arg);
4838  }
4839 
4840  if (*ptr != CHAR_RIGHT_PARENTHESIS)
4841  {
4842  *errorcodeptr = ERR60;
4843  goto FAILED;
4844  }
4845 
4846  /* Scan the table of verb names */
4847 
4848  for (i = 0; i < verbcount; i++)
4849  {
4850  if (namelen == verbs[i].len &&
4851  strncmp((char *)name, vn, namelen) == 0)
4852  {
4853  /* Check for open captures before ACCEPT */
4854 
4855  if (verbs[i].op == OP_ACCEPT)
4856  {
4857  open_capitem *oc;
4858  cd->had_accept = TRUE;
4859  for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4860  {
4861  *code++ = OP_CLOSE;
4862  PUT2INC(code, 0, oc->number);
4863  }
4864  }
4865 
4866  /* Handle the cases with/without an argument */
4867 
4868  if (arglen == 0)
4869  {
4870  if (verbs[i].op < 0) /* Argument is mandatory */
4871  {
4872  *errorcodeptr = ERR66;
4873  goto FAILED;
4874  }
4875  *code = verbs[i].op;
4876  if (*code++ == OP_THEN)
4877  {
4878  PUT(code, 0, (int)(code - bcptr->current_branch - 1));
4879  code += LINK_SIZE;
4880  }
4881  }
4882 
4883  else
4884  {
4885  if (verbs[i].op_arg < 0) /* Argument is forbidden */
4886  {
4887  *errorcodeptr = ERR59;
4888  goto FAILED;
4889  }
4890  *code = verbs[i].op_arg;
4891  if (*code++ == OP_THEN_ARG)
4892  {
4893  PUT(code, 0, (int)(code - bcptr->current_branch - 1));
4894  code += LINK_SIZE;
4895  }
4896  *code++ = arglen;
4897  memcpy(code, arg, arglen);
4898  code += arglen;
4899  *code++ = 0;
4900  }
4901 
4902  break; /* Found verb, exit loop */
4903  }
4904 
4905  vn += verbs[i].len + 1;
4906  }
4907 
4908  if (i < verbcount) continue; /* Successfully handled a verb */
4909  *errorcodeptr = ERR60; /* Verb not recognized */
4910  goto FAILED;
4911  }
4912 
4913  /* Deal with the extended parentheses; all are introduced by '?', and the
4914  appearance of any of them means that this is not a capturing group. */
4915 
4916  else if (*ptr == CHAR_QUESTION_MARK)
4917  {
4918  int i, set, unset, namelen;
4919  int *optset;
4920  const uschar *name;
4921  uschar *slot;
4922 
4923  switch (*(++ptr))
4924  {
4925  case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4926  ptr++;
4927  while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4928  if (*ptr == 0)
4929  {
4930  *errorcodeptr = ERR18;
4931  goto FAILED;
4932  }
4933  continue;
4934 
4935 
4936  /* ------------------------------------------------------------ */
4937  case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4938  reset_bracount = TRUE;
4939  /* Fall through */
4940 
4941  /* ------------------------------------------------------------ */
4942  case CHAR_COLON: /* Non-capturing bracket */
4943  bravalue = OP_BRA;
4944  ptr++;
4945  break;
4946 
4947 
4948  /* ------------------------------------------------------------ */
4949  case CHAR_LEFT_PARENTHESIS:
4950  bravalue = OP_COND; /* Conditional group */
4951 
4952  /* A condition can be an assertion, a number (referring to a numbered
4953  group), a name (referring to a named group), or 'R', referring to
4954  recursion. R<digits> and R&name are also permitted for recursion tests.
4955 
4956  There are several syntaxes for testing a named group: (?(name)) is used
4957  by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4958 
4959  There are two unfortunate ambiguities, caused by history. (a) 'R' can
4960  be the recursive thing or the name 'R' (and similarly for 'R' followed
4961  by digits), and (b) a number could be a name that consists of digits.
4962  In both cases, we look for a name first; if not found, we try the other
4963  cases. */
4964 
4965  /* For conditions that are assertions, check the syntax, and then exit
4966  the switch. This will take control down to where bracketed groups,
4967  including assertions, are processed. */
4968 
4969  if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4970  ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4971  break;
4972 
4973  /* Most other conditions use OP_CREF (a couple change to OP_RREF
4974  below), and all need to skip 3 bytes at the start of the group. */
4975 
4976  code[1+LINK_SIZE] = OP_CREF;
4977  skipbytes = 3;
4978  refsign = -1;
4979 
4980  /* Check for a test for recursion in a named group. */
4981 
4982  if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4983  {
4984  terminator = -1;
4985  ptr += 2;
4986  code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4987  }
4988 
4989  /* Check for a test for a named group's having been set, using the Perl
4990  syntax (?(<name>) or (?('name') */
4991 
4992  else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4993  {
4994  terminator = CHAR_GREATER_THAN_SIGN;
4995  ptr++;
4996  }
4997  else if (ptr[1] == CHAR_APOSTROPHE)
4998  {
4999  terminator = CHAR_APOSTROPHE;
5000  ptr++;
5001  }
5002  else
5003  {
5004  terminator = 0;
5005  if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5006  }
5007 
5008  /* We now expect to read a name; any thing else is an error */
5009 
5010  if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
5011  {
5012  ptr += 1; /* To get the right offset */
5013  *errorcodeptr = ERR28;
5014  goto FAILED;
5015  }
5016 
5017  /* Read the name, but also get it as a number if it's all digits */
5018 
5019  recno = 0;
5020  name = ++ptr;
5021  while ((cd->ctypes[*ptr] & ctype_word) != 0)
5022  {
5023  if (recno >= 0)
5024  recno = ((digitab[*ptr] & ctype_digit) != 0)?
5025  recno * 10 + *ptr - CHAR_0 : -1;
5026  ptr++;
5027  }
5028  namelen = (int)(ptr - name);
5029 
5030  if ((terminator > 0 && *ptr++ != terminator) ||
5031  *ptr++ != CHAR_RIGHT_PARENTHESIS)
5032  {
5033  ptr--; /* Error offset */
5034  *errorcodeptr = ERR26;
5035  goto FAILED;
5036  }
5037 
5038  /* Do no further checking in the pre-compile phase. */
5039 
5040  if (lengthptr != NULL) break;
5041 
5042  /* In the real compile we do the work of looking for the actual
5043  reference. If the string started with "+" or "-" we require the rest to
5044  be digits, in which case recno will be set. */
5045 
5046  if (refsign > 0)
5047  {
5048  if (recno <= 0)
5049  {
5050  *errorcodeptr = ERR58;
5051  goto FAILED;
5052  }
5053  recno = (refsign == CHAR_MINUS)?
5054  cd->bracount - recno + 1 : recno +cd->bracount;
5055  if (recno <= 0 || recno > cd->final_bracount)
5056  {
5057  *errorcodeptr = ERR15;
5058  goto FAILED;
5059  }
5060  PUT2(code, 2+LINK_SIZE, recno);
5061  break;
5062  }
5063 
5064  /* Otherwise (did not start with "+" or "-"), start by looking for the
5065  name. If we find a name, add one to the opcode to change OP_CREF or
5066  OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5067  except they record that the reference was originally to a name. The
5068  information is used to check duplicate names. */
5069 
5070  slot = cd->name_table;
5071  for (i = 0; i < cd->names_found; i++)
5072  {
5073  if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
5074  slot += cd->name_entry_size;
5075  }
5076 
5077  /* Found a previous named subpattern */
5078 
5079  if (i < cd->names_found)
5080  {
5081  recno = GET2(slot, 0);
5082  PUT2(code, 2+LINK_SIZE, recno);
5083  code[1+LINK_SIZE]++;
5084  }
5085 
5086  /* Search the pattern for a forward reference */
5087 
5088  else if ((i = find_parens(cd, name, namelen,
5089  (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5090  {
5091  PUT2(code, 2+LINK_SIZE, i);
5092  code[1+LINK_SIZE]++;
5093  }
5094 
5095  /* If terminator == 0 it means that the name followed directly after
5096  the opening parenthesis [e.g. (?(abc)...] and in this case there are
5097  some further alternatives to try. For the cases where terminator != 0
5098  [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5099  now checked all the possibilities, so give an error. */
5100 
5101  else if (terminator != 0)
5102  {
5103  *errorcodeptr = ERR15;
5104  goto FAILED;
5105  }
5106 
5107  /* Check for (?(R) for recursion. Allow digits after R to specify a
5108  specific group number. */
5109 
5110  else if (*name == CHAR_R)
5111  {
5112  recno = 0;
5113  for (i = 1; i < namelen; i++)
5114  {
5115  if ((digitab[name[i]] & ctype_digit) == 0)
5116  {
5117  *errorcodeptr = ERR15;
5118  goto FAILED;
5119  }
5120  recno = recno * 10 + name[i] - CHAR_0;
5121  }
5122  if (recno == 0) recno = RREF_ANY;
5123  code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5124  PUT2(code, 2+LINK_SIZE, recno);
5125  }
5126 
5127  /* Similarly, check for the (?(DEFINE) "condition", which is always
5128  false. */
5129 
5130  else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
5131  {
5132  code[1+LINK_SIZE] = OP_DEF;
5133  skipbytes = 1;
5134  }
5135 
5136  /* Check for the "name" actually being a subpattern number. We are
5137  in the second pass here, so final_bracount is set. */
5138 
5139  else if (recno > 0 && recno <= cd->final_bracount)
5140  {
5141  PUT2(code, 2+LINK_SIZE, recno);
5142  }
5143 
5144  /* Either an unidentified subpattern, or a reference to (?(0) */
5145 
5146  else
5147  {
5148  *errorcodeptr = (recno == 0)? ERR35: ERR15;
5149  goto FAILED;
5150  }
5151  break;
5152 
5153 
5154  /* ------------------------------------------------------------ */
5155  case CHAR_EQUALS_SIGN: /* Positive lookahead */
5156  bravalue = OP_ASSERT;
5157  ptr++;
5158  break;
5159 
5160 
5161  /* ------------------------------------------------------------ */
5162  case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5163  ptr++;
5164  if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5165  {
5166  *code++ = OP_FAIL;
5167  previous = NULL;
5168  continue;
5169  }
5170  bravalue = OP_ASSERT_NOT;
5171  break;
5172 
5173 
5174  /* ------------------------------------------------------------ */
5175  case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5176  switch (ptr[1])
5177  {
5178  case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5179  bravalue = OP_ASSERTBACK;
5180  ptr += 2;
5181  break;
5182 
5183  case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5184  bravalue = OP_ASSERTBACK_NOT;
5185  ptr += 2;
5186  break;
5187 
5188  default: /* Could be name define, else bad */
5189  if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
5190  ptr++; /* Correct offset for error */
5191  *errorcodeptr = ERR24;
5192  goto FAILED;
5193  }
5194  break;
5195 
5196 
5197  /* ------------------------------------------------------------ */
5198  case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5199  bravalue = OP_ONCE;
5200  ptr++;
5201  break;
5202 
5203 
5204  /* ------------------------------------------------------------ */
5205  case CHAR_C: /* Callout - may be followed by digits; */
5206  previous_callout = code; /* Save for later completion */
5207  after_manual_callout = 1; /* Skip one item before completing */
5208  *code++ = OP_CALLOUT;
5209  {
5210  int n = 0;
5211  while ((digitab[*(++ptr)] & ctype_digit) != 0)
5212  n = n * 10 + *ptr - CHAR_0;
5213  if (*ptr != CHAR_RIGHT_PARENTHESIS)
5214  {
5215  *errorcodeptr = ERR39;
5216  goto FAILED;
5217  }
5218  if (n > 255)
5219  {
5220  *errorcodeptr = ERR38;
5221  goto FAILED;
5222  }
5223  *code++ = n;
5224  PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5225  PUT(code, LINK_SIZE, 0); /* Default length */
5226  code += 2 * LINK_SIZE;
5227  }
5228  previous = NULL;
5229  continue;
5230 
5231 
5232  /* ------------------------------------------------------------ */
5233  case CHAR_P: /* Python-style named subpattern handling */
5234  if (*(++ptr) == CHAR_EQUALS_SIGN ||
5235  *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5236  {
5237  is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5238  terminator = CHAR_RIGHT_PARENTHESIS;
5239  goto NAMED_REF_OR_RECURSE;
5240  }
5241  else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5242  {
5243  *errorcodeptr = ERR41;
5244  goto FAILED;
5245  }
5246  /* Fall through to handle (?P< as (?< is handled */
5247 
5248 
5249  /* ------------------------------------------------------------ */
5250  DEFINE_NAME: /* Come here from (?< handling */
5251  case CHAR_APOSTROPHE:
5252  {
5253  terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5255  name = ++ptr;
5256 
5257  while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5258  namelen = (int)(ptr - name);
5259 
5260  /* In the pre-compile phase, just do a syntax check. */
5261 
5262  if (lengthptr != NULL)
5263  {
5264  if (*ptr != terminator)
5265  {
5266  *errorcodeptr = ERR42;
5267  goto FAILED;
5268  }
5269  if (cd->names_found >= MAX_NAME_COUNT)
5270  {
5271  *errorcodeptr = ERR49;
5272  goto FAILED;
5273  }
5274  if (namelen + 3 > cd->name_entry_size)
5275  {
5276  cd->name_entry_size = namelen + 3;
5277  if (namelen > MAX_NAME_SIZE)
5278  {
5279  *errorcodeptr = ERR48;
5280  goto FAILED;
5281  }
5282  }
5283  }
5284 
5285  /* In the real compile, create the entry in the table, maintaining
5286  alphabetical order. Duplicate names for different numbers are
5287  permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5288  number are always OK. (An existing number can be re-used if (?|
5289  appears in the pattern.) In either event, a duplicate name results in
5290  a duplicate entry in the table, even if the number is the same. This
5291  is because the number of names, and hence the table size, is computed
5292  in the pre-compile, and it affects various numbers and pointers which
5293  would all have to be modified, and the compiled code moved down, if
5294  duplicates with the same number were omitted from the table. This
5295  doesn't seem worth the hassle. However, *different* names for the
5296  same number are not permitted. */
5297 
5298  else
5299  {
5300  BOOL dupname = FALSE;
5301  slot = cd->name_table;
5302 
5303  for (i = 0; i < cd->names_found; i++)
5304  {
5305  int crc = memcmp(name, slot+2, namelen);
5306  if (crc == 0)
5307  {
5308  if (slot[2+namelen] == 0)
5309  {
5310  if (GET2(slot, 0) != cd->bracount + 1 &&
5311  (options & PCRE_DUPNAMES) == 0)
5312  {
5313  *errorcodeptr = ERR43;
5314  goto FAILED;
5315  }
5316  else dupname = TRUE;
5317  }
5318  else crc = -1; /* Current name is a substring */
5319  }
5320 
5321  /* Make space in the table and break the loop for an earlier
5322  name. For a duplicate or later name, carry on. We do this for
5323  duplicates so that in the simple case (when ?(| is not used) they
5324  are in order of their numbers. */
5325 
5326  if (crc < 0)
5327  {
5328  memmove(slot + cd->name_entry_size, slot,
5329  (cd->names_found - i) * cd->name_entry_size);
5330  break;
5331  }
5332 
5333  /* Continue the loop for a later or duplicate name */
5334 
5335  slot += cd->name_entry_size;
5336  }
5337 
5338  /* For non-duplicate names, check for a duplicate number before
5339  adding the new name. */
5340 
5341  if (!dupname)
5342  {
5343  uschar *cslot = cd->name_table;
5344  for (i = 0; i < cd->names_found; i++)
5345  {
5346  if (cslot != slot)
5347  {
5348  if (GET2(cslot, 0) == cd->bracount + 1)
5349  {
5350  *errorcodeptr = ERR65;
5351  goto FAILED;
5352  }
5353  }
5354  else i--;
5355  cslot += cd->name_entry_size;
5356  }
5357  }
5358 
5359  PUT2(slot, 0, cd->bracount + 1);
5360  memcpy(slot + 2, name, namelen);
5361  slot[2+namelen] = 0;
5362  }
5363  }
5364 
5365  /* In both pre-compile and compile, count the number of names we've
5366  encountered. */
5367 
5368  cd->names_found++;
5369  ptr++; /* Move past > or ' */
5370  goto NUMBERED_GROUP;
5371 
5372 
5373  /* ------------------------------------------------------------ */
5374  case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5375  terminator = CHAR_RIGHT_PARENTHESIS;
5376  is_recurse = TRUE;
5377  /* Fall through */
5378 
5379  /* We come here from the Python syntax above that handles both
5380  references (?P=name) and recursion (?P>name), as well as falling
5381  through from the Perl recursion syntax (?&name). We also come here from
5382  the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5383  .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5384 
5385  NAMED_REF_OR_RECURSE:
5386  name = ++ptr;
5387  while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5388  namelen = (int)(ptr - name);
5389 
5390  /* In the pre-compile phase, do a syntax check. We used to just set
5391  a dummy reference number, because it was not used in the first pass.
5392  However, with the change of recursive back references to be atomic,
5393  we have to look for the number so that this state can be identified, as
5394  otherwise the incorrect length is computed. If it's not a backwards
5395  reference, the dummy number will do. */
5396 
5397  if (lengthptr != NULL)
5398  {
5399  const uschar *temp;
5400 
5401  if (namelen == 0)
5402  {
5403  *errorcodeptr = ERR62;
5404  goto FAILED;
5405  }
5406  if (*ptr != terminator)
5407  {
5408  *errorcodeptr = ERR42;
5409  goto FAILED;
5410  }
5411  if (namelen > MAX_NAME_SIZE)
5412  {
5413  *errorcodeptr = ERR48;
5414  goto FAILED;
5415  }
5416 
5417  /* The name table does not exist in the first pass, so we cannot
5418  do a simple search as in the code below. Instead, we have to scan the
5419  pattern to find the number. It is important that we scan it only as
5420  far as we have got because the syntax of named subpatterns has not
5421  been checked for the rest of the pattern, and find_parens() assumes
5422  correct syntax. In any case, it's a waste of resources to scan
5423  further. We stop the scan at the current point by temporarily
5424  adjusting the value of cd->endpattern. */
5425 
5426  temp = cd->end_pattern;
5427  cd->end_pattern = ptr;
5428  recno = find_parens(cd, name, namelen,
5429  (options & PCRE_EXTENDED) != 0, utf8);
5430  cd->end_pattern = temp;
5431  if (recno < 0) recno = 0; /* Forward ref; set dummy number */
5432  }
5433 
5434  /* In the real compile, seek the name in the table. We check the name
5435  first, and then check that we have reached the end of the name in the
5436  table. That way, if the name that is longer than any in the table,
5437  the comparison will fail without reading beyond the table entry. */
5438 
5439  else
5440  {
5441  slot = cd->name_table;
5442  for (i = 0; i < cd->names_found; i++)
5443  {
5444  if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5445  slot[2+namelen] == 0)
5446  break;
5447  slot += cd->name_entry_size;
5448  }
5449 
5450  if (i < cd->names_found) /* Back reference */
5451  {
5452  recno = GET2(slot, 0);
5453  }
5454  else if ((recno = /* Forward back reference */
5455  find_parens(cd, name, namelen,
5456  (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5457  {
5458  *errorcodeptr = ERR15;
5459  goto FAILED;
5460  }
5461  }
5462 
5463  /* In both phases, we can now go to the code than handles numerical
5464  recursion or backreferences. */
5465 
5466  if (is_recurse) goto HANDLE_RECURSION;
5467  else goto HANDLE_REFERENCE;
5468 
5469 
5470  /* ------------------------------------------------------------ */
5471  case CHAR_R: /* Recursion */
5472  ptr++; /* Same as (?0) */
5473  /* Fall through */
5474 
5475 
5476  /* ------------------------------------------------------------ */
5477  case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5478  case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5479  case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5480  {
5481  const uschar *called;
5482  terminator = CHAR_RIGHT_PARENTHESIS;
5483 
5484  /* Come here from the \g<...> and \g'...' code (Oniguruma
5485  compatibility). However, the syntax has been checked to ensure that
5486  the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5487  be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5488  ever be taken. */
5489 
5490  HANDLE_NUMERICAL_RECURSION:
5491 
5492  if ((refsign = *ptr) == CHAR_PLUS)
5493  {
5494  ptr++;
5495  if ((digitab[*ptr] & ctype_digit) == 0)
5496  {
5497  *errorcodeptr = ERR63;
5498  goto FAILED;
5499  }
5500  }
5501  else if (refsign == CHAR_MINUS)
5502  {
5503  if ((digitab[ptr[1]] & ctype_digit) == 0)
5504  goto OTHER_CHAR_AFTER_QUERY;
5505  ptr++;
5506  }
5507 
5508  recno = 0;
5509  while((digitab[*ptr] & ctype_digit) != 0)
5510  recno = recno * 10 + *ptr++ - CHAR_0;
5511 
5512  if (*ptr != terminator)
5513  {
5514  *errorcodeptr = ERR29;
5515  goto FAILED;
5516  }
5517 
5518  if (refsign == CHAR_MINUS)
5519  {
5520  if (recno == 0)
5521  {
5522  *errorcodeptr = ERR58;
5523  goto FAILED;
5524  }
5525  recno = cd->bracount - recno + 1;
5526  if (recno <= 0)
5527  {
5528  *errorcodeptr = ERR15;
5529  goto FAILED;
5530  }
5531  }
5532  else if (refsign == CHAR_PLUS)
5533  {
5534  if (recno == 0)
5535  {
5536  *errorcodeptr = ERR58;
5537  goto FAILED;
5538  }
5539  recno += cd->bracount;
5540  }
5541 
5542  /* Come here from code above that handles a named recursion */
5543 
5544  HANDLE_RECURSION:
5545 
5546  previous = code;
5547  called = cd->start_code;
5548 
5549  /* When we are actually compiling, find the bracket that is being
5550  referenced. Temporarily end the regex in case it doesn't exist before
5551  this point. If we end up with a forward reference, first check that
5552  the bracket does occur later so we can give the error (and position)
5553  now. Then remember this forward reference in the workspace so it can
5554  be filled in at the end. */
5555 
5556  if (lengthptr == NULL)
5557  {
5558  *code = OP_END;
5559  if (recno != 0)
5560  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5561 
5562  /* Forward reference */
5563 
5564  if (called == NULL)
5565  {
5566  if (find_parens(cd, NULL, recno,
5567  (options & PCRE_EXTENDED) != 0, utf8) < 0)
5568  {
5569  *errorcodeptr = ERR15;
5570  goto FAILED;
5571  }
5572 
5573  /* Fudge the value of "called" so that when it is inserted as an
5574  offset below, what it actually inserted is the reference number
5575  of the group. */
5576 
5577  called = cd->start_code + recno;
5578  PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
5579  }
5580 
5581  /* If not a forward reference, and the subpattern is still open,
5582  this is a recursive call. We check to see if this is a left
5583  recursion that could loop for ever, and diagnose that case. */
5584 
5585  else if (GET(called, 1) == 0 &&
5586  could_be_empty(called, code, bcptr, utf8, cd))
5587  {
5588  *errorcodeptr = ERR40;
5589  goto FAILED;
5590  }
5591  }
5592 
5593  /* Insert the recursion/subroutine item, automatically wrapped inside
5594  "once" brackets. Set up a "previous group" length so that a
5595  subsequent quantifier will work. */
5596 
5597  *code = OP_ONCE;
5598  PUT(code, 1, 2 + 2*LINK_SIZE);
5599  code += 1 + LINK_SIZE;
5600 
5601  *code = OP_RECURSE;
5602  PUT(code, 1, (int)(called - cd->start_code));
5603  code += 1 + LINK_SIZE;
5604 
5605  *code = OP_KET;
5606  PUT(code, 1, 2 + 2*LINK_SIZE);
5607  code += 1 + LINK_SIZE;
5608 
5609  length_prevgroup = 3 + 3*LINK_SIZE;
5610  }
5611 
5612  /* Can't determine a first byte now */
5613 
5614  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5615  continue;
5616 
5617 
5618  /* ------------------------------------------------------------ */
5619  default: /* Other characters: check option setting */
5620  OTHER_CHAR_AFTER_QUERY:
5621  set = unset = 0;
5622  optset = &set;
5623 
5624  while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5625  {
5626  switch (*ptr++)
5627  {
5628  case CHAR_MINUS: optset = &unset; break;
5629 
5630  case CHAR_J: /* Record that it changed in the external options */
5631  *optset |= PCRE_DUPNAMES;
5633  break;
5634 
5635  case CHAR_i: *optset |= PCRE_CASELESS; break;
5636  case CHAR_m: *optset |= PCRE_MULTILINE; break;
5637  case CHAR_s: *optset |= PCRE_DOTALL; break;
5638  case CHAR_x: *optset |= PCRE_EXTENDED; break;
5639  case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5640  case CHAR_X: *optset |= PCRE_EXTRA; break;
5641 
5642  default: *errorcodeptr = ERR12;
5643  ptr--; /* Correct the offset */
5644  goto FAILED;
5645  }
5646  }
5647 
5648  /* Set up the changed option bits, but don't change anything yet. */
5649 
5650  newoptions = (options | set) & (~unset);
5651 
5652  /* If the options ended with ')' this is not the start of a nested
5653  group with option changes, so the options change at this level. If this
5654  item is right at the start of the pattern, the options can be
5655  abstracted and made external in the pre-compile phase, and ignored in
5656  the compile phase. This can be helpful when matching -- for instance in
5657  caseless checking of required bytes.
5658 
5659  If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5660  definitely *not* at the start of the pattern because something has been
5661  compiled. In the pre-compile phase, however, the code pointer can have
5662  that value after the start, because it gets reset as code is discarded
5663  during the pre-compile. However, this can happen only at top level - if
5664  we are within parentheses, the starting BRA will still be present. At
5665  any parenthesis level, the length value can be used to test if anything
5666  has been compiled at that level. Thus, a test for both these conditions
5667  is necessary to ensure we correctly detect the start of the pattern in
5668  both phases.
5669 
5670  If we are not at the pattern start, compile code to change the ims
5671  options if this setting actually changes any of them, and reset the
5672  greedy defaults and the case value for firstbyte and reqbyte. */
5673 
5674  if (*ptr == CHAR_RIGHT_PARENTHESIS)
5675  {
5676  if (code == cd->start_code + 1 + LINK_SIZE &&
5677  (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5678  {
5679  cd->external_options = newoptions;
5680  }
5681  else
5682  {
5683  if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5684  {
5685  *code++ = OP_OPT;
5686  *code++ = newoptions & PCRE_IMS;
5687  }
5688  greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5689  greedy_non_default = greedy_default ^ 1;
5690  req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5691  }
5692 
5693  /* Change options at this level, and pass them back for use
5694  in subsequent branches. When not at the start of the pattern, this
5695  information is also necessary so that a resetting item can be
5696  compiled at the end of a group (if we are in a group). */
5697 
5698  *optionsptr = options = newoptions;
5699  previous = NULL; /* This item can't be repeated */
5700  continue; /* It is complete */
5701  }
5702 
5703  /* If the options ended with ':' we are heading into a nested group
5704  with possible change of options. Such groups are non-capturing and are
5705  not assertions of any kind. All we need to do is skip over the ':';
5706  the newoptions value is handled below. */
5707 
5708  bravalue = OP_BRA;
5709  ptr++;
5710  } /* End of switch for character following (? */
5711  } /* End of (? handling */
5712 
5713  /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5714  is set, all unadorned brackets become non-capturing and behave like (?:...)
5715  brackets. */
5716 
5717  else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5718  {
5719  bravalue = OP_BRA;
5720  }
5721 
5722  /* Else we have a capturing group. */
5723 
5724  else
5725  {
5726  NUMBERED_GROUP:
5727  cd->bracount += 1;
5728  PUT2(code, 1+LINK_SIZE, cd->bracount);
5729  skipbytes = 2;
5730  }
5731 
5732  /* Process nested bracketed regex. Assertions may not be repeated, but
5733  other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5734  non-register variable in order to be able to pass its address because some
5735  compilers complain otherwise. Pass in a new setting for the ims options if
5736  they have changed. */
5737 
5738  previous = (bravalue >= OP_ONCE)? code : NULL;
5739  *code = bravalue;
5740  tempcode = code;
5741  tempreqvary = cd->req_varyopt; /* Save value before bracket */
5742  length_prevgroup = 0; /* Initialize for pre-compile phase */
5743 
5744  if (!compile_regex(
5745  newoptions, /* The complete new option state */
5746  options & PCRE_IMS, /* The previous ims option state */
5747  &tempcode, /* Where to put code (updated) */
5748  &ptr, /* Input pointer (updated) */
5749  errorcodeptr, /* Where to put an error message */
5750  (bravalue == OP_ASSERTBACK ||
5751  bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5752  reset_bracount, /* True if (?| group */
5753  skipbytes, /* Skip over bracket number */
5754  &subfirstbyte, /* For possible first char */
5755  &subreqbyte, /* For possible last char */
5756  bcptr, /* Current branch chain */
5757  cd, /* Tables block */
5758  (lengthptr == NULL)? NULL : /* Actual compile phase */
5759  &length_prevgroup /* Pre-compile phase */
5760  ))
5761  goto FAILED;
5762 
5763  /* At the end of compiling, code is still pointing to the start of the
5764  group, while tempcode has been updated to point past the end of the group
5765  and any option resetting that may follow it. The pattern pointer (ptr)
5766  is on the bracket. */
5767 
5768  /* If this is a conditional bracket, check that there are no more than
5769  two branches in the group, or just one if it's a DEFINE group. We do this
5770  in the real compile phase, not in the pre-pass, where the whole group may
5771  not be available. */
5772 
5773  if (bravalue == OP_COND && lengthptr == NULL)
5774  {
5775  uschar *tc = code;
5776  int condcount = 0;
5777 
5778  do {
5779  condcount++;
5780  tc += GET(tc,1);
5781  }
5782  while (*tc != OP_KET);
5783 
5784  /* A DEFINE group is never obeyed inline (the "condition" is always
5785  false). It must have only one branch. */
5786 
5787  if (code[LINK_SIZE+1] == OP_DEF)
5788  {
5789  if (condcount > 1)
5790  {
5791  *errorcodeptr = ERR54;
5792  goto FAILED;
5793  }
5794  bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5795  }
5796 
5797  /* A "normal" conditional group. If there is just one branch, we must not
5798  make use of its firstbyte or reqbyte, because this is equivalent to an
5799  empty second branch. */
5800 
5801  else
5802  {
5803  if (condcount > 2)
5804  {
5805  *errorcodeptr = ERR27;
5806  goto FAILED;
5807  }
5808  if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5809  }
5810  }
5811 
5812  /* Error if hit end of pattern */
5813 
5814  if (*ptr != CHAR_RIGHT_PARENTHESIS)
5815  {
5816  *errorcodeptr = ERR14;
5817  goto FAILED;
5818  }
5819 
5820  /* In the pre-compile phase, update the length by the length of the group,
5821  less the brackets at either end. Then reduce the compiled code to just a
5822  set of non-capturing brackets so that it doesn't use much memory if it is
5823  duplicated by a quantifier.*/
5824 
5825  if (lengthptr != NULL)
5826  {
5827  if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5828  {
5829  *errorcodeptr = ERR20;
5830  goto FAILED;
5831  }
5832  *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5833  *code++ = OP_BRA;
5834  PUTINC(code, 0, 1 + LINK_SIZE);
5835  *code++ = OP_KET;
5836  PUTINC(code, 0, 1 + LINK_SIZE);
5837  break; /* No need to waste time with special character handling */
5838  }
5839 
5840  /* Otherwise update the main code pointer to the end of the group. */
5841 
5842  code = tempcode;
5843 
5844  /* For a DEFINE group, required and first character settings are not
5845  relevant. */
5846 
5847  if (bravalue == OP_DEF) break;
5848 
5849  /* Handle updating of the required and first characters for other types of
5850  group. Update for normal brackets of all kinds, and conditions with two
5851  branches (see code above). If the bracket is followed by a quantifier with
5852  zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5853  zerofirstbyte outside the main loop so that they can be accessed for the
5854  back off. */
5855 
5856  zeroreqbyte = reqbyte;
5857  zerofirstbyte = firstbyte;
5858  groupsetfirstbyte = FALSE;
5859 
5860  if (bravalue >= OP_ONCE)
5861  {
5862  /* If we have not yet set a firstbyte in this branch, take it from the
5863  subpattern, remembering that it was set here so that a repeat of more
5864  than one can replicate it as reqbyte if necessary. If the subpattern has
5865  no firstbyte, set "none" for the whole branch. In both cases, a zero
5866  repeat forces firstbyte to "none". */
5867 
5868  if (firstbyte == REQ_UNSET)
5869  {
5870  if (subfirstbyte >= 0)
5871  {
5872  firstbyte = subfirstbyte;
5873  groupsetfirstbyte = TRUE;
5874  }
5875  else firstbyte = REQ_NONE;
5876  zerofirstbyte = REQ_NONE;
5877  }
5878 
5879  /* If firstbyte was previously set, convert the subpattern's firstbyte
5880  into reqbyte if there wasn't one, using the vary flag that was in
5881  existence beforehand. */
5882 
5883  else if (subfirstbyte >= 0 && subreqbyte < 0)
5884  subreqbyte = subfirstbyte | tempreqvary;
5885 
5886  /* If the subpattern set a required byte (or set a first byte that isn't
5887  really the first byte - see above), set it. */
5888 
5889  if (subreqbyte >= 0) reqbyte = subreqbyte;
5890  }
5891 
5892  /* For a forward assertion, we take the reqbyte, if set. This can be
5893  helpful if the pattern that follows the assertion doesn't set a different
5894  char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5895  for an assertion, however because it leads to incorrect effect for patterns
5896  such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5897  of a firstbyte. This is overcome by a scan at the end if there's no
5898  firstbyte, looking for an asserted first char. */
5899 
5900  else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5901  break; /* End of processing '(' */
5902 
5903 
5904  /* ===================================================================*/
5905  /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5906  are arranged to be the negation of the corresponding OP_values in the
5907  default case when PCRE_UCP is not set. For the back references, the values
5908  are ESC_REF plus the reference number. Only back references and those types
5909  that consume a character may be repeated. We can test for values between
5910  ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5911  ever created. */
5912 
5913  case CHAR_BACKSLASH:
5914  tempptr = ptr;
5915  c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5916  if (*errorcodeptr != 0) goto FAILED;
5917 
5918  if (c < 0)
5919  {
5920  if (-c == ESC_Q) /* Handle start of quoted string */
5921  {
5922  if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5923  ptr += 2; /* avoid empty string */
5924  else inescq = TRUE;
5925  continue;
5926  }
5927 
5928  if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5929 
5930  /* For metasequences that actually match a character, we disable the
5931  setting of a first character if it hasn't already been set. */
5932 
5933  if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5934  firstbyte = REQ_NONE;
5935 
5936  /* Set values to reset to if this is followed by a zero repeat. */
5937 
5938  zerofirstbyte = firstbyte;
5939  zeroreqbyte = reqbyte;
5940 
5941  /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5942  is a subroutine call by number (Oniguruma syntax). In fact, the value
5943  -ESC_g is returned only for these cases. So we don't need to check for <
5944  or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5945  -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5946  that is a synonym for a named back reference). */
5947 
5948  if (-c == ESC_g)
5949  {
5950  const uschar *p;
5951  save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5952  terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5954 
5955  /* These two statements stop the compiler for warning about possibly
5956  unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5957  fact, because we actually check for a number below, the paths that
5958  would actually be in error are never taken. */
5959 
5960  skipbytes = 0;
5961  reset_bracount = FALSE;
5962 
5963  /* Test for a name */
5964 
5965  if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5966  {
5967  BOOL isnumber = TRUE;
5968  for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5969  {
5970  if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5971  if ((cd->ctypes[*p] & ctype_word) == 0) break;
5972  }
5973  if (*p != terminator)
5974  {
5975  *errorcodeptr = ERR57;
5976  break;
5977  }
5978  if (isnumber)
5979  {
5980  ptr++;
5981  goto HANDLE_NUMERICAL_RECURSION;
5982  }
5983  is_recurse = TRUE;
5984  goto NAMED_REF_OR_RECURSE;
5985  }
5986 
5987  /* Test a signed number in angle brackets or quotes. */
5988 
5989  p = ptr + 2;
5990  while ((digitab[*p] & ctype_digit) != 0) p++;
5991  if (*p != terminator)
5992  {
5993  *errorcodeptr = ERR57;
5994  break;
5995  }
5996  ptr++;
5997  goto HANDLE_NUMERICAL_RECURSION;
5998  }
5999 
6000  /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6001  We also support \k{name} (.NET syntax) */
6002 
6003  if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
6004  ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
6005  {
6006  is_recurse = FALSE;
6007  terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6010  goto NAMED_REF_OR_RECURSE;
6011  }
6012 
6013  /* Back references are handled specially; must disable firstbyte if
6014  not set to cope with cases like (?=(\w+))\1: which would otherwise set
6015  ':' later. */
6016 
6017  if (-c >= ESC_REF)
6018  {
6019  open_capitem *oc;
6020  recno = -c - ESC_REF;
6021 
6022  HANDLE_REFERENCE: /* Come here from named backref handling */
6023  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
6024  previous = code;
6025  *code++ = OP_REF;
6026  PUT2INC(code, 0, recno);
6027  cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6028  if (recno > cd->top_backref) cd->top_backref = recno;
6029 
6030  /* Check to see if this back reference is recursive, that it, it
6031  is inside the group that it references. A flag is set so that the
6032  group can be made atomic. */
6033 
6034  for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6035  {
6036  if (oc->number == recno)
6037  {
6038  oc->flag = TRUE;
6039  break;
6040  }
6041  }
6042  }
6043 
6044  /* So are Unicode property matches, if supported. */
6045 
6046 #ifdef SUPPORT_UCP
6047  else if (-c == ESC_P || -c == ESC_p)
6048  {
6049  BOOL negated;
6050  int pdata;
6051  int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6052  if (ptype < 0) goto FAILED;
6053  previous = code;
6054  *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6055  *code++ = ptype;
6056  *code++ = pdata;
6057  }
6058 #else
6059 
6060  /* If Unicode properties are not supported, \X, \P, and \p are not
6061  allowed. */
6062 
6063  else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6064  {
6065  *errorcodeptr = ERR45;
6066  goto FAILED;
6067  }
6068 #endif
6069 
6070  /* For the rest (including \X when Unicode properties are supported), we
6071  can obtain the OP value by negating the escape value in the default
6072  situation when PCRE_UCP is not set. When it *is* set, we substitute
6073  Unicode property tests. */
6074 
6075  else
6076  {
6077 #ifdef SUPPORT_UCP
6078  if (-c >= ESC_DU && -c <= ESC_wu)
6079  {
6080  nestptr = ptr + 1; /* Where to resume */
6081  ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
6082  }
6083  else
6084 #endif
6085  {
6086  previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6087  *code++ = -c;
6088  }
6089  }
6090  continue;
6091  }
6092 
6093  /* We have a data character whose value is in c. In UTF-8 mode it may have
6094  a value > 127. We set its representation in the length/buffer, and then
6095  handle it as a data character. */
6096 
6097 #ifdef SUPPORT_UTF8
6098  if (utf8 && c > 127)
6099  mclength = _pcre_ord2utf8(c, mcbuffer);
6100  else
6101 #endif
6102 
6103  {
6104  mcbuffer[0] = c;
6105  mclength = 1;
6106  }
6107  goto ONE_CHAR;
6108 
6109 
6110  /* ===================================================================*/
6111  /* Handle a literal character. It is guaranteed not to be whitespace or #
6112  when the extended flag is set. If we are in UTF-8 mode, it may be a
6113  multi-byte literal character. */
6114 
6115  default:
6116  NORMAL_CHAR:
6117  mclength = 1;
6118  mcbuffer[0] = c;
6119 
6120 #ifdef SUPPORT_UTF8
6121  if (utf8 && c >= 0xc0)
6122  {
6123  while ((ptr[1] & 0xc0) == 0x80)
6124  mcbuffer[mclength++] = *(++ptr);
6125  }
6126 #endif
6127 
6128  /* At this point we have the character's bytes in mcbuffer, and the length
6129  in mclength. When not in UTF-8 mode, the length is always 1. */
6130 
6131  ONE_CHAR:
6132  previous = code;
6133  *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
6134  for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6135 
6136  /* Remember if \r or \n were seen */
6137 
6138  if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6140 
6141  /* Set the first and required bytes appropriately. If no previous first
6142  byte, set it from this character, but revert to none on a zero repeat.
6143  Otherwise, leave the firstbyte value alone, and don't change it on a zero
6144  repeat. */
6145 
6146  if (firstbyte == REQ_UNSET)
6147  {
6148  zerofirstbyte = REQ_NONE;
6149  zeroreqbyte = reqbyte;
6150 
6151  /* If the character is more than one byte long, we can set firstbyte
6152  only if it is not to be matched caselessly. */
6153 
6154  if (mclength == 1 || req_caseopt == 0)
6155  {
6156  firstbyte = mcbuffer[0] | req_caseopt;
6157  if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
6158  }
6159  else firstbyte = reqbyte = REQ_NONE;
6160  }
6161 
6162  /* firstbyte was previously set; we can set reqbyte only the length is
6163  1 or the matching is caseful. */
6164 
6165  else
6166  {
6167  zerofirstbyte = firstbyte;
6168  zeroreqbyte = reqbyte;
6169  if (mclength == 1 || req_caseopt == 0)
6170  reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
6171  }
6172 
6173  break; /* End of literal character handling */
6174  }
6175  } /* end of big loop */
6176 
6177 
6178 /* Control never reaches here by falling through, only by a goto for all the
6179 error states. Pass back the position in the pattern so that it can be displayed
6180 to the user for diagnosing the error. */
6181 
6182 FAILED:
6183 *ptrptr = ptr;
6184 return FALSE;
6185 }
6186 
6187 
6188 
6189 
6190 /*************************************************
6191 * Compile sequence of alternatives *
6192 *************************************************/
6193 
6194 /* On entry, ptr is pointing past the bracket character, but on return it
6195 points to the closing bracket, or vertical bar, or end of string. The code
6196 variable is pointing at the byte into which the BRA operator has been stored.
6197 If the ims options are changed at the start (for a (?ims: group) or during any
6198 branch, we need to insert an OP_OPT item at the start of every following branch
6199 to ensure they get set correctly at run time, and also pass the new options
6200 into every subsequent branch compile.
6201 
6202 This function is used during the pre-compile phase when we are trying to find
6203 out the amount of memory needed, as well as during the real compile phase. The
6204 value of lengthptr distinguishes the two phases.
6205 
6206 Arguments:
6207  options option bits, including any changes for this subpattern
6208  oldims previous settings of ims option bits
6209  codeptr -> the address of the current code pointer
6210  ptrptr -> the address of the current pattern pointer
6211  errorcodeptr -> pointer to error code variable
6212  lookbehind TRUE if this is a lookbehind assertion
6213  reset_bracount TRUE to reset the count for each branch
6214  skipbytes skip this many bytes at start (for brackets and OP_COND)
6215  firstbyteptr place to put the first required character, or a negative number
6216  reqbyteptr place to put the last required character, or a negative number
6217  bcptr pointer to the chain of currently open branches
6218  cd points to the data block with tables pointers etc.
6219  lengthptr NULL during the real compile phase
6220  points to length accumulator during pre-compile phase
6221 
6222 Returns: TRUE on success
6223 */
6224 
6225 static BOOL
6226 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
6227  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6228  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6229  int *lengthptr)
6230 {
6231 const uschar *ptr = *ptrptr;
6232 uschar *code = *codeptr;
6233 uschar *last_branch = code;
6234 uschar *start_bracket = code;
6235 uschar *reverse_count = NULL;
6236 open_capitem capitem;
6237 int capnumber = 0;
6238 int firstbyte, reqbyte;
6239 int branchfirstbyte, branchreqbyte;
6240 int length;
6241 int orig_bracount;
6242 int max_bracount;
6243 int old_external_options = cd->external_options;
6244 branch_chain bc;
6245 
6246 bc.outer = bcptr;
6247 bc.current_branch = code;
6248 
6249 firstbyte = reqbyte = REQ_UNSET;
6250 
6251 /* Accumulate the length for use in the pre-compile phase. Start with the
6252 length of the BRA and KET and any extra bytes that are required at the
6253 beginning. We accumulate in a local variable to save frequent testing of
6254 lenthptr for NULL. We cannot do this by looking at the value of code at the
6255 start and end of each alternative, because compiled items are discarded during
6256 the pre-compile phase so that the work space is not exceeded. */
6257 
6258 length = 2 + 2*LINK_SIZE + skipbytes;
6259 
6260 /* WARNING: If the above line is changed for any reason, you must also change
6261 the code that abstracts option settings at the start of the pattern and makes
6262 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
6263 pre-compile phase to find out whether anything has yet been compiled or not. */
6264 
6265 /* If this is a capturing subpattern, add to the chain of open capturing items
6266 so that we can detect them if (*ACCEPT) is encountered. This is also used to
6267 detect groups that contain recursive back references to themselves. */
6268 
6269 if (*code == OP_CBRA)
6270  {
6271  capnumber = GET2(code, 1 + LINK_SIZE);
6272  capitem.number = capnumber;
6273  capitem.next = cd->open_caps;
6274  capitem.flag = FALSE;
6275  cd->open_caps = &capitem;
6276  }
6277 
6278 /* Offset is set zero to mark that this bracket is still open */
6279 
6280 PUT(code, 1, 0);
6281 code += 1 + LINK_SIZE + skipbytes;
6282 
6283 /* Loop for each alternative branch */
6284 
6285 orig_bracount = max_bracount = cd->bracount;
6286 for (;;)
6287  {
6288  /* For a (?| group, reset the capturing bracket count so that each branch
6289  uses the same numbers. */
6290 
6291  if (reset_bracount) cd->bracount = orig_bracount;
6292 
6293  /* Handle a change of ims options at the start of the branch */
6294 
6295  if ((options & PCRE_IMS) != oldims)
6296  {
6297  *code++ = OP_OPT;
6298  *code++ = options & PCRE_IMS;
6299  length += 2;
6300  }
6301 
6302  /* Set up dummy OP_REVERSE if lookbehind assertion */
6303 
6304