Process Hacker
pcre_dfa_exec.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8 
9  Written by Philip Hazel
10  Copyright (c) 1997-2010 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16  * Redistributions of source code must retain the above copyright notice,
17  this list of conditions and the following disclaimer.
18 
19  * Redistributions in binary form must reproduce the above copyright
20  notice, this list of conditions and the following disclaimer in the
21  documentation and/or other materials provided with the distribution.
22 
23  * Neither the name of the University of Cambridge nor the names of its
24  contributors may be used to endorse or promote products derived from
25  this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 
76 #define HAVE_CONFIG_H
77 #ifdef HAVE_CONFIG_H
78 #include "config.h"
79 #endif
80 
81 #define NLBLOCK md /* Block containing newline information */
82 #define PSSTART start_subject /* Field containing processed string start */
83 #define PSEND end_subject /* Field containing processed string end */
84 
85 #include "pcre_internal.h"
86 
87 
88 /* For use to indent debugging output */
89 
90 #define SP " "
91 
92 
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96 
97 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
98 into others, under special conditions. A gap of 20 between the blocks should be
99 enough. The resulting opcodes don't have to be less than 256 because they are
100 never stored, so we push them well clear of the normal opcodes. */
101 
102 #define OP_PROP_EXTRA 300
103 #define OP_EXTUNI_EXTRA 320
104 #define OP_ANYNL_EXTRA 340
105 #define OP_HSPACE_EXTRA 360
106 #define OP_VSPACE_EXTRA 380
107 
108 
109 /* This table identifies those opcodes that are followed immediately by a
110 character that is to be tested in some way. This makes it possible to
111 centralize the loading of these characters. In the case of Type * etc, the
112 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
113 small value. Non-zero values in the table are the offsets from the opcode where
114 the character is to be found. ***NOTE*** If the start of this table is
115 modified, the three tables that follow must also be modified. */
116 
117 static const uschar coptable[] = {
118  0, /* End */
119  0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
120  0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
121  0, 0, 0, /* Any, AllAny, Anybyte */
122  0, 0, /* \P, \p */
123  0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
124  0, /* \X */
125  0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
126  1, /* Char */
127  1, /* Charnc */
128  1, /* not */
129  /* Positive single-char repeats */
130  1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131  3, 3, 3, /* upto, minupto, exact */
132  1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133  /* Negative single-char repeats - only for chars < 256 */
134  1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
135  3, 3, 3, /* NOT upto, minupto, exact */
136  1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
137  /* Positive type repeats */
138  1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
139  3, 3, 3, /* Type upto, minupto, exact */
140  1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
141  /* Character class & ref repeats */
142  0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
143  0, 0, /* CRRANGE, CRMINRANGE */
144  0, /* CLASS */
145  0, /* NCLASS */
146  0, /* XCLASS - variable length */
147  0, /* REF */
148  0, /* RECURSE */
149  0, /* CALLOUT */
150  0, /* Alt */
151  0, /* Ket */
152  0, /* KetRmax */
153  0, /* KetRmin */
154  0, /* Assert */
155  0, /* Assert not */
156  0, /* Assert behind */
157  0, /* Assert behind not */
158  0, /* Reverse */
159  0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
160  0, 0, 0, /* SBRA, SCBRA, SCOND */
161  0, 0, /* CREF, NCREF */
162  0, 0, /* RREF, NRREF */
163  0, /* DEF */
164  0, 0, /* BRAZERO, BRAMINZERO */
165  0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
166  0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
167  0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
168 };
169 
170 /* This table identifies those opcodes that inspect a character. It is used to
171 remember the fact that a character could have been inspected when the end of
172 the subject is reached. ***NOTE*** If the start of this table is modified, the
173 two tables that follow must also be modified. */
174 
175 static const uschar poptable[] = {
176  0, /* End */
177  0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
178  1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
179  1, 1, 1, /* Any, AllAny, Anybyte */
180  1, 1, /* \P, \p */
181  1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
182  1, /* \X */
183  0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
184  1, /* Char */
185  1, /* Charnc */
186  1, /* not */
187  /* Positive single-char repeats */
188  1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
189  1, 1, 1, /* upto, minupto, exact */
190  1, 1, 1, 1, /* *+, ++, ?+, upto+ */
191  /* Negative single-char repeats - only for chars < 256 */
192  1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
193  1, 1, 1, /* NOT upto, minupto, exact */
194  1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
195  /* Positive type repeats */
196  1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
197  1, 1, 1, /* Type upto, minupto, exact */
198  1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
199  /* Character class & ref repeats */
200  1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
201  1, 1, /* CRRANGE, CRMINRANGE */
202  1, /* CLASS */
203  1, /* NCLASS */
204  1, /* XCLASS - variable length */
205  0, /* REF */
206  0, /* RECURSE */
207  0, /* CALLOUT */
208  0, /* Alt */
209  0, /* Ket */
210  0, /* KetRmax */
211  0, /* KetRmin */
212  0, /* Assert */
213  0, /* Assert not */
214  0, /* Assert behind */
215  0, /* Assert behind not */
216  0, /* Reverse */
217  0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
218  0, 0, 0, /* SBRA, SCBRA, SCOND */
219  0, 0, /* CREF, NCREF */
220  0, 0, /* RREF, NRREF */
221  0, /* DEF */
222  0, 0, /* BRAZERO, BRAMINZERO */
223  0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
224  0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
225  0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
226 };
227 
228 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
229 and \w */
230 
231 static const uschar toptable1[] = {
232  0, 0, 0, 0, 0, 0,
236  0, 0 /* OP_ANY, OP_ALLANY */
237 };
238 
239 static const uschar toptable2[] = {
240  0, 0, 0, 0, 0, 0,
241  ctype_digit, 0,
242  ctype_space, 0,
243  ctype_word, 0,
244  1, 1 /* OP_ANY, OP_ALLANY */
245 };
246 
247 
248 /* Structure for holding data about a particular state, which is in effect the
249 current data for an active path through the match tree. It must consist
250 entirely of ints because the working vector we are passed, and which we put
251 these structures in, is a vector of ints. */
252 
253 typedef struct stateblock {
254  int offset; /* Offset to opcode */
255  int count; /* Count for repeats */
256  int ims; /* ims flag bits */
257  int data; /* Some use extra data */
258 } stateblock;
259 
260 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
261 
262 
263 #ifdef PCRE_DEBUG
264 /*************************************************
265 * Print character string *
266 *************************************************/
267 
268 /* Character string printing function for debugging.
269 
270 Arguments:
271  p points to string
272  length number of bytes
273  f where to print
274 
275 Returns: nothing
276 */
277 
278 static void
279 pchars(unsigned char *p, int length, FILE *f)
280 {
281 int c;
282 while (length-- > 0)
283  {
284  if (isprint(c = *(p++)))
285  fprintf(f, "%c", c);
286  else
287  fprintf(f, "\\x%02x", c);
288  }
289 }
290 #endif
291 
292 
293 
294 /*************************************************
295 * Execute a Regular Expression - DFA engine *
296 *************************************************/
297 
298 /* This internal function applies a compiled pattern to a subject string,
299 starting at a given point, using a DFA engine. This function is called from the
300 external one, possibly multiple times if the pattern is not anchored. The
301 function calls itself recursively for some kinds of subpattern.
302 
303 Arguments:
304  md the match_data block with fixed information
305  this_start_code the opening bracket of this subexpression's code
306  current_subject where we currently are in the subject string
307  start_offset start offset in the subject string
308  offsets vector to contain the matching string offsets
309  offsetcount size of same
310  workspace vector of workspace
311  wscount size of same
312  ims the current ims flags
313  rlevel function call recursion level
314  recursing regex recursive call level
315 
316 Returns: > 0 => number of match offset pairs placed in offsets
317  = 0 => offsets overflowed; longest matches are present
318  -1 => failed to match
319  < -1 => some kind of unexpected problem
320 
321 The following macros are used for adding states to the two state vectors (one
322 for the current character, one for the following character). */
323 
324 #define ADD_ACTIVE(x,y) \
325  if (active_count++ < wscount) \
326  { \
327  next_active_state->offset = (x); \
328  next_active_state->count = (y); \
329  next_active_state->ims = ims; \
330  next_active_state++; \
331  DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
332  } \
333  else return PCRE_ERROR_DFA_WSSIZE
334 
335 #define ADD_ACTIVE_DATA(x,y,z) \
336  if (active_count++ < wscount) \
337  { \
338  next_active_state->offset = (x); \
339  next_active_state->count = (y); \
340  next_active_state->ims = ims; \
341  next_active_state->data = (z); \
342  next_active_state++; \
343  DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
344  } \
345  else return PCRE_ERROR_DFA_WSSIZE
346 
347 #define ADD_NEW(x,y) \
348  if (new_count++ < wscount) \
349  { \
350  next_new_state->offset = (x); \
351  next_new_state->count = (y); \
352  next_new_state->ims = ims; \
353  next_new_state++; \
354  DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
355  } \
356  else return PCRE_ERROR_DFA_WSSIZE
357 
358 #define ADD_NEW_DATA(x,y,z) \
359  if (new_count++ < wscount) \
360  { \
361  next_new_state->offset = (x); \
362  next_new_state->count = (y); \
363  next_new_state->ims = ims; \
364  next_new_state->data = (z); \
365  next_new_state++; \
366  DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
367  } \
368  else return PCRE_ERROR_DFA_WSSIZE
369 
370 /* And now, here is the code */
371 
372 static int
373 internal_dfa_exec(
374  dfa_match_data *md,
375  const uschar *this_start_code,
376  const uschar *current_subject,
377  int start_offset,
378  int *offsets,
379  int offsetcount,
380  int *workspace,
381  int wscount,
382  int ims,
383  int rlevel,
384  int recursing)
385 {
386 stateblock *active_states, *new_states, *temp_states;
387 stateblock *next_active_state, *next_new_state;
388 
389 const uschar *ctypes, *lcc, *fcc;
390 const uschar *ptr;
391 const uschar *end_code, *first_op;
392 
393 int active_count, new_count, match_count;
394 
395 /* Some fields in the md block are frequently referenced, so we load them into
396 independent variables in the hope that this will perform better. */
397 
398 const uschar *start_subject = md->start_subject;
399 const uschar *end_subject = md->end_subject;
400 const uschar *start_code = md->start_code;
401 
402 #ifdef SUPPORT_UTF8
403 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
404 #else
405 BOOL utf8 = FALSE;
406 #endif
407 
408 rlevel++;
409 offsetcount &= (-2);
410 
411 wscount -= 2;
412 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
413  (2 * INTS_PER_STATEBLOCK);
414 
415 DPRINTF(("\n%.*s---------------------\n"
416  "%.*sCall to internal_dfa_exec f=%d r=%d\n",
417  rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
418 
419 ctypes = md->tables + ctypes_offset;
420 lcc = md->tables + lcc_offset;
421 fcc = md->tables + fcc_offset;
422 
423 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
424 
425 active_states = (stateblock *)(workspace + 2);
426 next_new_state = new_states = active_states + wscount;
427 new_count = 0;
428 
429 first_op = this_start_code + 1 + LINK_SIZE +
430  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
431 
432 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
433 the alternative states onto the list, and find out where the end is. This
434 makes is possible to use this function recursively, when we want to stop at a
435 matching internal ket rather than at the end.
436 
437 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
438 a backward assertion. In that case, we have to find out the maximum amount to
439 move back, and set up each alternative appropriately. */
440 
441 if (*first_op == OP_REVERSE)
442  {
443  int max_back = 0;
444  int gone_back;
445 
446  end_code = this_start_code;
447  do
448  {
449  int back = GET(end_code, 2+LINK_SIZE);
450  if (back > max_back) max_back = back;
451  end_code += GET(end_code, 1);
452  }
453  while (*end_code == OP_ALT);
454 
455  /* If we can't go back the amount required for the longest lookbehind
456  pattern, go back as far as we can; some alternatives may still be viable. */
457 
458 #ifdef SUPPORT_UTF8
459  /* In character mode we have to step back character by character */
460 
461  if (utf8)
462  {
463  for (gone_back = 0; gone_back < max_back; gone_back++)
464  {
465  if (current_subject <= start_subject) break;
466  current_subject--;
467  while (current_subject > start_subject &&
468  (*current_subject & 0xc0) == 0x80)
469  current_subject--;
470  }
471  }
472  else
473 #endif
474 
475  /* In byte-mode we can do this quickly. */
476 
477  {
478  gone_back = (current_subject - max_back < start_subject)?
479  (int)(current_subject - start_subject) : max_back;
480  current_subject -= gone_back;
481  }
482 
483  /* Save the earliest consulted character */
484 
485  if (current_subject < md->start_used_ptr)
486  md->start_used_ptr = current_subject;
487 
488  /* Now we can process the individual branches. */
489 
490  end_code = this_start_code;
491  do
492  {
493  int back = GET(end_code, 2+LINK_SIZE);
494  if (back <= gone_back)
495  {
496  int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
497  ADD_NEW_DATA(-bstate, 0, gone_back - back);
498  }
499  end_code += GET(end_code, 1);
500  }
501  while (*end_code == OP_ALT);
502  }
503 
504 /* This is the code for a "normal" subpattern (not a backward assertion). The
505 start of a whole pattern is always one of these. If we are at the top level,
506 we may be asked to restart matching from the same point that we reached for a
507 previous partial match. We still have to scan through the top-level branches to
508 find the end state. */
509 
510 else
511  {
512  end_code = this_start_code;
513 
514  /* Restarting */
515 
516  if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
517  {
518  do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
519  new_count = workspace[1];
520  if (!workspace[0])
521  memcpy(new_states, active_states, new_count * sizeof(stateblock));
522  }
523 
524  /* Not restarting */
525 
526  else
527  {
528  int length = 1 + LINK_SIZE +
529  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
530  do
531  {
532  ADD_NEW((int)(end_code - start_code + length), 0);
533  end_code += GET(end_code, 1);
534  length = 1 + LINK_SIZE;
535  }
536  while (*end_code == OP_ALT);
537  }
538  }
539 
540 workspace[0] = 0; /* Bit indicating which vector is current */
541 
542 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
543 
544 /* Loop for scanning the subject */
545 
546 ptr = current_subject;
547 for (;;)
548  {
549  int i, j;
550  int clen, dlen;
551  unsigned int c, d;
552  int forced_fail = 0;
553  BOOL could_continue = FALSE;
554 
555  /* Make the new state list into the active state list and empty the
556  new state list. */
557 
558  temp_states = active_states;
559  active_states = new_states;
560  new_states = temp_states;
561  active_count = new_count;
562  new_count = 0;
563 
564  workspace[0] ^= 1; /* Remember for the restarting feature */
565  workspace[1] = active_count;
566 
567 #ifdef PCRE_DEBUG
568  printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
569  pchars((uschar *)ptr, strlen((char *)ptr), stdout);
570  printf("\"\n");
571 
572  printf("%.*sActive states: ", rlevel*2-2, SP);
573  for (i = 0; i < active_count; i++)
574  printf("%d/%d ", active_states[i].offset, active_states[i].count);
575  printf("\n");
576 #endif
577 
578  /* Set the pointers for adding new states */
579 
580  next_active_state = active_states + active_count;
581  next_new_state = new_states;
582 
583  /* Load the current character from the subject outside the loop, as many
584  different states may want to look at it, and we assume that at least one
585  will. */
586 
587  if (ptr < end_subject)
588  {
589  clen = 1; /* Number of bytes in the character */
590 #ifdef SUPPORT_UTF8
591  if (utf8) { GETCHARLEN(c, ptr, clen); } else
592 #endif /* SUPPORT_UTF8 */
593  c = *ptr;
594  }
595  else
596  {
597  clen = 0; /* This indicates the end of the subject */
598  c = NOTACHAR; /* This value should never actually be used */
599  }
600 
601  /* Scan up the active states and act on each one. The result of an action
602  may be to add more states to the currently active list (e.g. on hitting a
603  parenthesis) or it may be to put states on the new list, for considering
604  when we move the character pointer on. */
605 
606  for (i = 0; i < active_count; i++)
607  {
608  stateblock *current_state = active_states + i;
609  const uschar *code;
610  int state_offset = current_state->offset;
611  int count, codevalue, rrc;
612 
613 #ifdef PCRE_DEBUG
614  printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
615  if (clen == 0) printf("EOL\n");
616  else if (c > 32 && c < 127) printf("'%c'\n", c);
617  else printf("0x%02x\n", c);
618 #endif
619 
620  /* This variable is referred to implicity in the ADD_xxx macros. */
621 
622  ims = current_state->ims;
623 
624  /* A negative offset is a special case meaning "hold off going to this
625  (negated) state until the number of characters in the data field have
626  been skipped". */
627 
628  if (state_offset < 0)
629  {
630  if (current_state->data > 0)
631  {
632  DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
633  ADD_NEW_DATA(state_offset, current_state->count,
634  current_state->data - 1);
635  continue;
636  }
637  else
638  {
639  current_state->offset = state_offset = -state_offset;
640  }
641  }
642 
643  /* Check for a duplicate state with the same count, and skip if found.
644  See the note at the head of this module about the possibility of improving
645  performance here. */
646 
647  for (j = 0; j < i; j++)
648  {
649  if (active_states[j].offset == state_offset &&
650  active_states[j].count == current_state->count)
651  {
652  DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
653  goto NEXT_ACTIVE_STATE;
654  }
655  }
656 
657  /* The state offset is the offset to the opcode */
658 
659  code = start_code + state_offset;
660  codevalue = *code;
661 
662  /* If this opcode inspects a character, but we are at the end of the
663  subject, remember the fact for use when testing for a partial match. */
664 
665  if (clen == 0 && poptable[codevalue] != 0)
666  could_continue = TRUE;
667 
668  /* If this opcode is followed by an inline character, load it. It is
669  tempting to test for the presence of a subject character here, but that
670  is wrong, because sometimes zero repetitions of the subject are
671  permitted.
672 
673  We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
674  argument that is not a data character - but is always one byte long. We
675  have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
676  this case. To keep the other cases fast, convert these ones to new opcodes.
677  */
678 
679  if (coptable[codevalue] > 0)
680  {
681  dlen = 1;
682 #ifdef SUPPORT_UTF8
683  if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
684 #endif /* SUPPORT_UTF8 */
685  d = code[coptable[codevalue]];
686  if (codevalue >= OP_TYPESTAR)
687  {
688  switch(d)
689  {
690  case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
691  case OP_NOTPROP:
692  case OP_PROP: codevalue += OP_PROP_EXTRA; break;
693  case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
694  case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
695  case OP_NOT_HSPACE:
696  case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
697  case OP_NOT_VSPACE:
698  case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
699  default: break;
700  }
701  }
702  }
703  else
704  {
705  dlen = 0; /* Not strictly necessary, but compilers moan */
706  d = NOTACHAR; /* if these variables are not set. */
707  }
708 
709 
710  /* Now process the individual opcodes */
711 
712  switch (codevalue)
713  {
714 /* ========================================================================== */
715  /* These cases are never obeyed. This is a fudge that causes a compile-
716  time error if the vectors coptable or poptable, which are indexed by
717  opcode, are not the correct length. It seems to be the only way to do
718  such a check at compile time, as the sizeof() operator does not work
719  in the C preprocessor. */
720 
721  case OP_TABLE_LENGTH:
722  case OP_TABLE_LENGTH +
723  ((sizeof(coptable) == OP_TABLE_LENGTH) &&
724  (sizeof(poptable) == OP_TABLE_LENGTH)):
725  break;
726 
727 /* ========================================================================== */
728  /* Reached a closing bracket. If not at the end of the pattern, carry
729  on with the next opcode. Otherwise, unless we have an empty string and
730  PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
731  start of the subject, save the match data, shifting up all previous
732  matches so we always have the longest first. */
733 
734  case OP_KET:
735  case OP_KETRMIN:
736  case OP_KETRMAX:
737  if (code != end_code)
738  {
739  ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
740  if (codevalue != OP_KET)
741  {
742  ADD_ACTIVE(state_offset - GET(code, 1), 0);
743  }
744  }
745  else
746  {
747  if (ptr > current_subject ||
748  ((md->moptions & PCRE_NOTEMPTY) == 0 &&
749  ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
750  current_subject > start_subject + md->start_offset)))
751  {
752  if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
753  else if (match_count > 0 && ++match_count * 2 >= offsetcount)
754  match_count = 0;
755  count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
756  if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
757  if (offsetcount >= 2)
758  {
759  offsets[0] = (int)(current_subject - start_subject);
760  offsets[1] = (int)(ptr - start_subject);
761  DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
762  offsets[1] - offsets[0], current_subject));
763  }
764  if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
765  {
766  DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
767  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
768  match_count, rlevel*2-2, SP));
769  return match_count;
770  }
771  }
772  }
773  break;
774 
775 /* ========================================================================== */
776  /* These opcodes add to the current list of states without looking
777  at the current character. */
778 
779  /*-----------------------------------------------------------------*/
780  case OP_ALT:
781  do { code += GET(code, 1); } while (*code == OP_ALT);
782  ADD_ACTIVE((int)(code - start_code), 0);
783  break;
784 
785  /*-----------------------------------------------------------------*/
786  case OP_BRA:
787  case OP_SBRA:
788  do
789  {
790  ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
791  code += GET(code, 1);
792  }
793  while (*code == OP_ALT);
794  break;
795 
796  /*-----------------------------------------------------------------*/
797  case OP_CBRA:
798  case OP_SCBRA:
799  ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
800  code += GET(code, 1);
801  while (*code == OP_ALT)
802  {
803  ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
804  code += GET(code, 1);
805  }
806  break;
807 
808  /*-----------------------------------------------------------------*/
809  case OP_BRAZERO:
810  case OP_BRAMINZERO:
811  ADD_ACTIVE(state_offset + 1, 0);
812  code += 1 + GET(code, 2);
813  while (*code == OP_ALT) code += GET(code, 1);
814  ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815  break;
816 
817  /*-----------------------------------------------------------------*/
818  case OP_SKIPZERO:
819  code += 1 + GET(code, 2);
820  while (*code == OP_ALT) code += GET(code, 1);
821  ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
822  break;
823 
824  /*-----------------------------------------------------------------*/
825  case OP_CIRC:
826  if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
827  ((ims & PCRE_MULTILINE) != 0 &&
828  ptr != end_subject &&
829  WAS_NEWLINE(ptr)))
830  { ADD_ACTIVE(state_offset + 1, 0); }
831  break;
832 
833  /*-----------------------------------------------------------------*/
834  case OP_EOD:
835  if (ptr >= end_subject)
836  {
837  if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
838  could_continue = TRUE;
839  else { ADD_ACTIVE(state_offset + 1, 0); }
840  }
841  break;
842 
843  /*-----------------------------------------------------------------*/
844  case OP_OPT:
845  ims = code[1];
846  ADD_ACTIVE(state_offset + 2, 0);
847  break;
848 
849  /*-----------------------------------------------------------------*/
850  case OP_SOD:
851  if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
852  break;
853 
854  /*-----------------------------------------------------------------*/
855  case OP_SOM:
856  if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
857  break;
858 
859 
860 /* ========================================================================== */
861  /* These opcodes inspect the next subject character, and sometimes
862  the previous one as well, but do not have an argument. The variable
863  clen contains the length of the current character and is zero if we are
864  at the end of the subject. */
865 
866  /*-----------------------------------------------------------------*/
867  case OP_ANY:
868  if (clen > 0 && !IS_NEWLINE(ptr))
869  { ADD_NEW(state_offset + 1, 0); }
870  break;
871 
872  /*-----------------------------------------------------------------*/
873  case OP_ALLANY:
874  if (clen > 0)
875  { ADD_NEW(state_offset + 1, 0); }
876  break;
877 
878  /*-----------------------------------------------------------------*/
879  case OP_EODN:
880  if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
881  could_continue = TRUE;
882  else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
883  { ADD_ACTIVE(state_offset + 1, 0); }
884  break;
885 
886  /*-----------------------------------------------------------------*/
887  case OP_DOLL:
888  if ((md->moptions & PCRE_NOTEOL) == 0)
889  {
890  if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
891  could_continue = TRUE;
892  else if (clen == 0 ||
893  ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
894  ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
895  ))
896  { ADD_ACTIVE(state_offset + 1, 0); }
897  }
898  else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
899  { ADD_ACTIVE(state_offset + 1, 0); }
900  break;
901 
902  /*-----------------------------------------------------------------*/
903 
904  case OP_DIGIT:
905  case OP_WHITESPACE:
906  case OP_WORDCHAR:
907  if (clen > 0 && c < 256 &&
908  ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
909  { ADD_NEW(state_offset + 1, 0); }
910  break;
911 
912  /*-----------------------------------------------------------------*/
913  case OP_NOT_DIGIT:
914  case OP_NOT_WHITESPACE:
915  case OP_NOT_WORDCHAR:
916  if (clen > 0 && (c >= 256 ||
917  ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
918  { ADD_NEW(state_offset + 1, 0); }
919  break;
920 
921  /*-----------------------------------------------------------------*/
922  case OP_WORD_BOUNDARY:
924  {
925  int left_word, right_word;
926 
927  if (ptr > start_subject)
928  {
929  const uschar *temp = ptr - 1;
930  if (temp < md->start_used_ptr) md->start_used_ptr = temp;
931 #ifdef SUPPORT_UTF8
932  if (utf8) BACKCHAR(temp);
933 #endif
934  GETCHARTEST(d, temp);
935 #ifdef SUPPORT_UCP
936  if ((md->poptions & PCRE_UCP) != 0)
937  {
938  if (d == '_') left_word = TRUE; else
939  {
940  int cat = UCD_CATEGORY(d);
941  left_word = (cat == ucp_L || cat == ucp_N);
942  }
943  }
944  else
945 #endif
946  left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
947  }
948  else left_word = FALSE;
949 
950  if (clen > 0)
951  {
952 #ifdef SUPPORT_UCP
953  if ((md->poptions & PCRE_UCP) != 0)
954  {
955  if (c == '_') right_word = TRUE; else
956  {
957  int cat = UCD_CATEGORY(c);
958  right_word = (cat == ucp_L || cat == ucp_N);
959  }
960  }
961  else
962 #endif
963  right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
964  }
965  else right_word = FALSE;
966 
967  if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
968  { ADD_ACTIVE(state_offset + 1, 0); }
969  }
970  break;
971 
972 
973  /*-----------------------------------------------------------------*/
974  /* Check the next character by Unicode property. We will get here only
975  if the support is in the binary; otherwise a compile-time error occurs.
976  */
977 
978 #ifdef SUPPORT_UCP
979  case OP_PROP:
980  case OP_NOTPROP:
981  if (clen > 0)
982  {
983  BOOL OK;
984  const ucd_record * prop = GET_UCD(c);
985  switch(code[1])
986  {
987  case PT_ANY:
988  OK = TRUE;
989  break;
990 
991  case PT_LAMP:
992  OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
993  prop->chartype == ucp_Lt;
994  break;
995 
996  case PT_GC:
997  OK = _pcre_ucp_gentype[prop->chartype] == code[2];
998  break;
999 
1000  case PT_PC:
1001  OK = prop->chartype == code[2];
1002  break;
1003 
1004  case PT_SC:
1005  OK = prop->script == code[2];
1006  break;
1007 
1008  /* These are specials for combination cases. */
1009 
1010  case PT_ALNUM:
1011  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1012  _pcre_ucp_gentype[prop->chartype] == ucp_N;
1013  break;
1014 
1015  case PT_SPACE: /* Perl space */
1016  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1017  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1018  break;
1019 
1020  case PT_PXSPACE: /* POSIX space */
1021  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1022  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1023  c == CHAR_FF || c == CHAR_CR;
1024  break;
1025 
1026  case PT_WORD:
1027  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1028  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1029  c == CHAR_UNDERSCORE;
1030  break;
1031 
1032  /* Should never occur, but keep compilers from grumbling. */
1033 
1034  default:
1035  OK = codevalue != OP_PROP;
1036  break;
1037  }
1038 
1039  if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1040  }
1041  break;
1042 #endif
1043 
1044 
1045 
1046 /* ========================================================================== */
1047  /* These opcodes likewise inspect the subject character, but have an
1048  argument that is not a data character. It is one of these opcodes:
1049  OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1050  OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1051 
1052  case OP_TYPEPLUS:
1053  case OP_TYPEMINPLUS:
1054  case OP_TYPEPOSPLUS:
1055  count = current_state->count; /* Already matched */
1056  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1057  if (clen > 0)
1058  {
1059  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1060  (c < 256 &&
1061  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1062  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1063  {
1064  if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1065  {
1066  active_count--; /* Remove non-match possibility */
1067  next_active_state--;
1068  }
1069  count++;
1070  ADD_NEW(state_offset, count);
1071  }
1072  }
1073  break;
1074 
1075  /*-----------------------------------------------------------------*/
1076  case OP_TYPEQUERY:
1077  case OP_TYPEMINQUERY:
1078  case OP_TYPEPOSQUERY:
1079  ADD_ACTIVE(state_offset + 2, 0);
1080  if (clen > 0)
1081  {
1082  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1083  (c < 256 &&
1084  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1085  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1086  {
1087  if (codevalue == OP_TYPEPOSQUERY)
1088  {
1089  active_count--; /* Remove non-match possibility */
1090  next_active_state--;
1091  }
1092  ADD_NEW(state_offset + 2, 0);
1093  }
1094  }
1095  break;
1096 
1097  /*-----------------------------------------------------------------*/
1098  case OP_TYPESTAR:
1099  case OP_TYPEMINSTAR:
1100  case OP_TYPEPOSSTAR:
1101  ADD_ACTIVE(state_offset + 2, 0);
1102  if (clen > 0)
1103  {
1104  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1105  (c < 256 &&
1106  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1107  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1108  {
1109  if (codevalue == OP_TYPEPOSSTAR)
1110  {
1111  active_count--; /* Remove non-match possibility */
1112  next_active_state--;
1113  }
1114  ADD_NEW(state_offset, 0);
1115  }
1116  }
1117  break;
1118 
1119  /*-----------------------------------------------------------------*/
1120  case OP_TYPEEXACT:
1121  count = current_state->count; /* Number already matched */
1122  if (clen > 0)
1123  {
1124  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1125  (c < 256 &&
1126  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1127  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1128  {
1129  if (++count >= GET2(code, 1))
1130  { ADD_NEW(state_offset + 4, 0); }
1131  else
1132  { ADD_NEW(state_offset, count); }
1133  }
1134  }
1135  break;
1136 
1137  /*-----------------------------------------------------------------*/
1138  case OP_TYPEUPTO:
1139  case OP_TYPEMINUPTO:
1140  case OP_TYPEPOSUPTO:
1141  ADD_ACTIVE(state_offset + 4, 0);
1142  count = current_state->count; /* Number already matched */
1143  if (clen > 0)
1144  {
1145  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1146  (c < 256 &&
1147  (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1148  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1149  {
1150  if (codevalue == OP_TYPEPOSUPTO)
1151  {
1152  active_count--; /* Remove non-match possibility */
1153  next_active_state--;
1154  }
1155  if (++count >= GET2(code, 1))
1156  { ADD_NEW(state_offset + 4, 0); }
1157  else
1158  { ADD_NEW(state_offset, count); }
1159  }
1160  }
1161  break;
1162 
1163 /* ========================================================================== */
1164  /* These are virtual opcodes that are used when something like
1165  OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1166  argument. It keeps the code above fast for the other cases. The argument
1167  is in the d variable. */
1168 
1169 #ifdef SUPPORT_UCP
1170  case OP_PROP_EXTRA + OP_TYPEPLUS:
1173  count = current_state->count; /* Already matched */
1174  if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1175  if (clen > 0)
1176  {
1177  BOOL OK;
1178  const ucd_record * prop = GET_UCD(c);
1179  switch(code[2])
1180  {
1181  case PT_ANY:
1182  OK = TRUE;
1183  break;
1184 
1185  case PT_LAMP:
1186  OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1187  prop->chartype == ucp_Lt;
1188  break;
1189 
1190  case PT_GC:
1191  OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1192  break;
1193 
1194  case PT_PC:
1195  OK = prop->chartype == code[3];
1196  break;
1197 
1198  case PT_SC:
1199  OK = prop->script == code[3];
1200  break;
1201 
1202  /* These are specials for combination cases. */
1203 
1204  case PT_ALNUM:
1205  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1206  _pcre_ucp_gentype[prop->chartype] == ucp_N;
1207  break;
1208 
1209  case PT_SPACE: /* Perl space */
1210  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1211  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1212  break;
1213 
1214  case PT_PXSPACE: /* POSIX space */
1215  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1216  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1217  c == CHAR_FF || c == CHAR_CR;
1218  break;
1219 
1220  case PT_WORD:
1221  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1222  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1223  c == CHAR_UNDERSCORE;
1224  break;
1225 
1226  /* Should never occur, but keep compilers from grumbling. */
1227 
1228  default:
1229  OK = codevalue != OP_PROP;
1230  break;
1231  }
1232 
1233  if (OK == (d == OP_PROP))
1234  {
1235  if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1236  {
1237  active_count--; /* Remove non-match possibility */
1238  next_active_state--;
1239  }
1240  count++;
1241  ADD_NEW(state_offset, count);
1242  }
1243  }
1244  break;
1245 
1246  /*-----------------------------------------------------------------*/
1250  count = current_state->count; /* Already matched */
1251  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1252  if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1253  {
1254  const uschar *nptr = ptr + clen;
1255  int ncount = 0;
1256  if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1257  {
1258  active_count--; /* Remove non-match possibility */
1259  next_active_state--;
1260  }
1261  while (nptr < end_subject)
1262  {
1263  int nd;
1264  int ndlen = 1;
1265  GETCHARLEN(nd, nptr, ndlen);
1266  if (UCD_CATEGORY(nd) != ucp_M) break;
1267  ncount++;
1268  nptr += ndlen;
1269  }
1270  count++;
1271  ADD_NEW_DATA(-state_offset, count, ncount);
1272  }
1273  break;
1274 #endif
1275 
1276  /*-----------------------------------------------------------------*/
1277  case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1280  count = current_state->count; /* Already matched */
1281  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282  if (clen > 0)
1283  {
1284  int ncount = 0;
1285  switch (c)
1286  {
1287  case 0x000b:
1288  case 0x000c:
1289  case 0x0085:
1290  case 0x2028:
1291  case 0x2029:
1292  if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1293  goto ANYNL01;
1294 
1295  case 0x000d:
1296  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1297  /* Fall through */
1298 
1299  ANYNL01:
1300  case 0x000a:
1301  if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1302  {
1303  active_count--; /* Remove non-match possibility */
1304  next_active_state--;
1305  }
1306  count++;
1307  ADD_NEW_DATA(-state_offset, count, ncount);
1308  break;
1309 
1310  default:
1311  break;
1312  }
1313  }
1314  break;
1315 
1316  /*-----------------------------------------------------------------*/
1320  count = current_state->count; /* Already matched */
1321  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1322  if (clen > 0)
1323  {
1324  BOOL OK;
1325  switch (c)
1326  {
1327  case 0x000a:
1328  case 0x000b:
1329  case 0x000c:
1330  case 0x000d:
1331  case 0x0085:
1332  case 0x2028:
1333  case 0x2029:
1334  OK = TRUE;
1335  break;
1336 
1337  default:
1338  OK = FALSE;
1339  break;
1340  }
1341 
1342  if (OK == (d == OP_VSPACE))
1343  {
1344  if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1345  {
1346  active_count--; /* Remove non-match possibility */
1347  next_active_state--;
1348  }
1349  count++;
1350  ADD_NEW_DATA(-state_offset, count, 0);
1351  }
1352  }
1353  break;
1354 
1355  /*-----------------------------------------------------------------*/
1359  count = current_state->count; /* Already matched */
1360  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1361  if (clen > 0)
1362  {
1363  BOOL OK;
1364  switch (c)
1365  {
1366  case 0x09: /* HT */
1367  case 0x20: /* SPACE */
1368  case 0xa0: /* NBSP */
1369  case 0x1680: /* OGHAM SPACE MARK */
1370  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1371  case 0x2000: /* EN QUAD */
1372  case 0x2001: /* EM QUAD */
1373  case 0x2002: /* EN SPACE */
1374  case 0x2003: /* EM SPACE */
1375  case 0x2004: /* THREE-PER-EM SPACE */
1376  case 0x2005: /* FOUR-PER-EM SPACE */
1377  case 0x2006: /* SIX-PER-EM SPACE */
1378  case 0x2007: /* FIGURE SPACE */
1379  case 0x2008: /* PUNCTUATION SPACE */
1380  case 0x2009: /* THIN SPACE */
1381  case 0x200A: /* HAIR SPACE */
1382  case 0x202f: /* NARROW NO-BREAK SPACE */
1383  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1384  case 0x3000: /* IDEOGRAPHIC SPACE */
1385  OK = TRUE;
1386  break;
1387 
1388  default:
1389  OK = FALSE;
1390  break;
1391  }
1392 
1393  if (OK == (d == OP_HSPACE))
1394  {
1395  if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1396  {
1397  active_count--; /* Remove non-match possibility */
1398  next_active_state--;
1399  }
1400  count++;
1401  ADD_NEW_DATA(-state_offset, count, 0);
1402  }
1403  }
1404  break;
1405 
1406  /*-----------------------------------------------------------------*/
1407 #ifdef SUPPORT_UCP
1408  case OP_PROP_EXTRA + OP_TYPEQUERY:
1411  count = 4;
1412  goto QS1;
1413 
1414  case OP_PROP_EXTRA + OP_TYPESTAR:
1417  count = 0;
1418 
1419  QS1:
1420 
1421  ADD_ACTIVE(state_offset + 4, 0);
1422  if (clen > 0)
1423  {
1424  BOOL OK;
1425  const ucd_record * prop = GET_UCD(c);
1426  switch(code[2])
1427  {
1428  case PT_ANY:
1429  OK = TRUE;
1430  break;
1431 
1432  case PT_LAMP:
1433  OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1434  prop->chartype == ucp_Lt;
1435  break;
1436 
1437  case PT_GC:
1438  OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1439  break;
1440 
1441  case PT_PC:
1442  OK = prop->chartype == code[3];
1443  break;
1444 
1445  case PT_SC:
1446  OK = prop->script == code[3];
1447  break;
1448 
1449  /* These are specials for combination cases. */
1450 
1451  case PT_ALNUM:
1452  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1453  _pcre_ucp_gentype[prop->chartype] == ucp_N;
1454  break;
1455 
1456  case PT_SPACE: /* Perl space */
1457  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1458  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1459  break;
1460 
1461  case PT_PXSPACE: /* POSIX space */
1462  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1463  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1464  c == CHAR_FF || c == CHAR_CR;
1465  break;
1466 
1467  case PT_WORD:
1468  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1469  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1470  c == CHAR_UNDERSCORE;
1471  break;
1472 
1473  /* Should never occur, but keep compilers from grumbling. */
1474 
1475  default:
1476  OK = codevalue != OP_PROP;
1477  break;
1478  }
1479 
1480  if (OK == (d == OP_PROP))
1481  {
1482  if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1483  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1484  {
1485  active_count--; /* Remove non-match possibility */
1486  next_active_state--;
1487  }
1488  ADD_NEW(state_offset + count, 0);
1489  }
1490  }
1491  break;
1492 
1493  /*-----------------------------------------------------------------*/
1497  count = 2;
1498  goto QS2;
1499 
1503  count = 0;
1504 
1505  QS2:
1506 
1507  ADD_ACTIVE(state_offset + 2, 0);
1508  if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1509  {
1510  const uschar *nptr = ptr + clen;
1511  int ncount = 0;
1512  if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1513  codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1514  {
1515  active_count--; /* Remove non-match possibility */
1516  next_active_state--;
1517  }
1518  while (nptr < end_subject)
1519  {
1520  int nd;
1521  int ndlen = 1;
1522  GETCHARLEN(nd, nptr, ndlen);
1523  if (UCD_CATEGORY(nd) != ucp_M) break;
1524  ncount++;
1525  nptr += ndlen;
1526  }
1527  ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1528  }
1529  break;
1530 #endif
1531 
1532  /*-----------------------------------------------------------------*/
1536  count = 2;
1537  goto QS3;
1538 
1539  case OP_ANYNL_EXTRA + OP_TYPESTAR:
1542  count = 0;
1543 
1544  QS3:
1545  ADD_ACTIVE(state_offset + 2, 0);
1546  if (clen > 0)
1547  {
1548  int ncount = 0;
1549  switch (c)
1550  {
1551  case 0x000b:
1552  case 0x000c:
1553  case 0x0085:
1554  case 0x2028:
1555  case 0x2029:
1556  if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1557  goto ANYNL02;
1558 
1559  case 0x000d:
1560  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1561  /* Fall through */
1562 
1563  ANYNL02:
1564  case 0x000a:
1565  if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1566  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1567  {
1568  active_count--; /* Remove non-match possibility */
1569  next_active_state--;
1570  }
1571  ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1572  break;
1573 
1574  default:
1575  break;
1576  }
1577  }
1578  break;
1579 
1580  /*-----------------------------------------------------------------*/
1584  count = 2;
1585  goto QS4;
1586 
1590  count = 0;
1591 
1592  QS4:
1593  ADD_ACTIVE(state_offset + 2, 0);
1594  if (clen > 0)
1595  {
1596  BOOL OK;
1597  switch (c)
1598  {
1599  case 0x000a:
1600  case 0x000b:
1601  case 0x000c:
1602  case 0x000d:
1603  case 0x0085:
1604  case 0x2028:
1605  case 0x2029:
1606  OK = TRUE;
1607  break;
1608 
1609  default:
1610  OK = FALSE;
1611  break;
1612  }
1613  if (OK == (d == OP_VSPACE))
1614  {
1615  if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1616  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1617  {
1618  active_count--; /* Remove non-match possibility */
1619  next_active_state--;
1620  }
1621  ADD_NEW_DATA(-(state_offset + count), 0, 0);
1622  }
1623  }
1624  break;
1625 
1626  /*-----------------------------------------------------------------*/
1630  count = 2;
1631  goto QS5;
1632 
1636  count = 0;
1637 
1638  QS5:
1639  ADD_ACTIVE(state_offset + 2, 0);
1640  if (clen > 0)
1641  {
1642  BOOL OK;
1643  switch (c)
1644  {
1645  case 0x09: /* HT */
1646  case 0x20: /* SPACE */
1647  case 0xa0: /* NBSP */
1648  case 0x1680: /* OGHAM SPACE MARK */
1649  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1650  case 0x2000: /* EN QUAD */
1651  case 0x2001: /* EM QUAD */
1652  case 0x2002: /* EN SPACE */
1653  case 0x2003: /* EM SPACE */
1654  case 0x2004: /* THREE-PER-EM SPACE */
1655  case 0x2005: /* FOUR-PER-EM SPACE */
1656  case 0x2006: /* SIX-PER-EM SPACE */
1657  case 0x2007: /* FIGURE SPACE */
1658  case 0x2008: /* PUNCTUATION SPACE */
1659  case 0x2009: /* THIN SPACE */
1660  case 0x200A: /* HAIR SPACE */
1661  case 0x202f: /* NARROW NO-BREAK SPACE */
1662  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1663  case 0x3000: /* IDEOGRAPHIC SPACE */
1664  OK = TRUE;
1665  break;
1666 
1667  default:
1668  OK = FALSE;
1669  break;
1670  }
1671 
1672  if (OK == (d == OP_HSPACE))
1673  {
1674  if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1675  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1676  {
1677  active_count--; /* Remove non-match possibility */
1678  next_active_state--;
1679  }
1680  ADD_NEW_DATA(-(state_offset + count), 0, 0);
1681  }
1682  }
1683  break;
1684 
1685  /*-----------------------------------------------------------------*/
1686 #ifdef SUPPORT_UCP
1687  case OP_PROP_EXTRA + OP_TYPEEXACT:
1688  case OP_PROP_EXTRA + OP_TYPEUPTO:
1691  if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1692  { ADD_ACTIVE(state_offset + 6, 0); }
1693  count = current_state->count; /* Number already matched */
1694  if (clen > 0)
1695  {
1696  BOOL OK;
1697  const ucd_record * prop = GET_UCD(c);
1698  switch(code[4])
1699  {
1700  case PT_ANY:
1701  OK = TRUE;
1702  break;
1703 
1704  case PT_LAMP:
1705  OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1706  prop->chartype == ucp_Lt;
1707  break;
1708 
1709  case PT_GC:
1710  OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1711  break;
1712 
1713  case PT_PC:
1714  OK = prop->chartype == code[5];
1715  break;
1716 
1717  case PT_SC:
1718  OK = prop->script == code[5];
1719  break;
1720 
1721  /* These are specials for combination cases. */
1722 
1723  case PT_ALNUM:
1724  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1725  _pcre_ucp_gentype[prop->chartype] == ucp_N;
1726  break;
1727 
1728  case PT_SPACE: /* Perl space */
1729  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1730  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1731  break;
1732 
1733  case PT_PXSPACE: /* POSIX space */
1734  OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1735  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1736  c == CHAR_FF || c == CHAR_CR;
1737  break;
1738 
1739  case PT_WORD:
1740  OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1741  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1742  c == CHAR_UNDERSCORE;
1743  break;
1744 
1745  /* Should never occur, but keep compilers from grumbling. */
1746 
1747  default:
1748  OK = codevalue != OP_PROP;
1749  break;
1750  }
1751 
1752  if (OK == (d == OP_PROP))
1753  {
1754  if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1755  {
1756  active_count--; /* Remove non-match possibility */
1757  next_active_state--;
1758  }
1759  if (++count >= GET2(code, 1))
1760  { ADD_NEW(state_offset + 6, 0); }
1761  else
1762  { ADD_NEW(state_offset, count); }
1763  }
1764  }
1765  break;
1766 
1767  /*-----------------------------------------------------------------*/
1772  if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1773  { ADD_ACTIVE(state_offset + 4, 0); }
1774  count = current_state->count; /* Number already matched */
1775  if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1776  {
1777  const uschar *nptr = ptr + clen;
1778  int ncount = 0;
1779  if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1780  {
1781  active_count--; /* Remove non-match possibility */
1782  next_active_state--;
1783  }
1784  while (nptr < end_subject)
1785  {
1786  int nd;
1787  int ndlen = 1;
1788  GETCHARLEN(nd, nptr, ndlen);
1789  if (UCD_CATEGORY(nd) != ucp_M) break;
1790  ncount++;
1791  nptr += ndlen;
1792  }
1793  if (++count >= GET2(code, 1))
1794  { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1795  else
1796  { ADD_NEW_DATA(-state_offset, count, ncount); }
1797  }
1798  break;
1799 #endif
1800 
1801  /*-----------------------------------------------------------------*/
1803  case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1806  if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1807  { ADD_ACTIVE(state_offset + 4, 0); }
1808  count = current_state->count; /* Number already matched */
1809  if (clen > 0)
1810  {
1811  int ncount = 0;
1812  switch (c)
1813  {
1814  case 0x000b:
1815  case 0x000c:
1816  case 0x0085:
1817  case 0x2028:
1818  case 0x2029:
1819  if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1820  goto ANYNL03;
1821 
1822  case 0x000d:
1823  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1824  /* Fall through */
1825 
1826  ANYNL03:
1827  case 0x000a:
1828  if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1829  {
1830  active_count--; /* Remove non-match possibility */
1831  next_active_state--;
1832  }
1833  if (++count >= GET2(code, 1))
1834  { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1835  else
1836  { ADD_NEW_DATA(-state_offset, count, ncount); }
1837  break;
1838 
1839  default:
1840  break;
1841  }
1842  }
1843  break;
1844 
1845  /*-----------------------------------------------------------------*/
1850  if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1851  { ADD_ACTIVE(state_offset + 4, 0); }
1852  count = current_state->count; /* Number already matched */
1853  if (clen > 0)
1854  {
1855  BOOL OK;
1856  switch (c)
1857  {
1858  case 0x000a:
1859  case 0x000b:
1860  case 0x000c:
1861  case 0x000d:
1862  case 0x0085:
1863  case 0x2028:
1864  case 0x2029:
1865  OK = TRUE;
1866  break;
1867 
1868  default:
1869  OK = FALSE;
1870  }
1871 
1872  if (OK == (d == OP_VSPACE))
1873  {
1874  if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1875  {
1876  active_count--; /* Remove non-match possibility */
1877  next_active_state--;
1878  }
1879  if (++count >= GET2(code, 1))
1880  { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1881  else
1882  { ADD_NEW_DATA(-state_offset, count, 0); }
1883  }
1884  }
1885  break;
1886 
1887  /*-----------------------------------------------------------------*/
1892  if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1893  { ADD_ACTIVE(state_offset + 4, 0); }
1894  count = current_state->count; /* Number already matched */
1895  if (clen > 0)
1896  {
1897  BOOL OK;
1898  switch (c)
1899  {
1900  case 0x09: /* HT */
1901  case 0x20: /* SPACE */
1902  case 0xa0: /* NBSP */
1903  case 0x1680: /* OGHAM SPACE MARK */
1904  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1905  case 0x2000: /* EN QUAD */
1906  case 0x2001: /* EM QUAD */
1907  case 0x2002: /* EN SPACE */
1908  case 0x2003: /* EM SPACE */
1909  case 0x2004: /* THREE-PER-EM SPACE */
1910  case 0x2005: /* FOUR-PER-EM SPACE */
1911  case 0x2006: /* SIX-PER-EM SPACE */
1912  case 0x2007: /* FIGURE SPACE */
1913  case 0x2008: /* PUNCTUATION SPACE */
1914  case 0x2009: /* THIN SPACE */
1915  case 0x200A: /* HAIR SPACE */
1916  case 0x202f: /* NARROW NO-BREAK SPACE */
1917  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1918  case 0x3000: /* IDEOGRAPHIC SPACE */
1919  OK = TRUE;
1920  break;
1921 
1922  default:
1923  OK = FALSE;
1924  break;
1925  }
1926 
1927  if (OK == (d == OP_HSPACE))
1928  {
1929  if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1930  {
1931  active_count--; /* Remove non-match possibility */
1932  next_active_state--;
1933  }
1934  if (++count >= GET2(code, 1))
1935  { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1936  else
1937  { ADD_NEW_DATA(-state_offset, count, 0); }
1938  }
1939  }
1940  break;
1941 
1942 /* ========================================================================== */
1943  /* These opcodes are followed by a character that is usually compared
1944  to the current subject character; it is loaded into d. We still get
1945  here even if there is no subject character, because in some cases zero
1946  repetitions are permitted. */
1947 
1948  /*-----------------------------------------------------------------*/
1949  case OP_CHAR:
1950  if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1951  break;
1952 
1953  /*-----------------------------------------------------------------*/
1954  case OP_CHARNC:
1955  if (clen == 0) break;
1956 
1957 #ifdef SUPPORT_UTF8
1958  if (utf8)
1959  {
1960  if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1961  {
1962  unsigned int othercase;
1963  if (c < 128) othercase = fcc[c]; else
1964 
1965  /* If we have Unicode property support, we can use it to test the
1966  other case of the character. */
1967 
1968 #ifdef SUPPORT_UCP
1969  othercase = UCD_OTHERCASE(c);
1970 #else
1971  othercase = NOTACHAR;
1972 #endif
1973 
1974  if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1975  }
1976  }
1977  else
1978 #endif /* SUPPORT_UTF8 */
1979 
1980  /* Non-UTF-8 mode */
1981  {
1982  if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1983  }
1984  break;
1985 
1986 
1987 #ifdef SUPPORT_UCP
1988  /*-----------------------------------------------------------------*/
1989  /* This is a tricky one because it can match more than one character.
1990  Find out how many characters to skip, and then set up a negative state
1991  to wait for them to pass before continuing. */
1992 
1993  case OP_EXTUNI:
1994  if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1995  {
1996  const uschar *nptr = ptr + clen;
1997  int ncount = 0;
1998  while (nptr < end_subject)
1999  {
2000  int nclen = 1;
2001  GETCHARLEN(c, nptr, nclen);
2002  if (UCD_CATEGORY(c) != ucp_M) break;
2003  ncount++;
2004  nptr += nclen;
2005  }
2006  ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2007  }
2008  break;
2009 #endif
2010 
2011  /*-----------------------------------------------------------------*/
2012  /* This is a tricky like EXTUNI because it too can match more than one
2013  character (when CR is followed by LF). In this case, set up a negative
2014  state to wait for one character to pass before continuing. */
2015 
2016  case OP_ANYNL:
2017  if (clen > 0) switch(c)
2018  {
2019  case 0x000b:
2020  case 0x000c:
2021  case 0x0085:
2022  case 0x2028:
2023  case 0x2029:
2024  if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2025 
2026  case 0x000a:
2027  ADD_NEW(state_offset + 1, 0);
2028  break;
2029 
2030  case 0x000d:
2031  if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2032  {
2033  ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2034  }
2035  else
2036  {
2037  ADD_NEW(state_offset + 1, 0);
2038  }
2039  break;
2040  }
2041  break;
2042 
2043  /*-----------------------------------------------------------------*/
2044  case OP_NOT_VSPACE:
2045  if (clen > 0) switch(c)
2046  {
2047  case 0x000a:
2048  case 0x000b:
2049  case 0x000c:
2050  case 0x000d:
2051  case 0x0085:
2052  case 0x2028:
2053  case 0x2029:
2054  break;
2055 
2056  default:
2057  ADD_NEW(state_offset + 1, 0);
2058  break;
2059  }
2060  break;
2061 
2062  /*-----------------------------------------------------------------*/
2063  case OP_VSPACE:
2064  if (clen > 0) switch(c)
2065  {
2066  case 0x000a:
2067  case 0x000b:
2068  case 0x000c:
2069  case 0x000d:
2070  case 0x0085:
2071  case 0x2028:
2072  case 0x2029:
2073  ADD_NEW(state_offset + 1, 0);
2074  break;
2075 
2076  default: break;
2077  }
2078  break;
2079 
2080  /*-----------------------------------------------------------------*/
2081  case OP_NOT_HSPACE:
2082  if (clen > 0) switch(c)
2083  {
2084  case 0x09: /* HT */
2085  case 0x20: /* SPACE */
2086  case 0xa0: /* NBSP */
2087  case 0x1680: /* OGHAM SPACE MARK */
2088  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2089  case 0x2000: /* EN QUAD */
2090  case 0x2001: /* EM QUAD */
2091  case 0x2002: /* EN SPACE */
2092  case 0x2003: /* EM SPACE */
2093  case 0x2004: /* THREE-PER-EM SPACE */
2094  case 0x2005: /* FOUR-PER-EM SPACE */
2095  case 0x2006: /* SIX-PER-EM SPACE */
2096  case 0x2007: /* FIGURE SPACE */
2097  case 0x2008: /* PUNCTUATION SPACE */
2098  case 0x2009: /* THIN SPACE */
2099  case 0x200A: /* HAIR SPACE */
2100  case 0x202f: /* NARROW NO-BREAK SPACE */
2101  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2102  case 0x3000: /* IDEOGRAPHIC SPACE */
2103  break;
2104 
2105  default:
2106  ADD_NEW(state_offset + 1, 0);
2107  break;
2108  }
2109  break;
2110 
2111  /*-----------------------------------------------------------------*/
2112  case OP_HSPACE:
2113  if (clen > 0) switch(c)
2114  {
2115  case 0x09: /* HT */
2116  case 0x20: /* SPACE */
2117  case 0xa0: /* NBSP */
2118  case 0x1680: /* OGHAM SPACE MARK */
2119  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2120  case 0x2000: /* EN QUAD */
2121  case 0x2001: /* EM QUAD */
2122  case 0x2002: /* EN SPACE */
2123  case 0x2003: /* EM SPACE */
2124  case 0x2004: /* THREE-PER-EM SPACE */
2125  case 0x2005: /* FOUR-PER-EM SPACE */
2126  case 0x2006: /* SIX-PER-EM SPACE */
2127  case 0x2007: /* FIGURE SPACE */
2128  case 0x2008: /* PUNCTUATION SPACE */
2129  case 0x2009: /* THIN SPACE */
2130  case 0x200A: /* HAIR SPACE */
2131  case 0x202f: /* NARROW NO-BREAK SPACE */
2132  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2133  case 0x3000: /* IDEOGRAPHIC SPACE */
2134  ADD_NEW(state_offset + 1, 0);
2135  break;
2136  }
2137  break;
2138 
2139  /*-----------------------------------------------------------------*/
2140  /* Match a negated single character. This is only used for one-byte
2141  characters, that is, we know that d < 256. The character we are
2142  checking (c) can be multibyte. */
2143 
2144  case OP_NOT:
2145  if (clen > 0)
2146  {
2147  unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2148  if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2149  }
2150  break;
2151 
2152  /*-----------------------------------------------------------------*/
2153  case OP_PLUS:
2154  case OP_MINPLUS:
2155  case OP_POSPLUS:
2156  case OP_NOTPLUS:
2157  case OP_NOTMINPLUS:
2158  case OP_NOTPOSPLUS:
2159  count = current_state->count; /* Already matched */
2160  if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2161  if (clen > 0)
2162  {
2163  unsigned int otherd = NOTACHAR;
2164  if ((ims & PCRE_CASELESS) != 0)
2165  {
2166 #ifdef SUPPORT_UTF8
2167  if (utf8 && d >= 128)
2168  {
2169 #ifdef SUPPORT_UCP
2170  otherd = UCD_OTHERCASE(d);
2171 #endif /* SUPPORT_UCP */
2172  }
2173  else
2174 #endif /* SUPPORT_UTF8 */
2175  otherd = fcc[d];
2176  }
2177  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2178  {
2179  if (count > 0 &&
2180  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2181  {
2182  active_count--; /* Remove non-match possibility */
2183  next_active_state--;
2184  }
2185  count++;
2186  ADD_NEW(state_offset, count);
2187  }
2188  }
2189  break;
2190 
2191  /*-----------------------------------------------------------------*/
2192  case OP_QUERY:
2193  case OP_MINQUERY:
2194  case OP_POSQUERY:
2195  case OP_NOTQUERY:
2196  case OP_NOTMINQUERY:
2197  case OP_NOTPOSQUERY:
2198  ADD_ACTIVE(state_offset + dlen + 1, 0);
2199  if (clen > 0)
2200  {
2201  unsigned int otherd = NOTACHAR;
2202  if ((ims & PCRE_CASELESS) != 0)
2203  {
2204 #ifdef SUPPORT_UTF8
2205  if (utf8 && d >= 128)
2206  {
2207 #ifdef SUPPORT_UCP
2208  otherd = UCD_OTHERCASE(d);
2209 #endif /* SUPPORT_UCP */
2210  }
2211  else
2212 #endif /* SUPPORT_UTF8 */
2213  otherd = fcc[d];
2214  }
2215  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2216  {
2217  if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2218  {
2219  active_count--; /* Remove non-match possibility */
2220  next_active_state--;
2221  }
2222  ADD_NEW(state_offset + dlen + 1, 0);
2223  }
2224  }
2225  break;
2226 
2227  /*-----------------------------------------------------------------*/
2228  case OP_STAR:
2229  case OP_MINSTAR:
2230  case OP_POSSTAR:
2231  case OP_NOTSTAR:
2232  case OP_NOTMINSTAR:
2233  case OP_NOTPOSSTAR:
2234  ADD_ACTIVE(state_offset + dlen + 1, 0);
2235  if (clen > 0)
2236  {
2237  unsigned int otherd = NOTACHAR;
2238  if ((ims & PCRE_CASELESS) != 0)
2239  {
2240 #ifdef SUPPORT_UTF8
2241  if (utf8 && d >= 128)
2242  {
2243 #ifdef SUPPORT_UCP
2244  otherd = UCD_OTHERCASE(d);
2245 #endif /* SUPPORT_UCP */
2246  }
2247  else
2248 #endif /* SUPPORT_UTF8 */
2249  otherd = fcc[d];
2250  }
2251  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2252  {
2253  if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2254  {
2255  active_count--; /* Remove non-match possibility */
2256  next_active_state--;
2257  }
2258  ADD_NEW(state_offset, 0);
2259  }
2260  }
2261  break;
2262 
2263  /*-----------------------------------------------------------------*/
2264  case OP_EXACT:
2265  case OP_NOTEXACT:
2266  count = current_state->count; /* Number already matched */
2267  if (clen > 0)
2268  {
2269  unsigned int otherd = NOTACHAR;
2270  if ((ims & PCRE_CASELESS) != 0)
2271  {
2272 #ifdef SUPPORT_UTF8
2273  if (utf8 && d >= 128)
2274  {
2275 #ifdef SUPPORT_UCP
2276  otherd = UCD_OTHERCASE(d);
2277 #endif /* SUPPORT_UCP */
2278  }
2279  else
2280 #endif /* SUPPORT_UTF8 */
2281  otherd = fcc[d];
2282  }
2283  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2284  {
2285  if (++count >= GET2(code, 1))
2286  { ADD_NEW(state_offset + dlen + 3, 0); }
2287  else
2288  { ADD_NEW(state_offset, count); }
2289  }
2290  }
2291  break;
2292 
2293  /*-----------------------------------------------------------------*/
2294  case OP_UPTO:
2295  case OP_MINUPTO:
2296  case OP_POSUPTO:
2297  case OP_NOTUPTO:
2298  case OP_NOTMINUPTO:
2299  case OP_NOTPOSUPTO:
2300  ADD_ACTIVE(state_offset + dlen + 3, 0);
2301  count = current_state->count; /* Number already matched */
2302  if (clen > 0)
2303  {
2304  unsigned int otherd = NOTACHAR;
2305  if ((ims & PCRE_CASELESS) != 0)
2306  {
2307 #ifdef SUPPORT_UTF8
2308  if (utf8 && d >= 128)
2309  {
2310 #ifdef SUPPORT_UCP
2311  otherd = UCD_OTHERCASE(d);
2312 #endif /* SUPPORT_UCP */
2313  }
2314  else
2315 #endif /* SUPPORT_UTF8 */
2316  otherd = fcc[d];
2317  }
2318  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2319  {
2320  if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2321  {
2322  active_count--; /* Remove non-match possibility */
2323  next_active_state--;
2324  }
2325  if (++count >= GET2(code, 1))
2326  { ADD_NEW(state_offset + dlen + 3, 0); }
2327  else
2328  { ADD_NEW(state_offset, count); }
2329  }
2330  }
2331  break;
2332 
2333 
2334 /* ========================================================================== */
2335  /* These are the class-handling opcodes */
2336 
2337  case OP_CLASS:
2338  case OP_NCLASS:
2339  case OP_XCLASS:
2340  {
2341  BOOL isinclass = FALSE;
2342  int next_state_offset;
2343  const uschar *ecode;
2344 
2345  /* For a simple class, there is always just a 32-byte table, and we
2346  can set isinclass from it. */
2347 
2348  if (codevalue != OP_XCLASS)
2349  {
2350  ecode = code + 33;
2351  if (clen > 0)
2352  {
2353  isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2354  ((code[1 + c/8] & (1 << (c&7))) != 0);
2355  }
2356  }
2357 
2358  /* An extended class may have a table or a list of single characters,
2359  ranges, or both, and it may be positive or negative. There's a
2360  function that sorts all this out. */
2361 
2362  else
2363  {
2364  ecode = code + GET(code, 1);
2365  if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2366  }
2367 
2368  /* At this point, isinclass is set for all kinds of class, and ecode
2369  points to the byte after the end of the class. If there is a
2370  quantifier, this is where it will be. */
2371 
2372  next_state_offset = (int)(ecode - start_code);
2373 
2374  switch (*ecode)
2375  {
2376  case OP_CRSTAR:
2377  case OP_CRMINSTAR:
2378  ADD_ACTIVE(next_state_offset + 1, 0);
2379  if (isinclass) { ADD_NEW(state_offset, 0); }
2380  break;
2381 
2382  case OP_CRPLUS:
2383  case OP_CRMINPLUS:
2384  count = current_state->count; /* Already matched */
2385  if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2386  if (isinclass) { count++; ADD_NEW(state_offset, count); }
2387  break;
2388 
2389  case OP_CRQUERY:
2390  case OP_CRMINQUERY:
2391  ADD_ACTIVE(next_state_offset + 1, 0);
2392  if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2393  break;
2394 
2395  case OP_CRRANGE:
2396  case OP_CRMINRANGE:
2397  count = current_state->count; /* Already matched */
2398  if (count >= GET2(ecode, 1))
2399  { ADD_ACTIVE(next_state_offset + 5, 0); }
2400  if (isinclass)
2401  {
2402  int max = GET2(ecode, 3);
2403  if (++count >= max && max != 0) /* Max 0 => no limit */
2404  { ADD_NEW(next_state_offset + 5, 0); }
2405  else
2406  { ADD_NEW(state_offset, count); }
2407  }
2408  break;
2409 
2410  default:
2411  if (isinclass) { ADD_NEW(next_state_offset, 0); }
2412  break;
2413  }
2414  }
2415  break;
2416 
2417 /* ========================================================================== */
2418  /* These are the opcodes for fancy brackets of various kinds. We have
2419  to use recursion in order to handle them. The "always failing" assertion
2420  (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2421  though the other "backtracking verbs" are not supported. */
2422 
2423  case OP_FAIL:
2424  forced_fail++; /* Count FAILs for multiple states */
2425  break;
2426 
2427  case OP_ASSERT:
2428  case OP_ASSERT_NOT:
2429  case OP_ASSERTBACK:
2430  case OP_ASSERTBACK_NOT:
2431  {
2432  int rc;
2433  int local_offsets[2];
2434  int local_workspace[1000];
2435  const uschar *endasscode = code + GET(code, 1);
2436 
2437  while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2438 
2439  rc = internal_dfa_exec(
2440  md, /* static match data */
2441  code, /* this subexpression's code */
2442  ptr, /* where we currently are */
2443  (int)(ptr - start_subject), /* start offset */
2444  local_offsets, /* offset vector */
2445  sizeof(local_offsets)/sizeof(int), /* size of same */
2446  local_workspace, /* workspace vector */
2447  sizeof(local_workspace)/sizeof(int), /* size of same */
2448  ims, /* the current ims flags */
2449  rlevel, /* function recursion level */
2450  recursing); /* pass on regex recursion */
2451 
2452  if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2453  if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2454  { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2455  }
2456  break;
2457 
2458  /*-----------------------------------------------------------------*/
2459  case OP_COND:
2460  case OP_SCOND:
2461  {
2462  int local_offsets[1000];
2463  int local_workspace[1000];
2464  int codelink = GET(code, 1);
2465  int condcode;
2466 
2467  /* Because of the way auto-callout works during compile, a callout item
2468  is inserted between OP_COND and an assertion condition. This does not
2469  happen for the other conditions. */
2470 
2471  if (code[LINK_SIZE+1] == OP_CALLOUT)
2472  {
2473  rrc = 0;
2474  if (pcre_callout != NULL)
2475  {
2476  pcre_callout_block cb;
2477  cb.version = 1; /* Version 1 of the callout block */
2478  cb.callout_number = code[LINK_SIZE+2];
2479  cb.offset_vector = offsets;
2480  cb.subject = (PCRE_SPTR)start_subject;
2481  cb.subject_length = (int)(end_subject - start_subject);
2482  cb.start_match = (int)(current_subject - start_subject);
2483  cb.current_position = (int)(ptr - start_subject);
2484  cb.pattern_position = GET(code, LINK_SIZE + 3);
2485  cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2486  cb.capture_top = 1;
2487  cb.capture_last = -1;
2488  cb.callout_data = md->callout_data;
2489  if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2490  }
2491  if (rrc > 0) break; /* Fail this thread */
2492  code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2493  }
2494 
2495  condcode = code[LINK_SIZE+1];
2496 
2497  /* Back reference conditions are not supported */
2498 
2499  if (condcode == OP_CREF || condcode == OP_NCREF)
2500  return PCRE_ERROR_DFA_UCOND;
2501 
2502  /* The DEFINE condition is always false */
2503 
2504  if (condcode == OP_DEF)
2505  { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2506 
2507  /* The only supported version of OP_RREF is for the value RREF_ANY,
2508  which means "test if in any recursion". We can't test for specifically
2509  recursed groups. */
2510 
2511  else if (condcode == OP_RREF || condcode == OP_NRREF)
2512  {
2513  int value = GET2(code, LINK_SIZE+2);
2514  if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2515  if (recursing > 0)
2516  { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2517  else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2518  }
2519 
2520  /* Otherwise, the condition is an assertion */
2521 
2522  else
2523  {
2524  int rc;
2525  const uschar *asscode = code + LINK_SIZE + 1;
2526  const uschar *endasscode = asscode + GET(asscode, 1);
2527 
2528  while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2529 
2530  rc = internal_dfa_exec(
2531  md, /* fixed match data */
2532  asscode, /* this subexpression's code */
2533  ptr, /* where we currently are */
2534  (int)(ptr - start_subject), /* start offset */
2535  local_offsets, /* offset vector */
2536  sizeof(local_offsets)/sizeof(int), /* size of same */
2537  local_workspace, /* workspace vector */
2538  sizeof(local_workspace)/sizeof(int), /* size of same */
2539  ims, /* the current ims flags */
2540  rlevel, /* function recursion level */
2541  recursing); /* pass on regex recursion */
2542 
2543  if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2544  if ((rc >= 0) ==
2545  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2546  { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2547  else
2548  { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2549  }
2550  }
2551  break;
2552 
2553  /*-----------------------------------------------------------------*/
2554  case OP_RECURSE:
2555  {
2556  int local_offsets[1000];
2557  int local_workspace[1000];
2558  int rc;
2559 
2560  DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2561  recursing + 1));
2562 
2563  rc = internal_dfa_exec(
2564  md, /* fixed match data */
2565  start_code + GET(code, 1), /* this subexpression's code */
2566  ptr, /* where we currently are */
2567  (int)(ptr - start_subject), /* start offset */
2568  local_offsets, /* offset vector */
2569  sizeof(local_offsets)/sizeof(int), /* size of same */
2570  local_workspace, /* workspace vector */
2571  sizeof(local_workspace)/sizeof(int), /* size of same */
2572  ims, /* the current ims flags */
2573  rlevel, /* function recursion level */
2574  recursing + 1); /* regex recurse level */
2575 
2576  DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2577  recursing + 1, rc));
2578 
2579  /* Ran out of internal offsets */
2580 
2581  if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2582 
2583  /* For each successful matched substring, set up the next state with a
2584  count of characters to skip before trying it. Note that the count is in
2585  characters, not bytes. */
2586 
2587  if (rc > 0)
2588  {
2589  for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2590  {
2591  const uschar *p = start_subject + local_offsets[rc];
2592  const uschar *pp = start_subject + local_offsets[rc+1];
2593  int charcount = local_offsets[rc+1] - local_offsets[rc];
2594  while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2595  if (charcount > 0)
2596  {
2597  ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2598  }
2599  else
2600  {
2601  ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2602  }
2603  }
2604  }
2605  else if (rc != PCRE_ERROR_NOMATCH) return rc;
2606  }
2607  break;
2608 
2609  /*-----------------------------------------------------------------*/
2610  case OP_ONCE:
2611  {
2612  int local_offsets[2];
2613  int local_workspace[1000];
2614 
2615  int rc = internal_dfa_exec(
2616  md, /* fixed match data */
2617  code, /* this subexpression's code */
2618  ptr, /* where we currently are */
2619  (int)(ptr - start_subject), /* start offset */
2620  local_offsets, /* offset vector */
2621  sizeof(local_offsets)/sizeof(int), /* size of same */
2622  local_workspace, /* workspace vector */
2623  sizeof(local_workspace)/sizeof(int), /* size of same */
2624  ims, /* the current ims flags */
2625  rlevel, /* function recursion level */
2626  recursing); /* pass on regex recursion */
2627 
2628  if (rc >= 0)
2629  {
2630  const uschar *end_subpattern = code;
2631  int charcount = local_offsets[1] - local_offsets[0];
2632  int next_state_offset, repeat_state_offset;
2633 
2634  do { end_subpattern += GET(end_subpattern, 1); }
2635  while (*end_subpattern == OP_ALT);
2636  next_state_offset =
2637  (int)(end_subpattern - start_code + LINK_SIZE + 1);
2638 
2639  /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2640  arrange for the repeat state also to be added to the relevant list.
2641  Calculate the offset, or set -1 for no repeat. */
2642 
2643  repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2644  *end_subpattern == OP_KETRMIN)?
2645  (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2646 
2647  /* If we have matched an empty string, add the next state at the
2648  current character pointer. This is important so that the duplicate
2649  checking kicks in, which is what breaks infinite loops that match an
2650  empty string. */
2651 
2652  if (charcount == 0)
2653  {
2654  ADD_ACTIVE(next_state_offset, 0);
2655  }
2656 
2657  /* Optimization: if there are no more active states, and there
2658  are no new states yet set up, then skip over the subject string
2659  right here, to save looping. Otherwise, set up the new state to swing
2660  into action when the end of the substring is reached. */
2661 
2662  else if (i + 1 >= active_count && new_count == 0)
2663  {
2664  ptr += charcount;
2665  clen = 0;
2666  ADD_NEW(next_state_offset, 0);
2667 
2668  /* If we are adding a repeat state at the new character position,
2669  we must fudge things so that it is the only current state.
2670  Otherwise, it might be a duplicate of one we processed before, and
2671  that would cause it to be skipped. */
2672 
2673  if (repeat_state_offset >= 0)
2674  {
2675  next_active_state = active_states;
2676  active_count = 0;
2677  i = -1;
2678  ADD_ACTIVE(repeat_state_offset, 0);
2679  }
2680  }
2681  else
2682  {
2683  const uschar *p = start_subject + local_offsets[0];
2684  const uschar *pp = start_subject + local_offsets[1];
2685  while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2686  ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2687  if (repeat_state_offset >= 0)
2688  { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2689  }
2690 
2691  }
2692  else if (rc != PCRE_ERROR_NOMATCH) return rc;
2693  }
2694  break;
2695 
2696 
2697 /* ========================================================================== */
2698  /* Handle callouts */
2699 
2700  case OP_CALLOUT:
2701  rrc = 0;
2702  if (pcre_callout != NULL)
2703  {
2704  pcre_callout_block cb;
2705  cb.version = 1; /* Version 1 of the callout block */
2706  cb.callout_number = code[1];
2707  cb.offset_vector = offsets;
2708  cb.subject = (PCRE_SPTR)start_subject;
2709  cb.subject_length = (int)(end_subject - start_subject);
2710  cb.start_match = (int)(current_subject - start_subject);
2711  cb.current_position = (int)(ptr - start_subject);
2712  cb.pattern_position = GET(code, 2);
2713  cb.next_item_length = GET(code, 2 + LINK_SIZE);
2714  cb.capture_top = 1;
2715  cb.capture_last = -1;
2716  cb.callout_data = md->callout_data;
2717  if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2718  }
2719  if (rrc == 0)
2720  { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2721  break;
2722 
2723 
2724 /* ========================================================================== */
2725  default: /* Unsupported opcode */
2726  return PCRE_ERROR_DFA_UITEM;
2727  }
2728 
2729  NEXT_ACTIVE_STATE: continue;
2730 
2731  } /* End of loop scanning active states */
2732 
2733  /* We have finished the processing at the current subject character. If no
2734  new states have been set for the next character, we have found all the
2735  matches that we are going to find. If we are at the top level and partial
2736  matching has been requested, check for appropriate conditions.
2737 
2738  The "forced_ fail" variable counts the number of (*F) encountered for the
2739  character. If it is equal to the original active_count (saved in
2740  workspace[1]) it means that (*F) was found on every active state. In this
2741  case we don't want to give a partial match.
2742 
2743  The "could_continue" variable is true if a state could have continued but
2744  for the fact that the end of the subject was reached. */
2745 
2746  if (new_count <= 0)
2747  {
2748  if (rlevel == 1 && /* Top level, and */
2749  could_continue && /* Some could go on */
2750  forced_fail != workspace[1] && /* Not all forced fail & */
2751  ( /* either... */
2752  (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2753  || /* or... */
2754  ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2755  match_count < 0) /* no matches */
2756  ) && /* And... */
2757  ptr >= end_subject && /* Reached end of subject */
2758  ptr > md->start_used_ptr) /* Inspected non-empty string */
2759  {
2760  if (offsetcount >= 2)
2761  {
2762  offsets[0] = (int)(md->start_used_ptr - start_subject);
2763  offsets[1] = (int)(end_subject - start_subject);
2764  }
2765  match_count = PCRE_ERROR_PARTIAL;
2766  }
2767 
2768  DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2769  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2770  rlevel*2-2, SP));
2771  break; /* In effect, "return", but see the comment below */
2772  }
2773 
2774  /* One or more states are active for the next character. */
2775 
2776  ptr += clen; /* Advance to next subject character */
2777  } /* Loop to move along the subject string */
2778 
2779 /* Control gets here from "break" a few lines above. We do it this way because
2780 if we use "return" above, we have compiler trouble. Some compilers warn if
2781 there's nothing here because they think the function doesn't return a value. On
2782 the other hand, if we put a dummy statement here, some more clever compilers
2783 complain that it can't be reached. Sigh. */
2784 
2785 return match_count;
2786 }
2787 
2788 
2789 
2790 
2791 /*************************************************
2792 * Execute a Regular Expression - DFA engine *
2793 *************************************************/
2794 
2795 /* This external function applies a compiled re to a subject string using a DFA
2796 engine. This function calls the internal function multiple times if the pattern
2797 is not anchored.
2798 
2799 Arguments:
2800  argument_re points to the compiled expression
2801  extra_data points to extra data or is NULL
2802  subject points to the subject string
2803  length length of subject string (may contain binary zeros)
2804  start_offset where to start in the subject string
2805  options option bits
2806  offsets vector of match offsets
2807  offsetcount size of same
2808  workspace workspace vector
2809  wscount size of same
2810 
2811 Returns: > 0 => number of match offset pairs placed in offsets
2812  = 0 => offsets overflowed; longest matches are present
2813  -1 => failed to match
2814  < -1 => some kind of unexpected problem
2815 */
2816 
2818 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2819  const char *subject, int length, int start_offset, int options, int *offsets,
2820  int offsetcount, int *workspace, int wscount)
2821 {
2822 real_pcre *re = (real_pcre *)argument_re;
2823 dfa_match_data match_block;
2824 dfa_match_data *md = &match_block;
2825 BOOL utf8, anchored, startline, firstline;
2826 const uschar *current_subject, *end_subject, *lcc;
2827 
2828 pcre_study_data internal_study;
2829 const pcre_study_data *study = NULL;
2830 real_pcre internal_re;
2831 
2832 const uschar *req_byte_ptr;
2833 const uschar *start_bits = NULL;
2834 BOOL first_byte_caseless = FALSE;
2835 BOOL req_byte_caseless = FALSE;
2836 int first_byte = -1;
2837 int req_byte = -1;
2838 int req_byte2 = -1;
2839 int newline;
2840 
2841 /* Plausibility checks */
2842 
2843 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2844 if (re == NULL || subject == NULL || workspace == NULL ||
2845  (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2846 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2847 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2848 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
2849 
2850 /* We need to find the pointer to any study data before we test for byte
2851 flipping, so we scan the extra_data block first. This may set two fields in the
2852 match block, so we must initialize them beforehand. However, the other fields
2853 in the match block must not be set until after the byte flipping. */
2854 
2855 md->tables = re->tables;
2856 md->callout_data = NULL;
2857 
2858 if (extra_data != NULL)
2859  {
2860  unsigned int flags = extra_data->flags;
2861  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2862  study = (const pcre_study_data *)extra_data->study_data;
2863  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2864  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2865  return PCRE_ERROR_DFA_UMLIMIT;
2866  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2867  md->callout_data = extra_data->callout_data;
2868  if ((flags & PCRE_EXTRA_TABLES) != 0)
2869  md->tables = extra_data->tables;
2870  }
2871 
2872 /* Check that the first field in the block is the magic number. If it is not,
2873 test for a regex that was compiled on a host of opposite endianness. If this is
2874 the case, flipped values are put in internal_re and internal_study if there was
2875 study data too. */
2876 
2877 if (re->magic_number != MAGIC_NUMBER)
2878  {
2879  re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2880  if (re == NULL) return PCRE_ERROR_BADMAGIC;
2881  if (study != NULL) study = &internal_study;
2882  }
2883 
2884 /* Set some local values */
2885 
2886 current_subject = (const unsigned char *)subject + start_offset;
2887 end_subject = (const unsigned char *)subject + length;
2888 req_byte_ptr = current_subject - 1;
2889 
2890 #ifdef SUPPORT_UTF8
2891 utf8 = (re->options & PCRE_UTF8) != 0;
2892 #else
2893 utf8 = FALSE;
2894 #endif
2895 
2896 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2897  (re->options & PCRE_ANCHORED) != 0;
2898 
2899 /* The remaining fixed data for passing around. */
2900 
2901 md->start_code = (const uschar *)argument_re +
2903 md->start_subject = (const unsigned char *)subject;
2904 md->end_subject = end_subject;
2905 md->start_offset = start_offset;
2906 md->moptions = options;
2907 md->poptions = re->options;
2908 
2909 /* If the BSR option is not set at match time, copy what was set
2910 at compile time. */
2911 
2912 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2913  {
2914  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2916 #ifdef BSR_ANYCRLF
2917  else md->moptions |= PCRE_BSR_ANYCRLF;
2918 #endif
2919  }
2920 
2921 /* Handle different types of newline. The three bits give eight cases. If
2922 nothing is set at run time, whatever was used at compile time applies. */
2923 
2924 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2925  PCRE_NEWLINE_BITS)
2926  {
2927  case 0: newline = NEWLINE; break; /* Compile-time default */
2928  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2929  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2930  case PCRE_NEWLINE_CR+
2931  PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2932  case PCRE_NEWLINE_ANY: newline = -1; break;
2933  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2934  default: return PCRE_ERROR_BADNEWLINE;
2935  }
2936 
2937 if (newline == -2)
2938  {
2939  md->nltype = NLTYPE_ANYCRLF;
2940  }
2941 else if (newline < 0)
2942  {
2943  md->nltype = NLTYPE_ANY;
2944  }
2945 else
2946  {
2947  md->nltype = NLTYPE_FIXED;
2948  if (newline > 255)
2949  {
2950  md->nllen = 2;
2951  md->nl[0] = (newline >> 8) & 255;
2952  md->nl[1] = newline & 255;
2953  }
2954  else
2955  {
2956  md->nllen = 1;
2957  md->nl[0] = newline;
2958  }
2959  }
2960 
2961 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2962 back the character offset. */
2963 
2964 #ifdef SUPPORT_UTF8
2965 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2966  {
2967  int tb;
2968  if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
2969  return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
2971  if (start_offset > 0 && start_offset < length)
2972  {
2973  tb = ((USPTR)subject)[start_offset] & 0xc0;
2974  if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
2975  }
2976  }
2977 #endif
2978 
2979 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2980 is a feature that makes it possible to save compiled regex and re-use them
2981 in other programs later. */
2982 
2983 if (md->tables == NULL) md->tables = _pcre_default_tables;
2984 
2985 /* The lower casing table and the "must be at the start of a line" flag are
2986 used in a loop when finding where to start. */
2987 
2988 lcc = md->tables + lcc_offset;
2989 startline = (re->flags & PCRE_STARTLINE) != 0;
2990 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2991 
2992 /* Set up the first character to match, if available. The first_byte value is
2993 never set for an anchored regular expression, but the anchoring may be forced
2994 at run time, so we have to test for anchoring. The first char may be unset for
2995 an unanchored pattern, of course. If there's no first char and the pattern was
2996 studied, there may be a bitmap of possible first characters. */
2997 
2998 if (!anchored)
2999  {
3000  if ((re->flags & PCRE_FIRSTSET) != 0)
3001  {
3002  first_byte = re->first_byte & 255;
3003  if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3004  first_byte = lcc[first_byte];
3005  }
3006  else
3007  {
3008  if (!startline && study != NULL &&
3009  (study->flags & PCRE_STUDY_MAPPED) != 0)
3010  start_bits = study->start_bits;
3011  }
3012  }
3013 
3014 /* For anchored or unanchored matches, there may be a "last known required
3015 character" set. */
3016 
3017 if ((re->flags & PCRE_REQCHSET) != 0)
3018  {
3019  req_byte = re->req_byte & 255;
3020  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3021  req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3022  }
3023 
3024 /* Call the main matching function, looping for a non-anchored regex after a
3025 failed match. If not restarting, perform certain optimizations at the start of
3026 a match. */
3027 
3028 for (;;)
3029  {
3030  int rc;
3031 
3032  if ((options & PCRE_DFA_RESTART) == 0)
3033  {
3034  const uschar *save_end_subject = end_subject;
3035 
3036  /* If firstline is TRUE, the start of the match is constrained to the first
3037  line of a multiline string. Implement this by temporarily adjusting
3038  end_subject so that we stop scanning at a newline. If the match fails at
3039  the newline, later code breaks this loop. */
3040 
3041  if (firstline)
3042  {
3043  USPTR t = current_subject;
3044 #ifdef SUPPORT_UTF8
3045  if (utf8)
3046  {
3047  while (t < md->end_subject && !IS_NEWLINE(t))
3048  {
3049  t++;
3050  while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3051  }
3052  }
3053  else
3054 #endif
3055  while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3056  end_subject = t;
3057  }
3058 
3059  /* There are some optimizations that avoid running the match if a known
3060  starting point is not found. However, there is an option that disables
3061  these, for testing and for ensuring that all callouts do actually occur.
3062  The option can be set in the regex by (*NO_START_OPT) or passed in
3063  match-time options. */
3064 
3065  if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3066  {
3067  /* Advance to a known first byte. */
3068 
3069  if (first_byte >= 0)
3070  {
3071  if (first_byte_caseless)
3072  while (current_subject < end_subject &&
3073  lcc[*current_subject] != first_byte)
3074  current_subject++;
3075  else
3076  while (current_subject < end_subject &&
3077  *current_subject != first_byte)
3078  current_subject++;
3079  }
3080 
3081  /* Or to just after a linebreak for a multiline match if possible */
3082 
3083  else if (startline)
3084  {
3085  if (current_subject > md->start_subject + start_offset)
3086  {
3087 #ifdef SUPPORT_UTF8
3088  if (utf8)
3089  {
3090  while (current_subject < end_subject &&
3091  !WAS_NEWLINE(current_subject))
3092  {
3093  current_subject++;
3094  while(current_subject < end_subject &&
3095  (*current_subject & 0xc0) == 0x80)
3096  current_subject++;
3097  }
3098  }
3099  else
3100 #endif
3101  while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3102  current_subject++;
3103 
3104  /* If we have just passed a CR and the newline option is ANY or
3105  ANYCRLF, and we are now at a LF, advance the match position by one
3106  more character. */
3107 
3108  if (current_subject[-1] == CHAR_CR &&
3109  (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3110  current_subject < end_subject &&
3111  *current_subject == CHAR_NL)
3112  current_subject++;
3113  }
3114  }
3115 
3116  /* Or to a non-unique first char after study */
3117 
3118  else if (start_bits != NULL)
3119  {
3120  while (current_subject < end_subject)
3121  {
3122  register unsigned int c = *current_subject;
3123  if ((start_bits[c/8] & (1 << (c&7))) == 0)
3124  {
3125  current_subject++;
3126 #ifdef SUPPORT_UTF8
3127  if (utf8)
3128  while(current_subject < end_subject &&
3129  (*current_subject & 0xc0) == 0x80) current_subject++;
3130 #endif
3131  }
3132  else break;
3133  }
3134  }
3135  }
3136 
3137  /* Restore fudged end_subject */
3138 
3139  end_subject = save_end_subject;
3140 
3141  /* The following two optimizations are disabled for partial matching or if
3142  disabling is explicitly requested (and of course, by the test above, this
3143  code is not obeyed when restarting after a partial match). */
3144 
3145  if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3146  (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3147  {
3148  /* If the pattern was studied, a minimum subject length may be set. This
3149  is a lower bound; no actual string of that length may actually match the
3150  pattern. Although the value is, strictly, in characters, we treat it as
3151  bytes to avoid spending too much time in this optimization. */
3152 
3153  if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3154  (pcre_uint32)(end_subject - current_subject) < study->minlength)
3155  return PCRE_ERROR_NOMATCH;
3156 
3157  /* If req_byte is set, we know that that character must appear in the
3158  subject for the match to succeed. If the first character is set, req_byte
3159  must be later in the subject; otherwise the test starts at the match
3160  point. This optimization can save a huge amount of work in patterns with
3161  nested unlimited repeats that aren't going to match. Writing separate
3162  code for cased/caseless versions makes it go faster, as does using an
3163  autoincrement and backing off on a match.
3164 
3165  HOWEVER: when the subject string is very, very long, searching to its end
3166  can take a long time, and give bad performance on quite ordinary
3167  patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3168  string... so we don't do this when the string is sufficiently long. */
3169 
3170  if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3171  {
3172  register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3173 
3174  /* We don't need to repeat the search if we haven't yet reached the
3175  place we found it at last time. */
3176 
3177  if (p > req_byte_ptr)
3178  {
3179  if (req_byte_caseless)
3180  {
3181  while (p < end_subject)
3182  {
3183  register int pp = *p++;
3184  if (pp == req_byte || pp == req_byte2) { p--; break; }
3185  }
3186  }
3187  else
3188  {
3189  while (p < end_subject)
3190  {
3191  if (*p++ == req_byte) { p--; break; }
3192  }
3193  }
3194 
3195  /* If we can't find the required character, break the matching loop,
3196  which will cause a return or PCRE_ERROR_NOMATCH. */
3197 
3198  if (p >= end_subject) break;
3199 
3200  /* If we have found the required character, save the point where we
3201  found it, so that we don't search again next time round the loop if
3202  the start hasn't passed this character yet. */
3203 
3204  req_byte_ptr = p;
3205  }
3206  }
3207  }
3208  } /* End of optimizations that are done when not restarting */
3209 
3210  /* OK, now we can do the business */
3211 
3212  md->start_used_ptr = current_subject;
3213 
3214  rc = internal_dfa_exec(
3215  md, /* fixed match data */
3216  md->start_code, /* this subexpression's code */
3217  current_subject, /* where we currently are */
3218  start_offset, /* start offset in subject */
3219  offsets, /* offset vector */
3220  offsetcount, /* size of same */
3221  workspace, /* workspace vector */
3222  wscount, /* size of same */
3223  re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3224  0, /* function recurse level */
3225  0); /* regex recurse level */
3226 
3227  /* Anything other than "no match" means we are done, always; otherwise, carry
3228  on only if not anchored. */
3229 
3230  if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3231 
3232  /* Advance to the next subject character unless we are at the end of a line
3233  and firstline is set. */
3234 
3235  if (firstline && IS_NEWLINE(current_subject)) break;
3236  current_subject++;
3237  if (utf8)
3238  {
3239  while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3240  current_subject++;
3241  }
3242  if (current_subject > end_subject) break;
3243 
3244  /* If we have just passed a CR and we are now at a LF, and the pattern does
3245  not contain any explicit matches for \r or \n, and the newline option is CRLF
3246  or ANY or ANYCRLF, advance the match position by one more character. */
3247 
3248  if (current_subject[-1] == CHAR_CR &&
3249  current_subject < end_subject &&
3250  *current_subject == CHAR_NL &&
3251  (re->flags & PCRE_HASCRORLF) == 0 &&
3252  (md->nltype == NLTYPE_ANY ||
3253  md->nltype == NLTYPE_ANYCRLF ||
3254  md->nllen == 2))
3255  current_subject++;
3256 
3257  } /* "Bumpalong" loop */
3258 
3259 return PCRE_ERROR_NOMATCH;
3260 }
3261 
3262 /* End of pcre_dfa_exec.c */