regc_nfa.c 97.6 KB
Newer Older
1 2 3 4
/*
 * NFA utilities.
 * This file is #included by regcomp.c.
 *
Bruce Momjian's avatar
Bruce Momjian committed
5
 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
Bruce Momjian's avatar
Bruce Momjian committed
6
 *
7 8 9
 * Development of this software was funded, in part, by Cray Research Inc.,
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 * Corporation, none of whom are responsible for the results.  The author
Bruce Momjian's avatar
Bruce Momjian committed
10 11
 * thanks all of them.
 *
12 13 14 15
 * Redistribution and use in source and binary forms -- with or without
 * modification -- are permitted for any purpose, provided that
 * redistributions in source form retain this entire copyright notice and
 * indicate the origin and nature of any modifications.
Bruce Momjian's avatar
Bruce Momjian committed
16
 *
17 18
 * I'd appreciate being given credit for this package in the documentation
 * of software which uses it, but that is not a requirement.
Bruce Momjian's avatar
Bruce Momjian committed
19
 *
20 21 22 23 24 25 26 27 28 29 30
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
31
 * src/backend/regex/regc_nfa.c
32 33 34 35 36 37 38
 *
 *
 * One or two things that technically ought to be in here
 * are actually in color.c, thanks to some incestuous relationships in
 * the color chains.
 */

Bruce Momjian's avatar
Bruce Momjian committed
39 40
#define NISERR()	VISERR(nfa->v)
#define NERR(e)		VERR(nfa->v, (e))
41 42 43 44 45


/*
 * newnfa - set up an NFA
 */
Bruce Momjian's avatar
Bruce Momjian committed
46
static struct nfa *				/* the NFA, or NULL */
47 48 49
newnfa(struct vars *v,
	   struct colormap *cm,
	   struct nfa *parent)		/* NULL if primary NFA */
50 51 52
{
	struct nfa *nfa;

Bruce Momjian's avatar
Bruce Momjian committed
53
	nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54
	if (nfa == NULL)
55 56
	{
		ERR(REG_ESPACE);
57
		return NULL;
58
	}
59

60
	/* Make the NFA minimally valid, so freenfa() will behave sanely */
61 62
	nfa->states = NULL;
	nfa->slast = NULL;
63 64 65 66 67 68
	nfa->freestates = NULL;
	nfa->freearcs = NULL;
	nfa->lastsb = NULL;
	nfa->lastab = NULL;
	nfa->lastsbused = 0;
	nfa->lastabused = 0;
69 70 71 72 73
	nfa->nstates = 0;
	nfa->cm = cm;
	nfa->v = v;
	nfa->bos[0] = nfa->bos[1] = COLORLESS;
	nfa->eos[0] = nfa->eos[1] = COLORLESS;
74 75
	nfa->flags = 0;
	nfa->minmatchall = nfa->maxmatchall = -1;
76
	nfa->parent = parent;		/* Precedes newfstate so parent is valid. */
77 78

	/* Create required infrastructure */
79
	nfa->post = newfstate(nfa, '@');	/* number 0 */
Tom Lane's avatar
Tom Lane committed
80
	nfa->pre = newfstate(nfa, '>'); /* number 1 */
Bruce Momjian's avatar
Bruce Momjian committed
81
	nfa->init = newstate(nfa);	/* may become invalid later */
82
	nfa->final = newstate(nfa);
Bruce Momjian's avatar
Bruce Momjian committed
83 84
	if (ISERR())
	{
85 86 87 88 89 90 91 92 93 94
		freenfa(nfa);
		return NULL;
	}
	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
	newarc(nfa, '^', 1, nfa->pre, nfa->init);
	newarc(nfa, '^', 0, nfa->pre, nfa->init);
	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
	newarc(nfa, '$', 1, nfa->final, nfa->post);
	newarc(nfa, '$', 0, nfa->final, nfa->post);

Bruce Momjian's avatar
Bruce Momjian committed
95 96
	if (ISERR())
	{
97 98 99 100 101 102 103 104 105 106
		freenfa(nfa);
		return NULL;
	}
	return nfa;
}

/*
 * freenfa - free an entire NFA
 */
static void
107
freenfa(struct nfa *nfa)
108
{
109 110 111 112
	struct statebatch *sb;
	struct statebatch *sbnext;
	struct arcbatch *ab;
	struct arcbatch *abnext;
113

114
	for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
Bruce Momjian's avatar
Bruce Momjian committed
115
	{
116 117 118
		sbnext = sb->next;
		nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
		FREE(sb);
119
	}
120 121
	nfa->lastsb = NULL;
	for (ab = nfa->lastab; ab != NULL; ab = abnext)
Bruce Momjian's avatar
Bruce Momjian committed
122
	{
123 124 125
		abnext = ab->next;
		nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
		FREE(ab);
126
	}
127
	nfa->lastab = NULL;
128 129 130 131 132 133 134 135

	nfa->nstates = -1;
	FREE(nfa);
}

/*
 * newstate - allocate an NFA state, with zero flag value
 */
Bruce Momjian's avatar
Bruce Momjian committed
136
static struct state *			/* NULL on error */
137
newstate(struct nfa *nfa)
138 139 140
{
	struct state *s;

141 142 143
	/*
	 * This is a handy place to check for operation cancel during regex
	 * compilation, since no code path will go very long without making a new
144
	 * state or arc.
145 146 147 148 149 150 151
	 */
	if (CANCEL_REQUESTED(nfa->v->re))
	{
		NERR(REG_CANCEL);
		return NULL;
	}

152 153 154 155 156 157 158 159
	/* first, recycle anything that's on the freelist */
	if (nfa->freestates != NULL)
	{
		s = nfa->freestates;
		nfa->freestates = s->next;
	}
	/* otherwise, is there anything left in the last statebatch? */
	else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
Bruce Momjian's avatar
Bruce Momjian committed
160
	{
161
		s = &nfa->lastsb->s[nfa->lastsbused++];
Bruce Momjian's avatar
Bruce Momjian committed
162
	}
163
	/* otherwise, need to allocate a new statebatch */
Bruce Momjian's avatar
Bruce Momjian committed
164 165
	else
	{
166 167 168
		struct statebatch *newSb;
		size_t		nstates;

169 170 171 172 173
		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
		{
			NERR(REG_ETOOBIG);
			return NULL;
		}
174 175 176 177 178
		nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
		if (nstates > MAXSBSIZE)
			nstates = MAXSBSIZE;
		newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
		if (newSb == NULL)
Bruce Momjian's avatar
Bruce Momjian committed
179
		{
180 181 182
			NERR(REG_ESPACE);
			return NULL;
		}
183 184 185 186 187 188
		nfa->v->spaceused += STATEBATCHSIZE(nstates);
		newSb->nstates = nstates;
		newSb->next = nfa->lastsb;
		nfa->lastsb = newSb;
		nfa->lastsbused = 1;
		s = &newSb->s[0];
189 190 191 192 193 194 195 196 197 198 199 200 201
	}

	assert(nfa->nstates >= 0);
	s->no = nfa->nstates++;
	s->flag = 0;
	if (nfa->states == NULL)
		nfa->states = s;
	s->nins = 0;
	s->ins = NULL;
	s->nouts = 0;
	s->outs = NULL;
	s->tmp = NULL;
	s->next = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
202 203
	if (nfa->slast != NULL)
	{
204 205 206 207 208 209 210 211 212 213 214
		assert(nfa->slast->next == NULL);
		nfa->slast->next = s;
	}
	s->prev = nfa->slast;
	nfa->slast = s;
	return s;
}

/*
 * newfstate - allocate an NFA state with a specified flag value
 */
Bruce Momjian's avatar
Bruce Momjian committed
215
static struct state *			/* NULL on error */
216
newfstate(struct nfa *nfa, int flag)
217 218 219 220 221
{
	struct state *s;

	s = newstate(nfa);
	if (s != NULL)
Bruce Momjian's avatar
Bruce Momjian committed
222
		s->flag = (char) flag;
223 224 225 226 227 228 229
	return s;
}

/*
 * dropstate - delete a state's inarcs and outarcs and free it
 */
static void
230 231
dropstate(struct nfa *nfa,
		  struct state *s)
232 233 234 235 236 237 238 239 240 241 242 243 244 245
{
	struct arc *a;

	while ((a = s->ins) != NULL)
		freearc(nfa, a);
	while ((a = s->outs) != NULL)
		freearc(nfa, a);
	freestate(nfa, s);
}

/*
 * freestate - free a state, which has no in-arcs or out-arcs
 */
static void
246 247
freestate(struct nfa *nfa,
		  struct state *s)
248 249 250 251 252 253 254 255
{
	assert(s != NULL);
	assert(s->nins == 0 && s->nouts == 0);

	s->no = FREESTATE;
	s->flag = 0;
	if (s->next != NULL)
		s->next->prev = s->prev;
Bruce Momjian's avatar
Bruce Momjian committed
256 257
	else
	{
258 259 260 261 262
		assert(s == nfa->slast);
		nfa->slast = s->prev;
	}
	if (s->prev != NULL)
		s->prev->next = s->next;
Bruce Momjian's avatar
Bruce Momjian committed
263 264
	else
	{
265 266 267 268
		assert(s == nfa->states);
		nfa->states = s->next;
	}
	s->prev = NULL;
269 270
	s->next = nfa->freestates;	/* don't delete it, put it on the free list */
	nfa->freestates = s;
271 272 273 274
}

/*
 * newarc - set up a new arc within an NFA
275 276 277
 *
 * This function checks to make sure that no duplicate arcs are created.
 * In general we never want duplicates.
278 279 280 281 282
 *
 * However: in principle, a RAINBOW arc is redundant with any plain arc
 * (unless that arc is for a pseudocolor).  But we don't try to recognize
 * that redundancy, either here or in allied operations such as moveins().
 * The pseudocolor consideration makes that more costly than it seems worth.
283 284
 */
static void
285
newarc(struct nfa *nfa,
286
	   int t,
287
	   color co,
288 289
	   struct state *from,
	   struct state *to)
290 291 292 293 294
{
	struct arc *a;

	assert(from != NULL && to != NULL);

295 296 297 298 299 300 301 302 303 304 305
	/*
	 * This is a handy place to check for operation cancel during regex
	 * compilation, since no code path will go very long without making a new
	 * state or arc.
	 */
	if (CANCEL_REQUESTED(nfa->v->re))
	{
		NERR(REG_CANCEL);
		return;
	}

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	/* check for duplicate arc, using whichever chain is shorter */
	if (from->nouts <= to->nins)
	{
		for (a = from->outs; a != NULL; a = a->outchain)
			if (a->to == to && a->co == co && a->type == t)
				return;
	}
	else
	{
		for (a = to->ins; a != NULL; a = a->inchain)
			if (a->from == from && a->co == co && a->type == t)
				return;
	}

	/* no dup, so create the arc */
	createarc(nfa, t, co, from, to);
}

/*
 * createarc - create a new arc within an NFA
 *
 * This function must *only* be used after verifying that there is no existing
 * identical arc (same type/color/from/to).
 */
static void
331
createarc(struct nfa *nfa,
332
		  int t,
333
		  color co,
334 335
		  struct state *from,
		  struct state *to)
336 337
{
	struct arc *a;
338

339
	a = allocarc(nfa);
340 341 342 343 344
	if (NISERR())
		return;
	assert(a != NULL);

	a->type = t;
345
	a->co = co;
346 347 348 349
	a->to = to;
	a->from = from;

	/*
350 351 352
	 * Put the new arc on the beginning, not the end, of the chains; it's
	 * simpler here, and freearc() is the same cost either way.  See also the
	 * logic in moveins() and its cohorts, as well as fixempties().
353 354
	 */
	a->inchain = to->ins;
355 356 357
	a->inchainRev = NULL;
	if (to->ins)
		to->ins->inchainRev = a;
358 359
	to->ins = a;
	a->outchain = from->outs;
360 361 362
	a->outchainRev = NULL;
	if (from->outs)
		from->outs->outchainRev = a;
363 364 365 366 367 368 369 370 371 372
	from->outs = a;

	from->nouts++;
	to->nins++;

	if (COLORED(a) && nfa->parent == NULL)
		colorchain(nfa->cm, a);
}

/*
373
 * allocarc - allocate a new arc within an NFA
374
 */
Bruce Momjian's avatar
Bruce Momjian committed
375
static struct arc *				/* NULL for failure */
376
allocarc(struct nfa *nfa)
377 378 379
{
	struct arc *a;

380 381
	/* first, recycle anything that's on the freelist */
	if (nfa->freearcs != NULL)
Bruce Momjian's avatar
Bruce Momjian committed
382
	{
383 384
		a = nfa->freearcs;
		nfa->freearcs = a->freechain;
385
	}
386 387 388 389 390 391 392
	/* otherwise, is there anything left in the last arcbatch? */
	else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
	{
		a = &nfa->lastab->a[nfa->lastabused++];
	}
	/* otherwise, need to allocate a new arcbatch */
	else
Bruce Momjian's avatar
Bruce Momjian committed
393
	{
394
		struct arcbatch *newAb;
395
		size_t		narcs;
396

397 398 399 400 401
		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
		{
			NERR(REG_ETOOBIG);
			return NULL;
		}
402 403 404 405
		narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
		if (narcs > MAXABSIZE)
			narcs = MAXABSIZE;
		newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
406
		if (newAb == NULL)
Bruce Momjian's avatar
Bruce Momjian committed
407
		{
408 409 410
			NERR(REG_ESPACE);
			return NULL;
		}
411 412 413 414 415 416
		nfa->v->spaceused += ARCBATCHSIZE(narcs);
		newAb->narcs = narcs;
		newAb->next = nfa->lastab;
		nfa->lastab = newAb;
		nfa->lastabused = 1;
		a = &newAb->a[0];
417 418 419 420 421 422 423 424 425
	}

	return a;
}

/*
 * freearc - free an arc
 */
static void
426 427
freearc(struct nfa *nfa,
		struct arc *victim)
428 429 430
{
	struct state *from = victim->from;
	struct state *to = victim->to;
431
	struct arc *predecessor;
432 433 434 435 436 437 438 439 440

	assert(victim->type != 0);

	/* take it off color chain if necessary */
	if (COLORED(victim) && nfa->parent == NULL)
		uncolorchain(nfa->cm, victim);

	/* take it off source's out-chain */
	assert(from != NULL);
441 442 443 444
	predecessor = victim->outchainRev;
	if (predecessor == NULL)
	{
		assert(from->outs == victim);
445
		from->outs = victim->outchain;
446
	}
Bruce Momjian's avatar
Bruce Momjian committed
447 448
	else
	{
449 450 451 452 453 454 455
		assert(predecessor->outchain == victim);
		predecessor->outchain = victim->outchain;
	}
	if (victim->outchain != NULL)
	{
		assert(victim->outchain->outchainRev == victim);
		victim->outchain->outchainRev = predecessor;
456 457 458 459 460
	}
	from->nouts--;

	/* take it off target's in-chain */
	assert(to != NULL);
461 462 463 464
	predecessor = victim->inchainRev;
	if (predecessor == NULL)
	{
		assert(to->ins == victim);
465
		to->ins = victim->inchain;
466
	}
Bruce Momjian's avatar
Bruce Momjian committed
467 468
	else
	{
469 470 471 472 473 474 475
		assert(predecessor->inchain == victim);
		predecessor->inchain = victim->inchain;
	}
	if (victim->inchain != NULL)
	{
		assert(victim->inchain->inchainRev == victim);
		victim->inchain->inchainRev = predecessor;
476 477 478
	}
	to->nins--;

479
	/* clean up and place on NFA's free list */
480 481 482 483
	victim->type = 0;
	victim->from = NULL;		/* precautions... */
	victim->to = NULL;
	victim->inchain = NULL;
484
	victim->inchainRev = NULL;
485
	victim->outchain = NULL;
486
	victim->outchainRev = NULL;
487 488
	victim->freechain = nfa->freearcs;
	nfa->freearcs = victim;
489 490
}

491
/*
492
 * changearcsource - flip an arc to have a different from state
493 494
 *
 * Caller must have verified that there is no pre-existing duplicate arc.
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
 */
static void
changearcsource(struct arc *a, struct state *newfrom)
{
	struct state *oldfrom = a->from;
	struct arc *predecessor;

	assert(oldfrom != newfrom);

	/* take it off old source's out-chain */
	assert(oldfrom != NULL);
	predecessor = a->outchainRev;
	if (predecessor == NULL)
	{
		assert(oldfrom->outs == a);
		oldfrom->outs = a->outchain;
	}
	else
	{
		assert(predecessor->outchain == a);
		predecessor->outchain = a->outchain;
	}
	if (a->outchain != NULL)
	{
		assert(a->outchain->outchainRev == a);
		a->outchain->outchainRev = predecessor;
	}
	oldfrom->nouts--;

	a->from = newfrom;

	/* prepend it to new source's out-chain */
	a->outchain = newfrom->outs;
	a->outchainRev = NULL;
	if (newfrom->outs)
		newfrom->outs->outchainRev = a;
	newfrom->outs = a;
	newfrom->nouts++;
}

/*
 * changearctarget - flip an arc to have a different to state
537
 *
538
 * Caller must have verified that there is no pre-existing duplicate arc.
539 540
 */
static void
541
changearctarget(struct arc *a, struct state *newto)
542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
{
	struct state *oldto = a->to;
	struct arc *predecessor;

	assert(oldto != newto);

	/* take it off old target's in-chain */
	assert(oldto != NULL);
	predecessor = a->inchainRev;
	if (predecessor == NULL)
	{
		assert(oldto->ins == a);
		oldto->ins = a->inchain;
	}
	else
	{
		assert(predecessor->inchain == a);
		predecessor->inchain = a->inchain;
	}
	if (a->inchain != NULL)
	{
		assert(a->inchain->inchainRev == a);
		a->inchain->inchainRev = predecessor;
	}
	oldto->nins--;

	a->to = newto;

	/* prepend it to new target's in-chain */
	a->inchain = newto->ins;
	a->inchainRev = NULL;
	if (newto->ins)
		newto->ins->inchainRev = a;
	newto->ins = a;
	newto->nins++;
}

579 580 581 582
/*
 * hasnonemptyout - Does state have a non-EMPTY out arc?
 */
static int
583
hasnonemptyout(struct state *s)
584 585 586 587 588 589 590 591 592 593 594
{
	struct arc *a;

	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (a->type != EMPTY)
			return 1;
	}
	return 0;
}

595 596 597 598 599
/*
 * findarc - find arc, if any, from given source with given type and color
 * If there is more than one such arc, the result is random.
 */
static struct arc *
600
findarc(struct state *s,
601
		int type,
602
		color co)
603 604 605 606 607 608 609 610 611 612 613 614 615
{
	struct arc *a;

	for (a = s->outs; a != NULL; a = a->outchain)
		if (a->type == type && a->co == co)
			return a;
	return NULL;
}

/*
 * cparc - allocate a new arc within an NFA, copying details from old one
 */
static void
616 617 618 619
cparc(struct nfa *nfa,
	  struct arc *oa,
	  struct state *from,
	  struct state *to)
620 621 622 623
{
	newarc(nfa, oa->type, oa->co, from, to);
}

624 625 626 627
/*
 * sortins - sort the in arcs of a state by from/color/type
 */
static void
628 629
sortins(struct nfa *nfa,
		struct state *s)
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
{
	struct arc **sortarray;
	struct arc *a;
	int			n = s->nins;
	int			i;

	if (n <= 1)
		return;					/* nothing to do */
	/* make an array of arc pointers ... */
	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
	if (sortarray == NULL)
	{
		NERR(REG_ESPACE);
		return;
	}
	i = 0;
	for (a = s->ins; a != NULL; a = a->inchain)
		sortarray[i++] = a;
	assert(i == n);
	/* ... sort the array */
	qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
	/* ... and rebuild arc list in order */
	/* it seems worth special-casing first and last items to simplify loop */
	a = sortarray[0];
	s->ins = a;
	a->inchain = sortarray[1];
	a->inchainRev = NULL;
	for (i = 1; i < n - 1; i++)
	{
		a = sortarray[i];
		a->inchain = sortarray[i + 1];
		a->inchainRev = sortarray[i - 1];
	}
	a = sortarray[i];
	a->inchain = NULL;
	a->inchainRev = sortarray[i - 1];
	FREE(sortarray);
}

static int
sortins_cmp(const void *a, const void *b)
{
672 673
	const struct arc *aa = *((const struct arc *const *) a);
	const struct arc *bb = *((const struct arc *const *) b);
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694

	/* we check the fields in the order they are most likely to be different */
	if (aa->from->no < bb->from->no)
		return -1;
	if (aa->from->no > bb->from->no)
		return 1;
	if (aa->co < bb->co)
		return -1;
	if (aa->co > bb->co)
		return 1;
	if (aa->type < bb->type)
		return -1;
	if (aa->type > bb->type)
		return 1;
	return 0;
}

/*
 * sortouts - sort the out arcs of a state by to/color/type
 */
static void
695 696
sortouts(struct nfa *nfa,
		 struct state *s)
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
{
	struct arc **sortarray;
	struct arc *a;
	int			n = s->nouts;
	int			i;

	if (n <= 1)
		return;					/* nothing to do */
	/* make an array of arc pointers ... */
	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
	if (sortarray == NULL)
	{
		NERR(REG_ESPACE);
		return;
	}
	i = 0;
	for (a = s->outs; a != NULL; a = a->outchain)
		sortarray[i++] = a;
	assert(i == n);
	/* ... sort the array */
	qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
	/* ... and rebuild arc list in order */
	/* it seems worth special-casing first and last items to simplify loop */
	a = sortarray[0];
	s->outs = a;
	a->outchain = sortarray[1];
	a->outchainRev = NULL;
	for (i = 1; i < n - 1; i++)
	{
		a = sortarray[i];
		a->outchain = sortarray[i + 1];
		a->outchainRev = sortarray[i - 1];
	}
	a = sortarray[i];
	a->outchain = NULL;
	a->outchainRev = sortarray[i - 1];
	FREE(sortarray);
}

static int
sortouts_cmp(const void *a, const void *b)
{
739 740
	const struct arc *aa = *((const struct arc *const *) a);
	const struct arc *bb = *((const struct arc *const *) b);
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768

	/* we check the fields in the order they are most likely to be different */
	if (aa->to->no < bb->to->no)
		return -1;
	if (aa->to->no > bb->to->no)
		return 1;
	if (aa->co < bb->co)
		return -1;
	if (aa->co > bb->co)
		return 1;
	if (aa->type < bb->type)
		return -1;
	if (aa->type > bb->type)
		return 1;
	return 0;
}

/*
 * Common decision logic about whether to use arc-by-arc operations or
 * sort/merge.  If there's just a few source arcs we cannot recoup the
 * cost of sorting the destination arc list, no matter how large it is.
 * Otherwise, limit the number of arc-by-arc comparisons to about 1000
 * (a somewhat arbitrary choice, but the breakeven point would probably
 * be machine dependent anyway).
 */
#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
	((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))

769 770 771 772
/*
 * moveins - move all in arcs of a state to another state
 *
 * You might think this could be done better by just updating the
773
 * existing arcs, and you would be right if it weren't for the need
774 775
 * for duplicate suppression, which makes it easier to just make new
 * ones to exploit the suppression built into newarc.
776 777 778 779
 *
 * However, if we have a whole lot of arcs to deal with, retail duplicate
 * checks become too slow.  In that case we proceed by sorting and merging
 * the arc lists, and then we can indeed just update the arcs in-place.
780 781
 */
static void
782 783 784
moveins(struct nfa *nfa,
		struct state *oldState,
		struct state *newState)
785
{
786
	assert(oldState != newState);
787

788
	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
Bruce Momjian's avatar
Bruce Momjian committed
789
	{
790 791 792 793 794 795 796 797
		/* With not too many arcs, just do them one at a time */
		struct arc *a;

		while ((a = oldState->ins) != NULL)
		{
			cparc(nfa, a, a->from, newState);
			freearc(nfa, a);
		}
798
	}
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
	else
	{
		/*
		 * With many arcs, use a sort-merge approach.  Note changearctarget()
		 * will put the arc onto the front of newState's chain, so it does not
		 * break our walk through the sorted part of the chain.
		 */
		struct arc *oa;
		struct arc *na;

		/*
		 * Because we bypass newarc() in this code path, we'd better include a
		 * cancel check.
		 */
		if (CANCEL_REQUESTED(nfa->v->re))
		{
			NERR(REG_CANCEL);
			return;
		}

		sortins(nfa, oldState);
		sortins(nfa, newState);
		if (NISERR())
			return;				/* might have failed to sort */
		oa = oldState->ins;
		na = newState->ins;
		while (oa != NULL && na != NULL)
		{
			struct arc *a = oa;

			switch (sortins_cmp(&oa, &na))
			{
				case -1:
					/* newState does not have anything matching oa */
					oa = oa->inchain;

					/*
					 * Rather than doing createarc+freearc, we can just unlink
					 * and relink the existing arc struct.
					 */
					changearctarget(a, newState);
					break;
				case 0:
					/* match, advance in both lists */
					oa = oa->inchain;
					na = na->inchain;
					/* ... and drop duplicate arc from oldState */
					freearc(nfa, a);
					break;
				case +1:
					/* advance only na; oa might have a match later */
					na = na->inchain;
					break;
				default:
					assert(NOTREACHED);
			}
		}
		while (oa != NULL)
		{
			/* newState does not have anything matching oa */
			struct arc *a = oa;

			oa = oa->inchain;
			changearctarget(a, newState);
		}
	}

866 867
	assert(oldState->nins == 0);
	assert(oldState->ins == NULL);
868 869 870
}

/*
871
 * copyins - copy in arcs of a state to another state
872 873
 */
static void
874 875 876
copyins(struct nfa *nfa,
		struct state *oldState,
		struct state *newState)
877
{
878
	assert(oldState != newState);
879

880
	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
881
	{
882 883 884 885
		/* With not too many arcs, just do them one at a time */
		struct arc *a;

		for (a = oldState->ins; a != NULL; a = a->inchain)
886
			cparc(nfa, a, a->from, newState);
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
	}
	else
	{
		/*
		 * With many arcs, use a sort-merge approach.  Note that createarc()
		 * will put new arcs onto the front of newState's chain, so it does
		 * not break our walk through the sorted part of the chain.
		 */
		struct arc *oa;
		struct arc *na;

		/*
		 * Because we bypass newarc() in this code path, we'd better include a
		 * cancel check.
		 */
		if (CANCEL_REQUESTED(nfa->v->re))
		{
			NERR(REG_CANCEL);
			return;
		}

		sortins(nfa, oldState);
		sortins(nfa, newState);
		if (NISERR())
			return;				/* might have failed to sort */
		oa = oldState->ins;
		na = newState->ins;
		while (oa != NULL && na != NULL)
		{
			struct arc *a = oa;

			switch (sortins_cmp(&oa, &na))
			{
				case -1:
					/* newState does not have anything matching oa */
					oa = oa->inchain;
					createarc(nfa, a->type, a->co, a->from, newState);
					break;
				case 0:
					/* match, advance in both lists */
					oa = oa->inchain;
					na = na->inchain;
					break;
				case +1:
					/* advance only na; oa might have a match later */
					na = na->inchain;
					break;
				default:
					assert(NOTREACHED);
			}
		}
		while (oa != NULL)
		{
			/* newState does not have anything matching oa */
			struct arc *a = oa;

			oa = oa->inchain;
			createarc(nfa, a->type, a->co, a->from, newState);
		}
	}
}

/*
 * mergeins - merge a list of inarcs into a state
 *
 * This is much like copyins, but the source arcs are listed in an array,
 * and are not guaranteed unique.  It's okay to clobber the array contents.
 */
static void
956 957 958
mergeins(struct nfa *nfa,
		 struct state *s,
		 struct arc **arcarray,
959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
		 int arccount)
{
	struct arc *na;
	int			i;
	int			j;

	if (arccount <= 0)
		return;

	/*
	 * Because we bypass newarc() in this code path, we'd better include a
	 * cancel check.
	 */
	if (CANCEL_REQUESTED(nfa->v->re))
	{
		NERR(REG_CANCEL);
		return;
	}

	/* Sort existing inarcs as well as proposed new ones */
	sortins(nfa, s);
	if (NISERR())
		return;					/* might have failed to sort */

	qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);

	/*
	 * arcarray very likely includes dups, so we must eliminate them.  (This
	 * could be folded into the next loop, but it's not worth the trouble.)
	 */
	j = 0;
	for (i = 1; i < arccount; i++)
	{
		switch (sortins_cmp(&arcarray[j], &arcarray[i]))
		{
			case -1:
				/* non-dup */
				arcarray[++j] = arcarray[i];
				break;
			case 0:
				/* dup */
				break;
			default:
				/* trouble */
				assert(NOTREACHED);
		}
	}
	arccount = j + 1;

	/*
	 * Now merge into s' inchain.  Note that createarc() will put new arcs
	 * onto the front of s's chain, so it does not break our walk through the
	 * sorted part of the chain.
	 */
	i = 0;
	na = s->ins;
	while (i < arccount && na != NULL)
	{
		struct arc *a = arcarray[i];

		switch (sortins_cmp(&a, &na))
		{
			case -1:
				/* s does not have anything matching a */
				createarc(nfa, a->type, a->co, a->from, s);
				i++;
				break;
			case 0:
				/* match, advance in both lists */
				i++;
				na = na->inchain;
				break;
			case +1:
				/* advance only na; array might have a match later */
				na = na->inchain;
				break;
			default:
				assert(NOTREACHED);
		}
	}
	while (i < arccount)
	{
		/* s does not have anything matching a */
		struct arc *a = arcarray[i];

		createarc(nfa, a->type, a->co, a->from, s);
		i++;
1046
	}
1047 1048 1049 1050
}

/*
 * moveouts - move all out arcs of a state to another state
1051 1052
 *
 * See comments for moveins()
1053 1054
 */
static void
1055 1056 1057
moveouts(struct nfa *nfa,
		 struct state *oldState,
		 struct state *newState)
1058
{
1059
	assert(oldState != newState);
1060

1061
	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
Bruce Momjian's avatar
Bruce Momjian committed
1062
	{
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
		/* With not too many arcs, just do them one at a time */
		struct arc *a;

		while ((a = oldState->outs) != NULL)
		{
			cparc(nfa, a, newState, a->to);
			freearc(nfa, a);
		}
	}
	else
	{
		/*
1075 1076 1077
		 * With many arcs, use a sort-merge approach.  Note changearcsource()
		 * will put the arc onto the front of newState's chain, so it does not
		 * break our walk through the sorted part of the chain.
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
		 */
		struct arc *oa;
		struct arc *na;

		/*
		 * Because we bypass newarc() in this code path, we'd better include a
		 * cancel check.
		 */
		if (CANCEL_REQUESTED(nfa->v->re))
		{
			NERR(REG_CANCEL);
			return;
		}

		sortouts(nfa, oldState);
		sortouts(nfa, newState);
		if (NISERR())
			return;				/* might have failed to sort */
		oa = oldState->outs;
		na = newState->outs;
		while (oa != NULL && na != NULL)
		{
			struct arc *a = oa;

			switch (sortouts_cmp(&oa, &na))
			{
				case -1:
					/* newState does not have anything matching oa */
					oa = oa->outchain;
1107 1108 1109 1110 1111 1112

					/*
					 * Rather than doing createarc+freearc, we can just unlink
					 * and relink the existing arc struct.
					 */
					changearcsource(a, newState);
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
					break;
				case 0:
					/* match, advance in both lists */
					oa = oa->outchain;
					na = na->outchain;
					/* ... and drop duplicate arc from oldState */
					freearc(nfa, a);
					break;
				case +1:
					/* advance only na; oa might have a match later */
					na = na->outchain;
					break;
				default:
					assert(NOTREACHED);
			}
		}
		while (oa != NULL)
		{
			/* newState does not have anything matching oa */
			struct arc *a = oa;

			oa = oa->outchain;
1135
			changearcsource(a, newState);
1136
		}
1137
	}
1138 1139 1140

	assert(oldState->nouts == 0);
	assert(oldState->outs == NULL);
1141 1142 1143
}

/*
1144
 * copyouts - copy out arcs of a state to another state
1145 1146
 */
static void
1147 1148 1149
copyouts(struct nfa *nfa,
		 struct state *oldState,
		 struct state *newState)
1150
{
1151
	assert(oldState != newState);
1152

1153
	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1154
	{
1155 1156 1157 1158
		/* With not too many arcs, just do them one at a time */
		struct arc *a;

		for (a = oldState->outs; a != NULL; a = a->outchain)
1159
			cparc(nfa, a, newState, a->to);
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
	}
	else
	{
		/*
		 * With many arcs, use a sort-merge approach.  Note that createarc()
		 * will put new arcs onto the front of newState's chain, so it does
		 * not break our walk through the sorted part of the chain.
		 */
		struct arc *oa;
		struct arc *na;

		/*
		 * Because we bypass newarc() in this code path, we'd better include a
		 * cancel check.
		 */
		if (CANCEL_REQUESTED(nfa->v->re))
		{
			NERR(REG_CANCEL);
			return;
		}

		sortouts(nfa, oldState);
		sortouts(nfa, newState);
		if (NISERR())
			return;				/* might have failed to sort */
		oa = oldState->outs;
		na = newState->outs;
		while (oa != NULL && na != NULL)
		{
			struct arc *a = oa;

			switch (sortouts_cmp(&oa, &na))
			{
				case -1:
					/* newState does not have anything matching oa */
					oa = oa->outchain;
					createarc(nfa, a->type, a->co, newState, a->to);
					break;
				case 0:
					/* match, advance in both lists */
					oa = oa->outchain;
					na = na->outchain;
					break;
				case +1:
					/* advance only na; oa might have a match later */
					na = na->outchain;
					break;
				default:
					assert(NOTREACHED);
			}
		}
		while (oa != NULL)
		{
			/* newState does not have anything matching oa */
			struct arc *a = oa;

			oa = oa->outchain;
			createarc(nfa, a->type, a->co, newState, a->to);
		}
1219
	}
1220 1221 1222 1223
}

/*
 * cloneouts - copy out arcs of a state to another state pair, modifying type
1224 1225 1226
 *
 * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
 * the same interpretation of "co".  It wouldn't be sensible with LACONs.
1227 1228
 */
static void
1229 1230 1231 1232
cloneouts(struct nfa *nfa,
		  struct state *old,
		  struct state *from,
		  struct state *to,
1233 1234 1235 1236 1237
		  int type)
{
	struct arc *a;

	assert(old != from);
1238
	assert(type == AHEAD || type == BEHIND);
1239 1240

	for (a = old->outs; a != NULL; a = a->outchain)
1241 1242
	{
		assert(a->type == PLAIN);
1243
		newarc(nfa, type, a->co, from, to);
1244
	}
1245 1246 1247 1248 1249 1250 1251 1252 1253
}

/*
 * delsub - delete a sub-NFA, updating subre pointers if necessary
 *
 * This uses a recursive traversal of the sub-NFA, marking already-seen
 * states using their tmp pointer.
 */
static void
1254 1255 1256
delsub(struct nfa *nfa,
	   struct state *lp,		/* the sub-NFA goes from here... */
	   struct state *rp)		/* ...to here, *not* inclusive */
1257 1258 1259
{
	assert(lp != rp);

Bruce Momjian's avatar
Bruce Momjian committed
1260
	rp->tmp = rp;				/* mark end */
1261 1262

	deltraverse(nfa, lp, lp);
1263 1264
	if (NISERR())
		return;					/* asserts might not hold after failure */
1265
	assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
Bruce Momjian's avatar
Bruce Momjian committed
1266
	assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1267

Bruce Momjian's avatar
Bruce Momjian committed
1268 1269
	rp->tmp = NULL;				/* unmark end */
	lp->tmp = NULL;				/* and begin, marked by deltraverse */
1270 1271 1272 1273 1274 1275 1276
}

/*
 * deltraverse - the recursive heart of delsub
 * This routine's basic job is to destroy all out-arcs of the state.
 */
static void
1277 1278 1279
deltraverse(struct nfa *nfa,
			struct state *leftend,
			struct state *s)
1280 1281 1282 1283
{
	struct arc *a;
	struct state *to;

1284 1285 1286 1287 1288 1289 1290
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

1291
	if (s->nouts == 0)
Bruce Momjian's avatar
Bruce Momjian committed
1292
		return;					/* nothing to do */
1293
	if (s->tmp != NULL)
Bruce Momjian's avatar
Bruce Momjian committed
1294
		return;					/* already in progress */
1295

Bruce Momjian's avatar
Bruce Momjian committed
1296
	s->tmp = s;					/* mark as in progress */
1297

Bruce Momjian's avatar
Bruce Momjian committed
1298 1299
	while ((a = s->outs) != NULL)
	{
1300 1301
		to = a->to;
		deltraverse(nfa, leftend, to);
1302 1303
		if (NISERR())
			return;				/* asserts might not hold after failure */
1304 1305
		assert(to->nouts == 0 || to->tmp != NULL);
		freearc(nfa, a);
Bruce Momjian's avatar
Bruce Momjian committed
1306 1307
		if (to->nins == 0 && to->tmp == NULL)
		{
1308 1309 1310 1311 1312
			assert(to->nouts == 0);
			freestate(nfa, to);
		}
	}

Bruce Momjian's avatar
Bruce Momjian committed
1313
	assert(s->no != FREESTATE); /* we're still here */
Tom Lane's avatar
Tom Lane committed
1314
	assert(s == leftend || s->nins != 0);	/* and still reachable */
1315 1316
	assert(s->nouts == 0);		/* but have no outarcs */

Bruce Momjian's avatar
Bruce Momjian committed
1317
	s->tmp = NULL;				/* we're done here */
1318 1319 1320 1321 1322 1323 1324 1325 1326 1327
}

/*
 * dupnfa - duplicate sub-NFA
 *
 * Another recursive traversal, this time using tmp to point to duplicates
 * as well as mark already-seen states.  (You knew there was a reason why
 * it's a state pointer, didn't you? :-))
 */
static void
1328 1329 1330 1331 1332
dupnfa(struct nfa *nfa,
	   struct state *start,		/* duplicate of subNFA starting here */
	   struct state *stop,		/* and stopping here */
	   struct state *from,		/* stringing duplicate from here */
	   struct state *to)		/* to here */
1333
{
Bruce Momjian's avatar
Bruce Momjian committed
1334 1335
	if (start == stop)
	{
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
		newarc(nfa, EMPTY, 0, from, to);
		return;
	}

	stop->tmp = to;
	duptraverse(nfa, start, from);
	/* done, except for clearing out the tmp pointers */

	stop->tmp = NULL;
	cleartraverse(nfa, start);
}

/*
 * duptraverse - recursive heart of dupnfa
 */
static void
1352 1353 1354
duptraverse(struct nfa *nfa,
			struct state *s,
			struct state *stmp) /* s's duplicate, or NULL */
1355 1356 1357
{
	struct arc *a;

1358 1359 1360 1361 1362 1363 1364
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

1365
	if (s->tmp != NULL)
Bruce Momjian's avatar
Bruce Momjian committed
1366
		return;					/* already done */
1367 1368

	s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
Bruce Momjian's avatar
Bruce Momjian committed
1369 1370
	if (s->tmp == NULL)
	{
1371 1372 1373 1374
		assert(NISERR());
		return;
	}

Bruce Momjian's avatar
Bruce Momjian committed
1375 1376 1377
	for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
	{
		duptraverse(nfa, a->to, (struct state *) NULL);
1378 1379
		if (NISERR())
			break;
1380 1381 1382 1383 1384
		assert(a->to->tmp != NULL);
		cparc(nfa, a, s->tmp, a->to->tmp);
	}
}

1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
/*
 * removeconstraints - remove any constraints in an NFA
 *
 * Constraint arcs are replaced by empty arcs, essentially treating all
 * constraints as automatically satisfied.
 */
static void
removeconstraints(struct nfa *nfa,
				  struct state *start,	/* process subNFA starting here */
				  struct state *stop)	/* and stopping here */
{
	if (start == stop)
		return;

	stop->tmp = stop;
	removetraverse(nfa, start);
	/* done, except for clearing out the tmp pointers */

	stop->tmp = NULL;
	cleartraverse(nfa, start);
}

/*
 * removetraverse - recursive heart of removeconstraints
 */
static void
removetraverse(struct nfa *nfa,
			   struct state *s)
{
	struct arc *a;
	struct arc *oa;

	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

	if (s->tmp != NULL)
		return;					/* already done */

	s->tmp = s;
	for (a = s->outs; a != NULL && !NISERR(); a = oa)
	{
		removetraverse(nfa, a->to);
		if (NISERR())
			break;
		oa = a->outchain;
		switch (a->type)
		{
			case PLAIN:
			case EMPTY:
				/* nothing to do */
				break;
			case AHEAD:
			case BEHIND:
			case '^':
			case '$':
			case LACON:
				/* replace it */
				newarc(nfa, EMPTY, 0, s, a->to);
				freearc(nfa, a);
				break;
			default:
				NERR(REG_ASSERT);
				break;
		}
	}
}

1456 1457 1458 1459
/*
 * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
 */
static void
1460 1461
cleartraverse(struct nfa *nfa,
			  struct state *s)
1462 1463 1464
{
	struct arc *a;

1465 1466 1467 1468 1469 1470 1471
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

1472 1473 1474 1475 1476 1477 1478 1479
	if (s->tmp == NULL)
		return;
	s->tmp = NULL;

	for (a = s->outs; a != NULL; a = a->outchain)
		cleartraverse(nfa, a->to);
}

1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
/*
 * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
 *
 * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
 * of a set of colors), return a state whose outarc list contains only PLAIN
 * arcs of those color(s).  Otherwise return NULL.
 *
 * This is used before optimizing the NFA, so there may be EMPTY arcs, which
 * we should ignore; the possibility of an EMPTY is why the result state could
 * be different from s1.
 *
 * It's worth troubling to handle multiple parallel PLAIN arcs here because a
 * bracket construct such as [abc] might yield either one or several parallel
 * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
 * that implementation detail not create user-visible performance differences.
 */
static struct state *
1497
single_color_transition(struct state *s1, struct state *s2)
1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522
{
	struct arc *a;

	/* Ignore leading EMPTY arc, if any */
	if (s1->nouts == 1 && s1->outs->type == EMPTY)
		s1 = s1->outs->to;
	/* Likewise for any trailing EMPTY arc */
	if (s2->nins == 1 && s2->ins->type == EMPTY)
		s2 = s2->ins->from;
	/* Perhaps we could have a single-state loop in between, if so reject */
	if (s1 == s2)
		return NULL;
	/* s1 must have at least one outarc... */
	if (s1->outs == NULL)
		return NULL;
	/* ... and they must all be PLAIN arcs to s2 */
	for (a = s1->outs; a != NULL; a = a->outchain)
	{
		if (a->type != PLAIN || a->to != s2)
			return NULL;
	}
	/* OK, return s1 as the possessor of the relevant outarcs */
	return s1;
}

1523 1524 1525 1526
/*
 * specialcolors - fill in special colors for an NFA
 */
static void
1527
specialcolors(struct nfa *nfa)
1528 1529
{
	/* false colors for BOS, BOL, EOS, EOL */
Bruce Momjian's avatar
Bruce Momjian committed
1530 1531
	if (nfa->parent == NULL)
	{
1532 1533 1534 1535
		nfa->bos[0] = pseudocolor(nfa->cm);
		nfa->bos[1] = pseudocolor(nfa->cm);
		nfa->eos[0] = pseudocolor(nfa->cm);
		nfa->eos[1] = pseudocolor(nfa->cm);
Bruce Momjian's avatar
Bruce Momjian committed
1536 1537 1538
	}
	else
	{
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
		assert(nfa->parent->bos[0] != COLORLESS);
		nfa->bos[0] = nfa->parent->bos[0];
		assert(nfa->parent->bos[1] != COLORLESS);
		nfa->bos[1] = nfa->parent->bos[1];
		assert(nfa->parent->eos[0] != COLORLESS);
		nfa->eos[0] = nfa->parent->eos[0];
		assert(nfa->parent->eos[1] != COLORLESS);
		nfa->eos[1] = nfa->parent->eos[1];
	}
}

/*
 * optimize - optimize an NFA
1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
 *
 * The main goal of this function is not so much "optimization" (though it
 * does try to get rid of useless NFA states) as reducing the NFA to a form
 * the regex executor can handle.  The executor, and indeed the cNFA format
 * that is its input, can only handle PLAIN and LACON arcs.  The output of
 * the regex parser also includes EMPTY (do-nothing) arcs, as well as
 * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
 * We first get rid of EMPTY arcs and then deal with the constraint arcs.
 * The hardest part of either job is to get rid of circular loops of the
 * target arc type.  We would have to do that in any case, though, as such a
 * loop would otherwise allow the executor to cycle through the loop endlessly
 * without making any progress in the input string.
1564
 */
Bruce Momjian's avatar
Bruce Momjian committed
1565
static long						/* re_info bits */
1566
optimize(struct nfa *nfa,
1567 1568 1569
		 FILE *f)				/* for debug output; NULL none */
{
#ifdef REG_DEBUG
Bruce Momjian's avatar
Bruce Momjian committed
1570
	int			verbose = (f != NULL) ? 1 : 0;
1571 1572 1573 1574

	if (verbose)
		fprintf(f, "\ninitial cleanup:\n");
#endif
Bruce Momjian's avatar
Bruce Momjian committed
1575
	cleanup(nfa);				/* may simplify situation */
1576 1577 1578 1579 1580 1581
#ifdef REG_DEBUG
	if (verbose)
		dumpnfa(nfa, f);
	if (verbose)
		fprintf(f, "\nempties:\n");
#endif
Bruce Momjian's avatar
Bruce Momjian committed
1582
	fixempties(nfa, f);			/* get rid of EMPTY arcs */
1583 1584 1585 1586
#ifdef REG_DEBUG
	if (verbose)
		fprintf(f, "\nconstraints:\n");
#endif
1587
	fixconstraintloops(nfa, f); /* get rid of constraint loops */
Bruce Momjian's avatar
Bruce Momjian committed
1588 1589
	pullback(nfa, f);			/* pull back constraints backward */
	pushfwd(nfa, f);			/* push fwd constraints forward */
1590 1591 1592 1593
#ifdef REG_DEBUG
	if (verbose)
		fprintf(f, "\nfinal cleanup:\n");
#endif
Bruce Momjian's avatar
Bruce Momjian committed
1594
	cleanup(nfa);				/* final tidying */
1595 1596 1597 1598
#ifdef REG_DEBUG
	if (verbose)
		dumpnfa(nfa, f);
#endif
Bruce Momjian's avatar
Bruce Momjian committed
1599
	return analyze(nfa);		/* and analysis */
1600 1601 1602
}

/*
1603
 * pullback - pull back constraints backward to eliminate them
1604 1605
 */
static void
1606
pullback(struct nfa *nfa,
Bruce Momjian's avatar
Bruce Momjian committed
1607
		 FILE *f)				/* for debug output; NULL none */
1608 1609 1610 1611 1612
{
	struct state *s;
	struct state *nexts;
	struct arc *a;
	struct arc *nexta;
1613
	struct state *intermediates;
Bruce Momjian's avatar
Bruce Momjian committed
1614
	int			progress;
1615 1616

	/* find and pull until there are no more */
Bruce Momjian's avatar
Bruce Momjian committed
1617 1618
	do
	{
1619
		progress = 0;
Bruce Momjian's avatar
Bruce Momjian committed
1620 1621
		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
		{
1622
			nexts = s->next;
1623
			intermediates = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
1624 1625
			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
			{
1626 1627
				nexta = a->outchain;
				if (a->type == '^' || a->type == BEHIND)
1628
					if (pull(nfa, a, &intermediates))
1629 1630
						progress = 1;
			}
1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641
			/* clear tmp fields of intermediate states created here */
			while (intermediates != NULL)
			{
				struct state *ns = intermediates->tmp;

				intermediates->tmp = NULL;
				intermediates = ns;
			}
			/* if s is now useless, get rid of it */
			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
				dropstate(nfa, s);
1642 1643 1644 1645 1646 1647 1648
		}
		if (progress && f != NULL)
			dumpnfa(nfa, f);
	} while (progress && !NISERR());
	if (NISERR())
		return;

1649 1650 1651 1652 1653 1654
	/*
	 * Any ^ constraints we were able to pull to the start state can now be
	 * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
	 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
	 * that here (compact() will fail if so).
	 */
Bruce Momjian's avatar
Bruce Momjian committed
1655 1656
	for (a = nfa->pre->outs; a != NULL; a = nexta)
	{
1657
		nexta = a->outchain;
Bruce Momjian's avatar
Bruce Momjian committed
1658 1659
		if (a->type == '^')
		{
1660 1661 1662 1663 1664 1665 1666 1667 1668
			assert(a->co == 0 || a->co == 1);
			newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
			freearc(nfa, a);
		}
	}
}

/*
 * pull - pull a back constraint backward past its source state
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683
 *
 * Returns 1 if successful (which it always is unless the source is the
 * start state or we have an internal error), 0 if nothing happened.
 *
 * A significant property of this function is that it deletes no pre-existing
 * states, and no outarcs of the constraint's from state other than the given
 * constraint arc.  This makes the loops in pullback() safe, at the cost that
 * we may leave useless states behind.  Therefore, we leave it to pullback()
 * to delete such states.
 *
 * If the from state has multiple back-constraint outarcs, and/or multiple
 * compatible constraint inarcs, we only need to create one new intermediate
 * state per combination of predecessor and successor states.  *intermediates
 * points to a list of such intermediate states for this from state (chained
 * through their tmp fields).
1684
 */
1685
static int
1686 1687 1688
pull(struct nfa *nfa,
	 struct arc *con,
	 struct state **intermediates)
1689 1690 1691 1692 1693 1694 1695
{
	struct state *from = con->from;
	struct state *to = con->to;
	struct arc *a;
	struct arc *nexta;
	struct state *s;

1696
	assert(from != to);			/* should have gotten rid of this earlier */
Bruce Momjian's avatar
Bruce Momjian committed
1697
	if (from->flag)				/* can't pull back beyond start */
1698
		return 0;
Bruce Momjian's avatar
Bruce Momjian committed
1699 1700
	if (from->nins == 0)
	{							/* unreachable */
1701 1702 1703 1704
		freearc(nfa, con);
		return 1;
	}

1705 1706 1707 1708 1709
	/*
	 * First, clone from state if necessary to avoid other outarcs.  This may
	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
	 * clone state again at the bottom.
	 */
Bruce Momjian's avatar
Bruce Momjian committed
1710 1711
	if (from->nouts > 1)
	{
1712 1713 1714
		s = newstate(nfa);
		if (NISERR())
			return 0;
1715
		copyins(nfa, from, s);	/* duplicate inarcs */
Bruce Momjian's avatar
Bruce Momjian committed
1716
		cparc(nfa, con, s, to); /* move constraint arc */
1717
		freearc(nfa, con);
1718 1719
		if (NISERR())
			return 0;
1720 1721 1722 1723 1724 1725
		from = s;
		con = from->outs;
	}
	assert(from->nouts == 1);

	/* propagate the constraint into the from state's inarcs */
1726
	for (a = from->ins; a != NULL && !NISERR(); a = nexta)
Bruce Momjian's avatar
Bruce Momjian committed
1727
	{
1728
		nexta = a->inchain;
1729
		switch (combine(nfa, con, a))
Bruce Momjian's avatar
Bruce Momjian committed
1730 1731 1732 1733 1734 1735 1736
		{
			case INCOMPATIBLE:	/* destroy the arc */
				freearc(nfa, a);
				break;
			case SATISFIED:		/* no action needed */
				break;
			case COMPATIBLE:	/* swap the two arcs, more or less */
1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
				/* need an intermediate state, but might have one already */
				for (s = *intermediates; s != NULL; s = s->tmp)
				{
					assert(s->nins > 0 && s->nouts > 0);
					if (s->ins->from == a->from && s->outs->to == to)
						break;
				}
				if (s == NULL)
				{
					s = newstate(nfa);
					if (NISERR())
						return 0;
					s->tmp = *intermediates;
					*intermediates = s;
				}
Bruce Momjian's avatar
Bruce Momjian committed
1752
				cparc(nfa, con, a->from, s);
1753
				cparc(nfa, a, s, to);
Bruce Momjian's avatar
Bruce Momjian committed
1754 1755
				freearc(nfa, a);
				break;
1756 1757 1758 1759
			case REPLACEARC:	/* replace arc's color */
				newarc(nfa, a->type, con->co, a->from, to);
				freearc(nfa, a);
				break;
Bruce Momjian's avatar
Bruce Momjian committed
1760 1761 1762
			default:
				assert(NOTREACHED);
				break;
1763 1764 1765 1766 1767
		}
	}

	/* remaining inarcs, if any, incorporate the constraint */
	moveins(nfa, from, to);
1768 1769
	freearc(nfa, con);
	/* from state is now useless, but we leave it to pullback() to clean up */
1770 1771 1772 1773
	return 1;
}

/*
1774
 * pushfwd - push forward constraints forward to eliminate them
1775 1776
 */
static void
1777
pushfwd(struct nfa *nfa,
Bruce Momjian's avatar
Bruce Momjian committed
1778
		FILE *f)				/* for debug output; NULL none */
1779 1780 1781 1782 1783
{
	struct state *s;
	struct state *nexts;
	struct arc *a;
	struct arc *nexta;
1784
	struct state *intermediates;
Bruce Momjian's avatar
Bruce Momjian committed
1785
	int			progress;
1786 1787

	/* find and push until there are no more */
Bruce Momjian's avatar
Bruce Momjian committed
1788 1789
	do
	{
1790
		progress = 0;
Bruce Momjian's avatar
Bruce Momjian committed
1791 1792
		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
		{
1793
			nexts = s->next;
1794
			intermediates = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
1795 1796
			for (a = s->ins; a != NULL && !NISERR(); a = nexta)
			{
1797 1798
				nexta = a->inchain;
				if (a->type == '$' || a->type == AHEAD)
1799
					if (push(nfa, a, &intermediates))
1800 1801
						progress = 1;
			}
1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
			/* clear tmp fields of intermediate states created here */
			while (intermediates != NULL)
			{
				struct state *ns = intermediates->tmp;

				intermediates->tmp = NULL;
				intermediates = ns;
			}
			/* if s is now useless, get rid of it */
			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
				dropstate(nfa, s);
1813 1814 1815 1816 1817 1818 1819
		}
		if (progress && f != NULL)
			dumpnfa(nfa, f);
	} while (progress && !NISERR());
	if (NISERR())
		return;

1820 1821 1822 1823 1824 1825
	/*
	 * Any $ constraints we were able to push to the post state can now be
	 * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
	 * be no other $ or AHEAD arcs left in the NFA, though we do not check
	 * that here (compact() will fail if so).
	 */
Bruce Momjian's avatar
Bruce Momjian committed
1826 1827
	for (a = nfa->post->ins; a != NULL; a = nexta)
	{
1828
		nexta = a->inchain;
Bruce Momjian's avatar
Bruce Momjian committed
1829 1830
		if (a->type == '$')
		{
1831 1832 1833 1834 1835 1836 1837 1838 1839
			assert(a->co == 0 || a->co == 1);
			newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
			freearc(nfa, a);
		}
	}
}

/*
 * push - push a forward constraint forward past its destination state
1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854
 *
 * Returns 1 if successful (which it always is unless the destination is the
 * post state or we have an internal error), 0 if nothing happened.
 *
 * A significant property of this function is that it deletes no pre-existing
 * states, and no inarcs of the constraint's to state other than the given
 * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
 * we may leave useless states behind.  Therefore, we leave it to pushfwd()
 * to delete such states.
 *
 * If the to state has multiple forward-constraint inarcs, and/or multiple
 * compatible constraint outarcs, we only need to create one new intermediate
 * state per combination of predecessor and successor states.  *intermediates
 * points to a list of such intermediate states for this to state (chained
 * through their tmp fields).
1855
 */
1856
static int
1857 1858 1859
push(struct nfa *nfa,
	 struct arc *con,
	 struct state **intermediates)
1860 1861 1862 1863 1864 1865 1866
{
	struct state *from = con->from;
	struct state *to = con->to;
	struct arc *a;
	struct arc *nexta;
	struct state *s;

1867
	assert(to != from);			/* should have gotten rid of this earlier */
Bruce Momjian's avatar
Bruce Momjian committed
1868
	if (to->flag)				/* can't push forward beyond end */
1869
		return 0;
Bruce Momjian's avatar
Bruce Momjian committed
1870 1871
	if (to->nouts == 0)
	{							/* dead end */
1872 1873 1874 1875
		freearc(nfa, con);
		return 1;
	}

1876 1877 1878 1879 1880
	/*
	 * First, clone to state if necessary to avoid other inarcs.  This may
	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
	 * clone state again at the bottom.
	 */
Bruce Momjian's avatar
Bruce Momjian committed
1881 1882
	if (to->nins > 1)
	{
1883 1884 1885
		s = newstate(nfa);
		if (NISERR())
			return 0;
1886
		copyouts(nfa, to, s);	/* duplicate outarcs */
Tom Lane's avatar
Tom Lane committed
1887
		cparc(nfa, con, from, s);	/* move constraint arc */
1888
		freearc(nfa, con);
1889 1890
		if (NISERR())
			return 0;
1891 1892 1893 1894 1895 1896
		to = s;
		con = to->ins;
	}
	assert(to->nins == 1);

	/* propagate the constraint into the to state's outarcs */
1897
	for (a = to->outs; a != NULL && !NISERR(); a = nexta)
Bruce Momjian's avatar
Bruce Momjian committed
1898
	{
1899
		nexta = a->outchain;
1900
		switch (combine(nfa, con, a))
Bruce Momjian's avatar
Bruce Momjian committed
1901 1902 1903 1904 1905 1906 1907
		{
			case INCOMPATIBLE:	/* destroy the arc */
				freearc(nfa, a);
				break;
			case SATISFIED:		/* no action needed */
				break;
			case COMPATIBLE:	/* swap the two arcs, more or less */
1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923
				/* need an intermediate state, but might have one already */
				for (s = *intermediates; s != NULL; s = s->tmp)
				{
					assert(s->nins > 0 && s->nouts > 0);
					if (s->ins->from == from && s->outs->to == a->to)
						break;
				}
				if (s == NULL)
				{
					s = newstate(nfa);
					if (NISERR())
						return 0;
					s->tmp = *intermediates;
					*intermediates = s;
				}
				cparc(nfa, con, s, a->to);
Bruce Momjian's avatar
Bruce Momjian committed
1924 1925 1926
				cparc(nfa, a, from, s);
				freearc(nfa, a);
				break;
1927 1928 1929 1930
			case REPLACEARC:	/* replace arc's color */
				newarc(nfa, a->type, con->co, from, a->to);
				freearc(nfa, a);
				break;
Bruce Momjian's avatar
Bruce Momjian committed
1931 1932 1933
			default:
				assert(NOTREACHED);
				break;
1934 1935 1936 1937 1938
		}
	}

	/* remaining outarcs, if any, incorporate the constraint */
	moveouts(nfa, to, from);
1939 1940
	freearc(nfa, con);
	/* to state is now useless, but we leave it to pushfwd() to clean up */
1941 1942 1943 1944 1945 1946
	return 1;
}

/*
 * combine - constraint lands on an arc, what happens?
 *
Bruce Momjian's avatar
Bruce Momjian committed
1947 1948 1949
 * #def INCOMPATIBLE	1	// destroys arc
 * #def SATISFIED		2	// constraint satisfied
 * #def COMPATIBLE		3	// compatible but not satisfied yet
1950
 * #def REPLACEARC		4	// replace arc's color with constraint color
1951 1952
 */
static int
1953 1954
combine(struct nfa *nfa,
		struct arc *con,
1955
		struct arc *a)
1956
{
Bruce Momjian's avatar
Bruce Momjian committed
1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
#define  CA(ct,at)	 (((ct)<<CHAR_BIT) | (at))

	switch (CA(con->type, a->type))
	{
		case CA('^', PLAIN):	/* newlines are handled separately */
		case CA('$', PLAIN):
			return INCOMPATIBLE;
			break;
		case CA(AHEAD, PLAIN):	/* color constraints meet colors */
		case CA(BEHIND, PLAIN):
			if (con->co == a->co)
				return SATISFIED;
1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
			if (con->co == RAINBOW)
			{
				/* con is satisfied unless arc's color is a pseudocolor */
				if (!(nfa->cm->cd[a->co].flags & PSEUDO))
					return SATISFIED;
			}
			else if (a->co == RAINBOW)
			{
				/* con is incompatible if it's for a pseudocolor */
				if (nfa->cm->cd[con->co].flags & PSEUDO)
					return INCOMPATIBLE;
				/* otherwise, constraint constrains arc to be only its color */
				return REPLACEARC;
			}
Bruce Momjian's avatar
Bruce Momjian committed
1983 1984 1985 1986
			return INCOMPATIBLE;
			break;
		case CA('^', '^'):		/* collision, similar constraints */
		case CA('$', '$'):
1987 1988 1989 1990 1991
			if (con->co == a->co)	/* true duplication */
				return SATISFIED;
			return INCOMPATIBLE;
			break;
		case CA(AHEAD, AHEAD):	/* collision, similar constraints */
Bruce Momjian's avatar
Bruce Momjian committed
1992
		case CA(BEHIND, BEHIND):
Tom Lane's avatar
Tom Lane committed
1993
			if (con->co == a->co)	/* true duplication */
Bruce Momjian's avatar
Bruce Momjian committed
1994
				return SATISFIED;
1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
			if (con->co == RAINBOW)
			{
				/* con is satisfied unless arc's color is a pseudocolor */
				if (!(nfa->cm->cd[a->co].flags & PSEUDO))
					return SATISFIED;
			}
			else if (a->co == RAINBOW)
			{
				/* con is incompatible if it's for a pseudocolor */
				if (nfa->cm->cd[con->co].flags & PSEUDO)
					return INCOMPATIBLE;
				/* otherwise, constraint constrains arc to be only its color */
				return REPLACEARC;
			}
Bruce Momjian's avatar
Bruce Momjian committed
2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
			return INCOMPATIBLE;
			break;
		case CA('^', BEHIND):	/* collision, dissimilar constraints */
		case CA(BEHIND, '^'):
		case CA('$', AHEAD):
		case CA(AHEAD, '$'):
			return INCOMPATIBLE;
			break;
		case CA('^', '$'):		/* constraints passing each other */
		case CA('^', AHEAD):
		case CA(BEHIND, '$'):
		case CA(BEHIND, AHEAD):
		case CA('$', '^'):
		case CA('$', BEHIND):
		case CA(AHEAD, '^'):
		case CA(AHEAD, BEHIND):
		case CA('^', LACON):
		case CA(BEHIND, LACON):
		case CA('$', LACON):
		case CA(AHEAD, LACON):
			return COMPATIBLE;
			break;
2031 2032 2033 2034 2035 2036 2037 2038 2039
	}
	assert(NOTREACHED);
	return INCOMPATIBLE;		/* for benefit of blind compilers */
}

/*
 * fixempties - get rid of EMPTY arcs
 */
static void
2040
fixempties(struct nfa *nfa,
Bruce Momjian's avatar
Bruce Momjian committed
2041
		   FILE *f)				/* for debug output; NULL none */
2042 2043
{
	struct state *s;
2044
	struct state *s2;
2045 2046 2047
	struct state *nexts;
	struct arc *a;
	struct arc *nexta;
2048 2049 2050 2051 2052 2053
	int			totalinarcs;
	struct arc **inarcsorig;
	struct arc **arcarray;
	int			arccount;
	int			prevnins;
	int			nskip;
2054

2055 2056 2057 2058 2059 2060
	/*
	 * First, get rid of any states whose sole out-arc is an EMPTY, since
	 * they're basically just aliases for their successor.  The parsing
	 * algorithm creates enough of these that it's worth special-casing this.
	 */
	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
Bruce Momjian's avatar
Bruce Momjian committed
2061
	{
2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093
		nexts = s->next;
		if (s->flag || s->nouts != 1)
			continue;
		a = s->outs;
		assert(a != NULL && a->outchain == NULL);
		if (a->type != EMPTY)
			continue;
		if (s != a->to)
			moveins(nfa, s, a->to);
		dropstate(nfa, s);
	}

	/*
	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
	 * it into its predecessor.
	 */
	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
	{
		nexts = s->next;
		/* while we're at it, ensure tmp fields are clear for next step */
		assert(s->tmp == NULL);
		if (s->flag || s->nins != 1)
			continue;
		a = s->ins;
		assert(a != NULL && a->inchain == NULL);
		if (a->type != EMPTY)
			continue;
		if (s != a->from)
			moveouts(nfa, s, a->from);
		dropstate(nfa, s);
	}

2094 2095 2096
	if (NISERR())
		return;

2097
	/*
2098 2099
	 * For each remaining NFA state, find all other states from which it is
	 * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
2100 2101
	 * that eliminate the need for each such chain.
	 *
2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
	 * We could replace a chain of EMPTY arcs that leads from a "from" state
	 * to a "to" state either by pushing non-EMPTY arcs forward (linking
	 * directly from "from"'s predecessors to "to") or by pulling them back
	 * (linking directly from "from" to "to"'s successors).  We choose to
	 * always do the former; this choice is somewhat arbitrary, but the
	 * approach below requires that we uniformly do one or the other.
	 *
	 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
	 * approach the size of the NFA).  All of the intermediate states must
	 * have additional inarcs and outarcs, else they'd have been removed by
	 * the steps above.  Assuming their inarcs are mostly not empties, we will
	 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
	 * state in the chain must be duplicated to lead to all its successor
	 * states as well.  So there is no hope of doing less than O(N^2) work;
	 * however, we should endeavor to keep the big-O cost from being even
	 * worse than that, which it can easily become without care.  In
	 * particular, suppose we were to copy all S1's inarcs forward to S2, and
	 * then also to S3, and then later we consider pushing S2's inarcs forward
	 * to S3.  If we include the arcs already copied from S1 in that, we'd be
	 * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
	 * and its cohorts would get rid of the extra arcs, but not without cost.)
	 *
	 * We can avoid this cost by treating only arcs that existed at the start
	 * of this phase as candidates to be pushed forward.  To identify those,
	 * we remember the first inarc each state had to start with.  We rely on
	 * the fact that newarc() and friends put new arcs on the front of their
	 * to-states' inchains, and that this phase never deletes arcs, so that
	 * the original arcs must be the last arcs in their to-states' inchains.
	 *
	 * So the process here is that, for each state in the NFA, we gather up
	 * all non-EMPTY inarcs of states that can reach the target state via
	 * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
	 * target state's inchain.  (We can safely use sort-merge for this as long
	 * as we update each state's original-arcs pointer after we add arcs to
	 * it; the sort step of mergeins probably changed the order of the old
	 * arcs.)
	 *
	 * Another refinement worth making is that, because we only add non-EMPTY
	 * arcs during this phase, and all added arcs have the same from-state as
	 * the non-EMPTY arc they were cloned from, we know ahead of time that any
	 * states having only EMPTY outarcs will be useless for lack of outarcs
	 * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
	 * they had none to start with.)  So we need not bother to update the
	 * inchains of such states at all.
	 */

	/* Remember the states' first original inarcs */
	/* ... and while at it, count how many old inarcs there are altogether */
	inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
	if (inarcsorig == NULL)
	{
		NERR(REG_ESPACE);
		return;
	}
	totalinarcs = 0;
	for (s = nfa->states; s != NULL; s = s->next)
	{
		inarcsorig[s->no] = s->ins;
		totalinarcs += s->nins;
	}

	/*
	 * Create a workspace for accumulating the inarcs to be added to the
	 * current target state.  totalinarcs is probably a considerable
	 * overestimate of the space needed, but the NFA is unlikely to be large
	 * enough at this point to make it worth being smarter.
2168
	 */
2169 2170 2171 2172 2173 2174 2175 2176 2177
	arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
	if (arcarray == NULL)
	{
		NERR(REG_ESPACE);
		FREE(inarcsorig);
		return;
	}

	/* And iterate over the target states */
2178 2179
	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
	{
2180 2181 2182 2183 2184 2185 2186
		/* Ignore target states without non-EMPTY outarcs, per note above */
		if (!s->flag && !hasnonemptyout(s))
			continue;

		/* Find predecessor states and accumulate their original inarcs */
		arccount = 0;
		for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
Bruce Momjian's avatar
Bruce Momjian committed
2187
		{
2188 2189 2190 2191 2192 2193
			/* Add s2's original inarcs to arcarray[], but ignore empties */
			for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
			{
				if (a->type != EMPTY)
					arcarray[arccount++] = a;
			}
2194 2195 2196 2197

			/* Reset the tmp fields as we walk back */
			nexts = s2->tmp;
			s2->tmp = NULL;
2198
		}
2199
		s->tmp = NULL;
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213
		assert(arccount <= totalinarcs);

		/* Remember how many original inarcs this state has */
		prevnins = s->nins;

		/* Add non-duplicate inarcs to target state */
		mergeins(nfa, s, arcarray, arccount);

		/* Now we must update the state's inarcsorig pointer */
		nskip = s->nins - prevnins;
		a = s->ins;
		while (nskip-- > 0)
			a = a->inchain;
		inarcsorig[s->no] = a;
2214 2215
	}

2216 2217 2218
	FREE(arcarray);
	FREE(inarcsorig);

2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235
	if (NISERR())
		return;

	/*
	 * Now remove all the EMPTY arcs, since we don't need them anymore.
	 */
	for (s = nfa->states; s != NULL; s = s->next)
	{
		for (a = s->outs; a != NULL; a = nexta)
		{
			nexta = a->outchain;
			if (a->type == EMPTY)
				freearc(nfa, a);
		}
	}

	/*
Bruce Momjian's avatar
Bruce Momjian committed
2236
	 * And remove any states that have become useless.  (This cleanup is not
2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248
	 * very thorough, and would be even less so if we tried to combine it with
	 * the previous step; but cleanup() will take care of anything we miss.)
	 */
	for (s = nfa->states; s != NULL; s = nexts)
	{
		nexts = s->next;
		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
			dropstate(nfa, s);
	}

	if (f != NULL)
		dumpnfa(nfa, f);
2249 2250 2251
}

/*
2252
 * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2253 2254 2255 2256
 *
 * The return value is the last such state found.  Its tmp field links back
 * to the next-to-last such state, and so on back to s, so that all these
 * states can be located without searching the whole NFA.
2257
 *
2258 2259 2260 2261
 * Since this is only used in fixempties(), we pass in the inarcsorig[] array
 * maintained by that function.  This lets us skip over all new inarcs, which
 * are certainly not EMPTY arcs.
 *
2262 2263
 * The maximum recursion depth here is equal to the length of the longest
 * loop-free chain of EMPTY arcs, which is surely no more than the size of
2264
 * the NFA ... but that could still be enough to cause trouble.
2265
 */
2266
static struct state *
2267 2268 2269 2270
emptyreachable(struct nfa *nfa,
			   struct state *s,
			   struct state *lastfound,
			   struct arc **inarcsorig)
2271
{
2272
	struct arc *a;
2273

2274 2275 2276 2277 2278 2279 2280
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return lastfound;
	}

2281 2282
	s->tmp = lastfound;
	lastfound = s;
2283
	for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2284
	{
2285 2286
		if (a->type == EMPTY && a->from->tmp == NULL)
			lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2287
	}
2288 2289 2290
	return lastfound;
}

2291 2292 2293 2294
/*
 * isconstraintarc - detect whether an arc is of a constraint type
 */
static inline int
2295
isconstraintarc(struct arc *a)
2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312
{
	switch (a->type)
	{
		case '^':
		case '$':
		case BEHIND:
		case AHEAD:
		case LACON:
			return 1;
	}
	return 0;
}

/*
 * hasconstraintout - does state have a constraint out arc?
 */
static int
2313
hasconstraintout(struct state *s)
2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333
{
	struct arc *a;

	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (isconstraintarc(a))
			return 1;
	}
	return 0;
}

/*
 * fixconstraintloops - get rid of loops containing only constraint arcs
 *
 * A loop of states that contains only constraint arcs is useless, since
 * passing around the loop represents no forward progress.  Moreover, it
 * would cause infinite looping in pullback/pushfwd, so we need to get rid
 * of such loops before doing that.
 */
static void
2334
fixconstraintloops(struct nfa *nfa,
2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432
				   FILE *f)		/* for debug output; NULL none */
{
	struct state *s;
	struct state *nexts;
	struct arc *a;
	struct arc *nexta;
	int			hasconstraints;

	/*
	 * In the trivial case of a state that loops to itself, we can just drop
	 * the constraint arc altogether.  This is worth special-casing because
	 * such loops are far more common than loops containing multiple states.
	 * While we're at it, note whether any constraint arcs survive.
	 */
	hasconstraints = 0;
	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
	{
		nexts = s->next;
		/* while we're at it, ensure tmp fields are clear for next step */
		assert(s->tmp == NULL);
		for (a = s->outs; a != NULL && !NISERR(); a = nexta)
		{
			nexta = a->outchain;
			if (isconstraintarc(a))
			{
				if (a->to == s)
					freearc(nfa, a);
				else
					hasconstraints = 1;
			}
		}
		/* If we removed all the outarcs, the state is useless. */
		if (s->nouts == 0 && !s->flag)
			dropstate(nfa, s);
	}

	/* Nothing to do if no remaining constraint arcs */
	if (NISERR() || !hasconstraints)
		return;

	/*
	 * Starting from each remaining NFA state, search outwards for a
	 * constraint loop.  If we find a loop, break the loop, then start the
	 * search over.  (We could possibly retain some state from the first scan,
	 * but it would complicate things greatly, and multi-state constraint
	 * loops are rare enough that it's not worth optimizing the case.)
	 */
restart:
	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
	{
		if (findconstraintloop(nfa, s))
			goto restart;
	}

	if (NISERR())
		return;

	/*
	 * Now remove any states that have become useless.  (This cleanup is not
	 * very thorough, and would be even less so if we tried to combine it with
	 * the previous step; but cleanup() will take care of anything we miss.)
	 *
	 * Because findconstraintloop intentionally doesn't reset all tmp fields,
	 * we have to clear them after it's done.  This is a convenient place to
	 * do that, too.
	 */
	for (s = nfa->states; s != NULL; s = nexts)
	{
		nexts = s->next;
		s->tmp = NULL;
		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
			dropstate(nfa, s);
	}

	if (f != NULL)
		dumpnfa(nfa, f);
}

/*
 * findconstraintloop - recursively find a loop of constraint arcs
 *
 * If we find a loop, break it by calling breakconstraintloop(), then
 * return 1; otherwise return 0.
 *
 * State tmp fields are guaranteed all NULL on a success return, because
 * breakconstraintloop does that.  After a failure return, any state that
 * is known not to be part of a loop is marked with s->tmp == s; this allows
 * us not to have to re-prove that fact on later calls.  (This convention is
 * workable because we already eliminated single-state loops.)
 *
 * Note that the found loop doesn't necessarily include the first state we
 * are called on.  Any loop reachable from that state will do.
 *
 * The maximum recursion depth here is one more than the length of the longest
 * loop-free chain of constraint arcs, which is surely no more than the size
 * of the NFA ... but that could still be enough to cause trouble.
 */
static int
2433
findconstraintloop(struct nfa *nfa, struct state *s)
2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521
{
	struct arc *a;

	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return 1;				/* to exit as quickly as possible */
	}

	if (s->tmp != NULL)
	{
		/* Already proven uninteresting? */
		if (s->tmp == s)
			return 0;
		/* Found a loop involving s */
		breakconstraintloop(nfa, s);
		/* The tmp fields have been cleaned up by breakconstraintloop */
		return 1;
	}
	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (isconstraintarc(a))
		{
			struct state *sto = a->to;

			assert(sto != s);
			s->tmp = sto;
			if (findconstraintloop(nfa, sto))
				return 1;
		}
	}

	/*
	 * If we get here, no constraint loop exists leading out from s.  Mark it
	 * with s->tmp == s so we need not rediscover that fact again later.
	 */
	s->tmp = s;
	return 0;
}

/*
 * breakconstraintloop - break a loop of constraint arcs
 *
 * sinitial is any one member state of the loop.  Each loop member's tmp
 * field links to its successor within the loop.  (Note that this function
 * will reset all the tmp fields to NULL.)
 *
 * We can break the loop by, for any one state S1 in the loop, cloning its
 * loop successor state S2 (and possibly following states), and then moving
 * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
 * copy any non-constraint outarcs of S2.  Constraint outarcs should be
 * dropped if they point back to S1, else they need to be copied as arcs to
 * similarly cloned states S3, S4, etc.  In general, each cloned state copies
 * non-constraint outarcs, drops constraint outarcs that would lead to itself
 * or any earlier cloned state, and sends other constraint outarcs to newly
 * cloned states.  No cloned state will have any inarcs that aren't constraint
 * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
 * constraint back-arcs since they would not take us to any state we've not
 * already been in; therefore, no new constraint loop is created.  In this way
 * we generate a modified NFA that can still represent every useful state
 * sequence, but not sequences that represent state loops with no consumption
 * of input data.  Note that the set of cloned states will certainly include
 * all of the loop member states other than S1, and it may also include
 * non-loop states that are reachable from S2 via constraint arcs.  This is
 * important because there is no guarantee that findconstraintloop found a
 * maximal loop (and searching for one would be NP-hard, so don't try).
 * Frequently the "non-loop states" are actually part of a larger loop that
 * we didn't notice, and indeed there may be several overlapping loops.
 * This technique ensures convergence in such cases, while considering only
 * the originally-found loop does not.
 *
 * If there is only one S1->S2 constraint arc, then that constraint is
 * certainly satisfied when we enter any of the clone states.  This means that
 * in the common case where many of the constraint arcs are identically
 * labeled, we can merge together clone states linked by a similarly-labeled
 * constraint: if we can get to the first one we can certainly get to the
 * second, so there's no need to distinguish.  This greatly reduces the number
 * of new states needed, so we preferentially break the given loop at a state
 * pair where this is true.
 *
 * Furthermore, it's fairly common to find that a cloned successor state has
 * no outarcs, especially if we're a bit aggressive about removing unnecessary
 * outarcs.  If that happens, then there is simply not any interesting state
 * that can be reached through the predecessor's loop arcs, which means we can
 * break the loop just by removing those loop arcs, with no new states added.
 */
static void
2522
breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667
{
	struct state *s;
	struct state *shead;
	struct state *stail;
	struct state *sclone;
	struct state *nexts;
	struct arc *refarc;
	struct arc *a;
	struct arc *nexta;

	/*
	 * Start by identifying which loop step we want to break at.
	 * Preferentially this is one with only one constraint arc.  (XXX are
	 * there any other secondary heuristics we want to use here?)  Set refarc
	 * to point to the selected lone constraint arc, if there is one.
	 */
	refarc = NULL;
	s = sinitial;
	do
	{
		nexts = s->tmp;
		assert(nexts != s);		/* should not see any one-element loops */
		if (refarc == NULL)
		{
			int			narcs = 0;

			for (a = s->outs; a != NULL; a = a->outchain)
			{
				if (a->to == nexts && isconstraintarc(a))
				{
					refarc = a;
					narcs++;
				}
			}
			assert(narcs > 0);
			if (narcs > 1)
				refarc = NULL;	/* multiple constraint arcs here, no good */
		}
		s = nexts;
	} while (s != sinitial);

	if (refarc)
	{
		/* break at the refarc */
		shead = refarc->from;
		stail = refarc->to;
		assert(stail == shead->tmp);
	}
	else
	{
		/* for lack of a better idea, break after sinitial */
		shead = sinitial;
		stail = sinitial->tmp;
	}

	/*
	 * Reset the tmp fields so that we can use them for local storage in
	 * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
	 * going to abandon its search anyway.)
	 */
	for (s = nfa->states; s != NULL; s = s->next)
		s->tmp = NULL;

	/*
	 * Recursively build clone state(s) as needed.
	 */
	sclone = newstate(nfa);
	if (sclone == NULL)
	{
		assert(NISERR());
		return;
	}

	clonesuccessorstates(nfa, stail, sclone, shead, refarc,
						 NULL, NULL, nfa->nstates);

	if (NISERR())
		return;

	/*
	 * It's possible that sclone has no outarcs at all, in which case it's
	 * useless.  (We don't try extremely hard to get rid of useless states
	 * here, but this is an easy and fairly common case.)
	 */
	if (sclone->nouts == 0)
	{
		freestate(nfa, sclone);
		sclone = NULL;
	}

	/*
	 * Move shead's constraint-loop arcs to point to sclone, or just drop them
	 * if we discovered we don't need sclone.
	 */
	for (a = shead->outs; a != NULL; a = nexta)
	{
		nexta = a->outchain;
		if (a->to == stail && isconstraintarc(a))
		{
			if (sclone)
				cparc(nfa, a, shead, sclone);
			freearc(nfa, a);
			if (NISERR())
				break;
		}
	}
}

/*
 * clonesuccessorstates - create a tree of constraint-arc successor states
 *
 * ssource is the state to be cloned, and sclone is the state to copy its
 * outarcs into.  sclone's inarcs, if any, should already be set up.
 *
 * spredecessor is the original predecessor state that we are trying to build
 * successors for (it may not be the immediate predecessor of ssource).
 * refarc, if not NULL, is the original constraint arc that is known to have
 * been traversed out of spredecessor to reach the successor(s).
 *
 * For each cloned successor state, we transiently create a "donemap" that is
 * a boolean array showing which source states we've already visited for this
 * clone state.  This prevents infinite recursion as well as useless repeat
 * visits to the same state subtree (which can add up fast, since typical NFAs
 * have multiple redundant arc pathways).  Each donemap is a char array
 * indexed by state number.  The donemaps are all of the same size "nstates",
 * which is nfa->nstates as of the start of the recursion.  This is enough to
 * have entries for all pre-existing states, but *not* entries for clone
 * states created during the recursion.  That's okay since we have no need to
 * mark those.
 *
 * curdonemap is NULL when recursing to a new sclone state, or sclone's
 * donemap when we are recursing without having created a new state (which we
 * do when we decide we can merge a successor state into the current clone
 * state).  outerdonemap is NULL at the top level and otherwise the parent
 * clone state's donemap.
 *
 * The successor states we create and fill here form a strict tree structure,
 * with each state having exactly one predecessor, except that the toplevel
 * state has no inarcs as yet (breakconstraintloop will add its inarcs from
 * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
 * to the root, plus refarc if any, to identify the set of constraints already
 * known valid at the current point.  This allows us to avoid generating extra
 * successor states.
 */
static void
2668 2669 2670 2671 2672
clonesuccessorstates(struct nfa *nfa,
					 struct state *ssource,
					 struct state *sclone,
					 struct state *spredecessor,
					 struct arc *refarc,
2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895
					 char *curdonemap,
					 char *outerdonemap,
					 int nstates)
{
	char	   *donemap;
	struct arc *a;

	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

	/* If this state hasn't already got a donemap, create one */
	donemap = curdonemap;
	if (donemap == NULL)
	{
		donemap = (char *) MALLOC(nstates * sizeof(char));
		if (donemap == NULL)
		{
			NERR(REG_ESPACE);
			return;
		}

		if (outerdonemap != NULL)
		{
			/*
			 * Not at outermost recursion level, so copy the outer level's
			 * donemap; this ensures that we see states in process of being
			 * visited at outer levels, or already merged into predecessor
			 * states, as ones we shouldn't traverse back to.
			 */
			memcpy(donemap, outerdonemap, nstates * sizeof(char));
		}
		else
		{
			/* At outermost level, only spredecessor is off-limits */
			memset(donemap, 0, nstates * sizeof(char));
			assert(spredecessor->no < nstates);
			donemap[spredecessor->no] = 1;
		}
	}

	/* Mark ssource as visited in the donemap */
	assert(ssource->no < nstates);
	assert(donemap[ssource->no] == 0);
	donemap[ssource->no] = 1;

	/*
	 * We proceed by first cloning all of ssource's outarcs, creating new
	 * clone states as needed but not doing more with them than that.  Then in
	 * a second pass, recurse to process the child clone states.  This allows
	 * us to have only one child clone state per reachable source state, even
	 * when there are multiple outarcs leading to the same state.  Also, when
	 * we do visit a child state, its set of inarcs is known exactly, which
	 * makes it safe to apply the constraint-is-already-checked optimization.
	 * Also, this ensures that we've merged all the states we can into the
	 * current clone before we recurse to any children, thus possibly saving
	 * them from making extra images of those states.
	 *
	 * While this function runs, child clone states of the current state are
	 * marked by setting their tmp fields to point to the original state they
	 * were cloned from.  This makes it possible to detect multiple outarcs
	 * leading to the same state, and also makes it easy to distinguish clone
	 * states from original states (which will have tmp == NULL).
	 */
	for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
	{
		struct state *sto = a->to;

		/*
		 * We do not consider cloning successor states that have no constraint
		 * outarcs; just link to them as-is.  They cannot be part of a
		 * constraint loop so there is no need to make copies.  In particular,
		 * this rule keeps us from trying to clone the post state, which would
		 * be a bad idea.
		 */
		if (isconstraintarc(a) && hasconstraintout(sto))
		{
			struct state *prevclone;
			int			canmerge;
			struct arc *a2;

			/*
			 * Back-link constraint arcs must not be followed.  Nor is there a
			 * need to revisit states previously merged into this clone.
			 */
			assert(sto->no < nstates);
			if (donemap[sto->no] != 0)
				continue;

			/*
			 * Check whether we already have a child clone state for this
			 * source state.
			 */
			prevclone = NULL;
			for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
			{
				if (a2->to->tmp == sto)
				{
					prevclone = a2->to;
					break;
				}
			}

			/*
			 * If this arc is labeled the same as refarc, or the same as any
			 * arc we must have traversed to get to sclone, then no additional
			 * constraints need to be met to get to sto, so we should just
			 * merge its outarcs into sclone.
			 */
			if (refarc && a->type == refarc->type && a->co == refarc->co)
				canmerge = 1;
			else
			{
				struct state *s;

				canmerge = 0;
				for (s = sclone; s->ins; s = s->ins->from)
				{
					if (s->nins == 1 &&
						a->type == s->ins->type && a->co == s->ins->co)
					{
						canmerge = 1;
						break;
					}
				}
			}

			if (canmerge)
			{
				/*
				 * We can merge into sclone.  If we previously made a child
				 * clone state, drop it; there's no need to visit it.  (This
				 * can happen if ssource has multiple pathways to sto, and we
				 * only just now found one that is provably a no-op.)
				 */
				if (prevclone)
					dropstate(nfa, prevclone);	/* kills our outarc, too */

				/* Recurse to merge sto's outarcs into sclone */
				clonesuccessorstates(nfa,
									 sto,
									 sclone,
									 spredecessor,
									 refarc,
									 donemap,
									 outerdonemap,
									 nstates);
				/* sto should now be marked as previously visited */
				assert(NISERR() || donemap[sto->no] == 1);
			}
			else if (prevclone)
			{
				/*
				 * We already have a clone state for this successor, so just
				 * make another arc to it.
				 */
				cparc(nfa, a, sclone, prevclone);
			}
			else
			{
				/*
				 * We need to create a new successor clone state.
				 */
				struct state *stoclone;

				stoclone = newstate(nfa);
				if (stoclone == NULL)
				{
					assert(NISERR());
					break;
				}
				/* Mark it as to what it's a clone of */
				stoclone->tmp = sto;
				/* ... and add the outarc leading to it */
				cparc(nfa, a, sclone, stoclone);
			}
		}
		else
		{
			/*
			 * Non-constraint outarcs just get copied to sclone, as do outarcs
			 * leading to states with no constraint outarc.
			 */
			cparc(nfa, a, sclone, sto);
		}
	}

	/*
	 * If we are at outer level for this clone state, recurse to all its child
	 * clone states, clearing their tmp fields as we go.  (If we're not
	 * outermost for sclone, leave this to be done by the outer call level.)
	 * Note that if we have multiple outarcs leading to the same clone state,
	 * it will only be recursed-to once.
	 */
	if (curdonemap == NULL)
	{
		for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
		{
			struct state *stoclone = a->to;
			struct state *sto = stoclone->tmp;

			if (sto != NULL)
			{
				stoclone->tmp = NULL;
				clonesuccessorstates(nfa,
									 sto,
									 stoclone,
									 spredecessor,
									 refarc,
									 NULL,
									 donemap,
									 nstates);
			}
		}

		/* Don't forget to free sclone's donemap when done with it */
		FREE(donemap);
	}
}

2896 2897 2898 2899
/*
 * cleanup - clean up NFA after optimizations
 */
static void
2900
cleanup(struct nfa *nfa)
2901 2902 2903
{
	struct state *s;
	struct state *nexts;
Bruce Momjian's avatar
Bruce Momjian committed
2904
	int			n;
2905

2906 2907 2908
	if (NISERR())
		return;

2909 2910
	/* clear out unreachable or dead-end states */
	/* use pre to mark reachable, then post to mark can-reach-post */
Bruce Momjian's avatar
Bruce Momjian committed
2911
	markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2912
	markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2913
	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
Bruce Momjian's avatar
Bruce Momjian committed
2914
	{
2915 2916 2917 2918
		nexts = s->next;
		if (s->tmp != nfa->post && !s->flag)
			dropstate(nfa, s);
	}
2919
	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2920
	cleartraverse(nfa, nfa->pre);
2921
	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
	/* the nins==0 (final unreachable) case will be caught later */

	/* renumber surviving states */
	n = 0;
	for (s = nfa->states; s != NULL; s = s->next)
		s->no = n++;
	nfa->nstates = n;
}

/*
 * markreachable - recursive marking of reachable states
 */
static void
2935 2936
markreachable(struct nfa *nfa,
			  struct state *s,
Tom Lane's avatar
Tom Lane committed
2937 2938
			  struct state *okay,	/* consider only states with this mark */
			  struct state *mark)	/* the value to mark with */
2939 2940 2941
{
	struct arc *a;

2942 2943 2944 2945 2946 2947 2948
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960
	if (s->tmp != okay)
		return;
	s->tmp = mark;

	for (a = s->outs; a != NULL; a = a->outchain)
		markreachable(nfa, a->to, okay, mark);
}

/*
 * markcanreach - recursive marking of states which can reach here
 */
static void
2961 2962 2963 2964
markcanreach(struct nfa *nfa,
			 struct state *s,
			 struct state *okay,	/* consider only states with this mark */
			 struct state *mark)	/* the value to mark with */
2965 2966 2967
{
	struct arc *a;

2968 2969 2970 2971 2972 2973 2974
	/* Since this is recursive, it could be driven to stack overflow */
	if (STACK_TOO_DEEP(nfa->v->re))
	{
		NERR(REG_ETOOBIG);
		return;
	}

2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985
	if (s->tmp != okay)
		return;
	s->tmp = mark;

	for (a = s->ins; a != NULL; a = a->inchain)
		markcanreach(nfa, a->from, okay, mark);
}

/*
 * analyze - ascertain potentially-useful facts about an optimized NFA
 */
Bruce Momjian's avatar
Bruce Momjian committed
2986
static long						/* re_info bits to be ORed in */
2987
analyze(struct nfa *nfa)
2988 2989 2990 2991
{
	struct arc *a;
	struct arc *aa;

2992 2993 2994
	if (NISERR())
		return 0;

2995
	/* Detect whether NFA can't match anything */
2996 2997
	if (nfa->pre->outs == NULL)
		return REG_UIMPOSSIBLE;
2998 2999 3000 3001 3002

	/* Detect whether NFA matches all strings (possibly with length bounds) */
	checkmatchall(nfa);

	/* Detect whether NFA can possibly match a zero-length string */
3003 3004 3005 3006 3007 3008 3009
	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
		for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
			if (aa->to == nfa->post)
				return REG_UEMPTYMATCH;
	return 0;
}

3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034
/*
 * checkmatchall - does the NFA represent no more than a string length test?
 *
 * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
 * to begin with) and set the MATCHALL bit in nfa->flags.
 *
 * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
 * for pseudocolors (i.e., BOS/BOL/EOS/EOL).  We must be able to reach the
 * post state via RAINBOW arcs, and if there are any loops in the graph, they
 * must be loop-to-self arcs, ensuring that each loop iteration consumes
 * exactly one character.  (Longer loops are problematic because they create
 * non-consecutive possible match lengths; we have no good way to represent
 * that situation for lengths beyond the DUPINF limit.)
 *
 * Pseudocolor arcs complicate things a little.  We know that they can only
 * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
 * EOS/EOL).  There, they must exactly replicate the parallel RAINBOW arcs,
 * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
 * and BOL outarcs to state 2, and no others.  Missing or extra pseudocolor
 * arcs can occur, meaning that the NFA involves some constraint on the
 * adjacent characters, which makes it not a matchall NFA.
 */
static void
checkmatchall(struct nfa *nfa)
{
3035 3036 3037
	bool	  **haspaths;
	struct state *s;
	int			i;
3038 3039

	/*
3040 3041 3042 3043 3044 3045 3046
	 * If there are too many states, don't bother trying to detect matchall.
	 * This limit serves to bound the time and memory we could consume below.
	 * Note that even if the graph is all-RAINBOW, if there are significantly
	 * more than DUPINF states then it's likely that there are paths of length
	 * more than DUPINF, which would force us to fail anyhow.  In practice,
	 * plausible ways of writing a matchall regex with maximum finite path
	 * length K tend not to have very many more than K states.
3047
	 */
3048 3049
	if (nfa->nstates > DUPINF * 2)
		return;
3050 3051

	/*
3052 3053 3054
	 * First, scan all the states to verify that only RAINBOW arcs appear,
	 * plus pseudocolor arcs adjacent to the pre and post states.  This lets
	 * us quickly eliminate most cases that aren't matchall NFAs.
3055
	 */
3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088
	for (s = nfa->states; s != NULL; s = s->next)
	{
		struct arc *a;

		for (a = s->outs; a != NULL; a = a->outchain)
		{
			if (a->type != PLAIN)
				return;			/* any LACONs make it non-matchall */
			if (a->co != RAINBOW)
			{
				if (nfa->cm->cd[a->co].flags & PSEUDO)
				{
					/*
					 * Pseudocolor arc: verify it's in a valid place (this
					 * seems quite unlikely to fail, but let's be sure).
					 */
					if (s == nfa->pre &&
						(a->co == nfa->bos[0] || a->co == nfa->bos[1]))
						 /* okay BOS/BOL arc */ ;
					else if (a->to == nfa->post &&
							 (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
						 /* okay EOS/EOL arc */ ;
					else
						return; /* unexpected pseudocolor arc */
					/* We'll check these arcs some more below. */
				}
				else
					return;		/* any other color makes it non-matchall */
			}
		}
		/* Also, assert that the tmp fields are available for use. */
		assert(s->tmp == NULL);
	}
3089 3090

	/*
3091 3092 3093 3094 3095
	 * The next cheapest check we can make is to verify that the BOS/BOL
	 * outarcs of the pre state reach the same states as its RAINBOW outarcs.
	 * If they don't, the NFA expresses some constraints on the character
	 * before the matched string, making it non-matchall.  Likewise, the
	 * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
3096 3097 3098
	 */
	if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
		!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
3099 3100
		!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
		!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
3101 3102 3103
		return;

	/*
3104 3105 3106
	 * Initialize an array of path-length arrays, in which
	 * checkmatchall_recurse will return per-state results.  This lets us
	 * memo-ize the recursive search and avoid exponential time consumption.
3107
	 */
3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121
	haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
	if (haspaths == NULL)
		return;					/* fail quietly */
	memset(haspaths, 0, nfa->nstates * sizeof(bool *));

	/*
	 * Recursively search the graph for all-RAINBOW paths to the "post" state,
	 * starting at the "pre" state, and computing the lengths of the paths.
	 * (Given the preceding checks, there should be at least one such path.
	 * However we could get back a false result anyway, in case there are
	 * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
	 * failures such as ENOMEM.)
	 */
	if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
3122
	{
3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175
		/* The useful result is the path length array for the pre state */
		bool	   *haspath = haspaths[nfa->pre->no];
		int			minmatch,
					maxmatch,
					morematch;

		assert(haspath != NULL);

		/*
		 * haspath[] now represents the set of possible path lengths; but we
		 * want to reduce that to a min and max value, because it doesn't seem
		 * worth complicating regexec.c to deal with nonconsecutive possible
		 * match lengths.  Find min and max of first run of lengths, then
		 * verify there are no nonconsecutive lengths.
		 */
		for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
		{
			if (haspath[minmatch])
				break;
		}
		assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
		for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
		{
			if (!haspath[maxmatch + 1])
				break;
		}
		for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
		{
			if (haspath[morematch])
			{
				haspath = NULL; /* fail, there are nonconsecutive lengths */
				break;
			}
		}

		if (haspath != NULL)
		{
			/*
			 * Success, so record the info.  Here we have a fine point: the
			 * path length from the pre state includes the pre-to-initial
			 * transition, so it's one more than the actually matched string
			 * length.  (We avoided counting the final-to-post transition
			 * within checkmatchall_recurse, but not this one.)  This is why
			 * checkmatchall_recurse allows one more level of path length than
			 * might seem necessary.  This decrement also takes care of
			 * converting checkmatchall_recurse's definition of "infinity" as
			 * "DUPINF+1" to our normal representation as "DUPINF".
			 */
			assert(minmatch > 0);	/* else pre and post states were adjacent */
			nfa->minmatchall = minmatch - 1;
			nfa->maxmatchall = maxmatch - 1;
			nfa->flags |= MATCHALL;
		}
3176
	}
3177 3178 3179

	/* Clean up */
	for (i = 0; i < nfa->nstates; i++)
3180
	{
3181 3182
		if (haspaths[i] != NULL)
			FREE(haspaths[i]);
3183
	}
3184
	FREE(haspaths);
3185 3186 3187 3188 3189
}

/*
 * checkmatchall_recurse - recursive search for checkmatchall
 *
3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
 * s is the state to be examined in this recursion level.
 * haspaths[] is an array of per-state exit path length arrays.
 *
 * We return true if the search was performed successfully, false if
 * we had to fail because of multi-state loops or other internal reasons.
 * (Because "dead" states that can't reach the post state have been
 * eliminated, and we already verified that only RAINBOW and matching
 * pseudocolor arcs exist, every state should have RAINBOW path(s) to
 * the post state.  Hence we take a false result from recursive calls
 * as meaning that we'd better fail altogether, not just that that
 * particular state can't reach the post state.)
 *
 * On success, we store a malloc'd result array in haspaths[s->no],
 * showing the possible path lengths from s to the post state.
 * Each state's haspath[] array is of length DUPINF+2.  The entries from
 * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
 * from this state to the string end.  haspath[DUPINF+1] is true if all
 * path lengths >= DUPINF+1 are possible.  (Situations that cannot be
 * represented under these rules cause failure.)
3209
 *
3210
 * checkmatchall is responsible for eventually freeing the haspath[] arrays.
3211 3212
 */
static bool
3213
checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
3214 3215
{
	bool		result = false;
3216 3217
	bool		foundloop = false;
	bool	   *haspath;
3218 3219 3220 3221 3222 3223 3224 3225 3226
	struct arc *a;

	/*
	 * Since this is recursive, it could be driven to stack overflow.  But we
	 * need not treat that as a hard failure; just deem the NFA non-matchall.
	 */
	if (STACK_TOO_DEEP(nfa->v->re))
		return false;

3227 3228
	/* In case the search takes a long time, check for cancel */
	if (CANCEL_REQUESTED(nfa->v->re))
3229
	{
3230 3231
		NERR(REG_CANCEL);
		return false;
3232 3233
	}

3234 3235 3236 3237 3238 3239 3240
	/* Create a haspath array for this state */
	haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
	if (haspath == NULL)
		return false;			/* again, treat as non-matchall */
	memset(haspath, 0, (DUPINF + 2) * sizeof(bool));

	/* Mark this state as being visited */
3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251
	assert(s->tmp == NULL);
	s->tmp = s;

	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (a->co != RAINBOW)
			continue;			/* ignore pseudocolor arcs */
		if (a->to == nfa->post)
		{
			/* We found an all-RAINBOW path to the post state */
			result = true;
3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277

			/*
			 * Mark this state as being zero steps away from the string end
			 * (the transition to the post state isn't counted).
			 */
			haspath[0] = true;
		}
		else if (a->to == s)
		{
			/* We found a cycle of length 1, which we'll deal with below. */
			foundloop = true;
		}
		else if (a->to->tmp != NULL)
		{
			/* It's busy, so we found a cycle of length > 1, so fail. */
			result = false;
			break;
		}
		else
		{
			/* Consider paths forward through this to-state. */
			bool	   *nexthaspath;
			int			i;

			/* If to-state was not already visited, recurse */
			if (haspaths[a->to->no] == NULL)
3278
			{
3279 3280 3281 3282
				result = checkmatchall_recurse(nfa, a->to, haspaths);
				/* Fail if any recursive path fails */
				if (!result)
					break;
3283
			}
3284
			else
3285
			{
3286 3287 3288 3289 3290 3291
				/* The previous visit must have found path(s) to the end */
				result = true;
			}
			assert(a->to->tmp == NULL);
			nexthaspath = haspaths[a->to->no];
			assert(nexthaspath != NULL);
3292

3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306
			/*
			 * Now, for every path of length i from a->to to the string end,
			 * there is a path of length i + 1 from s to the string end.
			 */
			if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
			{
				/*
				 * a->to has a path of length exactly DUPINF, but not longer;
				 * or it has paths of all lengths > DUPINF but not one of
				 * exactly that length.  In either case, we cannot represent
				 * the possible path lengths from s correctly, so fail.
				 */
				result = false;
				break;
3307
			}
3308 3309 3310 3311 3312
			/* Merge knowledge of these path lengths into what we have */
			for (i = 0; i < DUPINF; i++)
				haspath[i + 1] |= nexthaspath[i];
			/* Infinity + 1 is still infinity */
			haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
3313
		}
3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326
	}

	if (result && foundloop)
	{
		/*
		 * If there is a length-1 loop at this state, then find the shortest
		 * known path length to the end.  The loop means that every larger
		 * path length is possible, too.  (It doesn't matter whether any of
		 * the longer lengths were already known possible.)
		 */
		int			i;

		for (i = 0; i <= DUPINF; i++)
3327
		{
3328
			if (haspath[i])
3329 3330
				break;
		}
3331 3332
		for (i++; i <= DUPINF + 1; i++)
			haspath[i] = true;
3333 3334
	}

3335 3336 3337 3338 3339 3340
	/* Report out the completed path length map */
	assert(s->no < nfa->nstates);
	assert(haspaths[s->no] == NULL);
	haspaths[s->no] = haspath;

	/* Mark state no longer busy */
3341
	s->tmp = NULL;
3342

3343 3344 3345 3346 3347 3348
	return result;
}

/*
 * check_out_colors_match - subroutine for checkmatchall
 *
3349 3350 3351 3352
 * Check whether the set of states reachable from s by arcs of color co1
 * is equivalent to the set reachable by arcs of color co2.
 * checkmatchall already verified that all of the NFA's arcs are PLAIN,
 * so we need not examine arc types here.
3353 3354 3355 3356
 */
static bool
check_out_colors_match(struct state *s, color co1, color co2)
{
3357 3358
	bool		result = true;
	struct arc *a;
3359

3360 3361 3362 3363 3364 3365 3366 3367 3368
	/*
	 * To do this in linear time, we assume that the NFA contains no duplicate
	 * arcs.  Run through the out-arcs, marking states reachable by arcs of
	 * color co1.  Run through again, un-marking states reachable by arcs of
	 * color co2; if we see a not-marked state, we know this co2 arc is
	 * unmatched.  Then run through again, checking for still-marked states,
	 * and in any case leaving all the tmp fields reset to NULL.
	 */
	for (a = s->outs; a != NULL; a = a->outchain)
3369
	{
3370
		if (a->co == co1)
3371
		{
3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383
			assert(a->to->tmp == NULL);
			a->to->tmp = a->to;
		}
	}
	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (a->co == co2)
		{
			if (a->to->tmp != NULL)
				a->to->tmp = NULL;
			else
				result = false; /* unmatched co2 arc */
3384 3385
		}
	}
3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397
	for (a = s->outs; a != NULL; a = a->outchain)
	{
		if (a->co == co1)
		{
			if (a->to->tmp != NULL)
			{
				result = false; /* unmatched co1 arc */
				a->to->tmp = NULL;
			}
		}
	}
	return result;
3398 3399 3400 3401 3402
}

/*
 * check_in_colors_match - subroutine for checkmatchall
 *
3403 3404 3405 3406
 * Check whether the set of states that can reach s by arcs of color co1
 * is equivalent to the set that can reach s by arcs of color co2.
 * checkmatchall already verified that all of the NFA's arcs are PLAIN,
 * so we need not examine arc types here.
3407 3408 3409 3410
 */
static bool
check_in_colors_match(struct state *s, color co1, color co2)
{
3411 3412
	bool		result = true;
	struct arc *a;
3413

3414 3415 3416 3417 3418
	/*
	 * Identical algorithm to check_out_colors_match, except examine the
	 * from-states of s' inarcs.
	 */
	for (a = s->ins; a != NULL; a = a->inchain)
3419
	{
3420
		if (a->co == co1)
3421
		{
3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433
			assert(a->from->tmp == NULL);
			a->from->tmp = a->from;
		}
	}
	for (a = s->ins; a != NULL; a = a->inchain)
	{
		if (a->co == co2)
		{
			if (a->from->tmp != NULL)
				a->from->tmp = NULL;
			else
				result = false; /* unmatched co2 arc */
3434 3435
		}
	}
3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447
	for (a = s->ins; a != NULL; a = a->inchain)
	{
		if (a->co == co1)
		{
			if (a->from->tmp != NULL)
			{
				result = false; /* unmatched co1 arc */
				a->from->tmp = NULL;
			}
		}
	}
	return result;
3448 3449
}

3450
/*
3451
 * compact - construct the compact representation of an NFA
3452 3453
 */
static void
3454 3455
compact(struct nfa *nfa,
		struct cnfa *cnfa)
3456 3457 3458
{
	struct state *s;
	struct arc *a;
Bruce Momjian's avatar
Bruce Momjian committed
3459 3460
	size_t		nstates;
	size_t		narcs;
3461 3462 3463
	struct carc *ca;
	struct carc *first;

Bruce Momjian's avatar
Bruce Momjian committed
3464
	assert(!NISERR());
3465 3466 3467

	nstates = 0;
	narcs = 0;
Bruce Momjian's avatar
Bruce Momjian committed
3468 3469
	for (s = nfa->states; s != NULL; s = s->next)
	{
3470
		nstates++;
Bruce Momjian's avatar
Bruce Momjian committed
3471
		narcs += s->nouts + 1;	/* need one extra for endmarker */
3472 3473
	}

3474
	cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
Bruce Momjian's avatar
Bruce Momjian committed
3475 3476
	cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
	cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
3477
	if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
Bruce Momjian's avatar
Bruce Momjian committed
3478
	{
3479 3480
		if (cnfa->stflags != NULL)
			FREE(cnfa->stflags);
3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495
		if (cnfa->states != NULL)
			FREE(cnfa->states);
		if (cnfa->arcs != NULL)
			FREE(cnfa->arcs);
		NERR(REG_ESPACE);
		return;
	}
	cnfa->nstates = nstates;
	cnfa->pre = nfa->pre->no;
	cnfa->post = nfa->post->no;
	cnfa->bos[0] = nfa->bos[0];
	cnfa->bos[1] = nfa->bos[1];
	cnfa->eos[0] = nfa->eos[0];
	cnfa->eos[1] = nfa->eos[1];
	cnfa->ncolors = maxcolor(nfa->cm) + 1;
3496 3497 3498
	cnfa->flags = nfa->flags;
	cnfa->minmatchall = nfa->minmatchall;
	cnfa->maxmatchall = nfa->maxmatchall;
3499 3500

	ca = cnfa->arcs;
Bruce Momjian's avatar
Bruce Momjian committed
3501 3502 3503
	for (s = nfa->states; s != NULL; s = s->next)
	{
		assert((size_t) s->no < nstates);
3504
		cnfa->stflags[s->no] = 0;
3505 3506 3507
		cnfa->states[s->no] = ca;
		first = ca;
		for (a = s->outs; a != NULL; a = a->outchain)
Bruce Momjian's avatar
Bruce Momjian committed
3508 3509 3510 3511 3512 3513 3514 3515 3516
			switch (a->type)
			{
				case PLAIN:
					ca->co = a->co;
					ca->to = a->to->no;
					ca++;
					break;
				case LACON:
					assert(s->no != cnfa->pre);
3517
					assert(a->co >= 0);
Bruce Momjian's avatar
Bruce Momjian committed
3518 3519 3520 3521 3522 3523
					ca->co = (color) (cnfa->ncolors + a->co);
					ca->to = a->to->no;
					ca++;
					cnfa->flags |= HASLACONS;
					break;
				default:
3524
					NERR(REG_ASSERT);
3525
					return;
3526
			}
3527
		carcsort(first, ca - first);
3528 3529 3530 3531 3532 3533 3534 3535 3536
		ca->co = COLORLESS;
		ca->to = 0;
		ca++;
	}
	assert(ca == &cnfa->arcs[narcs]);
	assert(cnfa->nstates != 0);

	/* mark no-progress states */
	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3537 3538
		cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
	cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
3539 3540 3541 3542 3543 3544
}

/*
 * carcsort - sort compacted-NFA arcs by color
 */
static void
3545
carcsort(struct carc *first, size_t n)
3546
{
3547 3548 3549
	if (n > 1)
		qsort(first, n, sizeof(struct carc), carc_cmp);
}
3550

3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565
static int
carc_cmp(const void *a, const void *b)
{
	const struct carc *aa = (const struct carc *) a;
	const struct carc *bb = (const struct carc *) b;

	if (aa->co < bb->co)
		return -1;
	if (aa->co > bb->co)
		return +1;
	if (aa->to < bb->to)
		return -1;
	if (aa->to > bb->to)
		return +1;
	return 0;
3566 3567 3568 3569 3570 3571
}

/*
 * freecnfa - free a compacted NFA
 */
static void
3572
freecnfa(struct cnfa *cnfa)
3573
{
3574
	assert(!NULLCNFA(*cnfa));	/* not empty already */
3575
	FREE(cnfa->stflags);
3576 3577
	FREE(cnfa->states);
	FREE(cnfa->arcs);
3578
	ZAPCNFA(*cnfa);
3579 3580 3581 3582 3583 3584
}

/*
 * dumpnfa - dump an NFA in human-readable form
 */
static void
3585
dumpnfa(struct nfa *nfa,
3586 3587 3588 3589
		FILE *f)
{
#ifdef REG_DEBUG
	struct state *s;
3590 3591
	int			nstates = 0;
	int			narcs = 0;
3592 3593 3594

	fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
	if (nfa->bos[0] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3595
		fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3596
	if (nfa->bos[1] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3597
		fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3598
	if (nfa->eos[0] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3599
		fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3600
	if (nfa->eos[1] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3601
		fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3602 3603 3604
	if (nfa->flags & HASLACONS)
		fprintf(f, ", haslacons");
	if (nfa->flags & MATCHALL)
3605 3606 3607 3608 3609 3610 3611
	{
		fprintf(f, ", minmatchall %d", nfa->minmatchall);
		if (nfa->maxmatchall == DUPINF)
			fprintf(f, ", maxmatchall inf");
		else
			fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
	}
3612 3613
	fprintf(f, "\n");
	for (s = nfa->states; s != NULL; s = s->next)
3614
	{
3615
		dumpstate(s, f);
3616 3617 3618 3619
		nstates++;
		narcs += s->nouts;
	}
	fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3620 3621 3622 3623 3624 3625
	if (nfa->parent == NULL)
		dumpcolors(nfa->cm, f);
	fflush(f);
#endif
}

Bruce Momjian's avatar
Bruce Momjian committed
3626
#ifdef REG_DEBUG				/* subordinates of dumpnfa */
3627 3628 3629 3630 3631

/*
 * dumpstate - dump an NFA state in human-readable form
 */
static void
3632
dumpstate(struct state *s,
3633 3634 3635 3636 3637
		  FILE *f)
{
	struct arc *a;

	fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
Bruce Momjian's avatar
Bruce Momjian committed
3638
			(s->flag) ? s->flag : '.');
3639 3640 3641 3642 3643 3644
	if (s->prev != NULL && s->prev->next != s)
		fprintf(f, "\tstate chain bad\n");
	if (s->nouts == 0)
		fprintf(f, "\tno out arcs\n");
	else
		dumparcs(s, f);
Bruce Momjian's avatar
Bruce Momjian committed
3645 3646
	for (a = s->ins; a != NULL; a = a->inchain)
	{
3647 3648 3649 3650
		if (a->to != s)
			fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
					a->from->no, a->to->no, s->no);
	}
3651
	fflush(f);
3652 3653 3654 3655 3656 3657
}

/*
 * dumparcs - dump out-arcs in human-readable form
 */
static void
3658
dumparcs(struct state *s,
3659 3660
		 FILE *f)
{
Bruce Momjian's avatar
Bruce Momjian committed
3661
	int			pos;
3662
	struct arc *a;
3663

3664 3665 3666 3667 3668 3669 3670
	/* printing oldest arcs first is usually clearer */
	a = s->outs;
	assert(a != NULL);
	while (a->outchain != NULL)
		a = a->outchain;
	pos = 1;
	do
Bruce Momjian's avatar
Bruce Momjian committed
3671
	{
3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682
		dumparc(a, s, f);
		if (pos == 5)
		{
			fprintf(f, "\n");
			pos = 1;
		}
		else
			pos++;
		a = a->outchainRev;
	} while (a != NULL);
	if (pos != 1)
3683 3684 3685 3686 3687 3688 3689
		fprintf(f, "\n");
}

/*
 * dumparc - dump one outarc in readable form, including prefixing tab
 */
static void
3690 3691
dumparc(struct arc *a,
		struct state *s,
3692 3693 3694 3695 3696
		FILE *f)
{
	struct arc *aa;

	fprintf(f, "\t");
Bruce Momjian's avatar
Bruce Momjian committed
3697 3698 3699
	switch (a->type)
	{
		case PLAIN:
3700 3701 3702 3703
			if (a->co == RAINBOW)
				fprintf(f, "[*]");
			else
				fprintf(f, "[%ld]", (long) a->co);
Bruce Momjian's avatar
Bruce Momjian committed
3704 3705
			break;
		case AHEAD:
3706 3707 3708 3709
			if (a->co == RAINBOW)
				fprintf(f, ">*>");
			else
				fprintf(f, ">%ld>", (long) a->co);
Bruce Momjian's avatar
Bruce Momjian committed
3710 3711
			break;
		case BEHIND:
3712 3713 3714 3715
			if (a->co == RAINBOW)
				fprintf(f, "<*<");
			else
				fprintf(f, "<%ld<", (long) a->co);
Bruce Momjian's avatar
Bruce Momjian committed
3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728
			break;
		case LACON:
			fprintf(f, ":%ld:", (long) a->co);
			break;
		case '^':
		case '$':
			fprintf(f, "%c%d", a->type, (int) a->co);
			break;
		case EMPTY:
			break;
		default:
			fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
			break;
3729 3730 3731
	}
	if (a->from != s)
		fprintf(f, "?%d?", a->from->no);
3732 3733
	for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
		if (aa == a)
Bruce Momjian's avatar
Bruce Momjian committed
3734
			break;				/* NOTE BREAK OUT */
3735 3736
	if (aa == NULL)
		fprintf(f, "?!?");		/* missing from out-chain */
3737
	fprintf(f, "->");
Bruce Momjian's avatar
Bruce Momjian committed
3738 3739
	if (a->to == NULL)
	{
3740 3741 3742 3743 3744 3745
		fprintf(f, "NULL");
		return;
	}
	fprintf(f, "%d", a->to->no);
	for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
		if (aa == a)
Bruce Momjian's avatar
Bruce Momjian committed
3746
			break;				/* NOTE BREAK OUT */
3747
	if (aa == NULL)
Bruce Momjian's avatar
Bruce Momjian committed
3748
		fprintf(f, "?!?");		/* missing from in-chain */
3749
}
Tom Lane's avatar
Tom Lane committed
3750
#endif							/* REG_DEBUG */
3751 3752 3753 3754 3755 3756

/*
 * dumpcnfa - dump a compacted NFA in human-readable form
 */
#ifdef REG_DEBUG
static void
3757
dumpcnfa(struct cnfa *cnfa,
3758 3759
		 FILE *f)
{
Bruce Momjian's avatar
Bruce Momjian committed
3760
	int			st;
3761 3762 3763

	fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
	if (cnfa->bos[0] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3764
		fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3765
	if (cnfa->bos[1] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3766
		fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3767
	if (cnfa->eos[0] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3768
		fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3769
	if (cnfa->eos[1] != COLORLESS)
Bruce Momjian's avatar
Bruce Momjian committed
3770 3771
		fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
	if (cnfa->flags & HASLACONS)
3772
		fprintf(f, ", haslacons");
3773
	if (cnfa->flags & MATCHALL)
3774 3775 3776 3777 3778 3779 3780
	{
		fprintf(f, ", minmatchall %d", cnfa->minmatchall);
		if (cnfa->maxmatchall == DUPINF)
			fprintf(f, ", maxmatchall inf");
		else
			fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
	}
3781 3782
	fprintf(f, "\n");
	for (st = 0; st < cnfa->nstates; st++)
3783
		dumpcstate(st, cnfa, f);
3784 3785 3786 3787
	fflush(f);
}
#endif

Bruce Momjian's avatar
Bruce Momjian committed
3788
#ifdef REG_DEBUG				/* subordinates of dumpcnfa */
3789 3790 3791 3792 3793 3794

/*
 * dumpcstate - dump a compacted-NFA state in human-readable form
 */
static void
dumpcstate(int st,
3795
		   struct cnfa *cnfa,
3796 3797
		   FILE *f)
{
Bruce Momjian's avatar
Bruce Momjian committed
3798
	struct carc *ca;
Bruce Momjian's avatar
Bruce Momjian committed
3799
	int			pos;
3800

3801
	fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3802
	pos = 1;
3803
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
Bruce Momjian's avatar
Bruce Momjian committed
3804
	{
3805 3806 3807
		if (ca->co == RAINBOW)
			fprintf(f, "\t[*]->%d", ca->to);
		else if (ca->co < cnfa->ncolors)
3808
			fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3809
		else
3810
			fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
Bruce Momjian's avatar
Bruce Momjian committed
3811 3812
		if (pos == 5)
		{
3813 3814
			fprintf(f, "\n");
			pos = 1;
Bruce Momjian's avatar
Bruce Momjian committed
3815 3816
		}
		else
3817 3818
			pos++;
	}
3819
	if (ca == cnfa->states[st] || pos != 1)
3820 3821 3822 3823
		fprintf(f, "\n");
	fflush(f);
}

Tom Lane's avatar
Tom Lane committed
3824
#endif							/* REG_DEBUG */