Cleanup for some problems in tsearch patch:

- ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas

Cleanup for some problems in tsearch patch:
- ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas
7351b5fa · Tom Lane · b918bf86 · 7351b5fa · 7351b5fa · 7351b5fa
Commit 7351b5fa authored Aug 25, 2007 by Tom Lane
14 changed files
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -192,7 +192,6 @@ dsnowball_init(PG_FUNCTION_ARGS)
 	ListCell   *l;

 	d = (DictSnowball *) palloc0(sizeof(DictSnowball));
-	d->stoplist.wordop = recode_and_lowerstr;

 	foreach(l, dictoptions)
 	{
@@ -204,8 +203,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist);
-			sortstoplist(&d->stoplist);
+			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 			stoploaded = true;
 		}
 		else if (pg_strcasecmp("Language", defel->defname) == 0)

--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -39,7 +39,6 @@ dispell_init(PG_FUNCTION_ARGS)
 	ListCell   *l;

 	d = (DictISpell *) palloc0(sizeof(DictISpell));
-	d->stoplist.wordop = recode_and_lowerstr;

 	foreach(l, dictoptions)
 	{
@@ -73,8 +72,7 @@ dispell_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &(d->stoplist));
-			sortstoplist(&(d->stoplist));
+			readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
 			stoploaded = true;
 		}
 		else

--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,19 +23,17 @@
 typedef struct
 {
 	StopList	stoplist;
-} DictExample;
+} DictSimple;


 Datum
 dsimple_init(PG_FUNCTION_ARGS)
 {
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
-	DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
+	DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
 	bool		stoploaded = false;
 	ListCell   *l;

-	d->stoplist.wordop = recode_and_lowerstr;
-
 	foreach(l, dictoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);
@@ -46,8 +44,7 @@ dsimple_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist);
-			sortstoplist(&d->stoplist);
+			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 			stoploaded = true;
 		}
 		else
@@ -65,16 +62,16 @@ dsimple_init(PG_FUNCTION_ARGS)
 Datum
 dsimple_lexize(PG_FUNCTION_ARGS)
 {
-	DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
+	DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32	   len = PG_GETARG_INT32(2);
-	char	   *txt = lowerstr_with_len(in, len);
+	char	   *txt;
 	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

+	txt = lowerstr_with_len(in, len);
+
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
-	{
 		pfree(txt);
-	}
 	else
 		res[0].lexeme = txt;


--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.2 2007/08/22 04:13:15 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -20,9 +20,6 @@
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"

-
-#define SYNBUFLEN	4096
-
 typedef struct
 {
 	char	   *in;
@@ -31,23 +28,34 @@ typedef struct

 typedef struct
 {
-	int			len;
+	int			len;	/* length of syn array */
 	Syn		   *syn;
 } DictSyn;

+/*
+ * Finds the next whitespace-delimited word within the 'in' string.
+ * Returns a pointer to the first character of the word, and a pointer
+ * to the next byte after the last character in the word (in *end).
+ */
 static char *
 findwrd(char *in, char **end)
 {
 	char	   *start;

-	*end = NULL;
+	/* Skip leading spaces */
 	while (*in && t_isspace(in))
 		in += pg_mblen(in);

+	/* Return NULL on empty lines */
 	if (*in == '\0')
+	{
+		*end = NULL;
 		return NULL;
+	}
+
 	start = in;

+	/* Find end of word */
 	while (*in && !t_isspace(in))
 		in += pg_mblen(in);

@@ -70,12 +78,11 @@ dsynonym_init(PG_FUNCTION_ARGS)
 	ListCell   *l;
 	char	   *filename = NULL;
 	FILE	   *fin;
-	char		buf[SYNBUFLEN];
 	char	   *starti,
 			   *starto,
 			   *end = NULL;
 	int			cur = 0;
-	int			slen;
+	char	   *line = NULL;

 	foreach(l, dictoptions)
 	{
@@ -105,10 +112,33 @@ dsynonym_init(PG_FUNCTION_ARGS)

 	d = (DictSyn *) palloc0(sizeof(DictSyn));

-	while (fgets(buf, SYNBUFLEN, fin))
+	while ((line = t_readline(fin)) != NULL)
 	{
-		slen = strlen(buf);
-		pg_verifymbstr(buf, slen, false);
+		starti = findwrd(line, &end);
+		if (!starti)
+		{
+			/* Empty line */
+			goto skipline;
+		}
+		*end = '\0';
+		if (end >= line + strlen(line))
+		{
+			/* A line with only one word. Ignore silently. */
+			goto skipline;
+		}
+
+		starto = findwrd(end + 1, &end);
+		if (!starto)
+		{
+			/* A line with only one word. Ignore silently. */
+			goto skipline;
+		}
+		*end = '\0';
+
+		/* starti now points to the first word, and starto to the second
+		 * word on the line, with a \0 terminator at the end of both words.
+		 */
+
 		if (cur == d->len)
 		{
 			if (d->len == 0)
@@ -123,36 +153,19 @@ dsynonym_init(PG_FUNCTION_ARGS)
 			}
 		}

-		starti = findwrd(buf, &end);
-		if (!starti)
-			continue;
-		*end = '\0';
-		if (end >= buf + slen)
-			continue;
-
-		starto = findwrd(end + 1, &end);
-		if (!starto)
-			continue;
-		*end = '\0';
-
-		d->syn[cur].in = recode_and_lowerstr(starti);
-		d->syn[cur].out = recode_and_lowerstr(starto);
-		if (!(d->syn[cur].in && d->syn[cur].out))
-		{
-			FreeFile(fin);
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-		}
+		d->syn[cur].in = lowerstr(starti);
+		d->syn[cur].out = lowerstr(starto);

 		cur++;
+
+	skipline:
+		pfree(line);
 	}

 	FreeFile(fin);

 	d->len = cur;
-	if (cur > 1)
-		qsort(d->syn, d->len, sizeof(Syn), compareSyn);
+	qsort(d->syn, d->len, sizeof(Syn), compareSyn);

 	PG_RETURN_POINTER(d);
 }
@@ -179,8 +192,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
 	if (!found)
 		PG_RETURN_POINTER(NULL);

-	res = palloc(sizeof(TSLexeme) * 2);
-	memset(res, 0, sizeof(TSLexeme) * 2);
+	res = palloc0(sizeof(TSLexeme) * 2);
 	res[0].lexeme = pstrdup(found->out);

 	PG_RETURN_POINTER(res);

--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -170,10 +170,10 @@ static void
 thesaurusRead(char *filename, DictThesaurus * d)
 {
 	FILE	   *fh;
-	char		str[BUFSIZ];
 	int			lineno = 0;
 	uint16		idsubst = 0;
 	bool		useasis = false;
+	char	   *line;

 	filename = get_tsearch_config_filename(filename, "ths");
 	fh = AllocateFile(filename, "r");
@@ -183,27 +183,28 @@ thesaurusRead(char *filename, DictThesaurus * d)
 				 errmsg("could not open thesaurus file \"%s\": %m",
 						filename)));

-	while (fgets(str, sizeof(str), fh))
+	while ((line = t_readline(fh)) != NULL)
 	{
-		char	   *ptr,
-				   *recoded;
+		char	   *ptr;
 		int			state = TR_WAITLEX;
 		char	   *beginwrd = NULL;
 		uint16		posinsubst = 0;
 		uint16		nwrd = 0;

-		ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-											 GetDatabaseEncoding(), PG_UTF8);
-		if (recoded == NULL)
-			elog(ERROR, "encoding conversion failed");
-
 		lineno++;

-		/* is it comment ? */
-		while (t_isspace(ptr))
+		ptr = line;
+
+		/* is it a comment? */
+		while (*ptr && t_isspace(ptr))
 			ptr += pg_mblen(ptr);
-		if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
+
+		if (t_iseq(ptr, '#') || *ptr == '\0' ||
+			t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
+		{
+			pfree(line);
 			continue;
+		}

 		while (*ptr)
 		{
@@ -301,8 +302,7 @@ thesaurusRead(char *filename, DictThesaurus * d)
 							lineno, filename)));
 		}

-		if (recoded != str)
-			pfree(recoded);
+		pfree(line);
 	}

 	d->nsubst = idsubst;

--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -125,28 +125,47 @@ _t_isprint(const char *ptr)
 }
 #endif   /* TS_USE_WIDE */

+
 /*
- * Convert C-string from UTF8 to server encoding and
- * lower it
+ * Read the next line from a tsearch data file (expected to be in UTF-8), and
+ * convert it to database encoding if needed. The returned string is palloc'd.
+ * NULL return means EOF.
 */
 char *
-recode_and_lowerstr(char *str)
+t_readline(FILE *fp)
 {
-	char	   *recoded;
-	char	   *ret;
+	int len;
+	char *recoded;
+	char buf[4096];		/* lines must not be longer than this */
+	
+	if (fgets(buf, sizeof(buf), fp) == NULL)
+		return NULL;

-	recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-											 PG_UTF8, GetDatabaseEncoding());
+	len = strlen(buf);

-	if (recoded == NULL)
-		elog(ERROR, "encoding conversion failed");
+	/* Make sure the input is valid UTF-8 */
+	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);

-	ret = lowerstr(recoded);
+	/* And convert */
+	recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
+												 len,
+												 PG_UTF8,
+												 GetDatabaseEncoding());

-	if (recoded != str)
-		pfree(recoded);
+	if (recoded == NULL)		/* should not happen */
+		elog(ERROR, "encoding conversion failed");
+
+	if (recoded == buf)
+	{
+		/*
+		 * conversion didn't pstrdup, so we must.
+		 * We can use the length of the original string, because
+		 * no conversion was done.
+		 */
+		recoded = pnstrdup(recoded, len);
+	}

-	return ret;
+	return recoded;
 }

 char *
@@ -155,6 +174,9 @@ lowerstr(char *str)
 	return lowerstr_with_len(str, strlen(str));
 }

+/*
+ * Returned string is palloc'd
+ */
 char *
 lowerstr_with_len(char *str, int len)
 {

--- a/src/backend/tsearch/ts_parse.c
+++ b/src/backend/tsearch/ts_parse.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -308,7 +308,7 @@ LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
 			{
 				/*
 				 * Dictionary normalizes lexemes, so we remove from stack all
-				 * used lexemes , return to basic mode and redo end of stack
+				 * used lexemes, return to basic mode and redo end of stack
 				 * (if it exists)
 				 */
 				if (res)
@@ -427,14 +427,14 @@ parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
 * Headline framework
 */
 static void
-hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
+hladdword(HeadlineParsedText * prs, char *buf, int4 buflen, int type)
 {
 	while (prs->curwords >= prs->lenwords)
 	{
 		prs->lenwords *= 2;
-		prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
 	}
-	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
+	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
 	prs->words[prs->curwords].type = (uint8) type;
 	prs->words[prs->curwords].len = buflen;
 	prs->words[prs->curwords].word = palloc(buflen);
@@ -443,16 +443,16 @@ hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
 }

 static void
-hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
+hlfinditem(HeadlineParsedText * prs, TSQuery query, char *buf, int buflen)
 {
 	int			i;
 	QueryItem  *item = GETQUERY(query);
-	HeadlineWord *word;
+	HeadlineWordEntry *word;

 	while (prs->curwords + query->size >= prs->lenwords)
 	{
 		prs->lenwords *= 2;
-		prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
 	}

 	word = &(prs->words[prs->curwords - 1]);
@@ -462,7 +462,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
 		{
 			if (word->item)
 			{
-				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
+				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
 				prs->words[prs->curwords].item = item;
 				prs->words[prs->curwords].repeated = 1;
 				prs->curwords++;
@@ -475,7 +475,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
 }

 static void
-addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
+addHLParsedLex(HeadlineParsedText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
 {
 	ParsedLex  *tmplexs;
 	TSLexeme   *ptr;
@@ -511,7 +511,7 @@ addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * n
 }

 void
-hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
+hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen)
 {
 	int			type,
 				lenlemm;
@@ -571,12 +571,12 @@ hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen
 }

 text *
-generatHeadline(HeadlineText * prs)
+generateHeadline(HeadlineParsedText * prs)
 {
 	text	   *out;
 	int			len = 128;
 	char	   *ptr;
-	HeadlineWord *wrd = prs->words;
+	HeadlineWordEntry *wrd = prs->words;

 	out = (text *) palloc(len);
 	ptr = ((char *) out) + VARHDRSZ;

--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -63,21 +63,29 @@ get_tsearch_config_filename(const char *basename,
 	return result;
 }

-#define STOPBUFLEN	4096
+static int
+comparestr(const void *a, const void *b)
+{
+	return strcmp(*(char **) a, *(char **) b);
+}

+/*
+ * Reads a stopword file. Each word is run through 'wordop'
+ * function, if given.  wordop may either modify the input in-place,
+ * or palloc a new version.
+ */
 void
-readstoplist(char *in, StopList * s)
+readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
 {
 	char	  **stop = NULL;

 	s->len = 0;
-	if (in && *in)
+	if (fname && *fname)
 	{
-		char	   *filename = get_tsearch_config_filename(in, "stop");
+		char	   *filename = get_tsearch_config_filename(fname, "stop");
 		FILE	   *hin;
-		char		buf[STOPBUFLEN];
+		char	   *line;
 		int			reallen = 0;
-		int			line = 0;

 		if ((hin = AllocateFile(filename, "r")) == NULL)
 			ereport(ERROR,
@@ -85,65 +93,56 @@ readstoplist(char *in, StopList * s)
 					 errmsg("could not open stopword file \"%s\": %m",
 							filename)));

-		while (fgets(buf, STOPBUFLEN, hin))
+		while ((line = t_readline(hin)) != NULL)
 		{
-			char	   *pbuf = buf;
+			char *pbuf = line;

-			line++;
-			while (*pbuf && !isspace(*pbuf))
+			/* Trim trailing space */
+			while (*pbuf && !t_isspace(pbuf))
 				pbuf++;
 			*pbuf = '\0';

-			if (*buf == '\0')
-				continue;
-
-			if (!pg_verifymbstr(buf, strlen(buf), true))
+			/* Skip empty lines */
+			if (*line == '\0')
 			{
-				FreeFile(hin);
-				ereport(ERROR,
-						(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-						 errmsg("invalid multibyte encoding at line %d in file \"%s\"",
-								line, filename)));
+				pfree(line);
+				continue;
 			}

 			if (s->len >= reallen)
 			{
 				if (reallen == 0)
 				{
-					reallen = 16;
+					reallen = 64;
 					stop = (char **) palloc(sizeof(char *) * reallen);
 				}
 				else
 				{
 					reallen *= 2;
-					stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen);
+					stop = (char **) repalloc((void *) stop,
+											  sizeof(char *) * reallen);
 				}
 			}

-
-			if (s->wordop)
-				stop[s->len] = s->wordop(buf);
+			if (wordop)
+			{
+				stop[s->len] = wordop(line);
+				if (stop[s->len] != line)
+					pfree(line);
+			}
 			else
-				stop[s->len] = pstrdup(buf);
+				stop[s->len] = line;

 			(s->len)++;
 		}
+
 		FreeFile(hin);
 		pfree(filename);
 	}

 	s->stop = stop;
-}

-static int
-comparestr(const void *a, const void *b)
-{
-	return strcmp(*(char **) a, *(char **) b);
-}
-
-void
-sortstoplist(StopList * s)
-{
+	/* Sort to allow binary searching */
 	if (s->stop && s->len > 0)
 		qsort(s->stop, s->len, sizeof(char *), comparestr);
 }

--- a/src/backend/tsearch/wparser.c
+++ b/src/backend/tsearch/wparser.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.2 2007/08/22 01:39:45 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -300,7 +300,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
 	text	   *in = PG_GETARG_TEXT_P(1);
 	TSQuery		query = PG_GETARG_TSQUERY(2);
 	text	   *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
-	HeadlineText prs;
+	HeadlineParsedText prs;
 	List	   *prsoptions;
 	text	   *out;
 	TSConfigCacheEntry *cfg;
@@ -309,9 +309,9 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
 	cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
 	prsobj = lookup_ts_parser_cache(cfg->prsId);

-	memset(&prs, 0, sizeof(HeadlineText));
+	memset(&prs, 0, sizeof(HeadlineParsedText));
 	prs.lenwords = 32;
-	prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords);
+	prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);

 	hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);

@@ -325,7 +325,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
 				  PointerGetDatum(prsoptions),
 				  PointerGetDatum(query));

-	out = generatHeadline(&prs);
+	out = generateHeadline(&prs);

 	PG_FREE_IF_COPY(in, 1);
 	PG_FREE_IF_COPY(query, 2);

--- a/src/include/tsearch/dicts/spell.h
+++ b/src/include/tsearch/dicts/spell.h
@@ -6,7 +6,7 @@
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -18,12 +18,17 @@
 #include "tsearch/dicts/regis.h"
 #include "tsearch/ts_public.h"

+/*
+ * Max length of a flag name. Names longer than this will be truncated
+ * to the maximum. 
+ */
+#define MAXFLAGLEN 16
+
 struct SPNode;

 typedef struct
 {
-	uint32
-				val:8,
+	uint32		val:8,
 				isword:1,
 				compoundflag:4,
 				affix:19;
@@ -54,22 +59,25 @@ typedef struct spell_struct
 {
 	union
 	{
-		char		flag[16];
+		/*
+		 * flag is filled in by NIImportDictionary. After NISortDictionary,
+		 * d is valid and flag is invalid. 
+		 */
+		char		flag[MAXFLAGLEN];
 		struct
 		{
 			int			affix;
 			int			len;
 		}			d;
 	}			p;
-	char		word[1];
+	char		word[1]; /* variable length, null-terminated */
 } SPELL;

 #define SPELLHDRSZ	(offsetof(SPELL, word))

 typedef struct aff_struct
 {
-	uint32
-				flag:8,
+	uint32		flag:8,
 				type:1,
 				flagflags:7,
 				issimple:1,
@@ -85,11 +93,16 @@ typedef struct aff_struct
 } AFFIX;

 /*
- * affixes use deictinary flags too
+ * affixes use dictionary flags too
 */
 #define FF_COMPOUNDPERMITFLAG	0x10
 #define FF_COMPOUNDFORBIDFLAG	0x20
 #define FF_CROSSPRODUCT			0x40
+
+/*
+ * Don't change the order of these. Initialization sorts by these,
+ * and expects prefixes to come first after sorting.
+ */
 #define FF_SUFFIX				1
 #define FF_PREFIX				0

@@ -97,8 +110,7 @@ struct AffixNode;

 typedef struct
 {
-	uint32
-				val:8,
+	uint32		val:8,
 				naff:24;
 	AFFIX	  **aff;
 	struct AffixNode *node;
@@ -126,9 +138,13 @@ typedef struct
 	int			naffixes;
 	AFFIX	   *Affix;

-	int			nspell;
-	int			mspell;
+	/*
+	 * Temporary array of all words in the dict file. Only used during 
+	 * initialization
+	 */
 	SPELL	  **Spell;
+	int			nspell; /* number of valid entries in Spell array */
+	int			mspell; /* allocated length of Spell array */

 	AffixNode  *Suffix;
 	AffixNode  *Prefix;

--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -5,7 +5,7 @@
 *
 * Copyright (c) 1998-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -38,11 +38,11 @@

 #ifdef TS_USE_WIDE

-size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+extern size_t char2wchar(wchar_t *to, const char *from, size_t len);

 #ifdef WIN32

-size_t		wchar2char(char *to, const wchar_t *from, size_t len);
+extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
 #else							/* WIN32 */

 /* correct wcstombs */
@@ -81,8 +81,8 @@ extern int	_t_isprint(const char *ptr);
 #define COPYCHAR(d,s)	TOUCHAR(d) = TOUCHAR(s)
 #endif

-char	   *lowerstr(char *str);
-char	   *lowerstr_with_len(char *str, int len);
-char	   *recode_and_lowerstr(char *str);
+extern char *lowerstr(char *str);
+extern char *lowerstr_with_len(char *str, int len);
+extern char *t_readline(FILE *fp);

 #endif   /* __TSLOCALE_H__ */
--- a/src/include/tsearch/ts_public.h
+++ b/src/include/tsearch/ts_public.h
@@ -6,7 +6,7 @@
 *
 * Copyright (c) 1998-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.2 2007/08/22 01:39:46 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.3 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -71,12 +71,11 @@ typedef struct
 {
 	int			len;
 	char	  **stop;
-	char	   *(*wordop) (char *);
 } StopList;

-extern void sortstoplist(StopList * s);
-extern void readstoplist(char *in, StopList * s);
-extern bool searchstoplist(StopList * s, char *key);
+extern void readstoplist(const char *fname, StopList *s,
+						 char *(*wordop) (char *));
+extern bool searchstoplist(StopList *s, char *key);

 /*
 * Interface with dictionaries
@@ -102,9 +101,8 @@ typedef struct
 #define TSL_ADDPOS		0x01

 /*
- * Struct for supporting complex dictionaries like
- * thesaurus, pointer to is an 4-th argument for
- * dictlexize method
+ * Struct for supporting complex dictionaries like thesaurus.
+ * 4th argument for dictlexize method is a pointer to this
 */
 typedef struct
 {

--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -5,7 +5,7 @@
 *
 * Copyright (c) 1998-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -13,6 +13,7 @@
 #define _PG_TS_UTILS_H_

 #include "tsearch/ts_type.h"
+#include "tsearch/ts_public.h"

 /*
 * Common parse definitions for tsvector and tsquery
@@ -38,7 +39,8 @@ typedef struct

 extern bool gettoken_tsvector(TSVectorParseState *state);

-struct ParseQueryNode;
+struct ParseQueryNode;			/* private in backend/utils/adt/tsquery.c */
+
 typedef struct
 {
 	char	   *buffer;			/* entire string we are scanning */
@@ -46,7 +48,7 @@ typedef struct
 	int4		state;
 	int4		count;

-	/* reverse polish notation in list (for temprorary usage) */
+	/* reverse polish notation in list (for temporary usage) */
 	struct ParseQueryNode *str;

 	/* number in str */
@@ -102,36 +104,12 @@ extern void parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen);
 * headline framework, flow in common to generate:
 *	1 parse text with hlparsetext
 *	2 parser-specific function to find part
- *	3 generatHeadline to generate result text
+ *	3 generateHeadline to generate result text
 */

-typedef struct
-{
-	uint32		selected:1,
-				in:1,
-				replace:1,
-				repeated:1,
-				unused:4,
-				type:8,
-				len:16;
-	char	   *word;
-	QueryItem  *item;
-} HeadlineWord;
-
-typedef struct
-{
-	HeadlineWord *words;
-	int4		lenwords;
-	int4		curwords;
-	char	   *startsel;
-	char	   *stopsel;
-	int2		startsellen;
-	int2		stopsellen;
-} HeadlineText;
-
-extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
+extern void hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query,
 			char *buf, int4 buflen);
-extern text *generatHeadline(HeadlineText * prs);
+extern text *generateHeadline(HeadlineParsedText * prs);

 /*
 * token/node types for parsing