dict_thesaurus.c

/*-------------------------------------------------------------------------
 *
 * dict_thesaurus.c
 *		Thesaurus dictionary: phrase to phrase substitution
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "storage/fd.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"


/*
 * Temporay we use TSLexeme.flags for inner use...
 */
#define DT_USEASIS		0x1000

typedef struct LexemeInfo
{
	uint16		idsubst;		/* entry's number in DictThesaurus->subst */
	uint16		posinsubst;		/* pos info in entry */
	uint16		tnvariant;		/* total num lexemes in one variant */
	struct LexemeInfo *nextentry;
	struct LexemeInfo *nextvariant;
} LexemeInfo;

typedef struct
{
	char	   *lexeme;
	LexemeInfo *entries;
} TheLexeme;

typedef struct
{
	uint16		lastlexeme;		/* number lexemes to substitute */
	uint16		reslen;
	TSLexeme   *res;			/* prepared substituted result */
} TheSubstitute;

typedef struct
{
	/* subdictionary to normalize lexemes */
	Oid			subdictOid;
	TSDictionaryCacheEntry *subdict;

	/* Array to search lexeme by exact match */
	TheLexeme  *wrds;
	int			nwrds;
	int			ntwrds;

	/*
	 * Storage of substituted result, n-th element is for n-th expression
	 */
	TheSubstitute *subst;
	int			nsubst;
} DictThesaurus;


static void
newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
{
	TheLexeme  *ptr;

	if (d->nwrds >= d->ntwrds)
	{
		if (d->ntwrds == 0)
		{
			d->ntwrds = 16;
			d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
		}
		else
		{
			d->ntwrds *= 2;
			d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
		}
	}

	ptr = d->wrds + d->nwrds;
	d->nwrds++;

	ptr->lexeme = palloc(e - b + 1);

	memcpy(ptr->lexeme, b, e - b);
	ptr->lexeme[e - b] = '\0';

	ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));

	ptr->entries->nextentry = NULL;
	ptr->entries->idsubst = idsubst;
	ptr->entries->posinsubst = posinsubst;
}

static void
addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
{
	static int	nres = 0;
	static int	ntres = 0;
	TheSubstitute *ptr;

	if (nwrd == 0)
	{
		nres = ntres = 0;

		if (idsubst >= d->nsubst)
		{
			if (d->nsubst == 0)
			{
				d->nsubst = 16;
				d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
			}
			else
			{
				d->nsubst *= 2;
				d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
			}
		}
	}

	ptr = d->subst + idsubst;

	ptr->lastlexeme = posinsubst - 1;

	if (nres + 1 >= ntres)
	{
		if (ntres == 0)
		{
			ntres = 2;
			ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
		}
		else
		{
			ntres *= 2;
			ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
		}

	}

	ptr->res[nres].lexeme = palloc(e - b + 1);
	memcpy(ptr->res[nres].lexeme, b, e - b);
	ptr->res[nres].lexeme[e - b] = '\0';

	ptr->res[nres].nvariant = nwrd;
	if (useasis)
		ptr->res[nres].flags = DT_USEASIS;
	else
		ptr->res[nres].flags = 0;

	ptr->res[++nres].lexeme = NULL;
}

#define TR_WAITLEX	1
#define TR_INLEX	2
#define TR_WAITSUBS 3
#define TR_INSUBS	4

static void
thesaurusRead(char *filename, DictThesaurus * d)
{
	FILE	   *fh;
	int			lineno = 0;
	uint16		idsubst = 0;
	bool		useasis = false;
	char	   *line;

	filename = get_tsearch_config_filename(filename, "ths");
	fh = AllocateFile(filename, "r");
	if (!fh)
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open thesaurus file \"%s\": %m",
						filename)));

	while ((line = t_readline(fh)) != NULL)
	{
		char	   *ptr;
		int			state = TR_WAITLEX;
		char	   *beginwrd = NULL;
		uint16		posinsubst = 0;
		uint16		nwrd = 0;

		lineno++;

		ptr = line;

		/* is it a comment? */
		while (*ptr && t_isspace(ptr))
			ptr += pg_mblen(ptr);

		if (t_iseq(ptr, '#') || *ptr == '\0' ||
			t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
		{
			pfree(line);
			continue;
		}

		while (*ptr)
		{
			if (state == TR_WAITLEX)
			{
				if (t_iseq(ptr, ':'))
				{
					if (posinsubst == 0)
					{
						FreeFile(fh);
						ereport(ERROR,
								(errcode(ERRCODE_CONFIG_FILE_ERROR),
								 errmsg("unexpected delimiter at line %d of thesaurus file \"%s\"",
										lineno, filename)));
					}
					state = TR_WAITSUBS;
				}
				else if (!t_isspace(ptr))
				{
					beginwrd = ptr;
					state = TR_INLEX;
				}
			}
			else if (state == TR_INLEX)
			{
				if (t_iseq(ptr, ':'))
				{
					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
					state = TR_WAITSUBS;
				}
				else if (t_isspace(ptr))
				{
					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
					state = TR_WAITLEX;
				}
			}
			else if (state == TR_WAITSUBS)
			{
				if (t_iseq(ptr, '*'))
				{
					useasis = true;
					state = TR_INSUBS;
					beginwrd = ptr + pg_mblen(ptr);
				}
				else if (t_iseq(ptr, '\\'))
				{
					useasis = false;
					state = TR_INSUBS;
					beginwrd = ptr + pg_mblen(ptr);
				}
				else if (!t_isspace(ptr))
				{
					useasis = false;
					beginwrd = ptr;
					state = TR_INSUBS;
				}
			}
			else if (state == TR_INSUBS)
			{
				if (t_isspace(ptr))
				{
					if (ptr == beginwrd)
						ereport(ERROR,
								(errcode(ERRCODE_CONFIG_FILE_ERROR),
								 errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
										lineno, filename)));
					addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
					state = TR_WAITSUBS;
				}
			}
			else
				elog(ERROR, "unrecognized thesaurus state: %d", state);

			ptr += pg_mblen(ptr);
		}

		if (state == TR_INSUBS)
		{
			if (ptr == beginwrd)
				ereport(ERROR,
						(errcode(ERRCODE_CONFIG_FILE_ERROR),
						 errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
								lineno, filename)));
			addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
		}

		idsubst++;

		if (!(nwrd && posinsubst))
		{
			FreeFile(fh);
			ereport(ERROR,
					(errcode(ERRCODE_CONFIG_FILE_ERROR),
					 errmsg("unexpected end of line at line %d of thesaurus file \"%s\"",
							lineno, filename)));
		}

		pfree(line);
	}

	d->nsubst = idsubst;

	FreeFile(fh);
}

static TheLexeme *
addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant)
{

	if (*nnw >= *tnm)
	{
		*tnm *= 2;
		newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
	}

	newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));

	if (lexeme && lexeme->lexeme)
	{
		newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
		newwrds[*nnw].entries->tnvariant = tnvariant;
	}
	else
	{
		newwrds[*nnw].lexeme = NULL;
		newwrds[*nnw].entries->tnvariant = 1;
	}

	newwrds[*nnw].entries->idsubst = src->idsubst;
	newwrds[*nnw].entries->posinsubst = src->posinsubst;

	newwrds[*nnw].entries->nextentry = NULL;

	(*nnw)++;
	return newwrds;
}

static int
cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b)
{
	if (a == NULL || b == NULL)
		return 0;

	if (a->idsubst == b->idsubst)
	{
		if (a->posinsubst == b->posinsubst)
		{
			if (a->tnvariant == b->tnvariant)
				return 0;

			return (a->tnvariant > b->tnvariant) ? 1 : -1;
		}

		return (a->posinsubst > b->posinsubst) ? 1 : -1;
	}

	return (a->idsubst > b->idsubst) ? 1 : -1;
}

static int
cmpLexeme(TheLexeme * a, TheLexeme * b)
{
	if (a->lexeme == NULL)
	{
		if (b->lexeme == NULL)
			return 0;
		else
			return 1;
	}
	else if (b->lexeme == NULL)
		return -1;

	return strcmp(a->lexeme, b->lexeme);
}

static int
cmpLexemeQ(const void *a, const void *b)
{
	return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
}

static int
cmpTheLexeme(const void *a, const void *b)
{
	TheLexeme  *la = (TheLexeme *) a;
	TheLexeme  *lb = (TheLexeme *) b;
	int			res;

	if ((res = cmpLexeme(la, lb)) != 0)
		return res;

	return -cmpLexemeInfo(la->entries, lb->entries);
}

static void
compileTheLexeme(DictThesaurus * d)
{
	int			i,
				nnw = 0,
				tnm = 16;
	TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
			   *ptrwrds;

	for (i = 0; i < d->nwrds; i++)
	{
		TSLexeme   *ptr;

		ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
									   PointerGetDatum(d->subdict->dictData),
										  PointerGetDatum(d->wrds[i].lexeme),
									Int32GetDatum(strlen(d->wrds[i].lexeme)),
													 PointerGetDatum(NULL)));

		if (!ptr)
			elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
				 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
		else if (!(ptr->lexeme))
		{
			elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
				 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);

			newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
		}
		else
		{
			while (ptr->lexeme)
			{
				TSLexeme   *remptr = ptr + 1;
				int			tnvar = 1;
				int			curvar = ptr->nvariant;

				/* compute n words in one variant */
				while (remptr->lexeme)
				{
					if (remptr->nvariant != (remptr - 1)->nvariant)
						break;
					tnvar++;
					remptr++;
				}

				remptr = ptr;
				while (remptr->lexeme && remptr->nvariant == curvar)
				{
					newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
					remptr++;
				}

				ptr = remptr;
			}
		}

		pfree(d->wrds[i].lexeme);
		pfree(d->wrds[i].entries);
	}

	pfree(d->wrds);
	d->wrds = newwrds;
	d->nwrds = nnw;
	d->ntwrds = tnm;

	if (d->nwrds > 1)
	{
		qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);

		/* uniq */
		newwrds = d->wrds;
		ptrwrds = d->wrds + 1;
		while (ptrwrds - d->wrds < d->nwrds)
		{
			if (cmpLexeme(ptrwrds, newwrds) == 0)
			{
				if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
				{
					ptrwrds->entries->nextentry = newwrds->entries;
					newwrds->entries = ptrwrds->entries;
				}
				else
					pfree(ptrwrds->entries);

				if (ptrwrds->lexeme)
					pfree(ptrwrds->lexeme);
			}
			else
			{
				newwrds++;
				*newwrds = *ptrwrds;
			}

			ptrwrds++;
		}

		d->nwrds = newwrds - d->wrds + 1;
		d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
	}
}

static void
compileTheSubstitute(DictThesaurus * d)
{
	int			i;

	for (i = 0; i < d->nsubst; i++)
	{
		TSLexeme   *rem = d->subst[i].res,
				   *outptr,
				   *inptr;
		int			n = 2;

		outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
		outptr->lexeme = NULL;
		inptr = rem;

		while (inptr && inptr->lexeme)
		{
			TSLexeme   *lexized,
						tmplex[2];

			if (inptr->flags & DT_USEASIS)
			{					/* do not lexize */
				tmplex[0] = *inptr;
				tmplex[0].flags = 0;
				tmplex[1].lexeme = NULL;
				lexized = tmplex;
			}
			else
			{
				lexized = (TSLexeme *) DatumGetPointer(
													   FunctionCall4(
													   &(d->subdict->lexize),
									   PointerGetDatum(d->subdict->dictData),
											  PointerGetDatum(inptr->lexeme),
										Int32GetDatum(strlen(inptr->lexeme)),
														PointerGetDatum(NULL)
																	 )
					);
			}

			if (lexized && lexized->lexeme)
			{
				int			toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;

				while (lexized->lexeme)
				{
					if (outptr - d->subst[i].res + 1 >= n)
					{
						int			diff = outptr - d->subst[i].res;

						n *= 2;
						d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
						outptr = d->subst[i].res + diff;
					}

					*outptr = *lexized;
					outptr->lexeme = pstrdup(lexized->lexeme);

					outptr++;
					lexized++;
				}

				if (toset > 0)
					d->subst[i].res[toset].flags |= TSL_ADDPOS;
			}
			else if (lexized)
			{
				elog(NOTICE, "thesaurus word \"%s\" in substitution is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1);
			}
			else
			{
				elog(ERROR, "thesaurus word \"%s\" in substitution isn't recognized (rule %d)", inptr->lexeme, i + 1);
			}

			if (inptr->lexeme)
				pfree(inptr->lexeme);
			inptr++;
		}

		if (outptr == d->subst[i].res)
			elog(ERROR, "all words in thesaurus substitution are stop words (rule %d)", i + 1);

		d->subst[i].reslen = outptr - d->subst[i].res;

		pfree(rem);
	}
}

Datum
thesaurus_init(PG_FUNCTION_ARGS)
{
	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
	DictThesaurus *d;
	char	   *subdictname = NULL;
	bool		fileloaded = false;
	ListCell   *l;

	d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));

	foreach(l, dictoptions)
	{
		DefElem    *defel = (DefElem *) lfirst(l);

		if (pg_strcasecmp("DictFile", defel->defname) == 0)
		{
			if (fileloaded)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("multiple DictFile parameters")));
			thesaurusRead(defGetString(defel), d);
			fileloaded = true;
		}
		else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
		{
			if (subdictname)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("multiple Dictionary parameters")));
			subdictname = pstrdup(defGetString(defel));
		}
		else
		{
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("unrecognized Thesaurus parameter: \"%s\"",
							defel->defname)));
		}
	}

	if (!fileloaded)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("missing DictFile parameter")));
	if (!subdictname)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("missing Dictionary parameter")));

	d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
	d->subdict = lookup_ts_dictionary_cache(d->subdictOid);

	compileTheLexeme(d);
	compileTheSubstitute(d);

	PG_RETURN_POINTER(d);
}

static LexemeInfo *
findTheLexeme(DictThesaurus * d, char *lexeme)
{
	TheLexeme	key, *res;

	if (d->nwrds == 0)
		return NULL;

	key.lexeme = lexeme;
	key.entries = NULL;

	res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);

	if (res == NULL)
		return NULL;
	return res->entries;
}

static bool
matchIdSubst(LexemeInfo * stored, uint16 idsubst)
{
	bool		res = true;

	if (stored)
	{
		res = false;

		for (; stored; stored = stored->nextvariant)
			if (stored->idsubst == idsubst)
			{
				res = true;
				break;
			}
	}

	return res;
}

static LexemeInfo *
findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn)
{
	for (;;)
	{
		int			i;
		LexemeInfo *ptr = newin[0];

		for (i = 0; i < newn; i++)
		{
			while (newin[i] && newin[i]->idsubst < ptr->idsubst)
				newin[i] = newin[i]->nextentry;

			if (newin[i] == NULL)
				return in;

			if (newin[i]->idsubst > ptr->idsubst)
			{
				ptr = newin[i];
				i = -1;
				continue;
			}

			while (newin[i]->idsubst == ptr->idsubst)
			{
				if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
				{
					ptr = newin[i];
					break;
				}

				newin[i] = newin[i]->nextentry;
				if (newin[i] == NULL)
					return in;
			}

			if (newin[i]->idsubst != ptr->idsubst)
			{
				ptr = newin[i];
				i = -1;
				continue;
			}
		}

		if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
		{						/* found */

			ptr->nextvariant = in;
			in = ptr;
		}

		/* step forward */
		for (i = 0; i < newn; i++)
			newin[i] = newin[i]->nextentry;
	}

	return NULL;
}

static TSLexeme *
copyTSLexeme(TheSubstitute * ts)
{
	TSLexeme   *res;
	uint16		i;

	res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
	for (i = 0; i < ts->reslen; i++)
	{
		res[i] = ts->res[i];
		res[i].lexeme = pstrdup(ts->res[i].lexeme);
	}

	res[ts->reslen].lexeme = NULL;

	return res;
}

static TSLexeme *
checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres)
{
	*moreres = false;
	while (info)
	{
		Assert(info->idsubst < d->nsubst);
		if (info->nextvariant)
			*moreres = true;
		if (d->subst[info->idsubst].lastlexeme == curpos)
			return copyTSLexeme(d->subst + info->idsubst);
		info = info->nextvariant;
	}

	return NULL;
}

Datum
thesaurus_lexize(PG_FUNCTION_ARGS)
{
	DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
	DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
	TSLexeme   *res = NULL;
	LexemeInfo *stored,
			   *info = NULL;
	uint16		curpos = 0;
	bool		moreres = false;

	if (PG_NARGS() < 4 || dstate == NULL)
		elog(ERROR, "forbidden call of thesaurus or nested call");

	if (dstate->isend)
		PG_RETURN_POINTER(NULL);
	stored = (LexemeInfo *) dstate->private;

	if (stored)
		curpos = stored->posinsubst + 1;

	if (!d->subdict->isvalid)
		d->subdict = lookup_ts_dictionary_cache(d->subdictOid);

	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
									   PointerGetDatum(d->subdict->dictData),
													 PG_GETARG_DATUM(1),
													 PG_GETARG_DATUM(2),
													 PointerGetDatum(NULL)));

	if (res && res->lexeme)
	{
		TSLexeme   *ptr = res,
				   *basevar;

		while (ptr->lexeme)
		{
			uint16		nv = ptr->nvariant;
			uint16		i,
						nlex = 0;
			LexemeInfo **infos;

			basevar = ptr;
			while (ptr->lexeme && nv == ptr->nvariant)
			{
				nlex++;
				ptr++;
			}

			infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
			for (i = 0; i < nlex; i++)
				if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
					break;

			if (i < nlex)
			{
				/* no chance to find */
				pfree(infos);
				continue;
			}

			info = findVariant(info, stored, curpos, infos, nlex);
		}
	}
	else if (res)
	{							/* stop-word */
		LexemeInfo *infos = findTheLexeme(d, NULL);

		info = findVariant(NULL, stored, curpos, &infos, 1);
	}
	else
	{
		info = NULL;			/* word isn't recognized */
	}

	dstate->private = (void *) info;

	if (!info)
	{
		dstate->getnext = false;
		PG_RETURN_POINTER(NULL);
	}

	if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
	{
		dstate->getnext = moreres;
		PG_RETURN_POINTER(res);
	}

	dstate->getnext = true;

	PG_RETURN_POINTER(NULL);
}