Tsearch2 functionality migrates to core. The bulk of this work is by

Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-08-21 01:11:32 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-08-21 01:11:32 +0000
commit: 140d4ebcb46e17cdb1be43892ed797e5e060c8ef (patch)
tree: f99d209dbe5e40dcb434c3841e0c8b4ff383f453 /src/backend/tsearch/ts_parse.c
parent: 4e94d1f952c3ce5670ceae3c12b55e344503a701 (diff)
download: postgresql-140d4ebcb46e17cdb1be43892ed797e5e060c8ef.tar.gz
1 files changed, 626 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000000..f286a61fb0
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,626 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ *		main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_public.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME	1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+	int			type;
+	char	   *lemm;
+	int			lenlemm;
+	bool		resfollow;
+	struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+	ParsedLex  *head;
+	ParsedLex  *tail;
+} ListParsedLex;
+
+typedef struct
+{
+	TSConfigCacheEntry *cfg;
+	Oid			curDictId;
+	int			posDict;
+	DictSubState dictState;
+	ParsedLex  *curSub;
+	ListParsedLex towork;		/* current list to work */
+	ListParsedLex waste;		/* list of lexemes that already lexized */
+
+	/*
+	 * fields to store last variant to lexize (basically, thesaurus or similar
+	 * to, which wants	several lexemes
+	 */
+
+	ParsedLex  *lastRes;
+	TSLexeme   *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData * ld, TSConfigCacheEntry * cfg)
+{
+	ld->cfg = cfg;
+	ld->curDictId = InvalidOid;
+	ld->posDict = 0;
+	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+	ld->waste.head = ld->waste.tail = NULL;
+	ld->lastRes = NULL;
+	ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
+{
+	if (list->tail)
+	{
+		list->tail->next = newpl;
+		list->tail = newpl;
+	}
+	else
+		list->head = list->tail = newpl;
+	newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex * list)
+{
+	ParsedLex  *res = list->head;
+
+	if (list->head)
+		list->head = list->head->next;
+
+	if (list->head == NULL)
+		list->tail = NULL;
+
+	return res;
+}
+
+static void
+LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
+{
+	ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+	newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+	newpl->type = type;
+	newpl->lemm = lemm;
+	newpl->lenlemm = lenlemm;
+	LPLAddTail(&ld->towork, newpl);
+	ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData * ld)
+{
+	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+	ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+	if (correspondLexem)
+	{
+		*correspondLexem = ld->waste.head;
+	}
+	else
+	{
+		ParsedLex  *tmp,
+				   *ptr = ld->waste.head;
+
+		while (ptr)
+		{
+			tmp = ptr->next;
+			pfree(ptr);
+			ptr = tmp;
+		}
+	}
+	ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData * ld, ParsedLex * stop)
+{
+	bool		go = true;
+
+	while (ld->towork.head && go)
+	{
+		if (ld->towork.head == stop)
+		{
+			ld->curSub = stop->next;
+			go = false;
+		}
+		RemoveHead(ld);
+	}
+}
+
+static void
+setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
+{
+	if (ld->tmpRes)
+	{
+		TSLexeme   *ptr;
+
+		for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+			pfree(ptr->lexeme);
+		pfree(ld->tmpRes);
+	}
+	ld->tmpRes = res;
+	ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+	int			i;
+	ListDictionary *map;
+	TSDictionaryCacheEntry *dict;
+	TSLexeme   *res;
+
+	if (ld->curDictId == InvalidOid)
+	{
+		/*
+		 * usial mode: dictionary wants only one word, but we should keep in
+		 * mind that we should go through all stack
+		 */
+
+		while (ld->towork.head)
+		{
+			ParsedLex  *curVal = ld->towork.head;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+			{
+				/* skip this type of lexeme */
+				RemoveHead(ld);
+				continue;
+			}
+
+			for (i = ld->posDict; i < map->len; i++)
+			{
+				dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+				ld->dictState.isend = ld->dictState.getnext = false;
+				ld->dictState.private = NULL;
+				res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+															 &(dict->lexize),
+											 PointerGetDatum(dict->dictData),
+											   PointerGetDatum(curVal->lemm),
+											  Int32GetDatum(curVal->lenlemm),
+											  PointerGetDatum(&ld->dictState)
+																 ));
+
+				if (ld->dictState.getnext)
+				{
+					/*
+					 * dictionary wants next word, so setup and store current
+					 * position and go to multiword mode
+					 */
+
+					ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+					ld->posDict = i + 1;
+					ld->curSub = curVal->next;
+					if (res)
+						setNewTmpRes(ld, curVal, res);
+					return LexizeExec(ld, correspondLexem);
+				}
+
+				if (!res)		/* dictionary doesn't know this lexeme */
+					continue;
+
+				RemoveHead(ld);
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			RemoveHead(ld);
+		}
+	}
+	else
+	{							/* curDictId is valid */
+		dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+		/*
+		 * Dictionary ld->curDictId asks  us about following words
+		 */
+
+		while (ld->curSub)
+		{
+			ParsedLex  *curVal = ld->curSub;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type != 0)
+			{
+				bool		dictExists = false;
+
+				if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+				{
+					/* skip this type of lexeme */
+					ld->curSub = curVal->next;
+					continue;
+				}
+
+				/*
+				 * We should be sure that current type of lexeme is recognized
+				 * by our dictinonary: we just check is it exist in list of
+				 * dictionaries ?
+				 */
+				for (i = 0; i < map->len && !dictExists; i++)
+					if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+						dictExists = true;
+
+				if (!dictExists)
+				{
+					/*
+					 * Dictionary can't work with current tpe of lexeme,
+					 * return to basic mode and redo all stored lexemes
+					 */
+					ld->curDictId = InvalidOid;
+					return LexizeExec(ld, correspondLexem);
+				}
+			}
+
+			ld->dictState.isend = (curVal->type == 0) ? true : false;
+			ld->dictState.getnext = false;
+
+			res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+															 &(dict->lexize),
+											 PointerGetDatum(dict->dictData),
+											   PointerGetDatum(curVal->lemm),
+											  Int32GetDatum(curVal->lenlemm),
+											  PointerGetDatum(&ld->dictState)
+															 ));
+
+			if (ld->dictState.getnext)
+			{
+				/* Dictionary wants one more */
+				ld->curSub = curVal->next;
+				if (res)
+					setNewTmpRes(ld, curVal, res);
+				continue;
+			}
+
+			if (res || ld->tmpRes)
+			{
+				/*
+				 * Dictionary normalizes lexemes, so we remove from stack all
+				 * used lexemes , return to basic mode and redo end of stack
+				 * (if it exists)
+				 */
+				if (res)
+				{
+					moveToWaste(ld, ld->curSub);
+				}
+				else
+				{
+					res = ld->tmpRes;
+					moveToWaste(ld, ld->lastRes);
+				}
+
+				/* reset to initial state */
+				ld->curDictId = InvalidOid;
+				ld->posDict = 0;
+				ld->lastRes = NULL;
+				ld->tmpRes = NULL;
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			/*
+			 * Dict don't want next lexem and didn't recognize anything, redo
+			 * from ld->towork.head
+			 */
+			ld->curDictId = InvalidOid;
+			return LexizeExec(ld, correspondLexem);
+		}
+	}
+
+	setCorrLex(ld, correspondLexem);
+	return NULL;
+}
+
+/*
+ * Parse string and lexize words
+ */
+void
+parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
+{
+	int			type,
+				lenlemm;
+	char	   *lemm = NULL;
+	LexizeData	ldata;
+	TSLexeme   *norms;
+	TSConfigCacheEntry *cfg;
+	TSParserCacheEntry *prsobj;
+	void	   *prsdata;
+
+	cfg = lookup_ts_config_cache(cfgId);
+	prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+													 PointerGetDatum(buf),
+													 Int32GetDatum(buflen)));
+
+	LexizeInit(&ldata, cfg);
+
+	do
+	{
+		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+										   PointerGetDatum(prsdata),
+										   PointerGetDatum(&lemm),
+										   PointerGetDatum(&lenlemm)));
+
+		if (type > 0 && lenlemm >= MAXSTRLEN)
+		{
+#ifdef IGNORE_LONGLEXEME
+			ereport(NOTICE,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+			continue;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("word is too long to be indexed")));
+#endif
+		}
+
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+		while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+		{
+			TSLexeme   *ptr = norms;
+
+			prs->pos++;			/* set pos */
+
+			while (ptr->lexeme)
+			{
+				if (prs->curwords == prs->lenwords)
+				{
+					prs->lenwords *= 2;
+					prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+				}
+
+				if (ptr->flags & TSL_ADDPOS)
+					prs->pos++;
+				prs->words[prs->curwords].len = strlen(ptr->lexeme);
+				prs->words[prs->curwords].word = ptr->lexeme;
+				prs->words[prs->curwords].nvariant = ptr->nvariant;
+				prs->words[prs->curwords].alen = 0;
+				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+				ptr++;
+				prs->curwords++;
+			}
+			pfree(norms);
+		}
+	} while (type > 0);
+
+	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+static void
+hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
+{
+	while (prs->curwords >= prs->lenwords)
+	{
+		prs->lenwords *= 2;
+		prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+	}
+	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
+	prs->words[prs->curwords].type = (uint8) type;
+	prs->words[prs->curwords].len = buflen;
+	prs->words[prs->curwords].word = palloc(buflen);
+	memcpy(prs->words[prs->curwords].word, buf, buflen);
+	prs->curwords++;
+}
+
+static void
+hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
+{
+	int			i;
+	QueryItem  *item = GETQUERY(query);
+	HeadlineWord *word;
+
+	while (prs->curwords + query->size >= prs->lenwords)
+	{
+		prs->lenwords *= 2;
+		prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+	}
+
+	word = &(prs->words[prs->curwords - 1]);
+	for (i = 0; i < query->size; i++)
+	{
+		if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
+		{
+			if (word->item)
+			{
+				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
+				prs->words[prs->curwords].item = item;
+				prs->words[prs->curwords].repeated = 1;
+				prs->curwords++;
+			}
+			else
+				word->item = item;
+		}
+		item++;
+	}
+}
+
+static void
+addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
+{
+	ParsedLex  *tmplexs;
+	TSLexeme   *ptr;
+
+	while (lexs)
+	{
+
+		if (lexs->type > 0)
+			hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+		ptr = norms;
+		while (ptr && ptr->lexeme)
+		{
+			hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+			ptr++;
+		}
+
+		tmplexs = lexs->next;
+		pfree(lexs);
+		lexs = tmplexs;
+	}
+
+	if (norms)
+	{
+		ptr = norms;
+		while (ptr->lexeme)
+		{
+			pfree(ptr->lexeme);
+			ptr++;
+		}
+		pfree(norms);
+	}
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
+{
+	int			type,
+				lenlemm;
+	char	   *lemm = NULL;
+	LexizeData	ldata;
+	TSLexeme   *norms;
+	ParsedLex  *lexs;
+	TSConfigCacheEntry *cfg;
+	TSParserCacheEntry *prsobj;
+	void	   *prsdata;
+
+	cfg = lookup_ts_config_cache(cfgId);
+	prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+													 PointerGetDatum(buf),
+													 Int32GetDatum(buflen)));
+
+	LexizeInit(&ldata, cfg);
+
+	do
+	{
+		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+										   PointerGetDatum(prsdata),
+										   PointerGetDatum(&lemm),
+										   PointerGetDatum(&lenlemm)));
+
+		if (type > 0 && lenlemm >= MAXSTRLEN)
+		{
+#ifdef IGNORE_LONGLEXEME
+			ereport(NOTICE,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+			continue;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("word is too long to be indexed")));
+#endif
+		}
+
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+		do
+		{
+			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+				addHLParsedLex(prs, query, lexs, norms);
+			else
+				addHLParsedLex(prs, query, lexs, NULL);
+		} while (norms);
+
+	} while (type > 0);
+
+	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+text *
+generatHeadline(HeadlineText * prs)
+{
+	text	   *out;
+	int			len = 128;
+	char	   *ptr;
+	HeadlineWord *wrd = prs->words;
+
+	out = (text *) palloc(len);
+	ptr = ((char *) out) + VARHDRSZ;
+
+	while (wrd - prs->words < prs->curwords)
+	{
+		while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
+		{
+			int			dist = ptr - ((char *) out);
+
+			len *= 2;
+			out = (text *) repalloc(out, len);
+			ptr = ((char *) out) + dist;
+		}
+
+		if (wrd->in && !wrd->repeated)
+		{
+			if (wrd->replace)
+			{
+				*ptr = ' ';
+				ptr++;
+			}
+			else
+			{
+				if (wrd->selected)
+				{
+					memcpy(ptr, prs->startsel, prs->startsellen);
+					ptr += prs->startsellen;
+				}
+				memcpy(ptr, wrd->word, wrd->len);
+				ptr += wrd->len;
+				if (wrd->selected)
+				{
+					memcpy(ptr, prs->stopsel, prs->stopsellen);
+					ptr += prs->stopsellen;
+				}
+			}
+		}
+		else if (!wrd->repeated)
+			pfree(wrd->word);
+
+		wrd++;
+	}
+
+	SET_VARSIZE(out, ptr - ((char *) out));
+	return out;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-08-21 01:11:32 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-08-21 01:11:32 +0000
commit	140d4ebcb46e17cdb1be43892ed797e5e060c8ef (patch)
tree	f99d209dbe5e40dcb434c3841e0c8b4ff383f453 /src/backend/tsearch/ts_parse.c
parent	4e94d1f952c3ce5670ceae3c12b55e344503a701 (diff)
download	postgresql-140d4ebcb46e17cdb1be43892ed797e5e060c8ef.tar.gz