summaryrefslogtreecommitdiff
path: root/src/backend/tsearch/ts_parse.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2007-08-21 01:11:32 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2007-08-21 01:11:32 +0000
commit140d4ebcb46e17cdb1be43892ed797e5e060c8ef (patch)
treef99d209dbe5e40dcb434c3841e0c8b4ff383f453 /src/backend/tsearch/ts_parse.c
parent4e94d1f952c3ce5670ceae3c12b55e344503a701 (diff)
downloadpostgresql-140d4ebcb46e17cdb1be43892ed797e5e060c8ef.tar.gz
Tsearch2 functionality migrates to core. The bulk of this work is by
Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done.
Diffstat (limited to 'src/backend/tsearch/ts_parse.c')
-rw-r--r--src/backend/tsearch/ts_parse.c626
1 files changed, 626 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000000..f286a61fb0
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,626 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ * main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_public.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME 1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+ int type;
+ char *lemm;
+ int lenlemm;
+ bool resfollow;
+ struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+ ParsedLex *head;
+ ParsedLex *tail;
+} ListParsedLex;
+
+typedef struct
+{
+ TSConfigCacheEntry *cfg;
+ Oid curDictId;
+ int posDict;
+ DictSubState dictState;
+ ParsedLex *curSub;
+ ListParsedLex towork; /* current list to work */
+ ListParsedLex waste; /* list of lexemes that already lexized */
+
+ /*
+ * fields to store last variant to lexize (basically, thesaurus or similar
+ * to, which wants several lexemes
+ */
+
+ ParsedLex *lastRes;
+ TSLexeme *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData * ld, TSConfigCacheEntry * cfg)
+{
+ ld->cfg = cfg;
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+ ld->waste.head = ld->waste.tail = NULL;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
+{
+ if (list->tail)
+ {
+ list->tail->next = newpl;
+ list->tail = newpl;
+ }
+ else
+ list->head = list->tail = newpl;
+ newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex * list)
+{
+ ParsedLex *res = list->head;
+
+ if (list->head)
+ list->head = list->head->next;
+
+ if (list->head == NULL)
+ list->tail = NULL;
+
+ return res;
+}
+
+static void
+LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
+{
+ ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+ newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+ newpl->type = type;
+ newpl->lemm = lemm;
+ newpl->lenlemm = lenlemm;
+ LPLAddTail(&ld->towork, newpl);
+ ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData * ld)
+{
+ LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+ ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+ if (correspondLexem)
+ {
+ *correspondLexem = ld->waste.head;
+ }
+ else
+ {
+ ParsedLex *tmp,
+ *ptr = ld->waste.head;
+
+ while (ptr)
+ {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+ }
+ ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData * ld, ParsedLex * stop)
+{
+ bool go = true;
+
+ while (ld->towork.head && go)
+ {
+ if (ld->towork.head == stop)
+ {
+ ld->curSub = stop->next;
+ go = false;
+ }
+ RemoveHead(ld);
+ }
+}
+
+static void
+setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
+{
+ if (ld->tmpRes)
+ {
+ TSLexeme *ptr;
+
+ for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+ pfree(ptr->lexeme);
+ pfree(ld->tmpRes);
+ }
+ ld->tmpRes = res;
+ ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+ int i;
+ ListDictionary *map;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (ld->curDictId == InvalidOid)
+ {
+ /*
+ * usial mode: dictionary wants only one word, but we should keep in
+ * mind that we should go through all stack
+ */
+
+ while (ld->towork.head)
+ {
+ ParsedLex *curVal = ld->towork.head;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ RemoveHead(ld);
+ continue;
+ }
+
+ for (i = ld->posDict; i < map->len; i++)
+ {
+ dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+ ld->dictState.isend = ld->dictState.getnext = false;
+ ld->dictState.private = NULL;
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+ &(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if (ld->dictState.getnext)
+ {
+ /*
+ * dictionary wants next word, so setup and store current
+ * position and go to multiword mode
+ */
+
+ ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+ ld->posDict = i + 1;
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ return LexizeExec(ld, correspondLexem);
+ }
+
+ if (!res) /* dictionary doesn't know this lexeme */
+ continue;
+
+ RemoveHead(ld);
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ RemoveHead(ld);
+ }
+ }
+ else
+ { /* curDictId is valid */
+ dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+ /*
+ * Dictionary ld->curDictId asks us about following words
+ */
+
+ while (ld->curSub)
+ {
+ ParsedLex *curVal = ld->curSub;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type != 0)
+ {
+ bool dictExists = false;
+
+ if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ ld->curSub = curVal->next;
+ continue;
+ }
+
+ /*
+ * We should be sure that current type of lexeme is recognized
+ * by our dictinonary: we just check is it exist in list of
+ * dictionaries ?
+ */
+ for (i = 0; i < map->len && !dictExists; i++)
+ if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+ dictExists = true;
+
+ if (!dictExists)
+ {
+ /*
+ * Dictionary can't work with current tpe of lexeme,
+ * return to basic mode and redo all stored lexemes
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ ld->dictState.isend = (curVal->type == 0) ? true : false;
+ ld->dictState.getnext = false;
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+ &(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if (ld->dictState.getnext)
+ {
+ /* Dictionary wants one more */
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ continue;
+ }
+
+ if (res || ld->tmpRes)
+ {
+ /*
+ * Dictionary normalizes lexemes, so we remove from stack all
+ * used lexemes , return to basic mode and redo end of stack
+ * (if it exists)
+ */
+ if (res)
+ {
+ moveToWaste(ld, ld->curSub);
+ }
+ else
+ {
+ res = ld->tmpRes;
+ moveToWaste(ld, ld->lastRes);
+ }
+
+ /* reset to initial state */
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ /*
+ * Dict don't want next lexem and didn't recognize anything, redo
+ * from ld->towork.head
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ setCorrLex(ld, correspondLexem);
+ return NULL;
+}
+
+/*
+ * Parse string and lexize words
+ */
+void
+parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed")));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+ {
+ TSLexeme *ptr = norms;
+
+ prs->pos++; /* set pos */
+
+ while (ptr->lexeme)
+ {
+ if (prs->curwords == prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+ }
+
+ if (ptr->flags & TSL_ADDPOS)
+ prs->pos++;
+ prs->words[prs->curwords].len = strlen(ptr->lexeme);
+ prs->words[prs->curwords].word = ptr->lexeme;
+ prs->words[prs->curwords].nvariant = ptr->nvariant;
+ prs->words[prs->curwords].alen = 0;
+ prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+ ptr++;
+ prs->curwords++;
+ }
+ pfree(norms);
+ }
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+static void
+hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
+{
+ while (prs->curwords >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+ }
+ memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
+ prs->words[prs->curwords].type = (uint8) type;
+ prs->words[prs->curwords].len = buflen;
+ prs->words[prs->curwords].word = palloc(buflen);
+ memcpy(prs->words[prs->curwords].word, buf, buflen);
+ prs->curwords++;
+}
+
+static void
+hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
+{
+ int i;
+ QueryItem *item = GETQUERY(query);
+ HeadlineWord *word;
+
+ while (prs->curwords + query->size >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+ }
+
+ word = &(prs->words[prs->curwords - 1]);
+ for (i = 0; i < query->size; i++)
+ {
+ if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
+ {
+ if (word->item)
+ {
+ memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
+ prs->words[prs->curwords].item = item;
+ prs->words[prs->curwords].repeated = 1;
+ prs->curwords++;
+ }
+ else
+ word->item = item;
+ }
+ item++;
+ }
+}
+
+static void
+addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
+{
+ ParsedLex *tmplexs;
+ TSLexeme *ptr;
+
+ while (lexs)
+ {
+
+ if (lexs->type > 0)
+ hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+ ptr = norms;
+ while (ptr && ptr->lexeme)
+ {
+ hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+ ptr++;
+ }
+
+ tmplexs = lexs->next;
+ pfree(lexs);
+ lexs = tmplexs;
+ }
+
+ if (norms)
+ {
+ ptr = norms;
+ while (ptr->lexeme)
+ {
+ pfree(ptr->lexeme);
+ ptr++;
+ }
+ pfree(norms);
+ }
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ ParsedLex *lexs;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed")));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ do
+ {
+ if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ addHLParsedLex(prs, query, lexs, norms);
+ else
+ addHLParsedLex(prs, query, lexs, NULL);
+ } while (norms);
+
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+text *
+generatHeadline(HeadlineText * prs)
+{
+ text *out;
+ int len = 128;
+ char *ptr;
+ HeadlineWord *wrd = prs->words;
+
+ out = (text *) palloc(len);
+ ptr = ((char *) out) + VARHDRSZ;
+
+ while (wrd - prs->words < prs->curwords)
+ {
+ while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
+ {
+ int dist = ptr - ((char *) out);
+
+ len *= 2;
+ out = (text *) repalloc(out, len);
+ ptr = ((char *) out) + dist;
+ }
+
+ if (wrd->in && !wrd->repeated)
+ {
+ if (wrd->replace)
+ {
+ *ptr = ' ';
+ ptr++;
+ }
+ else
+ {
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->startsel, prs->startsellen);
+ ptr += prs->startsellen;
+ }
+ memcpy(ptr, wrd->word, wrd->len);
+ ptr += wrd->len;
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->stopsel, prs->stopsellen);
+ ptr += prs->stopsellen;
+ }
+ }
+ }
+ else if (!wrd->repeated)
+ pfree(wrd->word);
+
+ wrd++;
+ }
+
+ SET_VARSIZE(out, ptr - ((char *) out));
+ return out;
+}