summaryrefslogtreecommitdiff
path: root/src/backend/tsearch/ts_parse.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/tsearch/ts_parse.c')
-rw-r--r--src/backend/tsearch/ts_parse.c626
1 files changed, 626 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000000..f286a61fb0
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,626 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ * main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_public.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME 1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+ int type;
+ char *lemm;
+ int lenlemm;
+ bool resfollow;
+ struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+ ParsedLex *head;
+ ParsedLex *tail;
+} ListParsedLex;
+
+typedef struct
+{
+ TSConfigCacheEntry *cfg;
+ Oid curDictId;
+ int posDict;
+ DictSubState dictState;
+ ParsedLex *curSub;
+ ListParsedLex towork; /* current list to work */
+ ListParsedLex waste; /* list of lexemes that already lexized */
+
+ /*
+ * fields to store last variant to lexize (basically, thesaurus or similar
+ * to, which wants several lexemes
+ */
+
+ ParsedLex *lastRes;
+ TSLexeme *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData * ld, TSConfigCacheEntry * cfg)
+{
+ ld->cfg = cfg;
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+ ld->waste.head = ld->waste.tail = NULL;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
+{
+ if (list->tail)
+ {
+ list->tail->next = newpl;
+ list->tail = newpl;
+ }
+ else
+ list->head = list->tail = newpl;
+ newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex * list)
+{
+ ParsedLex *res = list->head;
+
+ if (list->head)
+ list->head = list->head->next;
+
+ if (list->head == NULL)
+ list->tail = NULL;
+
+ return res;
+}
+
+static void
+LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
+{
+ ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+ newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+ newpl->type = type;
+ newpl->lemm = lemm;
+ newpl->lenlemm = lenlemm;
+ LPLAddTail(&ld->towork, newpl);
+ ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData * ld)
+{
+ LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+ ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+ if (correspondLexem)
+ {
+ *correspondLexem = ld->waste.head;
+ }
+ else
+ {
+ ParsedLex *tmp,
+ *ptr = ld->waste.head;
+
+ while (ptr)
+ {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+ }
+ ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData * ld, ParsedLex * stop)
+{
+ bool go = true;
+
+ while (ld->towork.head && go)
+ {
+ if (ld->towork.head == stop)
+ {
+ ld->curSub = stop->next;
+ go = false;
+ }
+ RemoveHead(ld);
+ }
+}
+
+static void
+setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
+{
+ if (ld->tmpRes)
+ {
+ TSLexeme *ptr;
+
+ for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+ pfree(ptr->lexeme);
+ pfree(ld->tmpRes);
+ }
+ ld->tmpRes = res;
+ ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
+{
+ int i;
+ ListDictionary *map;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (ld->curDictId == InvalidOid)
+ {
+ /*
+ * usial mode: dictionary wants only one word, but we should keep in
+ * mind that we should go through all stack
+ */
+
+ while (ld->towork.head)
+ {
+ ParsedLex *curVal = ld->towork.head;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ RemoveHead(ld);
+ continue;
+ }
+
+ for (i = ld->posDict; i < map->len; i++)
+ {
+ dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+ ld->dictState.isend = ld->dictState.getnext = false;
+ ld->dictState.private = NULL;
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+ &(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if (ld->dictState.getnext)
+ {
+ /*
+ * dictionary wants next word, so setup and store current
+ * position and go to multiword mode
+ */
+
+ ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+ ld->posDict = i + 1;
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ return LexizeExec(ld, correspondLexem);
+ }
+
+ if (!res) /* dictionary doesn't know this lexeme */
+ continue;
+
+ RemoveHead(ld);
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ RemoveHead(ld);
+ }
+ }
+ else
+ { /* curDictId is valid */
+ dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+ /*
+ * Dictionary ld->curDictId asks us about following words
+ */
+
+ while (ld->curSub)
+ {
+ ParsedLex *curVal = ld->curSub;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type != 0)
+ {
+ bool dictExists = false;
+
+ if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ ld->curSub = curVal->next;
+ continue;
+ }
+
+ /*
+ * We should be sure that current type of lexeme is recognized
+ * by our dictinonary: we just check is it exist in list of
+ * dictionaries ?
+ */
+ for (i = 0; i < map->len && !dictExists; i++)
+ if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+ dictExists = true;
+
+ if (!dictExists)
+ {
+ /*
+ * Dictionary can't work with current tpe of lexeme,
+ * return to basic mode and redo all stored lexemes
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ ld->dictState.isend = (curVal->type == 0) ? true : false;
+ ld->dictState.getnext = false;
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(
+ &(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)
+ ));
+
+ if (ld->dictState.getnext)
+ {
+ /* Dictionary wants one more */
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ continue;
+ }
+
+ if (res || ld->tmpRes)
+ {
+ /*
+ * Dictionary normalizes lexemes, so we remove from stack all
+ * used lexemes , return to basic mode and redo end of stack
+ * (if it exists)
+ */
+ if (res)
+ {
+ moveToWaste(ld, ld->curSub);
+ }
+ else
+ {
+ res = ld->tmpRes;
+ moveToWaste(ld, ld->lastRes);
+ }
+
+ /* reset to initial state */
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ /*
+ * Dict don't want next lexem and didn't recognize anything, redo
+ * from ld->towork.head
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ setCorrLex(ld, correspondLexem);
+ return NULL;
+}
+
+/*
+ * Parse string and lexize words
+ */
+void
+parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed")));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+ {
+ TSLexeme *ptr = norms;
+
+ prs->pos++; /* set pos */
+
+ while (ptr->lexeme)
+ {
+ if (prs->curwords == prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+ }
+
+ if (ptr->flags & TSL_ADDPOS)
+ prs->pos++;
+ prs->words[prs->curwords].len = strlen(ptr->lexeme);
+ prs->words[prs->curwords].word = ptr->lexeme;
+ prs->words[prs->curwords].nvariant = ptr->nvariant;
+ prs->words[prs->curwords].alen = 0;
+ prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+ ptr++;
+ prs->curwords++;
+ }
+ pfree(norms);
+ }
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+static void
+hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
+{
+ while (prs->curwords >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+ }
+ memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
+ prs->words[prs->curwords].type = (uint8) type;
+ prs->words[prs->curwords].len = buflen;
+ prs->words[prs->curwords].word = palloc(buflen);
+ memcpy(prs->words[prs->curwords].word, buf, buflen);
+ prs->curwords++;
+}
+
+static void
+hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
+{
+ int i;
+ QueryItem *item = GETQUERY(query);
+ HeadlineWord *word;
+
+ while (prs->curwords + query->size >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+ }
+
+ word = &(prs->words[prs->curwords - 1]);
+ for (i = 0; i < query->size; i++)
+ {
+ if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
+ {
+ if (word->item)
+ {
+ memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
+ prs->words[prs->curwords].item = item;
+ prs->words[prs->curwords].repeated = 1;
+ prs->curwords++;
+ }
+ else
+ word->item = item;
+ }
+ item++;
+ }
+}
+
+static void
+addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
+{
+ ParsedLex *tmplexs;
+ TSLexeme *ptr;
+
+ while (lexs)
+ {
+
+ if (lexs->type > 0)
+ hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+ ptr = norms;
+ while (ptr && ptr->lexeme)
+ {
+ hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+ ptr++;
+ }
+
+ tmplexs = lexs->next;
+ pfree(lexs);
+ lexs = tmplexs;
+ }
+
+ if (norms)
+ {
+ ptr = norms;
+ while (ptr->lexeme)
+ {
+ pfree(ptr->lexeme);
+ ptr++;
+ }
+ pfree(norms);
+ }
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ ParsedLex *lexs;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long to be indexed")));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ do
+ {
+ if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ addHLParsedLex(prs, query, lexs, norms);
+ else
+ addHLParsedLex(prs, query, lexs, NULL);
+ } while (norms);
+
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+text *
+generatHeadline(HeadlineText * prs)
+{
+ text *out;
+ int len = 128;
+ char *ptr;
+ HeadlineWord *wrd = prs->words;
+
+ out = (text *) palloc(len);
+ ptr = ((char *) out) + VARHDRSZ;
+
+ while (wrd - prs->words < prs->curwords)
+ {
+ while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
+ {
+ int dist = ptr - ((char *) out);
+
+ len *= 2;
+ out = (text *) repalloc(out, len);
+ ptr = ((char *) out) + dist;
+ }
+
+ if (wrd->in && !wrd->repeated)
+ {
+ if (wrd->replace)
+ {
+ *ptr = ' ';
+ ptr++;
+ }
+ else
+ {
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->startsel, prs->startsellen);
+ ptr += prs->startsellen;
+ }
+ memcpy(ptr, wrd->word, wrd->len);
+ ptr += wrd->len;
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->stopsel, prs->stopsellen);
+ ptr += prs->stopsellen;
+ }
+ }
+ }
+ else if (!wrd->repeated)
+ pfree(wrd->word);
+
+ wrd++;
+ }
+
+ SET_VARSIZE(out, ptr - ((char *) out));
+ return out;
+}