Saturday, May 31, 2008

[HACKERS] phrase search

Index: src/backend/utils/adt/Makefile
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/utils/adt/Makefile,v
retrieving revision 1.69
diff -u -r1.69 Makefile
--- src/backend/utils/adt/Makefile 19 Feb 2008 10:30:08 -0000 1.69
+++ src/backend/utils/adt/Makefile 31 May 2008 19:57:34 -0000
@@ -29,7 +29,7 @@
tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
tsvector.o tsvector_op.o tsvector_parser.o \
- txid.o uuid.o xml.o
+ txid.o uuid.o xml.o phrase_search.o

like.o: like.c like_match.c

Index: src/backend/utils/adt/phrase_search.c
===================================================================
RCS file: src/backend/utils/adt/phrase_search.c
diff -N src/backend/utils/adt/phrase_search.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/backend/utils/adt/phrase_search.c 31 May 2008 19:56:59 -0000
@@ -0,0 +1,167 @@
+#include "postgres.h"
+
+#include "tsearch/ts_type.h"
+#include "tsearch/ts_utils.h"
+
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+PG_FUNCTION_INFO_V1(is_phrase_present);
+Datum is_phrase_present(PG_FUNCTION_ARGS);
+
+typedef struct {
+ WordEntryPosVector *posVector;
+ int4 posInPhrase;
+ int4 curpos;
+} PhraseInfo;
+
+static int
+WordCompareVectorEntry(char *eval, WordEntry *ptr, ParsedWord *prsdword)
+{
+ if (ptr->len == prsdword->len)
+ return strncmp(
+ eval + ptr->pos,
+ prsdword->word,
+ prsdword->len);
+
+ return (ptr->len > prsdword->len) ? 1 : -1;
+}
+
+/*
+ * Returns a pointer to a WordEntry from tsvector t corresponding to prsdword.
+ * Returns NULL if not found.
+ */
+static WordEntry *
+find_wordentry_prsdword(TSVector t, ParsedWord *prsdword)
+{
+ WordEntry *StopLow = ARRPTR(t);
+ WordEntry *StopHigh = (WordEntry *) STRPTR(t);
+ WordEntry *StopMiddle;
+ int difference;
+
+ /* Loop invariant: StopLow <= item < StopHigh */
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ difference = WordCompareVectorEntry(STRPTR(t), StopMiddle, prsdword);
+ if (difference == 0)
+ return StopMiddle;
+ else if (difference < 0)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ return NULL;
+}
+
+
+static int4
+check_and_advance(int4 i, PhraseInfo *phraseInfo)
+{
+ WordEntryPosVector *posvector1, *posvector2;
+ int4 diff;
+
+ posvector1 = phraseInfo[i].posVector;
+ posvector2 = phraseInfo[i+1].posVector;
+
+ diff = phraseInfo[i+1].posInPhrase - phraseInfo[i].posInPhrase;
+ while (posvector2->pos[phraseInfo[i+1].curpos] - posvector1->pos[phraseInfo[i].curpos] < diff)
+ if (phraseInfo[i+1].curpos >= posvector2->npos - 1)
+ return 2;
+ else
+ phraseInfo[i+1].curpos += 1;
+
+ if (posvector2->pos[phraseInfo[i+1].curpos] - posvector1->pos[phraseInfo[i].curpos] == diff)
+ return 1;
+ else
+ return 0;
+}
+
+int4
+initialize_phraseinfo(ParsedText *prs, TSVector t, PhraseInfo *phraseInfo)
+{
+ WordEntry *entry;
+ int4 i;
+
+ for (i = 0; i < prs->curwords; i++)
+ {
+ phraseInfo[i].posInPhrase = prs->words[i].pos.pos;
+ entry = find_wordentry_prsdword(t, &(prs->words[i]));
+ if (entry == NULL)
+ return 0;
+ else
+ phraseInfo[i].posVector = _POSVECPTR(t, entry);
+ }
+ return 1;
+}
+Datum
+is_phrase_present(PG_FUNCTION_ARGS)
+{
+ ParsedText prs;
+ int4 numwords, i, retval, found = 0;
+ PhraseInfo *phraseInfo;
+ text *phrase = PG_GETARG_TEXT_P(0);
+ TSVector t = PG_GETARG_TSVECTOR(1);
+ Oid cfgId = getTSCurrentConfig(true);
+
+ prs.lenwords = (VARSIZE(phrase) - VARHDRSZ) / 6; /* just estimation of * word's number */
+ if (prs.lenwords == 0)
+ prs.lenwords = 2;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc0(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(cfgId, &prs, VARDATA(phrase), VARSIZE(phrase) - VARHDRSZ);
+
+ // allocate & initialize
+ numwords = prs.curwords;
+ phraseInfo = palloc0(numwords * sizeof(PhraseInfo));
+
+
+ if (numwords > 0 && initialize_phraseinfo(&prs, t, phraseInfo))
+ {
+ // if there is only one word and we are able to initialize
+ // the phraseInfo then it is a success
+ if (numwords == 1)
+ found = 1;
+ while (!found)
+ {
+ retval = check_and_advance(0, phraseInfo);
+ if (retval == 2)
+ break;
+ else if (retval == 1)
+ {
+ for (i = 1; i < numwords - 1; i++)
+ {
+ retval = check_and_advance(i, phraseInfo);
+ if (retval == 2 || retval == 0)
+ break;
+ }
+ if (i >= numwords - 1)
+ // found a phrase
+ found = 1;
+ else if (retval == 2)
+ // no chance of finding a phrase
+ break;
+ }
+ // now move the pos of 0th phrase
+ if (phraseInfo[0].curpos >= phraseInfo[0].posVector->npos- 1)
+ // no chance of finding a phrase
+ break;
+ else
+ phraseInfo[0].curpos += 1;
+ }
+ }
+ pfree(phraseInfo);
+ pfree(prs.words);
+ // prepare to leave
+ PG_FREE_IF_COPY(phrase, 0);
+ PG_FREE_IF_COPY(t, 1);
+ // return
+ PG_RETURN_INT32(found);
+}
I have attached a patch for phrase search with respect to the cvs head.
Basically it takes a a phrase (text) and a TSVector. It checks if the
relative positions of lexeme in the phrase are same as in their
positions in TSVector.

If the configuration for text search is "simple", then this will produce
exact phrase search. Otherwise the stopwords in a phrase will be ignored
and the words in a phrase will only be matched with the stemmed lexeme.

For my application I am using this as a separate shared object. I do not
know how to expose this function from the core. Can someone explain how
to do this?

I saw this discussion on phrase search and I am not sure what other
functionality is required.

http://archives.postgresql.org/pgsql-general/2008-02/msg01170.php

-Sushant.

No comments: