PostgreSQL: Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Index: src/backend/tsearch/dict.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/dict.c,v
retrieving revision 1.5
diff -u -r1.5 dict.c
--- src/backend/tsearch/dict.c 25 Mar 2008 22:42:43 -0000 1.5
+++ src/backend/tsearch/dict.c 30 May 2008 23:20:57 -0000
@@ -16,6 +16,7 @@
#include "catalog/pg_type.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_public.h"
#include "utils/builtins.h"

Index: src/backend/tsearch/to_tsany.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/to_tsany.c,v
retrieving revision 1.12
diff -u -r1.12 to_tsany.c
--- src/backend/tsearch/to_tsany.c 16 May 2008 16:31:01 -0000 1.12
+++ src/backend/tsearch/to_tsany.c 31 May 2008 08:43:27 -0000
@@ -15,6 +15,7 @@

#include "catalog/namespace.h"
#include "tsearch/ts_cache.h"
+#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/syscache.h"
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.8
diff -u -r1.8 ts_parse.c
--- src/backend/tsearch/ts_parse.c 16 May 2008 16:31:01 -0000 1.8
+++ src/backend/tsearch/ts_parse.c 31 May 2008 08:09:32 -0000
@@ -485,7 +485,11 @@
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
ParsedLex *tmplexs;
+ ParsedText *prstxt;
TSLexeme *ptr;
+ int startpos, endpos, pos;
+
+ startpos = prs->curwords;

while (lexs)
{
@@ -505,6 +509,29 @@
lexs = tmplexs;
}

+ /* update the corresponding ParsedText */
+ pos = startpos;
+ endpos = prs->curwords;
+ prstxt = &(prs->prstxt);
+ for (ptr = norms; ptr && ptr->lexeme; ptr++)
+ {
+ while (prstxt->curwords >= prstxt->lenwords)
+ {
+ prstxt->lenwords *= 2;
+ prstxt->words = (ParsedWord *) repalloc((void *) prstxt->words, prstxt->lenwords * sizeof(ParsedWord));
+ }
+ if (ptr->flags & TSL_ADDPOS)
+ pos++;
+
+ prstxt->words[prstxt->curwords].len = strlen(ptr->lexeme);
+ prstxt->words[prstxt->curwords].word = palloc( prstxt->words[prstxt->curwords].len);
+ memcpy(prstxt->words[prstxt->curwords].word, ptr->lexeme, prstxt->words[prstxt->curwords].len);
+ prstxt->words[prstxt->curwords].nvariant = ptr->nvariant;
+ prstxt->words[prstxt->curwords].flags = ptr->flags & TSL_PREFIX;
+ prstxt->words[prstxt->curwords].alen = 0;
+ prstxt->words[prstxt->curwords].pos.pos = pos;
+ prstxt->curwords++;
+ }
if (norms)
{
ptr = norms;
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.9
diff -u -r1.9 wparser.c
--- src/backend/tsearch/wparser.c 12 May 2008 00:00:50 -0000 1.9
+++ src/backend/tsearch/wparser.c 31 May 2008 08:44:01 -0000
@@ -317,6 +317,9 @@
prs.lenwords = 32;
prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);

+ prs.prstxt.lenwords = 32;
+ prs.prstxt.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.prstxt.lenwords);
+
hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);

if (opt)
@@ -335,6 +338,11 @@
PG_FREE_IF_COPY(query, 2);
if (opt)
PG_FREE_IF_COPY(opt, 3);
+
+
+ /* prs.prstxt.words are all freed up by make_tsvector itself
+ * so don't need to free it now */
+
pfree(prs.words);
pfree(prs.startsel);
pfree(prs.stopsel);
Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.14
diff -u -r1.14 wparser_def.c
--- src/backend/tsearch/wparser_def.c 1 Jan 2008 19:45:52 -0000 1.14
+++ src/backend/tsearch/wparser_def.c 31 May 2008 15:14:16 -0000
@@ -1684,18 +1684,176 @@
return false;
}

-Datum
-prsd_headline(PG_FUNCTION_ARGS)
+static void
+mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
- HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
- List *prsoptions = (List *) PG_GETARG_POINTER(1);
- TSQuery query = PG_GETARG_TSQUERY(2);
+ int i;
+ char *coversep = "...";
+ int coverlen = strlen(coversep);

- /* from opt + start and and tag */
- int min_words = 15;
- int max_words = 35;
- int shortword = 3;
+ for (i = startpos; i <= endpos; i++)
+ {
+ if (prs->words[i].item)
+ prs->words[i].selected = 1;
+ if (highlight == 0)
+ {
+ if (HLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ }
+ else
+ {
+ if (XMLHLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ }

+ prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
+ }
+ /* add cover separators if needed */
+ if (startpos > 0 && strncmp(prs->words[startpos-1].word, coversep,
+ prs->words[startpos-1].len) != 0)
+ {
+
+ prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * coverlen);
+ prs->words[startpos-1].in = 1;
+ prs->words[startpos-1].len = coverlen;
+ memcpy(prs->words[startpos-1].word, coversep, coverlen);
+ }
+ if (endpos-1 < prs->curwords && strncmp(prs->words[startpos-1].word, coversep,
+ prs->words[startpos-1].len) != 0)
+ {
+ prs->words[endpos+1].word = repalloc(prs->words[endpos+1].word, sizeof(char) * coverlen);
+ prs->words[endpos+1].in = 1;
+ memcpy(prs->words[endpos+1].word, coversep, coverlen);
+ }
+}
+
+static void
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query,int highlight,
+ int num_fragments, int maxcoversize)
+{
+ DocRepresentation* doc;
+ Extention ext;
+ int4 coverlen, doclen;
+ int4 startpos = 0, endpos = 0;
+ QueryRepresentation qr;
+ int4 i, f, numcovers = 0, maxcovers = 32, maxstretch;
+ int4 min, minI = 0;
+ CoverPos *covers = palloc(maxcovers * sizeof(CoverPos));
+ TSVector t = make_tsvector(&(prs->prstxt));
+
+ qr.query = query;
+ qr.operandexist = (bool*) palloc0(sizeof(bool) * query->size);
+
+ /* start generating covers for the query */
+ doc = get_docrep(t, &qr, &doclen);
+ if (!doc)
+ {
+ pfree(qr.operandexist);
+ pfree(covers);
+ /* cannot do anything */
+ return;
+ }
+
+ /* get all covers */
+ MemSet(&ext, 0, sizeof(Extention));
+ while (Cover(doc, doclen, &qr, &ext))
+ {
+ if (numcovers >= maxcovers)
+ {
+ maxcovers *= 2;
+ covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
+ }
+ covers[numcovers].startpos = ext.p;
+ covers[numcovers].endpos = ext.q;
+
+ covers[numcovers].in = 0;
+ covers[numcovers].excluded = 0;
+ numcovers ++;
+ }
+ /* we do not need tsvector any more, free it */
+ if (t)
+ pfree(t);
+
+ /* choose best covers */
+ for (f = 0; f < num_fragments; f++)
+ {
+ min = 9999999;/* XXX - will not display headlines that exceed 9999999 */
+ for (i = 0; i < numcovers; i ++)
+ {
+ coverlen = covers[i].endpos - covers[i].startpos + 1;
+ if (!covers[i].in && !covers[i].excluded && min > coverlen)
+ {
+ min = coverlen;
+ minI = i;
+ }
+ }
+ if (min < 9999999)
+ {
+ covers[minI].in = 1;
+ /* adjust the size of cover
+ * if maxcoversize >= len
+ * then headline from ext.p - (maxcoversize-len)/2 to ext.q + (maxcoverSize-len) /2
+ * if maxcoverSize < len
+ * then headline from ext.p to ext.p + maxcoverSize
+ * (ensures starting lexeme is in the headline)
+ */
+ /* cut down endpos if it crosses maxWords */
+ startpos = covers[minI].startpos;
+ endpos = covers[minI].endpos;
+ coverlen = endpos - startpos + 1;
+
+ if (maxcoversize > coverlen)
+ {
+ /* stretch it to maxwords */
+ maxstretch = maxcoversize;
+
+ /* divide the stretch on both sides of cover */
+ startpos -= (maxstretch - coverlen)/2;
+ endpos += (maxstretch - coverlen)/2;
+ if (startpos < 0)
+ startpos = 0;
+ /* XXX - do we need to check whether endpos crosses the document
+ * the other function would return if the document ends or the
+ * endpos is reached.
+ * Dropping this check for time being
+ */
+ }
+ else if (maxcoversize < coverlen)
+ endpos = startpos + maxcoversize;
+ covers[minI].startpos = startpos;
+ covers[minI].endpos = endpos;
+
+ /* exclude overlapping covers */
+ for (i = 0; i < numcovers; i ++)
+ {
+ if (i != minI &&
+ (covers[i].startpos >= covers[minI].startpos &&
+ covers[i].startpos <= covers[minI].endpos))
+ covers[i].excluded = 1;
+ }
+ }
+ else
+ break;
+ }
+
+ /* Mark the chosen fragments (covers) */
+
+ for (i = 0; i < numcovers; i++)
+ {
+ if (!covers[i].in)
+ continue;
+
+ startpos = covers[i].startpos;
+ endpos = covers[i].endpos;
+
+ mark_fragment(prs, highlight, covers[i].startpos, covers[i].endpos);
+ }
+ pfree(qr.operandexist);
+ pfree(covers);
+}
+static void
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words)
+{
int p = 0,
q = 0;
int bestb = -1,
@@ -1707,56 +1865,9 @@
curlen;

int i;
- int highlight = 0;
- ListCell *l;
-
- /* config */
- prs->startsel = NULL;
- prs->stopsel = NULL;
- foreach(l, prsoptions)
- {
- DefElem *defel = (DefElem *) lfirst(l);
- char *val = defGetString(defel);
-
- if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- max_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- min_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- shortword = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- prs->startsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- prs->stopsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("unrecognized headline parameter: \"%s\"",
- defel->defname)));
- }

if (highlight == 0)
{
- if (min_words >= max_words)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("ShortWord should be >= 0")));
-
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
@@ -1877,6 +1988,82 @@
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}

+}
+
+Datum
+prsd_headline(PG_FUNCTION_ARGS)
+{
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int num_fragments = 0;
+ int highlight = 0;
+ ListCell *l;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "NumFragments") == 0)
+ num_fragments = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlight = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (highlight == 0)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (num_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("NumFragments should be >= 0")));
+
+ if (num_fragments == 0)
+ /* call the default headline generator */
+ mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ else
+ mark_hl_fragments(prs, query, highlight, num_fragments, max_words);
+ }
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
@@ -1886,3 +2073,4 @@

PG_RETURN_POINTER(prs);
}
+
Index: src/backend/utils/adt/tsrank.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/utils/adt/tsrank.c,v
retrieving revision 1.13
diff -u -r1.13 tsrank.c
--- src/backend/utils/adt/tsrank.c 16 May 2008 16:31:01 -0000 1.13
+++ src/backend/utils/adt/tsrank.c 30 May 2008 17:44:01 -0000
@@ -17,6 +17,7 @@

#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
#include "utils/array.h"
#include "miscadmin.h"

@@ -488,14 +489,6 @@
PG_RETURN_FLOAT4(res);
}

-typedef struct
-{
- QueryItem **item;
- int16 nitem;
- uint8 wclass;
- int32 pos;
-} DocRepresentation;
-
static int
compareDocR(const void *va, const void *vb)
{
@@ -507,12 +500,6 @@
return (a->pos > b->pos) ? 1 : -1;
}

-typedef struct
-{
- TSQuery query;
- bool *operandexist;
-} QueryRepresentation;
-
#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true

@@ -524,17 +511,7 @@
return QR_GET_OPERAND_EXISTS(qr, val);
}

-typedef struct
-{
- int pos;
- int p;
- int q;
- DocRepresentation *begin;
- DocRepresentation *end;
-} Extention;
-
-
-static bool
+bool
Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext)
{
DocRepresentation *ptr;
@@ -615,7 +592,7 @@
return Cover(doc, len, qr, ext);
}

-static DocRepresentation *
+DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.9
diff -u -r1.9 ts_public.h
--- src/include/tsearch/ts_public.h 16 May 2008 16:31:02 -0000 1.9
+++ src/include/tsearch/ts_public.h 31 May 2008 15:10:24 -0000
@@ -14,6 +14,7 @@
#define _PG_TS_PUBLIC_H_

#include "tsearch/ts_type.h"
+#include "tsearch/ts_utils.h"

/*
* Parser's framework
@@ -47,6 +48,7 @@

typedef struct
{
+ ParsedText prstxt;
HeadlineWordEntry *words;
int4 lenwords;
int4 curwords;
@@ -55,6 +57,24 @@
int2 startsellen;
int2 stopsellen;
} HeadlineParsedText;
+/*
+ * headline framework, flow in common to generate:
+ * 1 parse text with hlparsetext
+ * 2 parser-specific function to find part
+ * 3 generateHeadline to generate result text
+ */
+
+typedef struct
+{
+ int4 startpos;
+ int4 endpos;
+ int2 in;
+ int2 excluded;
+} CoverPos;
+
+extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
+ char *buf, int4 buflen);
+extern text *generateHeadline(HeadlineParsedText *prs);

/*
* Common useful things for tsearch subsystem
Index: src/include/tsearch/ts_rank.h
===================================================================
RCS file: src/include/tsearch/ts_rank.h
diff -N src/include/tsearch/ts_rank.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/include/tsearch/ts_rank.h 30 May 2008 17:44:01 -0000
@@ -0,0 +1,36 @@
+#ifndef __TSRANK_H__
+#define __TSRANK_H__
+
+#include "ts_type.h"
+#include "ts_cache.h"
+
+typedef struct
+{
+ QueryItem **item;
+ int16 nitem;
+ uint8 wclass;
+ int32 pos;
+} DocRepresentation;
+
+typedef struct
+{
+ TSQuery query;
+ bool *operandexist;
+} QueryRepresentation;
+
+typedef struct
+{
+ int pos;
+ int p;
+ int q;
+ DocRepresentation *begin;
+ DocRepresentation *end;
+} Extention;
+
+bool
+Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext);
+
+DocRepresentation *
+get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen);
+
+#endif /* __TSRANK_H__ */
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.15
diff -u -r1.15 ts_utils.h
--- src/include/tsearch/ts_utils.h 16 May 2008 16:31:02 -0000 1.15
+++ src/include/tsearch/ts_utils.h 30 May 2008 23:18:08 -0000
@@ -13,7 +13,7 @@
#define _PG_TS_UTILS_H_

#include "tsearch/ts_type.h"
-#include "tsearch/ts_public.h"
+#include "tsearch/ts_rank.h"
#include "nodes/pg_list.h"

/*
@@ -90,16 +90,6 @@

extern void parsetext(Oid cfgId, ParsedText *prs, char *buf, int4 buflen);

-/*
- * headline framework, flow in common to generate:
- * 1 parse text with hlparsetext
- * 2 parser-specific function to find part
- * 3 generateHeadline to generate result text
- */
-
-extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
- char *buf, int4 buflen);
-extern text *generateHeadline(HeadlineParsedText *prs);

/*
* Common check function for tsvector @@ tsquery
I have attached a new patch with respect to the current cvs head. This
produces headline in a document for a given query. Basically it
identifies fragments of text that contain the query and displays them.

DESCRIPTION

HeadlineParsedText contains an array of actual words but not
information about the norms. We need an indexed position vector for each
norm so that we can quickly evaluate a number of possible fragments.
Something that tsvector provides.

So this patch changes HeadlineParsedText to contain the norms
(ParsedText). This field is updated while parsing in hlparsetext. The
position information of the norms corresponds to the position of words
in HeadlineParsedText (not to the norms positions as is the case in
tsvector). This works correctly with the current parser. If you think
there may be issues with other parsers please let me know.

This approach does not change any other interface and fits nicely with
the overall framework.

The norms are converted into tsvector and a number of covers are
generated. The best covers are then chosen to be in the headline. The
covers are separated using a hardcoded coversep. Let me know if you want
to expose this as an option.

Covers that overlap with already chosen covers are excluded.

Some options like ShortWord and MinWords are not taken care of right
now. MaxWords are used as maxcoversize. Let me know if you would like to
see other options for fragment generation as well.

Let me know any more changes you would like to see.

-Sushant.

On Tue, 2008-05-27 at 13:30 +0400, Teodor Sigaev wrote:
> Hi!
>
> > 1. Why is hlparsetext used to parse the document rather than the
> > parsetext function? Since words to be included in the headline will be
> > marked afterwords, it seems more reasonable to just use the parsetext
> > function.
> > The main difference I see is the use of hlfinditem and marking whether
> > some word is repeated.
> hlparsetext preserves any kind of lexeme - not indexed, spaces etc. parsetext
> doesn't.
> hlparsetext preserves original form of lexemes. parsetext doesn't.
>
> >
> > The reason this is important is that hlparsetext does not seem to be
> > storing word positions which parsetext does. The word positions are
> > important for generating headline with fragments.
> Doesn't needed - hlparsetext preserves the whole text, so, position is a number
> of array.
>
> >
> > 2.
> >> I would prefer the signature ts_headline( [regconfig,] text, tsquery
> >> [,text] )and function should accept 'NumFragments=>N' for default
> >> parser. Another parsers may use another options.
> >
> > Does this mean we want a unified function ts_headline and we trigger the
> > fragments if NumFragments is specified?
>
> Trigger should be inside parser-specific function (pg_ts_parser.prsheadline).
> Another parsers might not recognize that option.
>
> > It seems that introducing a new
> > function which can take configuration OID, or name is complex as there
> > are so many functions handling these issues in wparser.c.
> No, of course - ts_headline takes care about finding configuration and calling
> correct parser.
>
> >
> > If this is true then we need to just add marking of headline words in
> > prsd_headline. Otherwise we will need another prsd_headline_with_covers
> > function.
> Yeah, pg_ts_parser.prsheadline should mark the lexemes to. It even can change
> an array of HeadlineParsedText.
>
> >
> > 3. In many cases people may already have TSVector for a given document
> > (for search operation). Would it be faster to pass TSVector to headline
> > function when compared to computing TSVector each time? If that is the
> > case then should we have an option to pass TSVector to headline
> > function?
> As I mentioned above, tsvector doesn;t contain whole information about text.
>

PostgreSQL

Saturday, May 31, 2008

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

No comments:

Blog Archive