Saturday, August 2, 2008

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.10
diff -c -r1.10 ts_public.h
*** src/include/tsearch/ts_public.h 18 Jun 2008 18:42:54 -0000 1.10
--- src/include/tsearch/ts_public.h 2 Aug 2008 02:40:27 -0000
***************
*** 52,59 ****
--- 52,61 ----
int4 curwords;
char *startsel;
char *stopsel;
+ char *fragdelim;
int2 startsellen;
int2 stopsellen;
+ int2 fragdelimlen;
} HeadlineParsedText;

/*
Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.15
diff -c -r1.15 wparser_def.c
*** src/backend/tsearch/wparser_def.c 17 Jun 2008 16:09:06 -0000 1.15
--- src/backend/tsearch/wparser_def.c 2 Aug 2008 15:25:46 -0000
***************
*** 1684,1701 ****
return false;
}

! Datum
! prsd_headline(PG_FUNCTION_ARGS)
{
! HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
! List *prsoptions = (List *) PG_GETARG_POINTER(1);
! TSQuery query = PG_GETARG_TSQUERY(2);

! /* from opt + start and and tag */
! int min_words = 15;
! int max_words = 35;
! int shortword = 3;

int p = 0,
q = 0;
int bestb = -1,
--- 1684,1930 ----
return false;
}

! static void
! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
! int i;

! for (i = startpos; i <= endpos; i++)
! {
! if (prs->words[i].item)
! prs->words[i].selected = 1;
! if (highlight == 0)
! {
! if (HLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
! else
! {
! if (XMLHLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
!
! prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
! }
! }
!
! typedef struct
! {
! int4 startpos;
! int4 endpos;
! int4 poslen;
! int4 curlen;
! int2 in;
! int2 excluded;
! } CoverPos;
!
! static void
! get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
! int *curlen, int *poslen, int max_words)
! {
! int i;
! /* Objective: Generate a fragment of words between startpos and endpos
! * such that it has at most max_words and both ends has query words.
! * If the startpos and endpos are the endpoints of the cover and the
! * cover has fewer words than max_words, then this function should
! * just return the cover
! */
! /* first move startpos to an item */
! for(i = *startpos; i <= *endpos; i++)
! {
! *startpos = i;
! if (prs->words[i].item && !prs->words[i].repeated)
! break;
! }
! /* cut endpos to have only max_words */
! *curlen = 0;
! *poslen = 0;
! for(i = *startpos; i <= *endpos && *curlen < max_words; i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! *curlen += 1;
! if (prs->words[i].item && !prs->words[i].repeated)
! *poslen += 1;
! }
! /* if the cover was cut then move back endpos to a query item */
! if (*endpos > i)
! {
! *endpos = i;
! for(i = *endpos; i >= *startpos; i --)
! {
! *endpos = i;
! if (prs->words[i].item && !prs->words[i].repeated)
! break;
! if (!NONWORDTOKEN(prs->words[i].type))
! *curlen -= 1;
! }
! }
! }
!
! static void
! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
! int shortword, int min_words,
! int max_words, int max_fragments)
! {
! int4 poslen, curlen, i, f, num_f = 0;
! int4 stretch, maxstretch, posmarker;
!
! int4 startpos = 0,
! endpos = 0,
! p = 0,
! q = 0;
!
! int4 numcovers = 0,
! maxcovers = 32;
!
! int4 minI, minwords, maxitems;
! CoverPos *covers;
!
! covers = palloc(maxcovers * sizeof(CoverPos));
!
! /* get all covers */
! while (hlCover(prs, query, &p, &q))
! {
! startpos = p;
! endpos = q;
!
! /* Break the cover into smaller fragments such that each fragment
! * has at most max_words. Also ensure that each end of the fragment
! * is a query word. This will allow us to stretch the fragment in
! * either direction
! */
!
! while (startpos <= endpos)
! {
! get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
! if (numcovers >= maxcovers)
! {
! maxcovers *= 2;
! covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
! }
! covers[numcovers].startpos = startpos;
! covers[numcovers].endpos = endpos;
! covers[numcovers].curlen = curlen;
! covers[numcovers].poslen = poslen;
! covers[numcovers].in = 0;
! covers[numcovers].excluded = 0;
! numcovers ++;
! startpos = endpos + 1;
! endpos = q;
! }
! /* move p to generate the next cover */
! p++;
! }
!
! /* choose best covers */
! for (f = 0; f < max_fragments; f++)
! {
! maxitems = 0;
! minwords = 0x7fffffff;
! minI = -1;
! /* Choose the cover that contains max items.
! * In case of tie choose the one with smaller
! * number of words.
! */
! for (i = 0; i < numcovers; i ++)
! {
! if (!covers[i].in && !covers[i].excluded &&
! (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
! && minwords > covers[i].curlen)))
! {
! maxitems = covers[i].poslen;
! minwords = covers[i].curlen;
! minI = i;
! }
! }
! /* if a cover was found mark it */
! if (minI >= 0)
! {
! covers[minI].in = 1;
! /* adjust the size of cover */
! startpos = covers[minI].startpos;
! endpos = covers[minI].endpos;
! curlen = covers[minI].curlen;
! /* stretch the cover if cover size is lower than max_words */
! if (curlen < max_words)
! {
! /* divide the stretch on both sides of cover */
! maxstretch = (max_words - curlen)/2;
! /* first stretch the startpos
! * stop stretching if
! * 1. we hit the beginning of document
! * 2. exceed maxstretch
! * 3. we hit an already marked fragment
! */
! stretch = 0;
! posmarker = startpos;
! for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! {
! curlen ++;
! stretch ++;
! }
! posmarker = i;
! }
! /* cut back startpos till we find a non short token */
! for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen --;
! }
! startpos = i;
! /* now stretch the endpos as much as possible*/
! posmarker = endpos;
! for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen ++;
! posmarker = i;
! }
! /* cut back endpos till we find a non-short token */
! for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen --;
! }
! endpos = i;
! }
! covers[minI].startpos = startpos;
! covers[minI].endpos = endpos;
! covers[minI].curlen = curlen;
! /* Mark the chosen fragments (covers) */
! mark_fragment(prs, highlight, startpos, endpos);
! num_f ++;
! /* exclude overlapping covers */
! for (i = 0; i < numcovers; i ++)
! {
! if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
! covers[i].excluded = 1;
! }
! }
! else
! break;
! }

+ /* show at least min_words we have not marked anything*/
+ if (num_f <= 0)
+ {
+ startpos = endpos = curlen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ endpos = i;
+ }
+ mark_fragment(prs, highlight, startpos, endpos);
+ }
+ pfree(covers);
+ }
+ static void
+ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+ int shortword, int min_words, int max_words)
+ {
int p = 0,
q = 0;
int bestb = -1,
***************
*** 1707,1762 ****
curlen;

int i;
- int highlight = 0;
- ListCell *l;
-
- /* config */
- prs->startsel = NULL;
- prs->stopsel = NULL;
- foreach(l, prsoptions)
- {
- DefElem *defel = (DefElem *) lfirst(l);
- char *val = defGetString(defel);
-
- if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- max_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- min_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- shortword = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- prs->startsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- prs->stopsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("unrecognized headline parameter: \"%s\"",
- defel->defname)));
- }

if (highlight == 0)
{
- if (min_words >= max_words)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("ShortWord should be >= 0")));
-
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
--- 1936,1944 ----
***************
*** 1877,1888 ****
--- 2059,2153 ----
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}

+ }
+
+ Datum
+ prsd_headline(PG_FUNCTION_ARGS)
+ {
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int max_fragments = 0;
+ int highlight = 0;
+ ListCell *l;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ max_fragments = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
+ prs->fragdelim = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlight = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (highlight == 0)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (max_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MaxFragments should be >= 0")));
+ }
+
+ if (max_fragments == 0)
+ /* call the default headline generator */
+ mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ else
+ mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
prs->stopsel = pstrdup("</b>");
+ if (!prs->fragdelim)
+ prs->fragdelim = pstrdup(" ... ");
prs->startsellen = strlen(prs->startsel);
prs->stopsellen = strlen(prs->stopsel);
+ prs->fragdelimlen = strlen(prs->fragdelim);

PG_RETURN_POINTER(prs);
}
+
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.8
diff -c -r1.8 ts_parse.c
*** src/backend/tsearch/ts_parse.c 16 May 2008 16:31:01 -0000 1.8
--- src/backend/tsearch/ts_parse.c 2 Aug 2008 04:22:51 -0000
***************
*** 583,590 ****
generateHeadline(HeadlineParsedText *prs)
{
text *out;
- int len = 128;
char *ptr;
HeadlineWordEntry *wrd = prs->words;

out = (text *) palloc(len);
--- 583,593 ----
generateHeadline(HeadlineParsedText *prs)
{
text *out;
char *ptr;
+ int len = 128;
+ int numfragments = 0;
+ int2 infrag = 0;
+
HeadlineWordEntry *wrd = prs->words;

out = (text *) palloc(len);
***************
*** 592,598 ****

while (wrd - prs->words < prs->curwords)
{
! while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);

--- 595,601 ----

while (wrd - prs->words < prs->curwords)
{
! while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);

***************
*** 603,608 ****
--- 606,625 ----

if (wrd->in && !wrd->repeated)
{
+ if (!infrag)
+ {
+
+ /* start of a new fragment */
+ infrag = 1;
+ numfragments ++;
+ /* add a fragment delimitor if this is after the first one */
+ if (numfragments > 1)
+ {
+ memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
+ ptr += prs->fragdelimlen;
+ }
+
+ }
if (wrd->replace)
{
*ptr = ' ';
***************
*** 625,631 ****
--- 642,652 ----
}
}
else if (!wrd->repeated)
+ {
+ if (infrag)
+ infrag = 0;
pfree(wrd->word);
+ }

wrd++;
}
Index: src/test/regress/sql/tsearch.sql
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/test/regress/sql/tsearch.sql,v
retrieving revision 1.9
diff -c -r1.9 tsearch.sql
*** src/test/regress/sql/tsearch.sql 16 May 2008 16:31:02 -0000 1.9
--- src/test/regress/sql/tsearch.sql 2 Aug 2008 15:19:39 -0000
***************
*** 208,213 ****
--- 208,265 ----
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');

+ --Check if headline fragments work
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean'), 'MaxFragments=1');
+
+ --Check if more than one fragments are displayed
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
+
+ --Fragments when there all query words are not in the document
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
+
+ --FragmentDelimiter option
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
+
--Rewrite sub system

CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
Index: src/test/regress/expected/tsearch.out
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/test/regress/expected/tsearch.out,v
retrieving revision 1.14
diff -c -r1.14 tsearch.out
*** src/test/regress/expected/tsearch.out 16 May 2008 16:31:02 -0000 1.14
--- src/test/regress/expected/tsearch.out 2 Aug 2008 15:27:21 -0000
***************
*** 632,637 ****
--- 632,729 ----
</html>
(1 row)

+ --Check if headline fragments work
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean'), 'MaxFragments=1');
+ ts_headline
+ ------------------------------------
+ after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted <b>Ocean</b>.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop
+ (1 row)
+
+ --Check if more than one fragments are displayed
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
+ ts_headline
+ ----------------------------------------------
+ after day, day after day,
+ We <b>stuck</b>, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where ... drop to drink.
+ S. T. <b>Coleridge</b>
+ (1 row)
+
+ --Fragments when there all query words are not in the document
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
+ ts_headline
+ ------------------------------------
+
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as
+ (1 row)
+
+ --FragmentDelimiter option
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
+ ts_headline
+ --------------------------------------------
+ after day, day after day,
+ We <b>stuck</b>, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where***drop to drink.
+ S. T. <b>Coleridge</b>
+ (1 row)
+
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none
Index: doc/src/sgml/textsearch.sgml
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/doc/src/sgml/textsearch.sgml,v
retrieving revision 1.44
diff -c -r1.44 textsearch.sgml
*** doc/src/sgml/textsearch.sgml 16 May 2008 16:31:01 -0000 1.44
--- doc/src/sgml/textsearch.sgml 2 Aug 2008 15:12:10 -0000
***************
*** 1100,1105 ****
--- 1100,1128 ----
</listitem>
<listitem>
<para>
+ <literal>MaxFragments</literal>: maximum number of text excerpts
+ or fragments that matches the query words. It also triggers a
+ different headline generation function than the default one. This
+ function finds text fragments with as many query words as possible and
+ stretches those fragments around the query words. As a result
+ query words are close to the middle of each fragment and have words on
+ each side. Each fragment will be of at most MaxWords and will not
+ have words of size less than or equal to ShortWord at the start or
+ end of a fragment. If all query words are not found in the document,
+ then a single fragment of MinWords will be displayed.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>FragmentDelimiter</literal>: When more than one fragments are
+ displayed, then the fragments will be separated by this delimiter. This
+ option is effective only if MaxFragments is greater than 1 and there are
+ more than one fragments to be diplayed. This option has no effect on the
+ default headline generation function.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
<literal>HighlightAll</literal>: Boolean flag; if
<literal>true</literal> the whole document will be highlighted.
</para>
***************
*** 1109,1115 ****
Any unspecified options receive these defaults:

<programlisting>
! StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
</programlisting>
</para>

--- 1132,1138 ----
Any unspecified options receive these defaults:

<programlisting>
! StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
</programlisting>
</para>

-- does it work

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy
query and optionally return them in some order. Most common case: Find
documents containing all query terms and return them in order of their
similarity to the query.', to_tsquery('english', 'documents'),
'MaxFragments=1');
ts_headline
-------------------------------------------------------------------------------
purpose of FTS is to find <b>documents</b>, which satisfy
query and optionally return them in some order. Most common case: Find
<b>documents</b> containing all query terms and return them in order of their
similarity
(1 row)

-- does it respect MinWords

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'nosuchterm'), 'MaxFragments=1,MinWords=5');
ts_headline
-----------------------
The purpose of FTS is

-- does it respect MaxWords

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=8');
ts_headline
------------------------------------------------------------------
find <b>documents</b>, which satisfy query and optionally return

-- does it exclude ShortWord in the end ( "in" is excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
-----------------------------------------------------------------------
find <b>documents</b>, which satisfy query and optionally return them
(1 row)

-- does it exclude ShortWord in the front ( "The" is excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=13');
ts_headline
---------------------------------------------------------------------------------------
purpose of FTS is to find <b>documents</b>, which satisfy query and optionally return

-- when multiple words are used, the cover is shown in middle of the fragment (cover size <= MaxWords)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'optional & order'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
---------------------------------------------------------------------------
query and <b>optionally</b> return them in some <b>order</b>. Most common

-- does it choose the smallest cover (there are three covers between positions (7,17), (17, 22), and (22, 31). The chosen one is (17, 22))

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'order & documents'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
---------------------------------------------------------------------------------
some <b>order</b>. Most common case: Find <b>documents</b> containing all query


-- does it show multiple fragments

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'query & documents'), 'MaxFragments=2,MinWords=5,MaxWords=10');
ts_headline
------------------------------------------------------------------------------------------------------------------------------------------------------------------
find <b>documents</b>, which satisfy <b>query</b> and optionally return them ... common case: Find <b>documents</b> containing all <b>query</b> terms and return
(1 row)

-- does it exclude overlapping covers (even when MaxFragments = 2, the overlapping covers are excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'query & order & documents'), 'MaxFragments=2,MinWords=5,MaxWords=15');
ts_headline
-----------------------------------------------------------------------------------------------------------------
them in some <b>order</b>. Most common case: Find <b>documents</b> containing all <b>query</b> terms and return
(1 row)


-- when cover size is greater than MaxWords, does it break covers into fragments (first with MaxFragments = 1 and then with maxFragments = 2)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'purpose & similarity'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
-----------------------------------------------------------------
<b>purpose</b> of FTS is to find documents, which satisfy query
(1 row)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'purpose & similarity'), 'MaxFragments=2,MinWords=5,MaxWords=10');
ts_headline
-------------------------------------------------------------------------------------------------------------------
<b>purpose</b> of FTS is to find documents, which satisfy query ... order of their <b>similarity</b> to the query

-- using Oleg suggestions for testing some boundry cases

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery, 'MaxFragments=1');
ts_headline
-------------------
<b>3</b> <b>1</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery, 'MaxFragments=2');
ts_headline
-------------------------------------------
<b>1</b> 2 <b>3</b> ... <b>3</b> <b>1</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&2'::tsquery, 'MaxFragments=2');
ts_headline
-----------------------------------------
<b>1</b> <b>2</b> ... <b>1</b> <b>2</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','2'::tsquery, 'MaxFragments=2');
ts_headline
-----------------------
<b>2</b> ... <b>2</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&4'::tsquery, 'MaxFragments=2');
ts_headline
---------------------
<b>4</b> 5 <b>1</b>

-- Checking FragmentDelimiter

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','2'::tsquery, 'MaxFragments=2,FragmentDelimiter=***');
ts_headline
---------------------
<b>2</b>***<b>2</b>
(1 row)

Sorry for the delay. Here is the patch with FragmentDelimiter option.
It requires an extra option in HeadlineParsedText and uses that option
during generateHeadline.

Implementing notion of fragments in HeadlineParsedText and a separate
function to join them seems more complicated. So for the time being I
just dump a FragmentDelimiter whenever a new fragment (other than the
first one) starts.

The patch also contains the updated regression tests/results and also a
new test for FragmentDelimiter option. It also contains the
documentation for the new options.

I have also attached a separate file that tests different aspects of the
new headline generation function.

Let me know if anything else is needed.

-Sushant.

On Thu, 2008-07-24 at 00:28 +0400, Oleg Bartunov wrote:
> On Wed, 23 Jul 2008, Sushant Sinha wrote:
>
> > I guess it is more readable to add cover separator at the end of a fragment
> > than in the front. Let me know what you think and I can update it.
>
> FragmentsDelimiter should *separate* fragments and that says all.
> Not very difficult algorithmic problem, it's like perl's
> join(FragmentsDelimiter, @array)
>
> >
> > I think the right place for cover separator is in the structure
> > HeadlineParsedText just like startsel and stopsel. This will enable users to
> > specify their own cover separators. But this will require changes to the
> > structure as well as to the generateHeadline function. This option will not
> > also play well with the default headline generation function.
>
> As soon as we introduce FragmentsDelimiter we should make it
> configurable.
>
> >
> > The default MaxWords = 35 seems a bit high for this headline generation
> > function and 20 seems to be more reasonable. Any thoughts?
>
> I think we should not change default value because it could change
> behaviour of existing applications. I'm not sure if it'd be useful and
> possible to define default values in CREATE TEXT SEARCH PARSER
>
> >
> > -Sushant.
> >
> > On Wed, Jul 23, 2008 at 7:44 AM, Oleg Bartunov <oleg@sai.msu.su> wrote:
> >
> >> btw, is it intentional to have '....' in headline ?
> >>
> >> =# select ts_headline('1 2 3 4 5 1 2 3 1','1&4'::tsquery,'MaxFragments=1');
> >> ts_headline
> >> -------------------------
> >> ... <b>4</b> 5 <b>1</b>
> >>
> >>
> >>
> >> Oleg
> >>
> >> On Wed, 23 Jul 2008, Teodor Sigaev wrote:
> >>
> >> Let me know of any other changes that are needed.
> >>>>
> >>>
> >>> Looks like ready to commit, but documentation is needed.
> >>>
> >>>
> >>>
> >> Regards,
> >> Oleg
> >> _____________________________________________________________
> >> Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
> >> Sternberg Astronomical Institute, Moscow University, Russia
> >> Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/<http://www.sai.msu.su/%7Emegera/>
> >> phone: +007(495)939-16-83, +007(495)939-23-83
> >>
> >
>
> Regards,
> Oleg
> _____________________________________________________________
> Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
> Sternberg Astronomical Institute, Moscow University, Russia
> Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
> phone: +007(495)939-16-83, +007(495)939-23-83

No comments: