Thursday, July 17, 2008

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.15
diff -c -r1.15 wparser_def.c
*** src/backend/tsearch/wparser_def.c 17 Jun 2008 16:09:06 -0000 1.15
--- src/backend/tsearch/wparser_def.c 18 Jul 2008 00:07:22 -0000
***************
*** 1684,1701 ****
return false;
}

! Datum
! prsd_headline(PG_FUNCTION_ARGS)
{
! HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
! List *prsoptions = (List *) PG_GETARG_POINTER(1);
! TSQuery query = PG_GETARG_TSQUERY(2);

! /* from opt + start and and tag */
! int min_words = 15;
! int max_words = 35;
! int shortword = 3;

int p = 0,
q = 0;
int bestb = -1,
--- 1684,1941 ----
return false;
}

! static void
! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
! int i;
! char *coversep = "... ";
! int seplen = strlen(coversep);

! for (i = startpos; i <= endpos; i++)
! {
! if (prs->words[i].item)
! prs->words[i].selected = 1;
! if (highlight == 0)
! {
! if (HLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
! else
! {
! if (XMLHLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
!
! prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
! }
! /* add cover separators if needed */
! if (startpos > 0 && startpos <= endpos)
! {
!
! prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * seplen);
! prs->words[startpos-1].in = 1;
! prs->words[startpos-1].len = seplen;
! memcpy(prs->words[startpos-1].word, coversep, seplen);
! }
! }
!
! typedef struct
! {
! int4 startpos;
! int4 endpos;
! int4 poslen;
! int4 curlen;
! int2 in;
! int2 excluded;
! } CoverPos;
!
! static void
! get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
! int *curlen, int *poslen, int max_words)
! {
! int i;
! /* Objective: Generate a fragment of words between startpos and endpos
! * such that it has at most max_words and both ends has query words.
! * If the startpos and endpos are the endpoints of the cover and the
! * cover has fewer words than max_words, then this function should
! * just return the cover
! */
! /* first move startpos to an item */
! for(i = *startpos; i <= *endpos; i++)
! {
! *startpos = i;
! if (prs->words[i].item && !prs->words[i].repeated)
! break;
! }
! /* cut endpos to have only max_words */
! *curlen = 0;
! *poslen = 0;
! for(i = *startpos; i <= *endpos && *curlen < max_words; i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! *curlen += 1;
! if (prs->words[i].item && !prs->words[i].repeated)
! *poslen += 1;
! }
! /* if the cover was cut then move back endpos to a query item */
! if (*endpos > i)
! {
! *endpos = i;
! for(i = *endpos; i >= *startpos; i --)
! {
! *endpos = i;
! if (prs->words[i].item && !prs->words[i].repeated)
! break;
! if (!NONWORDTOKEN(prs->words[i].type))
! *curlen -= 1;
! }
! }
! }
!
! static void
! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
! int shortword, int min_words,
! int max_words, int max_fragments)
! {
! int4 poslen, curlen, i, f, num_f = 0;
! int4 stretch, maxstretch, posmarker;
!
! int4 startpos = 0,
! endpos = 0,
! p = 0,
! q = 0;
!
! int4 numcovers = 0,
! maxcovers = 32;
!
! int4 minI, minwords, maxitems;
! CoverPos *covers;
!
! covers = palloc(maxcovers * sizeof(CoverPos));
!
! /* get all covers */
! while (hlCover(prs, query, &p, &q))
! {
! startpos = p;
! endpos = q;
!
! /* Break the cover into smaller fragments such that each fragment
! * has at most max_words. Also ensure that each end of the fragment
! * is a query word. This will allow us to stretch the fragment in
! * either direction
! */
!
! while (startpos <= endpos)
! {
! get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
! if (numcovers >= maxcovers)
! {
! maxcovers *= 2;
! covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
! }
! covers[numcovers].startpos = startpos;
! covers[numcovers].endpos = endpos;
! covers[numcovers].curlen = curlen;
! covers[numcovers].poslen = poslen;
! covers[numcovers].in = 0;
! covers[numcovers].excluded = 0;
! numcovers ++;
! startpos = endpos + 1;
! endpos = q;
! }
! /* move p to generate the next cover */
! p++;
! }

+ /* choose best covers */
+ for (f = 0; f < max_fragments; f++)
+ {
+ maxitems = 0;
+ minwords = 0x7fffffff;
+ minI = -1;
+ /* Choose the cover that contains max items.
+ * In case of tie choose the one with smaller
+ * number of words.
+ */
+ for (i = 0; i < numcovers; i ++)
+ {
+ if (!covers[i].in && !covers[i].excluded &&
+ (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
+ && minwords > covers[i].curlen)))
+ {
+ maxitems = covers[i].poslen;
+ minwords = covers[i].curlen;
+ minI = i;
+ }
+ }
+ /* if a cover was found mark it */
+ if (minI >= 0)
+ {
+ covers[minI].in = 1;
+ /* adjust the size of cover */
+ startpos = covers[minI].startpos;
+ endpos = covers[minI].endpos;
+ curlen = covers[minI].curlen;
+ /* stretch the cover if cover size is lower than max_words */
+ if (curlen < max_words)
+ {
+ /* divide the stretch on both sides of cover */
+ maxstretch = (max_words - curlen)/2;
+ /* first stretch the startpos
+ * stop stretching if
+ * 1. we hit the beginning of document
+ * 2. exceed maxstretch
+ * 3. we hit an already marked fragment
+ */
+ stretch = 0;
+ posmarker = startpos;
+ for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ {
+ curlen ++;
+ stretch ++;
+ }
+ posmarker = i;
+ }
+ /* cut back startpos till we find a non short token */
+ for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen --;
+ }
+ startpos = i;
+ /* now stretch the endpos as much as possible*/
+ posmarker = endpos;
+ for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen ++;
+ posmarker = i;
+ }
+ /* cut back endpos till we find a non-short token */
+ for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen --;
+ }
+ endpos = i;
+ }
+ covers[minI].startpos = startpos;
+ covers[minI].endpos = endpos;
+ covers[minI].curlen = curlen;
+ /* Mark the chosen fragments (covers) */
+ mark_fragment(prs, highlight, startpos, endpos);
+ num_f ++;
+ /* exclude overlapping covers */
+ for (i = 0; i < numcovers; i ++)
+ {
+ if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
+ covers[i].excluded = 1;
+ }
+ }
+ else
+ break;
+ }
+
+ /* show at least min_words we have not marked anything*/
+ if (num_f <= 0)
+ {
+ startpos = endpos = curlen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ endpos = i;
+ }
+ mark_fragment(prs, highlight, startpos, endpos);
+ }
+ pfree(covers);
+ }
+ static void
+ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+ int shortword, int min_words, int max_words)
+ {
int p = 0,
q = 0;
int bestb = -1,
***************
*** 1707,1762 ****
curlen;

int i;
- int highlight = 0;
- ListCell *l;
-
- /* config */
- prs->startsel = NULL;
- prs->stopsel = NULL;
- foreach(l, prsoptions)
- {
- DefElem *defel = (DefElem *) lfirst(l);
- char *val = defGetString(defel);
-
- if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- max_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- min_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- shortword = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- prs->startsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- prs->stopsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("unrecognized headline parameter: \"%s\"",
- defel->defname)));
- }

if (highlight == 0)
{
- if (min_words >= max_words)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("ShortWord should be >= 0")));
-
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
--- 1947,1955 ----
***************
*** 1877,1882 ****
--- 2070,2152 ----
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}

+ }
+
+ Datum
+ prsd_headline(PG_FUNCTION_ARGS)
+ {
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int max_fragments = 0;
+ int highlight = 0;
+ ListCell *l;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ max_fragments = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlight = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (highlight == 0)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (max_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MaxFragments should be >= 0")));
+ }
+
+ if (max_fragments == 0)
+ /* call the default headline generator */
+ mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ else
+ mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
***************
*** 1886,1888 ****
--- 2156,2159 ----

PG_RETURN_POINTER(prs);
}
+
-- does it work

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy
query and optionally return them in some order. Most common case: Find
documents containing all query terms and return them in order of their
similarity to the query.', to_tsquery('english', 'documents'),
'MaxFragments=1');

ts_headline
----------------------------------------------------------------------
... purpose of FTS is to find <b>documents</b>, which satisfy query and
optionally return them in some order. Most common case: Find <b>documents</b>
containing all query terms and return them in order of their similarity
(1 row)

-- does it respect MinWords

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'nosuchterm'), 'MaxFragments=1,MinWords=5');
ts_headline
-----------------------
The purpose of FTS is


-- does it respect MaxWords

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=8');
ts_headline
----------------------------------------------------------------------
... find <b>documents</b>, which satisfy query and optionally return

-- does it exclude ShortWord in the end ( "in" is excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
---------------------------------------------------------------------------
... find <b>documents</b>, which satisfy query and optionally return them


-- does it exclude ShortWord in the front ( "The" is excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'document'), 'MaxFragments=1,MinWords=5,MaxWords=13');
ts_headline
-------------------------------------------------------------------------------------------
... purpose of FTS is to find <b>documents</b>, which satisfy query and optionally return
(1 row)


-- when multiple words are used, the cover is shown in middle of the fragment (cover size <= MaxWords)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'optional & order'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
-------------------------------------------------------------------------------
... query and <b>optionally</b> return them in some <b>order</b>. Most common


-- does it choose the smallest cover (there are three covers between positions (7,17), (17, 22), and (22, 31). The chosen one is (17, 22))

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'order & documents'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
-------------------------------------------------------------------------------------
... some <b>order</b>. Most common case: Find <b>documents</b> containing all query
(1 row)

-- does it show multiple fragments

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'query & documents'), 'MaxFragments=2,MinWords=5,MaxWords=10');

ts_headline
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
... find <b>documents</b>, which satisfy <b>query</b> and optionally return them... common case: Find <b>documents</b> containing all <b>query</b> terms and return
(1 row)

-- does it exclude overlapping covers (even when MaxFragments = 2, the overlapping covers are excluded)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'query & order & documents'), 'MaxFragments=2,MinWords=5,MaxWords=15');
ts_headline
---------------------------------------------------------------------------------------------------------------------
... them in some <b>order</b>. Most common case: Find <b>documents</b> containing all <b>query</b> terms and return
(1 row)


-- when cover size is greater than MaxWords, does it break covers into fragments (first with MaxFragments = 1 and then with maxFragments = 2)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'purpose & similarity'), 'MaxFragments=1,MinWords=5,MaxWords=10');
ts_headline
---------------------------------------------------------------------
... <b>purpose</b> of FTS is to find documents, which satisfy query
(1 row)

testdb=# select ts_headline('The purpose of FTS is to find documents, which satisfy query and optionally return them in some order. Most common case: Find documents containing all query terms and return them in order of their similarity to the query.', to_tsquery('english', 'purpose & similarity'), 'MaxFragments=2,MinWords=5,MaxWords=10');
ts_headline
----------------------------------------------------------------------------------------------------------------------
... <b>purpose</b> of FTS is to find documents, which satisfy query... order of their <b>similarity</b> to the query
(1 row)

-- using Oleg suggestions for testing some boundry cases

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery, 'MaxFragments=1');
ts_headline
-----------------------
... <b>3</b> <b>1</b>
(1 row)

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery, 'MaxFragments=2');
ts_headline
------------------------------------------
<b>1</b> 2 <b>3</b>... <b>3</b> <b>1</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&2'::tsquery, 'MaxFragments=2');
ts_headline
----------------------------------------
<b>1</b> <b>2</b>... <b>1</b> <b>2</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','2'::tsquery, 'MaxFragments=2');
ts_headline
--------------------------
... <b>2</b>... <b>2</b>

testdb=# select ts_headline('1 2 3 4 5 1 2 3 1','1&4'::tsquery, 'MaxFragments=2');
ts_headline
-------------------------
... <b>4</b> 5 <b>1</b>
(1 row)

Index: src/test/regress/sql/tsearch.sql
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/test/regress/sql/tsearch.sql,v
retrieving revision 1.9
diff -c -r1.9 tsearch.sql
*** src/test/regress/sql/tsearch.sql 16 May 2008 16:31:02 -0000 1.9
--- src/test/regress/sql/tsearch.sql 18 Jul 2008 00:57:14 -0000
***************
*** 208,213 ****
--- 208,253 ----
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');

+ --Check if headline fragments work
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean'), 'MaxFragments=1');
+
+ --Check if more than one fragments are displayed
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
+
+ --Fragments when there all query words are not in the document
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
+
+
--Rewrite sub system

CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
Index: src/test/regress/expected/tsearch.out
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/test/regress/expected/tsearch.out,v
retrieving revision 1.14
diff -c -r1.14 tsearch.out
*** src/test/regress/expected/tsearch.out 16 May 2008 16:31:02 -0000 1.14
--- src/test/regress/expected/tsearch.out 18 Jul 2008 00:58:28 -0000
***************
*** 632,637 ****
--- 632,705 ----
</html>
(1 row)

+ --Check if headline fragments work
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean'), 'MaxFragments=1');
+ ts_headline
+ ------------------------------------
+ ... after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted <b>Ocean</b>.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop
+ (1 row)
+
+ --Check if more than one fragments are displayed
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
+ ts_headline
+ ---------------------------------------------
+ ... after day, day after day,
+ We <b>stuck</b>, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where... drop to drink.
+ S. T. <b>Coleridge</b>
+ (1 row)
+
+ --Fragments when there all query words are not in the document
+ SELECT ts_headline('english', '
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as a painted Ship
+ Upon a painted Ocean.
+ Water, water, every where
+ And all the boards did shrink;
+ Water, water, every where,
+ Nor any drop to drink.
+ S. T. Coleridge (1772-1834)
+ ', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
+ ts_headline
+ ------------------------------------
+
+ Day after day, day after day,
+ We stuck, nor breath nor motion,
+ As idle as
+ (1 row)
+
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none
Fixed some off by one errors pointed by Oleg and errors in excluding
overlapping fragments.

Also adding test queries and updating regression tests.

Let me know of any other changes that are needed.

-Sushant.

On Thu, 2008-07-17 at 03:28 +0400, Oleg Bartunov wrote:
> On Wed, 16 Jul 2008, Sushant Sinha wrote:
>
> > I will add test queries and their results for the corner cases in a
> > separate file. I guess the only thing I am confused about is what should
> > be the behavior of headline generation when Query items have words of
> > size less than ShortWord. I guess the answer is to ignore ShortWord
> > parameter but let me know if the answer is any different.
> >
>
> ShortWord is about headline text, it doesn't affects words in query,
> so you can't discard them from query.
>
> > -Sushant.
> >
> > On Thu, 2008-07-17 at 02:53 +0400, Oleg Bartunov wrote:
> >> Sushant,
> >>
> >> first, please, provide simple test queries, which demonstrate the right work
> >> in the corner cases. This will helps reviewers to test your patch and
> >> helps you to make sure your new version is ok. For example:
> >>
> >> =# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery);
> >> ts_headline
> >> ------------------------------------------------------
> >> <b>1</b> 2 <b>3</b> 4 5 <b>1</b> 2 <b>3</b> <b>1</b>
> >>
> >> This select breaks your code:
> >>
> >> =# select ts_headline('1 2 3 4 5 1 2 3 1','1&3'::tsquery,'maxfragments=2');
> >> ts_headline
> >> --------------
> >> ... 2 ...
> >>
> >> and so on ....
> >>
> >>
> >> Oleg
> >> On Tue, 15 Jul 2008, Sushant Sinha wrote:
> >>
> >>> Attached a new patch that:
> >>>
> >>> 1. fixes previous bug
> >>> 2. better handles the case when cover size is greater than the MaxWords.
> >>> Basically it divides a cover greater than MaxWords into fragments of
> >>> MaxWords, resizes each such fragment so that each end of the fragment
> >>> contains a query word and then evaluates best fragments based on number of
> >>> query words in each fragment. In case of tie it picks up the smaller
> >>> fragment. This allows more query words to be shown with multiple fragments
> >>> in case a single cover is larger than the MaxWords.
> >>>
> >>> The resizing of a fragment such that each end is a query word provides room
> >>> for stretching both sides of the fragment. This (hopefully) better presents
> >>> the context in which query words appear in the document. If a cover is
> >>> smaller than MaxWords then the cover is treated as a fragment.
> >>>
> >>> Let me know if you have any more suggestions or anything is not clear.
> >>>
> >>> I have not yet added the regression tests. The regression test suite seemed
> >>> to be only ensuring that the function works. How many tests should I be
> >>> adding? Is there any other place that I need to add different test cases for
> >>> the function?
> >>>
> >>> -Sushant.
> >>>
> >>>
> >>> Nice. But it will be good to resolve following issues:
> >>>> 1) Patch contains mistakes, I didn't investigate or carefully read it. Get
> >>>> http://www.sai.msu.su/~megera/postgres/fts/apod.dump.gz<http://www.sai.msu.su/%7Emegera/postgres/fts/apod.dump.gz>and load in db.
> >>>>
> >>>> Queries
> >>>> # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1')
> >>>> from apod where to_tsvector(body) @@ plainto_tsquery('black hole');
> >>>>
> >>>> and
> >>>>
> >>>> # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1')
> >>>> from apod;
> >>>>
> >>>> crash postgresql :(
> >>>>
> >>>> 2) pls, include in your patch documentation and regression tests.
> >>>>
> >>>>
> >>>>> Another change that I was thinking:
> >>>>>
> >>>>> Right now if cover size > max_words then I just cut the trailing words.
> >>>>> Instead I was thinking that we should split the cover into more
> >>>>> fragments such that each fragment contains a few query words. Then each
> >>>>> fragment will not contain all query words but will show more occurrences
> >>>>> of query words in the headline. I would like to know what your opinion
> >>>>> on this is.
> >>>>>
> >>>>
> >>>> Agreed.
> >>>>
> >>>>
> >>>> --
> >>>> Teodor Sigaev E-mail: teodor@sigaev.ru
> >>>> WWW:
> >>>> http://www.sigaev.ru/
> >>>>
> >>>
> >>
> >> Regards,
> >> Oleg
> >> _____________________________________________________________
> >> Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
> >> Sternberg Astronomical Institute, Moscow University, Russia
> >> Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
> >> phone: +007(495)939-16-83, +007(495)939-23-83
> >
>
> Regards,
> Oleg
> _____________________________________________________________
> Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
> Sternberg Astronomical Institute, Moscow University, Russia
> Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
> phone: +007(495)939-16-83, +007(495)939-23-83

No comments: