Saturday, June 21, 2008

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.14
diff -c -r1.14 wparser_def.c
*** src/backend/tsearch/wparser_def.c 1 Jan 2008 19:45:52 -0000 1.14
--- src/backend/tsearch/wparser_def.c 21 Jun 2008 07:59:02 -0000
***************
*** 1684,1701 ****
return false;
}

! Datum
! prsd_headline(PG_FUNCTION_ARGS)
{
! HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
! List *prsoptions = (List *) PG_GETARG_POINTER(1);
! TSQuery query = PG_GETARG_TSQUERY(2);

! /* from opt + start and and tag */
! int min_words = 15;
! int max_words = 35;
! int shortword = 3;

int p = 0,
q = 0;
int bestb = -1,
--- 1684,1891 ----
return false;
}

! static void
! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
! int i;
! char *coversep = "...";
! int coverlen = strlen(coversep);

! for (i = startpos; i <= endpos; i++)
! {
! if (prs->words[i].item)
! prs->words[i].selected = 1;
! if (highlight == 0)
! {
! if (HLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
! else
! {
! if (XMLHLIDIGNORE(prs->words[i].type))
! prs->words[i].replace = 1;
! }
!
! prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
! }
! /* add cover separators if needed */
! if (startpos > 0 && strncmp(prs->words[startpos-1].word, coversep,
! prs->words[startpos-1].len) != 0)
! {
!
! prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * coverlen);
! prs->words[startpos-1].in = 1;
! prs->words[startpos-1].len = coverlen;
! memcpy(prs->words[startpos-1].word, coversep, coverlen);
! }
! if (endpos-1 < prs->curwords && strncmp(prs->words[startpos-1].word, coversep,
! prs->words[startpos-1].len) != 0)
! {
! prs->words[endpos+1].word = repalloc(prs->words[endpos+1].word, sizeof(char) * coverlen);
! prs->words[endpos+1].in = 1;
! memcpy(prs->words[endpos+1].word, coversep, coverlen);
! }
! }
!
! typedef struct
! {
! int4 startpos;
! int4 endpos;
! int2 in;
! int2 excluded;
! } CoverPos;
!
!
! static void
! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
! int shortword, int min_words,
! int max_words, int max_fragments)
! {
! int4 curlen, coverlen, i, f, num_f;
! int4 stretch, maxstretch;
!
! int4 startpos = 0,
! endpos = 0,
! p = 0,
! q = 0;
!
! int4 numcovers = 0,
! maxcovers = 32;
!
! int4 min, minI = 0;
! CoverPos *covers;
!
! covers = palloc(maxcovers * sizeof(CoverPos));
!
! /* get all covers */
! while (hlCover(prs, query, &p, &q))
! {
! if (numcovers >= maxcovers)
! {
! maxcovers *= 2;
! covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
! }
! covers[numcovers].startpos = p;
! covers[numcovers].endpos = q;
!
! covers[numcovers].in = 0;
! covers[numcovers].excluded = 0;
! numcovers ++;
! /* move p to generate the next cover */
! p++;
! }
!
! /* choose best covers */
! for (f = 0; f < max_fragments; f++)
! {
! min = 0x7fffffff;
! for (i = 0; i < numcovers; i ++)
! {
! coverlen = covers[i].endpos - covers[i].startpos + 1;
! if (!covers[i].in && !covers[i].excluded && min > coverlen)
! {
! min = coverlen;
! minI = i;
! }
! }
! if (min < 0x7fffffff)
! {
! covers[minI].in = 1;
! /* adjust the size of cover
! * if max_words >= len
! * then headline from ext.p - (max_words-len)/2 to ext.q + (maxcoverSize-len) /2
! * if maxcoverSize < len
! * then headline from ext.p to ext.p + maxcoverSize
! */
! startpos = covers[minI].startpos;
! endpos = covers[minI].endpos;
!
! for(curlen = 0, i = startpos; i < endpos && curlen < max_words; i++)
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen++;
! endpos = i;
! if (curlen < max_words)
! {
! /* divide the stretch on both sides of cover */
! maxstretch = (max_words - curlen)/2;
! /* first stretch the startpos */
! stretch = 0;
! for (i = startpos; i >= 0 && stretch < maxstretch; i--)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! {
! curlen ++;
! stretch ++;
! }
! }
! /* cut back startpos till we find a non short token */
! for ( ; i <= startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen --;
! }
! startpos = i;
! /* now stretch the endpos as much as possible*/
! for (i = endpos; i < prs->curwords && curlen < max_words; i++)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen ++;
! }
! /* cut back endpos till we find a non-short token */
! for ( ; i >= endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
! {
! if (!NONWORDTOKEN(prs->words[i].type))
! curlen --;
! }
! endpos = i;
! }
! covers[minI].startpos = startpos;
! covers[minI].endpos = endpos;
!
! /* exclude overlapping covers */
! for (i = 0; i < numcovers; i ++)
! {
! if (i != minI &&
! (covers[i].startpos >= covers[minI].startpos &&
! covers[i].startpos <= covers[minI].endpos))
! covers[i].excluded = 1;
! }
! }
! else
! break;
! }
!
! /* Mark the chosen fragments (covers) */

+ for (num_f = 0, i = 0; i < numcovers; i++)
+ {
+ if (covers[i].in && !covers[i].excluded)
+ {
+ startpos = covers[i].startpos;
+ endpos = covers[i].endpos;
+
+ mark_fragment(prs, highlight, covers[i].startpos, covers[i].endpos);
+ num_f ++;
+ }
+ }
+ /* show at least min_words we have not marked anything*/
+ if (num_f <= 0)
+ {
+ startpos = endpos = curlen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ endpos = i;
+ }
+ mark_fragment(prs, highlight, startpos, endpos);
+ }
+ pfree(covers);
+ }
+ static void
+ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+ int shortword, int min_words, int max_words)
+ {
int p = 0,
q = 0;
int bestb = -1,
***************
*** 1707,1762 ****
curlen;

int i;
- int highlight = 0;
- ListCell *l;
-
- /* config */
- prs->startsel = NULL;
- prs->stopsel = NULL;
- foreach(l, prsoptions)
- {
- DefElem *defel = (DefElem *) lfirst(l);
- char *val = defGetString(defel);
-
- if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- max_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- min_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- shortword = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- prs->startsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- prs->stopsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("unrecognized headline parameter: \"%s\"",
- defel->defname)));
- }

if (highlight == 0)
{
- if (min_words >= max_words)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("ShortWord should be >= 0")));
-
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
--- 1897,1905 ----
***************
*** 1877,1882 ****
--- 2020,2101 ----
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}

+ }
+
+ Datum
+ prsd_headline(PG_FUNCTION_ARGS)
+ {
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int max_fragments = 0;
+ int highlight = 0;
+ ListCell *l;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ max_fragments = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlight = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (highlight == 0)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (max_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MaxFragments should be >= 0")));
+
+ if (max_fragments == 0)
+ /* call the default headline generator */
+ mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ else
+ mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+ }
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
***************
*** 1886,1888 ****
--- 2105,2108 ----

PG_RETURN_POINTER(prs);
}
+
I have an attached an updated patch with following changes:

1. Respects ShortWord and MinWords
2. Uses hlCover instead of Cover
3. Does not store norm (or lexeme) for headline marking
4. Removes ts_rank.h
5. Earlier it was counting even NONWORDTOKEN in the headline. Now it
only counts the actual words and excludes spaces etc.

I have also changed NumFragments option to MaxFragments as there may not
be enough covers to display NumFragments.

Another change that I was thinking:

Right now if cover size > max_words then I just cut the trailing words.
Instead I was thinking that we should split the cover into more
fragments such that each fragment contains a few query words. Then each
fragment will not contain all query words but will show more occurrences
of query words in the headline. I would like to know what your opinion
on this is.

-Sushant.

On Thu, 2008-06-05 at 20:21 +0400, Teodor Sigaev wrote:
> > A couple of caveats:
> >
> > 1. ts_headline testing was done with current cvs head where as
> > headline_with_fragments was done with postgres 8.3.1.
> > 2. For headline_with_fragments, TSVector for the document was obtained
> > by joining with another table.
> > Are these differences understandable?
>
> That is possible situation because ts_headline has several criterias of 'best'
> covers - length, number of words from query, good words at the begin and at the
> end of headline while your fragment's algorithm takes care only on total number
> of words in all covers. It's not very good, but it's acceptable, I think.
> Headline (and ranking too) hasn't any formal rules to define is it good or bad?
> Just a people's opinions.
>
> Next possible reason: original algorithm had a look on all covers trying to find
> the best one while your algorithm tries to find just the shortest covers to fill
> a headline.
>
> But it's very desirable to use ShortWord - it's not very comfortable for user if
> one option produces unobvious side effect with another one.
> `
>
> > If you think these caveats are the reasons or there is something I am
> > missing, then I can repeat the entire experiments with exactly the same
> > conditions.
>
> Interesting for me test is a comparing hlCover with Cover in your patch, i.e.
> develop a patch which uses hlCover instead of Cover and compare old patch with
> new one.

No comments: