Monday, June 2, 2008

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Index: src/backend/tsearch/dict.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/dict.c,v
retrieving revision 1.5
diff -u -r1.5 dict.c
--- src/backend/tsearch/dict.c 25 Mar 2008 22:42:43 -0000 1.5
+++ src/backend/tsearch/dict.c 30 May 2008 23:20:57 -0000
@@ -16,6 +16,7 @@
#include "catalog/pg_type.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_public.h"
#include "utils/builtins.h"


Index: src/backend/tsearch/to_tsany.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/to_tsany.c,v
retrieving revision 1.12
diff -u -r1.12 to_tsany.c
--- src/backend/tsearch/to_tsany.c 16 May 2008 16:31:01 -0000 1.12
+++ src/backend/tsearch/to_tsany.c 31 May 2008 08:43:27 -0000
@@ -15,6 +15,7 @@

#include "catalog/namespace.h"
#include "tsearch/ts_cache.h"
+#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/syscache.h"
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.8
diff -u -r1.8 ts_parse.c
--- src/backend/tsearch/ts_parse.c 16 May 2008 16:31:01 -0000 1.8
+++ src/backend/tsearch/ts_parse.c 2 Jun 2008 20:10:14 -0000
@@ -446,6 +446,27 @@
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
+static void
+hladdnorm(HeadlineParsedText *prs, char *lexeme, int lexemelen)
+{
+ ParsedText *prstxt = &(prs->prstxt);
+ while (prstxt->curwords >= prstxt->lenwords)
+ {
+ prstxt->lenwords *= 2;
+ prstxt->words = (ParsedWord *) repalloc((void *) prstxt->words, prstxt->lenwords * sizeof(ParsedWord));
+ }
+
+ prstxt->words[prstxt->curwords].len = lexemelen;
+ prstxt->words[prstxt->curwords].word = palloc(lexemelen * sizeof(char));
+ memcpy(prstxt->words[prstxt->curwords].word, lexeme, lexemelen);
+ /*
+ prstxt->words[prstxt->curwords].nvariant = ptr->nvariant;
+ prstxt->words[prstxt->curwords].flags = ptr->flags & TSL_PREFIX;
+ */
+ prstxt->words[prstxt->curwords].alen = 0;
+ prstxt->words[prstxt->curwords].pos.pos = prs->curwords - 1;
+ prstxt->curwords++;
+}

static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
@@ -476,6 +497,9 @@
}
else
word->item = &item->operand;
+ /* update the corresponding ParsedText */
+ hladdnorm(prs, buf, buflen);
+
}
item++;
}
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.9
diff -u -r1.9 wparser.c
--- src/backend/tsearch/wparser.c 12 May 2008 00:00:50 -0000 1.9
+++ src/backend/tsearch/wparser.c 31 May 2008 08:44:01 -0000
@@ -317,6 +317,9 @@
prs.lenwords = 32;
prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);

+ prs.prstxt.lenwords = 32;
+ prs.prstxt.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.prstxt.lenwords);
+
hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);

if (opt)
@@ -335,6 +338,11 @@
PG_FREE_IF_COPY(query, 2);
if (opt)
PG_FREE_IF_COPY(opt, 3);
+
+
+ /* prs.prstxt.words are all freed up by make_tsvector itself
+ * so don't need to free it now */
+
pfree(prs.words);
pfree(prs.startsel);
pfree(prs.stopsel);
Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.14
diff -u -r1.14 wparser_def.c
--- src/backend/tsearch/wparser_def.c 1 Jan 2008 19:45:52 -0000 1.14
+++ src/backend/tsearch/wparser_def.c 2 Jun 2008 20:47:17 -0000
@@ -1684,18 +1684,186 @@
return false;
}

-Datum
-prsd_headline(PG_FUNCTION_ARGS)
+static void
+mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
- HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
- List *prsoptions = (List *) PG_GETARG_POINTER(1);
- TSQuery query = PG_GETARG_TSQUERY(2);
+ int i;
+ char *coversep = "...";
+ int coverlen = strlen(coversep);

- /* from opt + start and and tag */
- int min_words = 15;
- int max_words = 35;
- int shortword = 3;
+ for (i = startpos; i <= endpos; i++)
+ {
+ if (prs->words[i].item)
+ prs->words[i].selected = 1;
+ if (highlight == 0)
+ {
+ if (HLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ }
+ else
+ {
+ if (XMLHLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ }
+
+ prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
+ }
+ /* add cover separators if needed */
+ if (startpos > 0 && strncmp(prs->words[startpos-1].word, coversep,
+ prs->words[startpos-1].len) != 0)
+ {
+
+ prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * coverlen);
+ prs->words[startpos-1].in = 1;
+ prs->words[startpos-1].len = coverlen;
+ memcpy(prs->words[startpos-1].word, coversep, coverlen);
+ }
+ if (endpos-1 < prs->curwords && strncmp(prs->words[startpos-1].word, coversep,
+ prs->words[startpos-1].len) != 0)
+ {
+ prs->words[endpos+1].word = repalloc(prs->words[endpos+1].word, sizeof(char) * coverlen);
+ prs->words[endpos+1].in = 1;
+ memcpy(prs->words[endpos+1].word, coversep, coverlen);
+ }
+}
+
+static void
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query,int highlight,
+ int num_fragments, int maxcoversize)
+{
+ DocRepresentation* doc;
+ Extention ext;
+ int4 coverlen, doclen;
+ int4 startpos = 0, endpos = 0;
+ QueryRepresentation qr;
+ int4 i, f, numcovers = 0, maxcovers = 32, maxstretch;
+ int4 min, minI = 0;
+ CoverPos *covers;
+ TSVector t;
+
+ if (prs->prstxt.curwords == 0)
+ {
+ /* no query words found in the document */
+ pfree(prs->prstxt.words);
+ return;
+ }
+ t = make_tsvector(&(prs->prstxt));
+ covers = palloc(maxcovers * sizeof(CoverPos));
+
+ qr.query = query;
+ qr.operandexist = (bool*) palloc0(sizeof(bool) * query->size);
+
+ /* start generating covers for the query */
+ doc = get_docrep(t, &qr, &doclen);
+ if (!doc)
+ {
+ pfree(qr.operandexist);
+ pfree(covers);
+ /* cannot do anything */
+ return;
+ }
+
+ /* get all covers */
+ MemSet(&ext, 0, sizeof(Extention));
+ while (Cover(doc, doclen, &qr, &ext))
+ {
+ if (numcovers >= maxcovers)
+ {
+ maxcovers *= 2;
+ covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
+ }
+ covers[numcovers].startpos = ext.p;
+ covers[numcovers].endpos = ext.q;

+ covers[numcovers].in = 0;
+ covers[numcovers].excluded = 0;
+ numcovers ++;
+ }
+ /* we do not need tsvector any more, free it */
+ if (t)
+ pfree(t);
+
+ /* choose best covers */
+ for (f = 0; f < num_fragments; f++)
+ {
+ /*min = 9999999; XXX - will not display headlines that exceed 9999999 */
+ min = 0x7fffffff;
+ for (i = 0; i < numcovers; i ++)
+ {
+ coverlen = covers[i].endpos - covers[i].startpos + 1;
+ if (!covers[i].in && !covers[i].excluded && min > coverlen)
+ {
+ min = coverlen;
+ minI = i;
+ }
+ }
+ if (min < 9999999)
+ {
+ covers[minI].in = 1;
+ /* adjust the size of cover
+ * if maxcoversize >= len
+ * then headline from ext.p - (maxcoversize-len)/2 to ext.q + (maxcoverSize-len) /2
+ * if maxcoverSize < len
+ * then headline from ext.p to ext.p + maxcoverSize
+ * (ensures starting lexeme is in the headline)
+ */
+ /* cut down endpos if it crosses maxWords */
+ startpos = covers[minI].startpos;
+ endpos = covers[minI].endpos;
+ coverlen = endpos - startpos + 1;
+
+ if (maxcoversize > coverlen)
+ {
+ /* stretch it to maxwords */
+ maxstretch = maxcoversize;
+
+ /* divide the stretch on both sides of cover */
+ startpos -= (maxstretch - coverlen)/2;
+ endpos += (maxstretch - coverlen)/2;
+ if (startpos < 0)
+ startpos = 0;
+ /* XXX - do we need to check whether endpos crosses the document
+ * the other function would return if the document ends or the
+ * endpos is reached.
+ * Dropping this check for time being
+ */
+ }
+ else if (maxcoversize < coverlen)
+ endpos = startpos + maxcoversize;
+ covers[minI].startpos = startpos;
+ covers[minI].endpos = endpos;
+
+ /* exclude overlapping covers */
+ for (i = 0; i < numcovers; i ++)
+ {
+ if (i != minI &&
+ (covers[i].startpos >= covers[minI].startpos &&
+ covers[i].startpos <= covers[minI].endpos))
+ covers[i].excluded = 1;
+ }
+ }
+ else
+ break;
+ }
+
+ /* Mark the chosen fragments (covers) */
+
+ for (i = 0; i < numcovers; i++)
+ {
+ if (!covers[i].in)
+ continue;
+
+ startpos = covers[i].startpos;
+ endpos = covers[i].endpos;
+
+ mark_fragment(prs, highlight, covers[i].startpos, covers[i].endpos);
+ }
+ pfree(qr.operandexist);
+ pfree(covers);
+}
+static void
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, int shortword, int min_words, int max_words)
+{
int p = 0,
q = 0;
int bestb = -1,
@@ -1707,56 +1875,9 @@
curlen;

int i;
- int highlight = 0;
- ListCell *l;
-
- /* config */
- prs->startsel = NULL;
- prs->stopsel = NULL;
- foreach(l, prsoptions)
- {
- DefElem *defel = (DefElem *) lfirst(l);
- char *val = defGetString(defel);
-
- if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- max_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- min_words = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- shortword = pg_atoi(val, sizeof(int32), 0);
- else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- prs->startsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- prs->stopsel = pstrdup(val);
- else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- highlight = (pg_strcasecmp(val, "1") == 0 ||
- pg_strcasecmp(val, "on") == 0 ||
- pg_strcasecmp(val, "true") == 0 ||
- pg_strcasecmp(val, "t") == 0 ||
- pg_strcasecmp(val, "y") == 0 ||
- pg_strcasecmp(val, "yes") == 0);
- else
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("unrecognized headline parameter: \"%s\"",
- defel->defname)));
- }

if (highlight == 0)
{
- if (min_words >= max_words)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("ShortWord should be >= 0")));
-
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
@@ -1877,6 +1998,82 @@
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}

+}
+
+Datum
+prsd_headline(PG_FUNCTION_ARGS)
+{
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int num_fragments = 0;
+ int highlight = 0;
+ ListCell *l;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "NumFragments") == 0)
+ num_fragments = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlight = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (highlight == 0)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (num_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("NumFragments should be >= 0")));
+
+ if (num_fragments == 0)
+ /* call the default headline generator */
+ mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ else
+ mark_hl_fragments(prs, query, highlight, num_fragments, max_words);
+ }
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
@@ -1886,3 +2083,4 @@

PG_RETURN_POINTER(prs);
}
+
Index: src/backend/utils/adt/tsrank.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/utils/adt/tsrank.c,v
retrieving revision 1.13
diff -u -r1.13 tsrank.c
--- src/backend/utils/adt/tsrank.c 16 May 2008 16:31:01 -0000 1.13
+++ src/backend/utils/adt/tsrank.c 2 Jun 2008 20:04:25 -0000
@@ -17,6 +17,7 @@

#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
#include "utils/array.h"
#include "miscadmin.h"

@@ -488,14 +489,6 @@
PG_RETURN_FLOAT4(res);
}

-typedef struct
-{
- QueryItem **item;
- int16 nitem;
- uint8 wclass;
- int32 pos;
-} DocRepresentation;
-
static int
compareDocR(const void *va, const void *vb)
{
@@ -507,12 +500,6 @@
return (a->pos > b->pos) ? 1 : -1;
}

-typedef struct
-{
- TSQuery query;
- bool *operandexist;
-} QueryRepresentation;
-
#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true

@@ -524,17 +511,7 @@
return QR_GET_OPERAND_EXISTS(qr, val);
}

-typedef struct
-{
- int pos;
- int p;
- int q;
- DocRepresentation *begin;
- DocRepresentation *end;
-} Extention;
-
-
-static bool
+bool
Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext)
{
DocRepresentation *ptr;
@@ -615,7 +592,7 @@
return Cover(doc, len, qr, ext);
}

-static DocRepresentation *
+DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.9
diff -u -r1.9 ts_public.h
--- src/include/tsearch/ts_public.h 16 May 2008 16:31:02 -0000 1.9
+++ src/include/tsearch/ts_public.h 31 May 2008 15:10:24 -0000
@@ -14,6 +14,7 @@
#define _PG_TS_PUBLIC_H_

#include "tsearch/ts_type.h"
+#include "tsearch/ts_utils.h"

/*
* Parser's framework
@@ -47,6 +48,7 @@

typedef struct
{
+ ParsedText prstxt;
HeadlineWordEntry *words;
int4 lenwords;
int4 curwords;
@@ -55,6 +57,24 @@
int2 startsellen;
int2 stopsellen;
} HeadlineParsedText;
+/*
+ * headline framework, flow in common to generate:
+ * 1 parse text with hlparsetext
+ * 2 parser-specific function to find part
+ * 3 generateHeadline to generate result text
+ */
+
+typedef struct
+{
+ int4 startpos;
+ int4 endpos;
+ int2 in;
+ int2 excluded;
+} CoverPos;
+
+extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
+ char *buf, int4 buflen);
+extern text *generateHeadline(HeadlineParsedText *prs);

/*
* Common useful things for tsearch subsystem
Index: src/include/tsearch/ts_rank.h
===================================================================
RCS file: src/include/tsearch/ts_rank.h
diff -N src/include/tsearch/ts_rank.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/include/tsearch/ts_rank.h 2 Jun 2008 20:04:25 -0000
@@ -0,0 +1,36 @@
+#ifndef __TSRANK_H__
+#define __TSRANK_H__
+
+#include "ts_type.h"
+#include "ts_cache.h"
+
+typedef struct
+{
+ QueryItem **item;
+ int16 nitem;
+ uint8 wclass;
+ int32 pos;
+} DocRepresentation;
+
+typedef struct
+{
+ TSQuery query;
+ bool *operandexist;
+} QueryRepresentation;
+
+typedef struct
+{
+ int pos;
+ int p;
+ int q;
+ DocRepresentation *begin;
+ DocRepresentation *end;
+} Extention;
+
+bool
+Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext);
+
+DocRepresentation *
+get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen);
+
+#endif /* __TSRANK_H__ */
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.15
diff -u -r1.15 ts_utils.h
--- src/include/tsearch/ts_utils.h 16 May 2008 16:31:02 -0000 1.15
+++ src/include/tsearch/ts_utils.h 30 May 2008 23:18:08 -0000
@@ -13,7 +13,7 @@
#define _PG_TS_UTILS_H_

#include "tsearch/ts_type.h"
-#include "tsearch/ts_public.h"
+#include "tsearch/ts_rank.h"
#include "nodes/pg_list.h"

/*
@@ -90,16 +90,6 @@

extern void parsetext(Oid cfgId, ParsedText *prs, char *buf, int4 buflen);

-/*
- * headline framework, flow in common to generate:
- * 1 parse text with hlparsetext
- * 2 parser-specific function to find part
- * 3 generateHeadline to generate result text
- */
-
-extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
- char *buf, int4 buflen);
-extern text *generateHeadline(HeadlineParsedText *prs);

/*
* Common check function for tsvector @@ tsquery
Efficiency: I realized that we do not need to store all norms. We need
to only store store norms that are in the query. So I moved the addition
of norms from addHLParsedLex to hlfinditem. This should add very little
memory overhead to existing headline generation.

If this is still not acceptable for default headline generation, then I
can push it into mark_hl_fragments. But I think any headline marking
function will benefit by having the norms corresponding to the query.

Why we need norms?

hlCover does the exact thing that Cover in tsrank does which is to find
the cover that contains the query. However hlcover has to go through
words that do not match the query. Cover on the other hand operates on
position indexes for just the query words and so it should be faster.

The main reason why I would I like it to be fast is that I want to
generate all covers for a given query. Then choose covers with smallest
length as they will be the one that will best explain relation of a
query to a document. Finally stretch those covers to the specified size.

In my understanding, the current headline generation tries to find the
biggest cover for display in the headline. I personally think that such
a cover does not explain the context of a query in a document. We may
differ on this and thats why we may need both options.

Let me know what you think on this patch and I will update the patch to
respect other options like MinWords and ShortWord.

NumFragments < 2:
I wanted people to use the new headline marker if they specify
NumFragments >= 1. If they do not specify the NumFragments or put it to
0 then the default marker will be used. This becomes a bit of tricky
parameter so please put in any idea on how to trigger the new marker.

On an another note I found that make_tsvector crashes if it receives a
ParsedText with curwords = 0. Specifically uniqueWORD returns curwords
as 1 even when it gets 0 words. I am not sure if this is the desired
behavior.

-Sushant.


On Mon, 2008-06-02 at 18:10 +0400, Teodor Sigaev wrote:
> > I have attached a new patch with respect to the current cvs head. This
> > produces headline in a document for a given query. Basically it
> > identifies fragments of text that contain the query and displays them.
> New variant is much better, but...
>
> > HeadlineParsedText contains an array of actual words but not
> > information about the norms. We need an indexed position vector for each
> > norm so that we can quickly evaluate a number of possible fragments.
> > Something that tsvector provides.
>
> Why do you need to store norms? The single purpose of norms is identifying words
> from query - but it's already done by hlfinditem. It sets
> HeadlineWordEntry->item to corresponding QueryOperand in tsquery.
> Look, headline function is rather expensive and your patch adds a lot of extra
> work - at least in memory usage. And if user calls with NumFragments=0 the that
> work is unneeded.
>
> > This approach does not change any other interface and fits nicely with
> > the overall framework.
> Yeah, it's a really big step forward. Thank you. You are very close to
> committing except: Did you find a hlCover() function which produce a cover from
> original HeadlineParsedText representation? Is any reason to do not use it?
>
> >
> > The norms are converted into tsvector and a number of covers are
> > generated. The best covers are then chosen to be in the headline. The
> > covers are separated using a hardcoded coversep. Let me know if you want
> > to expose this as an option.
>
>
> >
> > Covers that overlap with already chosen covers are excluded.
> >
> > Some options like ShortWord and MinWords are not taken care of right
> > now. MaxWords are used as maxcoversize. Let me know if you would like to
> > see other options for fragment generation as well.
> ShortWord, MinWords and MaxWords should store their meaning, but for each
> fragment, not for the whole headline.
>
>
> >
> > Let me know any more changes you would like to see.
>
> if (num_fragments == 0)
> /* call the default headline generator */
> mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
> else
> mark_hl_fragments(prs, query, highlight, num_fragments, max_words);
>
>
> Suppose, num_fragments < 2?
>

No comments: