#include "postgres.h" #include "catalog/pg_type.h" #include "mb/pg_wchar.h" #include "utils/array.h" /* * MARK_SIGN is a sign of end of string and this character should * be regular character. The best candidate of this is a zero byte * which is accepted for any locale used by postgres. But it's impossible * to show it, so we will replace it to another one (MARK_SIGN_SHOW) which * can be noticed well. But we can't use it as mark because it's allowed * to be inside string. */ #define MARK_SIGN '\0' #define MARK_SIGN_SHOW '$' #define WC_BEGIN 0x01 /* should be in begining of string */ #define WC_MIDDLE 0x02 /* should be in middle of string */ #define WC_END 0x04 /* should be in end of string */ PG_MODULE_MAGIC; static text* appendStrToText( text *src, char *str, int32 len, int32 maxlen ) { int32 curlen; if (src == NULL ) { Assert( maxlen >= 0 ); src = (text*)palloc( VARHDRSZ + sizeof(char*) * maxlen ); SET_VARSIZE(src, 0 + VARHDRSZ); } curlen = VARSIZE(src) - VARHDRSZ; if (len>0) memcpy( VARDATA(src) + curlen, str, len ); SET_VARSIZE(src, curlen + len + VARHDRSZ); return src; } static text* appendMarkToText( text *src, int32 maxlen ) { char sign = MARK_SIGN; return appendStrToText( src, &sign, 1, maxlen ); } static text* setFlagOfText( char flag, int32 maxlen ) { char flagstruct[2]; Assert( maxlen > 0 ); /* * Mark text by setting first byte to MARK_SIGN to indicate * that text has flags. It's a safe for non empty string, * because first character can not be a MARK_SIGN (see * gin_extract_permuted() ) */ flagstruct[0] = MARK_SIGN; flagstruct[1] = flag; return appendStrToText(NULL, flagstruct, 2, maxlen ); } PG_FUNCTION_INFO_V1(gin_extract_permuted); Datum gin_extract_permuted(PG_FUNCTION_ARGS); Datum gin_extract_permuted(PG_FUNCTION_ARGS) { text *src = PG_GETARG_TEXT_P(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; int32 srclen = pg_mbstrlen_with_len(VARDATA(src), VARSIZE(src) - VARHDRSZ); *nentries = srclen; if ( srclen == 0 ) { /* * Empty string is encoded by alone MARK_SIGN character */ *nentries = 1; entries = (Datum*) palloc(sizeof(Datum)); entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) ); } else { text *dst; int32 i, offset=0; /* offset to current position in src in bytes */ int32 nbytes = VARSIZE(src) - VARHDRSZ; char *srcptr = VARDATA(src); /* * Permutation: hello will be permuted to hello$, ello$h, llo$he, lo$hel, o$hell. * So, number of entries is equial to number of characters (not a bytes) */ entries = (Datum*)palloc(sizeof(char*) * nbytes ); for(i=0; i 2 && *ptra == MARK_SIGN ) { flag = *(ptra+1); ptra+=2; lena-=2; if ( lenb > 2 && *ptrb == MARK_SIGN ) { /* * If they have different flags then they can not be equal, this * place works only during check of equality of keys * to search */ if ( flag != *(ptrb+1) ) return 1; ptrb+=2; lenb-=2; /* b can not be a product of gin_extract_wildcard for partial match mode */ Assert( partialMatch == false ); } } else if ( lenb > 2 && *ptrb == MARK_SIGN ) { /* b can not be a product of gin_extract_wildcard for partial match mode */ Assert( partialMatch == false ); ptrb+=2; lenb-=2; } if ( lena == 0 ) { if ( partialMatch ) cmp = 0; /* full scan for partialMatch*/ else cmp = (lenb>0) ? -1 : 0; } else { /* * We couldn't use strcmp because of MARK_SIGN */ cmp = memcmp(ptra, ptrb, Min(lena, lenb)); if ( partialMatch ) { if ( cmp == 0 ) { if ( lena > lenb ) { /* * b argument is not beginning with argument a */ cmp = 1; } else if ( flag > 0 && lenb>lena /* be safe */ ) { /* there is some flags to check */ char actualFlag; if ( ptrb[ lenb - 1 ] == MARK_SIGN ) actualFlag = WC_BEGIN; else if ( ptrb[ lena ] == MARK_SIGN ) actualFlag = WC_END; else actualFlag = WC_MIDDLE; if ( (flag & actualFlag) == 0 ) { /* * Prefix are matched but this prefix s not placed as needed. * so we should give a smoke signal to GIN that we don't want * this match, but wish to continue scan */ cmp = -1; } } } else if (cmp < 0) { cmp = 1; /* prevent continue scan */ } } else if ( (cmp == 0) && (lena != lenb) ) { cmp = (lena < lenb) ? -1 : 1; } } return cmp; } PG_FUNCTION_INFO_V1(wildcmp); Datum wildcmp(PG_FUNCTION_ARGS); Datum wildcmp(PG_FUNCTION_ARGS) { text *a = PG_GETARG_TEXT_P(0); text *b = PG_GETARG_TEXT_P(1); int32 cmp; cmp = wildcmp_internal(a, b, false); PG_FREE_IF_COPY(a,0); PG_FREE_IF_COPY(b,1); PG_RETURN_INT32( cmp ); } PG_FUNCTION_INFO_V1(wildcmp_prefix); Datum wildcmp_prefix(PG_FUNCTION_ARGS); Datum wildcmp_prefix(PG_FUNCTION_ARGS) { text *a = PG_GETARG_TEXT_P(0); text *b = PG_GETARG_TEXT_P(1); #ifdef NOT_USED StrategyNumber strategy = PG_GETARG_UINT16(2); #endif int32 cmp; cmp = wildcmp_internal(a, b, true); PG_FREE_IF_COPY(a,0); PG_FREE_IF_COPY(b,1); PG_RETURN_INT32( cmp ); } #ifdef OPTIMIZE_WILDCARD_QUERY typedef struct { Datum entry; int32 len; char flag; } OptItem; /* * Function drops most short search word to speedup * index search by preventing use word which gives * a lot of matches */ static void optimize_wildcard_search( Datum *entries, int32 *nentries ) { int32 maxlen=0; OptItem *items; int i, nitems = *nentries; char *ptr,*p; items = (OptItem*)palloc( sizeof(OptItem) * (*nentries) ); for(i=0;i 2 && *ptr == MARK_SIGN ) { items[i].len-=2; items[i].flag = *(ptr+1); } else { items[i].flag = 0; if ( items[i].len > 1 && (p=strchr(ptr, MARK_SIGN)) != NULL ) { if ( p == ptr + items[i].len -1 ) items[i].flag = WC_BEGIN; else items[i].flag = WC_BEGIN | WC_END; } } if ( items[i].len > maxlen ) maxlen = items[i].len; } *nentries=0; for(i=0;i maxlen ) { entries[ *nentries ] = items[i].entry; (*nentries)++; } } else if ( 2*items[i].len > maxlen ) { /* * use only items with biggest length */ entries[ *nentries ] = items[i].entry; (*nentries)++; } } Assert( *nentries>0 ); } #endif typedef struct { bool iswildcard; int32 len; char *ptr; } WildItem; PG_FUNCTION_INFO_V1(gin_extract_wildcard); Datum gin_extract_wildcard(PG_FUNCTION_ARGS); Datum gin_extract_wildcard(PG_FUNCTION_ARGS) { text *q = PG_GETARG_TEXT_P(0); int32 lenq = VARSIZE(q) - VARHDRSZ; int32 *nentries = (int32 *) PG_GETARG_POINTER(1); #ifdef NOT_USED StrategyNumber strategy = PG_GETARG_UINT16(2); #endif bool *partialmatch, **ptr_partialmatch = (bool**) PG_GETARG_POINTER(3); Pointer **extra = (Pointer**)PG_GETARG_POINTER(4); Datum *entries = NULL; char *qptr = VARDATA(q); int clen, splitqlen = 0, i; WildItem *items; text *entry; bool needRecheck = false; *nentries = 0; if ( lenq == 0 ) { partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool)); *nentries = 1; entries = (Datum*) palloc(sizeof(Datum)); entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) ); PG_RETURN_POINTER(entries); } partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool) * lenq); entries = (Datum*) palloc(sizeof(Datum) * lenq); items=(WildItem*) palloc0( sizeof(WildItem) * lenq ); /* * Parse expression to the list of constant parts and * wildcards */ while( qptr - VARDATA(q) < lenq ) { clen = pg_mblen(qptr); if ( clen==1 && (*qptr == '_' || *qptr == '%' ) ) { if ( splitqlen == 0 ) { items[ splitqlen ].iswildcard = true; splitqlen++; } else if ( items[ splitqlen-1 ].iswildcard == false ) { items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr; items[ splitqlen ].iswildcard = true; splitqlen++; } /* * ignore wildcard, because we don't make difference beetween * %, _ or a combination of its */ } else { if ( splitqlen == 0 || items[ splitqlen-1 ].iswildcard == true ) { items[ splitqlen ].ptr = qptr; splitqlen++; } } qptr += clen; } Assert( splitqlen >= 1 ); if ( items[ splitqlen-1 ].iswildcard == false ) items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr; if ( items[ 0 ].iswildcard == false ) { /* X... */ if ( splitqlen == 1 ) { /* X => X$, exact match */ *nentries = 1; entry = appendStrToText(NULL, items[ 0 ].ptr, items[ 0 ].len, lenq+1); entry = appendMarkToText( entry, -1 ); entries[0] = PointerGetDatum( entry ); } else if ( items[ splitqlen-1 ].iswildcard == false ) { /* X * [X1 * [] ] ] Y => Y$X* [ + X1* [] ] */ *nentries = 1; entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1); entry = appendMarkToText( entry, -1 ); entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1); partialmatch[0] = true; entries[0] = PointerGetDatum( entry ); for(i=1; i 3 /* X1 may be inside X OR Y */ ) needRecheck = true; } else { /* X * [ X1 * [] ] => X*$ [ + X1* [] ] */ entry = setFlagOfText( WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1); *nentries = 1; partialmatch[ 0 ] = true; entries[0] = PointerGetDatum( entry ); for(i=2; i 2 /* we don't remeber an order of Xn */ ) needRecheck = true; } } else { /* *... */ if ( splitqlen == 1 ) { /* any word => full scan */ *nentries = 1; entry = appendStrToText(NULL, "", 0, lenq+1); partialmatch[0] = true; entries[0] = PointerGetDatum( entry ); } else if ( items[ splitqlen-1 ].iswildcard == false ) { /* * [ X1 * [] ] X => X$* [ + X1* [] ] */ *nentries = 1; entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1); entry = appendMarkToText( entry, -1 ); partialmatch[0] = true; entries[0] = PointerGetDatum( entry ); for(i=1; i 2 /* X1 may be inside X */ ) needRecheck = true; } else { /* * X [ * X1 [] ] * => X* [ + X1* [] ] */ for(i=1; i 3 ) { if ( i==1 ) entry = setFlagOfText( WC_MIDDLE | WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); else if ( i == splitqlen-2 ) entry = setFlagOfText( WC_MIDDLE | WC_END, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); else entry = setFlagOfText( WC_MIDDLE, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); } else entry = NULL; entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, lenq+1); partialmatch[ *nentries ] = true; entries[ *nentries ] = PointerGetDatum( entry ); (*nentries)++; if ( splitqlen > 3 /* we don't remeber an order of Xn */ ) needRecheck = true; } } } PG_FREE_IF_COPY(q,0); #ifdef OPTIMIZE_WILDCARD_QUERY if ( *nentries > 1 ) { int saven = *nentries; optimize_wildcard_search( entries, nentries ); if ( saven != *nentries ) needRecheck = true; } #endif if (needRecheck) { /* * Non empty extra signals to consistentFn about * rechecking of result */ *extra = palloc0(sizeof(Pointer) * *nentries); } PG_RETURN_POINTER(entries); } PG_FUNCTION_INFO_V1(gin_consistent_wildcard); Datum gin_consistent_wildcard(PG_FUNCTION_ARGS); Datum gin_consistent_wildcard(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); #ifdef NOT_USED StrategyNumber strategy = PG_GETARG_UINT16(1); text *query = PG_GETARG_TEXT_P(2); #endif int nentries = PG_GETARG_INT32(3); Pointer *extra = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); bool res = true; int i; for (i = 0; res && i < nentries; i++) if (check[i] == false) res = false; *recheck = (extra == NULL) ? false : true; PG_RETURN_BOOL(res); } /* * Mostly debug fuction */ PG_FUNCTION_INFO_V1(permute); Datum permute(PG_FUNCTION_ARGS); Datum permute(PG_FUNCTION_ARGS) { Datum src = PG_GETARG_DATUM(0); int32 nentries = 0; Datum *entries; ArrayType *res; int i; /* * Get permuted values by gin_extract_permuted() */ entries = (Datum*) DatumGetPointer(DirectFunctionCall2( gin_extract_permuted, src, PointerGetDatum(&nentries) )); /* * We need to replace MARK_SIGN to MARK_SIGN_SHOW. * See comments above near definition of MARK_SIGN and MARK_SIGN_SHOW. */ if ( nentries == 1 && VARSIZE(entries[0]) == VARHDRSZ + 1) { *(VARDATA(entries[0])) = MARK_SIGN_SHOW; } else { int32 offset = 0; /* offset of MARK_SIGN */ char *ptr; /* * We scan array from the end because it allows simple calculation * of MARK_SIGN position: on every iteration it's moved one * character to the end. */ for(i=nentries-1;i>=0;i--) { ptr = VARDATA(entries[i]); offset += pg_mblen(ptr); Assert( *(ptr + offset) == MARK_SIGN ); *(ptr + offset) = MARK_SIGN_SHOW; } } res = construct_array( entries, nentries, TEXTOID, -1, false, 'i' ); PG_RETURN_POINTER(res); }