Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Postgres FD Implementation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Abuhujair Javed
Postgres FD Implementation
Commits
b87b52bf
Commit
b87b52bf
authored
Nov 12, 2008
by
Teodor Sigaev
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Support of multibyte encoding for pg_trgm
parent
e4ffd143
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
161 additions
and
88 deletions
+161
-88
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm.h
+9
-2
contrib/pg_trgm/trgm_gin.c
contrib/pg_trgm/trgm_gin.c
+2
-2
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/trgm_op.c
+150
-84
No files found.
contrib/pg_trgm/trgm.h
View file @
b87b52bf
/*
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.
9 2008/05/17 01:28:21 adunstan
Exp $
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.
10 2008/11/12 13:43:54 teodor
Exp $
*/
#ifndef __TRGM_H__
#define __TRGM_H__
...
...
@@ -31,7 +31,14 @@ typedef char trgm[3];
*(((char*)(a))+2) = *(((char*)(b))+2); \
} while(0);
#define TRGMINT(a) ( (*(((char*)(a))+2)<<16)+(*(((char*)(a))+1)<<8)+*(((char*)(a))+0) )
uint32
trgm2int
(
trgm
*
ptr
);
#ifdef KEEPONLYALNUM
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
#else
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
#endif
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
typedef
struct
{
...
...
contrib/pg_trgm/trgm_gin.c
View file @
b87b52bf
/*
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.
5 2008/07/11 11:56:48
teodor Exp $
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.
6 2008/11/12 13:43:54
teodor Exp $
*/
#include "trgm.h"
...
...
@@ -42,7 +42,7 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
ptr
=
GETARR
(
trg
);
while
(
ptr
-
GETARR
(
trg
)
<
ARRNELEM
(
trg
))
{
item
=
TRGMINT
(
ptr
);
item
=
trgm2int
(
ptr
);
entries
[
i
++
]
=
Int32GetDatum
(
item
);
ptr
++
;
...
...
contrib/pg_trgm/trgm_op.c
View file @
b87b52bf
/*
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.1
0 2008/05/17 01:28:21 adunstan
Exp $
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.1
1 2008/11/12 13:43:54 teodor
Exp $
*/
#include "trgm.h"
#include <ctype.h>
#include "utils/array.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
PG_MODULE_MAGIC
;
...
...
@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4
(
trgm_limit
);
}
#define WORDWAIT 0
#define INWORD 1
static
int
comp_trgm
(
const
void
*
a
,
const
void
*
b
)
{
...
...
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
return
curend
+
1
-
a
;
}
#ifdef KEEPONLYALNUM
#define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
#else
#define iswordchr(c) (!t_isspace(c))
#endif
/*
* Finds first word in string, returns pointer to the word,
* endword points to the character after word
*/
static
char
*
find_word
(
char
*
str
,
int
lenstr
,
char
**
endword
,
int
*
charlen
)
{
char
*
beginword
=
str
;
while
(
beginword
-
str
<
lenstr
&&
!
iswordchr
(
beginword
)
)
beginword
+=
pg_mblen
(
beginword
);
if
(
beginword
-
str
>=
lenstr
)
return
NULL
;
*
endword
=
beginword
;
*
charlen
=
0
;
while
(
*
endword
-
str
<
lenstr
&&
iswordchr
(
*
endword
)
)
{
*
endword
+=
pg_mblen
(
*
endword
);
(
*
charlen
)
++
;
}
return
beginword
;
}
#ifdef USE_WIDE_UPPER_LOWER
static
void
cnt_trigram
(
trgm
*
tptr
,
char
*
str
,
int
bytelen
)
{
if
(
bytelen
==
3
)
{
CPTRGM
(
tptr
,
str
);
}
else
{
pg_crc32
crc
;
INIT_CRC32
(
crc
);
COMP_CRC32
(
crc
,
str
,
bytelen
);
FIN_CRC32
(
crc
);
/*
* use only 3 upper bytes from crc, hope, it's
* good enough hashing
*/
CPTRGM
(
tptr
,
&
crc
);
}
}
#endif
/*
* Adds trigramm from words (already padded).
*/
static
trgm
*
make_trigrams
(
trgm
*
tptr
,
char
*
str
,
int
bytelen
,
int
charlen
)
{
char
*
ptr
=
str
;
if
(
charlen
<
3
)
return
tptr
;
#ifdef USE_WIDE_UPPER_LOWER
if
(
pg_database_encoding_max_length
()
>
1
)
{
int
lenfirst
=
pg_mblen
(
str
),
lenmiddle
=
pg_mblen
(
str
+
lenfirst
),
lenlast
=
pg_mblen
(
str
+
lenfirst
+
lenmiddle
);
while
(
(
ptr
-
str
)
+
lenfirst
+
lenmiddle
+
lenlast
<=
bytelen
)
{
cnt_trigram
(
tptr
,
ptr
,
lenfirst
+
lenmiddle
+
lenlast
);
ptr
+=
lenfirst
;
tptr
++
;
lenfirst
=
lenmiddle
;
lenmiddle
=
lenlast
;
lenlast
=
pg_mblen
(
ptr
+
lenfirst
+
lenmiddle
);
}
}
else
#endif
{
Assert
(
bytelen
==
charlen
);
while
(
ptr
-
str
<
bytelen
-
2
/* number of trigrams = strlen - 2 */
)
{
CPTRGM
(
tptr
,
ptr
);
ptr
++
;
tptr
++
;
}
}
return
tptr
;
}
TRGM
*
generate_trgm
(
char
*
str
,
int
slen
)
{
TRGM
*
trg
;
char
*
buf
,
*
sptr
,
*
bufptr
;
char
*
buf
;
trgm
*
tptr
;
int
state
=
WORDWAIT
;
int
wl
,
len
;
int
len
,
charlen
,
bytelen
;
char
*
bword
,
*
eword
;
trg
=
(
TRGM
*
)
palloc
(
TRGMHDRSIZE
+
sizeof
(
trgm
)
*
(
slen
/
2
+
1
)
*
3
);
trg
->
flag
=
ARRKEY
;
...
...
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
tptr
=
GETARR
(
trg
);
buf
=
palloc
(
sizeof
(
char
)
*
(
slen
+
4
));
sptr
=
str
;
if
(
LPADDING
>
0
)
{
...
...
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
*
(
buf
+
1
)
=
' '
;
}
bufptr
=
buf
+
LPADDING
;
while
(
sptr
-
str
<
slen
)
{
if
(
state
==
WORDWAIT
)
{
if
(
#ifdef KEEPONLYALNUM
isalnum
((
unsigned
char
)
*
sptr
)
#else
!
isspace
((
unsigned
char
)
*
sptr
)
#endif
)
{
*
bufptr
=
*
sptr
;
/* start put word in buffer */
bufptr
++
;
state
=
INWORD
;
if
(
sptr
-
str
==
slen
-
1
/* last char */
)
goto
gettrg
;
}
}
else
eword
=
str
;
while
(
(
bword
=
find_word
(
eword
,
slen
-
(
eword
-
str
),
&
eword
,
&
charlen
))
!=
NULL
)
{
if
(
#ifdef KEEPONLYALNUM
!
isalnum
((
unsigned
char
)
*
sptr
)
#ifdef IGNORECASE
bword
=
lowerstr_with_len
(
bword
,
eword
-
bword
);
bytelen
=
strlen
(
bword
);
#else
isspace
((
unsigned
char
)
*
sptr
)
bytelen
=
eword
-
bword
;
#endif
)
{
gettrg:
/* word in buffer, so count trigrams */
*
bufptr
=
' '
;
*
(
bufptr
+
1
)
=
' '
;
wl
=
bufptr
-
(
buf
+
LPADDING
)
-
2
+
LPADDING
+
RPADDING
;
if
(
wl
<=
0
)
{
bufptr
=
buf
+
LPADDING
;
state
=
WORDWAIT
;
sptr
++
;
continue
;
}
#ifdef IGNORECASE
do
{
/* lower word */
int
wwl
=
bufptr
-
buf
;
memcpy
(
buf
+
LPADDING
,
bword
,
bytelen
);
bufptr
=
buf
+
LPADDING
;
while
(
bufptr
-
buf
<
wwl
)
{
*
bufptr
=
tolower
((
unsigned
char
)
*
bufptr
);
bufptr
++
;
}
}
while
(
0
);
#ifdef IGNORECASE
pfree
(
bword
);
#endif
bufptr
=
buf
;
/* set trigrams */
while
(
bufptr
-
buf
<
wl
)
{
CPTRGM
(
tptr
,
bufptr
);
bufptr
++
;
tptr
++
;
}
bufptr
=
buf
+
LPADDING
;
state
=
WORDWAIT
;
}
else
{
*
bufptr
=
*
sptr
;
/* put in buffer */
bufptr
++
;
if
(
sptr
-
str
==
slen
-
1
)
goto
gettrg
;
}
}
sptr
++
;
buf
[
LPADDING
+
bytelen
]
=
' '
;
buf
[
LPADDING
+
bytelen
+
1
]
=
' '
;
/*
* count trigrams
*/
tptr
=
make_trigrams
(
tptr
,
buf
,
bytelen
+
LPADDING
+
RPADDING
,
charlen
+
LPADDING
+
RPADDING
);
}
pfree
(
buf
);
...
...
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
return
trg
;
}
uint32
trgm2int
(
trgm
*
ptr
)
{
uint32
val
=
0
;
val
|=
*
(
((
unsigned
char
*
)
ptr
)
);
val
<<=
8
;
val
|=
*
(
((
unsigned
char
*
)
ptr
)
+
1
);
val
<<=
8
;
val
|=
*
(
((
unsigned
char
*
)
ptr
)
+
2
);
return
val
;
}
PG_FUNCTION_INFO_V1
(
show_trgm
);
Datum
show_trgm
(
PG_FUNCTION_ARGS
);
...
...
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
for
(
i
=
0
,
ptr
=
GETARR
(
trg
);
i
<
ARRNELEM
(
trg
);
i
++
,
ptr
++
)
{
text
*
item
=
(
text
*
)
palloc
(
VARHDRSZ
+
3
);
text
*
item
=
(
text
*
)
palloc
(
VARHDRSZ
+
Max
(
12
,
pg_database_encoding_max_length
()
*
3
)
);
if
(
pg_database_encoding_max_length
()
>
1
&&
!
ISPRINTABLETRGM
(
ptr
)
)
{
snprintf
(
VARDATA
(
item
),
12
,
"0x%06x"
,
trgm2int
(
ptr
));
SET_VARSIZE
(
item
,
VARHDRSZ
+
strlen
(
VARDATA
(
item
)));
}
else
{
SET_VARSIZE
(
item
,
VARHDRSZ
+
3
);
CPTRGM
(
VARDATA
(
item
),
ptr
);
}
d
[
i
]
=
PointerGetDatum
(
item
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment