Commit 678d0e23 authored by Peter Eisentraut's avatar Peter Eisentraut

Update snowball

Update to snowball tag v2.1.0.  Major changes are new stemmers for
Armenian, Serbian, and Yiddish.
parent b071a311
...@@ -3837,6 +3837,7 @@ Parser: "pg_catalog.default" ...@@ -3837,6 +3837,7 @@ Parser: "pg_catalog.default"
Schema | Name | Description Schema | Name | Description
------------+-----------------+----------------------------------------------------------- ------------+-----------------+-----------------------------------------------------------
pg_catalog | arabic_stem | snowball stemmer for arabic language pg_catalog | arabic_stem | snowball stemmer for arabic language
pg_catalog | armenian_stem | snowball stemmer for armenian language
pg_catalog | basque_stem | snowball stemmer for basque language pg_catalog | basque_stem | snowball stemmer for basque language
pg_catalog | catalan_stem | snowball stemmer for catalan language pg_catalog | catalan_stem | snowball stemmer for catalan language
pg_catalog | danish_stem | snowball stemmer for danish language pg_catalog | danish_stem | snowball stemmer for danish language
...@@ -3857,11 +3858,13 @@ Parser: "pg_catalog.default" ...@@ -3857,11 +3858,13 @@ Parser: "pg_catalog.default"
pg_catalog | portuguese_stem | snowball stemmer for portuguese language pg_catalog | portuguese_stem | snowball stemmer for portuguese language
pg_catalog | romanian_stem | snowball stemmer for romanian language pg_catalog | romanian_stem | snowball stemmer for romanian language
pg_catalog | russian_stem | snowball stemmer for russian language pg_catalog | russian_stem | snowball stemmer for russian language
pg_catalog | serbian_stem | snowball stemmer for serbian language
pg_catalog | simple | simple dictionary: just lower case and check for stopword pg_catalog | simple | simple dictionary: just lower case and check for stopword
pg_catalog | spanish_stem | snowball stemmer for spanish language pg_catalog | spanish_stem | snowball stemmer for spanish language
pg_catalog | swedish_stem | snowball stemmer for swedish language pg_catalog | swedish_stem | snowball stemmer for swedish language
pg_catalog | tamil_stem | snowball stemmer for tamil language pg_catalog | tamil_stem | snowball stemmer for tamil language
pg_catalog | turkish_stem | snowball stemmer for turkish language pg_catalog | turkish_stem | snowball stemmer for turkish language
pg_catalog | yiddish_stem | snowball stemmer for yiddish language
</screen> </screen>
</para> </para>
</listitem> </listitem>
......
...@@ -43,6 +43,7 @@ OBJS += \ ...@@ -43,6 +43,7 @@ OBJS += \
stem_ISO_8859_2_romanian.o \ stem_ISO_8859_2_romanian.o \
stem_KOI8_R_russian.o \ stem_KOI8_R_russian.o \
stem_UTF_8_arabic.o \ stem_UTF_8_arabic.o \
stem_UTF_8_armenian.o \
stem_UTF_8_basque.o \ stem_UTF_8_basque.o \
stem_UTF_8_catalan.o \ stem_UTF_8_catalan.o \
stem_UTF_8_danish.o \ stem_UTF_8_danish.o \
...@@ -64,10 +65,12 @@ OBJS += \ ...@@ -64,10 +65,12 @@ OBJS += \
stem_UTF_8_portuguese.o \ stem_UTF_8_portuguese.o \
stem_UTF_8_romanian.o \ stem_UTF_8_romanian.o \
stem_UTF_8_russian.o \ stem_UTF_8_russian.o \
stem_UTF_8_serbian.o \
stem_UTF_8_spanish.o \ stem_UTF_8_spanish.o \
stem_UTF_8_swedish.o \ stem_UTF_8_swedish.o \
stem_UTF_8_tamil.o \ stem_UTF_8_tamil.o \
stem_UTF_8_turkish.o stem_UTF_8_turkish.o \
stem_UTF_8_yiddish.o
# first column is language name and also name of dictionary for not-all-ASCII # first column is language name and also name of dictionary for not-all-ASCII
# words, second is name of dictionary for all-ASCII words # words, second is name of dictionary for all-ASCII words
...@@ -75,6 +78,7 @@ OBJS += \ ...@@ -75,6 +78,7 @@ OBJS += \
# must come after creation of that language # must come after creation of that language
LANGUAGES= \ LANGUAGES= \
arabic arabic \ arabic arabic \
armenian armenian \
basque basque \ basque basque \
catalan catalan \ catalan catalan \
danish danish \ danish danish \
...@@ -95,10 +99,12 @@ LANGUAGES= \ ...@@ -95,10 +99,12 @@ LANGUAGES= \
portuguese portuguese \ portuguese portuguese \
romanian romanian \ romanian romanian \
russian english \ russian english \
serbian serbian \
spanish spanish \ spanish spanish \
swedish swedish \ swedish swedish \
tamil tamil \ tamil tamil \
turkish turkish turkish turkish \
yiddish yiddish
SQLSCRIPT= snowball_create.sql SQLSCRIPT= snowball_create.sql
......
...@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution ...@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution
because most installations will not have the Snowball compiler available. because most installations will not have the Snowball compiler available.
We are currently synced with the Snowball git commit We are currently synced with the Snowball git commit
c70ed64f9d41c1032fba4e962b054f8e9d489a74 (tag v2.0.0) 4764395431c8f2a0b4fe18b816ab1fc966a45837 (tag v2.1.0)
of 2019-10-02. of 2021-01-21.
To update the PostgreSQL sources from a new Snowball version: To update the PostgreSQL sources from a new Snowball version:
...@@ -59,7 +59,8 @@ do not require any changes. ...@@ -59,7 +59,8 @@ do not require any changes.
4. Check whether any stemmer modules have been added or removed. If so, edit 4. Check whether any stemmer modules have been added or removed. If so, edit
the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the
stemmer_modules[] table in dict_snowball.c. You might also need to change stemmer_modules[] table in dict_snowball.c, as well as the list in the
documentation in textsearch.sgml. You might also need to change
the LANGUAGES list in Makefile and tsearch_config_languages in initdb.c. the LANGUAGES list in Makefile and tsearch_config_languages in initdb.c.
5. The various stopword files in stopwords/ must be downloaded 5. The various stopword files in stopwords/ must be downloaded
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h" #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
#include "snowball/libstemmer/stem_KOI8_R_russian.h" #include "snowball/libstemmer/stem_KOI8_R_russian.h"
#include "snowball/libstemmer/stem_UTF_8_arabic.h" #include "snowball/libstemmer/stem_UTF_8_arabic.h"
#include "snowball/libstemmer/stem_UTF_8_armenian.h"
#include "snowball/libstemmer/stem_UTF_8_basque.h" #include "snowball/libstemmer/stem_UTF_8_basque.h"
#include "snowball/libstemmer/stem_UTF_8_catalan.h" #include "snowball/libstemmer/stem_UTF_8_catalan.h"
#include "snowball/libstemmer/stem_UTF_8_danish.h" #include "snowball/libstemmer/stem_UTF_8_danish.h"
...@@ -67,10 +68,12 @@ ...@@ -67,10 +68,12 @@
#include "snowball/libstemmer/stem_UTF_8_portuguese.h" #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
#include "snowball/libstemmer/stem_UTF_8_romanian.h" #include "snowball/libstemmer/stem_UTF_8_romanian.h"
#include "snowball/libstemmer/stem_UTF_8_russian.h" #include "snowball/libstemmer/stem_UTF_8_russian.h"
#include "snowball/libstemmer/stem_UTF_8_serbian.h"
#include "snowball/libstemmer/stem_UTF_8_spanish.h" #include "snowball/libstemmer/stem_UTF_8_spanish.h"
#include "snowball/libstemmer/stem_UTF_8_swedish.h" #include "snowball/libstemmer/stem_UTF_8_swedish.h"
#include "snowball/libstemmer/stem_UTF_8_tamil.h" #include "snowball/libstemmer/stem_UTF_8_tamil.h"
#include "snowball/libstemmer/stem_UTF_8_turkish.h" #include "snowball/libstemmer/stem_UTF_8_turkish.h"
#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
...@@ -117,6 +120,7 @@ static const stemmer_module stemmer_modules[] = ...@@ -117,6 +120,7 @@ static const stemmer_module stemmer_modules[] =
STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2), STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
STEMMER_MODULE(russian, PG_KOI8R, KOI8_R), STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
STEMMER_MODULE(arabic, PG_UTF8, UTF_8), STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
STEMMER_MODULE(basque, PG_UTF8, UTF_8), STEMMER_MODULE(basque, PG_UTF8, UTF_8),
STEMMER_MODULE(catalan, PG_UTF8, UTF_8), STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
STEMMER_MODULE(danish, PG_UTF8, UTF_8), STEMMER_MODULE(danish, PG_UTF8, UTF_8),
...@@ -138,10 +142,12 @@ static const stemmer_module stemmer_modules[] = ...@@ -138,10 +142,12 @@ static const stemmer_module stemmer_modules[] =
STEMMER_MODULE(portuguese, PG_UTF8, UTF_8), STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
STEMMER_MODULE(romanian, PG_UTF8, UTF_8), STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
STEMMER_MODULE(russian, PG_UTF8, UTF_8), STEMMER_MODULE(russian, PG_UTF8, UTF_8),
STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
STEMMER_MODULE(spanish, PG_UTF8, UTF_8), STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
STEMMER_MODULE(swedish, PG_UTF8, UTF_8), STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
STEMMER_MODULE(tamil, PG_UTF8, UTF_8), STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
STEMMER_MODULE(turkish, PG_UTF8, UTF_8), STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
/* /*
* Stemmer with PG_SQL_ASCII encoding should be valid for any server * Stemmer with PG_SQL_ASCII encoding should be valid for any server
......
#include "header.h" #include "header.h"
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) extern struct SN_env * SN_create_env(int S_size, int I_size)
{ {
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
if (z == NULL) return NULL; if (z == NULL) return NULL;
...@@ -25,12 +25,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) ...@@ -25,12 +25,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
if (z->I == NULL) goto error; if (z->I == NULL) goto error;
} }
if (B_size)
{
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
if (z->B == NULL) goto error;
}
return z; return z;
error: error:
SN_close_env(z, S_size); SN_close_env(z, S_size);
...@@ -50,7 +44,6 @@ extern void SN_close_env(struct SN_env * z, int S_size) ...@@ -50,7 +44,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
free(z->S); free(z->S);
} }
free(z->I); free(z->I);
free(z->B);
if (z->p) lose_s(z->p); if (z->p) lose_s(z->p);
free(z); free(z);
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -18,21 +18,20 @@ extern void lose_s(symbol * p) { ...@@ -18,21 +18,20 @@ extern void lose_s(symbol * p) {
} }
/* /*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c.
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new new_p is the new position, or -1 on failure.
position, or -1 on failure.
-- used to implement hop and next in the utf8 case. -- used to implement hop and next in the utf8 case.
*/ */
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { extern int skip_utf8(const symbol * p, int c, int limit, int n) {
int b; int b;
if (n >= 0) { if (n < 0) return -1;
for (; n > 0; n--) { for (; n > 0; n--) {
if (c >= l) return -1; if (c >= limit) return -1;
b = p[c++]; b = p[c++];
if (b >= 0xC0) { /* 1100 0000 */ if (b >= 0xC0) { /* 1100 0000 */
while (c < l) { while (c < limit) {
b = p[c]; b = p[c];
if (b >= 0xC0 || b < 0x80) break; if (b >= 0xC0 || b < 0x80) break;
/* break unless b is 10------ */ /* break unless b is 10------ */
...@@ -40,19 +39,30 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { ...@@ -40,19 +39,30 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
} }
} }
} }
} else { return c;
for (; n < 0; n++) { }
if (c <= lb) return -1;
/*
new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1
new_p is the new position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
extern int skip_b_utf8(const symbol * p, int c, int limit, int n) {
int b;
if (n < 0) return -1;
for (; n > 0; n--) {
if (c <= limit) return -1;
b = p[--c]; b = p[--c];
if (b >= 0x80) { /* 1000 0000 */ if (b >= 0x80) { /* 1000 0000 */
while (c > lb) { while (c > limit) {
b = p[c]; b = p[c];
if (b >= 0xC0) break; /* 1100 0000 */ if (b >= 0xC0) break; /* 1100 0000 */
c--; c--;
} }
} }
} }
}
return c; return c;
} }
...@@ -76,7 +86,7 @@ static int get_utf8(const symbol * p, int c, int l, int * slot) { ...@@ -76,7 +86,7 @@ static int get_utf8(const symbol * p, int c, int l, int * slot) {
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2; *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
return 3; return 3;
} }
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
return 4; return 4;
} }
...@@ -100,7 +110,7 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { ...@@ -100,7 +110,7 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
*slot = (b & 0xF) << 12 | a; *slot = (b & 0xF) << 12 | a;
return 3; return 3;
} }
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a;
return 4; return 4;
} }
...@@ -226,7 +236,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { ...@@ -226,7 +236,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size; int j = v_size;
int c = z->c; int l = z->l; int c = z->c; int l = z->l;
symbol * q = z->p + c; const symbol * q = z->p + c;
const struct among * w; const struct among * w;
...@@ -291,7 +301,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { ...@@ -291,7 +301,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size; int j = v_size;
int c = z->c; int lb = z->lb; int c = z->c; int lb = z->lb;
symbol * q = z->p + c - 1; const symbol * q = z->p + c - 1;
const struct among * w; const struct among * w;
......
...@@ -656,6 +656,8 @@ static const struct tsearch_config_match tsearch_config_languages[] = ...@@ -656,6 +656,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{ {
{"arabic", "ar"}, {"arabic", "ar"},
{"arabic", "Arabic"}, {"arabic", "Arabic"},
{"armenian", "hy"},
{"armenian", "Armenian"},
{"basque", "eu"}, {"basque", "eu"},
{"basque", "Basque"}, {"basque", "Basque"},
{"catalan", "ca"}, {"catalan", "ca"},
...@@ -697,6 +699,8 @@ static const struct tsearch_config_match tsearch_config_languages[] = ...@@ -697,6 +699,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"romanian", "ro"}, {"romanian", "ro"},
{"russian", "ru"}, {"russian", "ru"},
{"russian", "Russian"}, {"russian", "Russian"},
{"serbian", "sr"},
{"serbian", "Serbian"},
{"spanish", "es"}, {"spanish", "es"},
{"spanish", "Spanish"}, {"spanish", "Spanish"},
{"swedish", "sv"}, {"swedish", "sv"},
...@@ -705,6 +709,8 @@ static const struct tsearch_config_match tsearch_config_languages[] = ...@@ -705,6 +709,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"tamil", "Tamil"}, {"tamil", "Tamil"},
{"turkish", "tr"}, {"turkish", "tr"},
{"turkish", "Turkish"}, {"turkish", "Turkish"},
{"yiddish", "yi"},
{"yiddish", "Yiddish"},
{NULL, NULL} /* end marker */ {NULL, NULL} /* end marker */
}; };
......
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 202102171 #define CATALOG_VERSION_NO 202102191
#endif #endif
...@@ -16,14 +16,13 @@ struct SN_env { ...@@ -16,14 +16,13 @@ struct SN_env {
int c; int l; int lb; int bra; int ket; int c; int l; int lb; int bra; int ket;
symbol * * S; symbol * * S;
int * I; int * I;
unsigned char * B;
}; };
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); extern struct SN_env * SN_create_env(int S_size, int I_size);
extern void SN_close_env(struct SN_env * z, int S_size); extern void SN_close_env(struct SN_env * z, int S_size);
extern int SN_set_current(struct SN_env * z, int size, const symbol * s); extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
......
...@@ -23,7 +23,9 @@ struct among ...@@ -23,7 +23,9 @@ struct among
extern symbol * create_s(void); extern symbol * create_s(void);
extern void lose_s(symbol * p); extern void lose_s(symbol * p);
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); extern int skip_utf8(const symbol * p, int c, int limit, int n);
extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ /* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment