Commit 678d0e23 authored by Peter Eisentraut's avatar Peter Eisentraut

Update snowball

Update to snowball tag v2.1.0.  Major changes are new stemmers for
Armenian, Serbian, and Yiddish.
parent b071a311
......@@ -3837,6 +3837,7 @@ Parser: "pg_catalog.default"
Schema | Name | Description
------------+-----------------+-----------------------------------------------------------
pg_catalog | arabic_stem | snowball stemmer for arabic language
pg_catalog | armenian_stem | snowball stemmer for armenian language
pg_catalog | basque_stem | snowball stemmer for basque language
pg_catalog | catalan_stem | snowball stemmer for catalan language
pg_catalog | danish_stem | snowball stemmer for danish language
......@@ -3857,11 +3858,13 @@ Parser: "pg_catalog.default"
pg_catalog | portuguese_stem | snowball stemmer for portuguese language
pg_catalog | romanian_stem | snowball stemmer for romanian language
pg_catalog | russian_stem | snowball stemmer for russian language
pg_catalog | serbian_stem | snowball stemmer for serbian language
pg_catalog | simple | simple dictionary: just lower case and check for stopword
pg_catalog | spanish_stem | snowball stemmer for spanish language
pg_catalog | swedish_stem | snowball stemmer for swedish language
pg_catalog | tamil_stem | snowball stemmer for tamil language
pg_catalog | turkish_stem | snowball stemmer for turkish language
pg_catalog | yiddish_stem | snowball stemmer for yiddish language
</screen>
</para>
</listitem>
......
......@@ -43,6 +43,7 @@ OBJS += \
stem_ISO_8859_2_romanian.o \
stem_KOI8_R_russian.o \
stem_UTF_8_arabic.o \
stem_UTF_8_armenian.o \
stem_UTF_8_basque.o \
stem_UTF_8_catalan.o \
stem_UTF_8_danish.o \
......@@ -64,10 +65,12 @@ OBJS += \
stem_UTF_8_portuguese.o \
stem_UTF_8_romanian.o \
stem_UTF_8_russian.o \
stem_UTF_8_serbian.o \
stem_UTF_8_spanish.o \
stem_UTF_8_swedish.o \
stem_UTF_8_tamil.o \
stem_UTF_8_turkish.o
stem_UTF_8_turkish.o \
stem_UTF_8_yiddish.o
# first column is language name and also name of dictionary for not-all-ASCII
# words, second is name of dictionary for all-ASCII words
......@@ -75,6 +78,7 @@ OBJS += \
# must come after creation of that language
LANGUAGES= \
arabic arabic \
armenian armenian \
basque basque \
catalan catalan \
danish danish \
......@@ -95,10 +99,12 @@ LANGUAGES= \
portuguese portuguese \
romanian romanian \
russian english \
serbian serbian \
spanish spanish \
swedish swedish \
tamil tamil \
turkish turkish
turkish turkish \
yiddish yiddish
SQLSCRIPT= snowball_create.sql
......
......@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution
because most installations will not have the Snowball compiler available.
We are currently synced with the Snowball git commit
c70ed64f9d41c1032fba4e962b054f8e9d489a74 (tag v2.0.0)
of 2019-10-02.
4764395431c8f2a0b4fe18b816ab1fc966a45837 (tag v2.1.0)
of 2021-01-21.
To update the PostgreSQL sources from a new Snowball version:
......@@ -59,7 +59,8 @@ do not require any changes.
4. Check whether any stemmer modules have been added or removed. If so, edit
the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the
stemmer_modules[] table in dict_snowball.c. You might also need to change
stemmer_modules[] table in dict_snowball.c, as well as the list in the
documentation in textsearch.sgml. You might also need to change
the LANGUAGES list in Makefile and tsearch_config_languages in initdb.c.
5. The various stopword files in stopwords/ must be downloaded
......
......@@ -46,6 +46,7 @@
#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
#include "snowball/libstemmer/stem_KOI8_R_russian.h"
#include "snowball/libstemmer/stem_UTF_8_arabic.h"
#include "snowball/libstemmer/stem_UTF_8_armenian.h"
#include "snowball/libstemmer/stem_UTF_8_basque.h"
#include "snowball/libstemmer/stem_UTF_8_catalan.h"
#include "snowball/libstemmer/stem_UTF_8_danish.h"
......@@ -67,10 +68,12 @@
#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
#include "snowball/libstemmer/stem_UTF_8_romanian.h"
#include "snowball/libstemmer/stem_UTF_8_russian.h"
#include "snowball/libstemmer/stem_UTF_8_serbian.h"
#include "snowball/libstemmer/stem_UTF_8_spanish.h"
#include "snowball/libstemmer/stem_UTF_8_swedish.h"
#include "snowball/libstemmer/stem_UTF_8_tamil.h"
#include "snowball/libstemmer/stem_UTF_8_turkish.h"
#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
PG_MODULE_MAGIC;
......@@ -117,6 +120,7 @@ static const stemmer_module stemmer_modules[] =
STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
STEMMER_MODULE(basque, PG_UTF8, UTF_8),
STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
STEMMER_MODULE(danish, PG_UTF8, UTF_8),
......@@ -138,10 +142,12 @@ static const stemmer_module stemmer_modules[] =
STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
STEMMER_MODULE(russian, PG_UTF8, UTF_8),
STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
/*
* Stemmer with PG_SQL_ASCII encoding should be valid for any server
......
#include "header.h"
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
extern struct SN_env * SN_create_env(int S_size, int I_size)
{
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
if (z == NULL) return NULL;
......@@ -25,12 +25,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
if (z->I == NULL) goto error;
}
if (B_size)
{
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
if (z->B == NULL) goto error;
}
return z;
error:
SN_close_env(z, S_size);
......@@ -50,7 +44,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
free(z->S);
}
free(z->I);
free(z->B);
if (z->p) lose_s(z->p);
free(z);
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -18,38 +18,48 @@ extern void lose_s(symbol * p) {
}
/*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
position, or -1 on failure.
new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c.
new_p is the new position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
extern int skip_utf8(const symbol * p, int c, int limit, int n) {
int b;
if (n >= 0) {
for (; n > 0; n--) {
if (c >= l) return -1;
b = p[c++];
if (b >= 0xC0) { /* 1100 0000 */
while (c < l) {
b = p[c];
if (b >= 0xC0 || b < 0x80) break;
/* break unless b is 10------ */
c++;
}
if (n < 0) return -1;
for (; n > 0; n--) {
if (c >= limit) return -1;
b = p[c++];
if (b >= 0xC0) { /* 1100 0000 */
while (c < limit) {
b = p[c];
if (b >= 0xC0 || b < 0x80) break;
/* break unless b is 10------ */
c++;
}
}
} else {
for (; n < 0; n++) {
if (c <= lb) return -1;
b = p[--c];
if (b >= 0x80) { /* 1000 0000 */
while (c > lb) {
b = p[c];
if (b >= 0xC0) break; /* 1100 0000 */
c--;
}
}
return c;
}
/*
new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1
new_p is the new position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
extern int skip_b_utf8(const symbol * p, int c, int limit, int n) {
int b;
if (n < 0) return -1;
for (; n > 0; n--) {
if (c <= limit) return -1;
b = p[--c];
if (b >= 0x80) { /* 1000 0000 */
while (c > limit) {
b = p[c];
if (b >= 0xC0) break; /* 1100 0000 */
c--;
}
}
}
......@@ -76,7 +86,7 @@ static int get_utf8(const symbol * p, int c, int l, int * slot) {
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
return 3;
}
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
*slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
return 4;
}
......@@ -100,7 +110,7 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
*slot = (b & 0xF) << 12 | a;
return 3;
}
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
*slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a;
return 4;
}
......@@ -226,7 +236,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int l = z->l;
symbol * q = z->p + c;
const symbol * q = z->p + c;
const struct among * w;
......@@ -291,7 +301,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int lb = z->lb;
symbol * q = z->p + c - 1;
const symbol * q = z->p + c - 1;
const struct among * w;
......
......@@ -656,6 +656,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{
{"arabic", "ar"},
{"arabic", "Arabic"},
{"armenian", "hy"},
{"armenian", "Armenian"},
{"basque", "eu"},
{"basque", "Basque"},
{"catalan", "ca"},
......@@ -697,6 +699,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"romanian", "ro"},
{"russian", "ru"},
{"russian", "Russian"},
{"serbian", "sr"},
{"serbian", "Serbian"},
{"spanish", "es"},
{"spanish", "Spanish"},
{"swedish", "sv"},
......@@ -705,6 +709,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"tamil", "Tamil"},
{"turkish", "tr"},
{"turkish", "Turkish"},
{"yiddish", "yi"},
{"yiddish", "Yiddish"},
{NULL, NULL} /* end marker */
};
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202102171
#define CATALOG_VERSION_NO 202102191
#endif
......@@ -16,14 +16,13 @@ struct SN_env {
int c; int l; int lb; int bra; int ket;
symbol * * S;
int * I;
unsigned char * B;
};
#ifdef __cplusplus
extern "C" {
#endif
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
extern struct SN_env * SN_create_env(int S_size, int I_size);
extern void SN_close_env(struct SN_env * z, int S_size);
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
......
......@@ -23,7 +23,9 @@ struct among
extern symbol * create_s(void);
extern void lose_s(symbol * p);
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
extern int skip_utf8(const symbol * p, int c, int limit, int n);
extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
/* Generated by Snowball 2.0.0 - https://snowballstem.org/ */
/* Generated by Snowball 2.1.0 - https://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment