Commit 7b925e12 authored by Peter Eisentraut's avatar Peter Eisentraut

Sync our Snowball stemmer dictionaries with current upstream

The main change is a new stemmer for Greek.  There are minor changes
in the Danish and French stemmers.

Author: Panagiotis Mavrogiorgos <pmav99@gmail.com>
parent dedb6e01
......@@ -3810,6 +3810,7 @@ Parser: "pg_catalog.default"
pg_catalog | finnish_stem | snowball stemmer for finnish language
pg_catalog | french_stem | snowball stemmer for french language
pg_catalog | german_stem | snowball stemmer for german language
pg_catalog | greek_stem | snowball stemmer for greek language
pg_catalog | hungarian_stem | snowball stemmer for hungarian language
pg_catalog | indonesian_stem | snowball stemmer for indonesian language
pg_catalog | irish_stem | snowball stemmer for irish language
......
......@@ -41,6 +41,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_UTF_8_finnish.o \
stem_UTF_8_french.o \
stem_UTF_8_german.o \
stem_UTF_8_greek.o \
stem_UTF_8_hungarian.o \
stem_UTF_8_indonesian.o \
stem_UTF_8_irish.o \
......@@ -69,6 +70,7 @@ LANGUAGES= \
finnish finnish \
french french \
german german \
greek greek \
hungarian hungarian \
indonesian indonesian \
irish irish \
......
......@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution
because most installations will not have the Snowball compiler available.
We are currently synced with the Snowball git commit
1964ce688cbeca505263c8f77e16ed923296ce7a
of 2018-06-29.
4456b82c26c02493e8807a66f30593a98c5d2888
of 2019-06-24.
To update the PostgreSQL sources from a new Snowball version:
......@@ -57,7 +57,7 @@ do not require any changes.
4. Check whether any stemmer modules have been added or removed. If so, edit
the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the
stemmer_modules[] table in dict_snowball.c. You might also need to change
the LANGUAGES list in Makefile.
the LANGUAGES list in Makefile and tsearch_config_languages in initdb.c.
5. The various stopword files in stopwords/ must be downloaded
individually from pages on the snowballstem.org website.
......
......@@ -50,6 +50,7 @@
#include "snowball/libstemmer/stem_UTF_8_finnish.h"
#include "snowball/libstemmer/stem_UTF_8_french.h"
#include "snowball/libstemmer/stem_UTF_8_german.h"
#include "snowball/libstemmer/stem_UTF_8_greek.h"
#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
#include "snowball/libstemmer/stem_UTF_8_irish.h"
......@@ -115,6 +116,7 @@ static const stemmer_module stemmer_modules[] =
STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
STEMMER_MODULE(french, PG_UTF8, UTF_8),
STEMMER_MODULE(german, PG_UTF8, UTF_8),
STEMMER_MODULE(greek, PG_UTF8, UTF_8),
STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
STEMMER_MODULE(irish, PG_UTF8, UTF_8),
......
This diff is collapsed.
......@@ -59,31 +59,49 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
/* Code for character groupings: utf8 cases */
static int get_utf8(const symbol * p, int c, int l, int * slot) {
int b0, b1;
int b0, b1, b2;
if (c >= l) return 0;
b0 = p[c++];
if (b0 < 0xC0 || c == l) { /* 1100 0000 */
* slot = b0; return 1;
*slot = b0;
return 1;
}
b1 = p[c++];
b1 = p[c++] & 0x3F;
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
* slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
*slot = (b0 & 0x1F) << 6 | b1;
return 2;
}
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
b2 = p[c++] & 0x3F;
if (b0 < 0xF0 || c == l) { /* 1111 0000 */
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
return 3;
}
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
return 4;
}
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
int b0, b1;
int a, b;
if (c <= lb) return 0;
b0 = p[--c];
if (b0 < 0x80 || c == lb) { /* 1000 0000 */
* slot = b0; return 1;
b = p[--c];
if (b < 0x80 || c == lb) { /* 1000 0000 */
*slot = b;
return 1;
}
a = b & 0x3F;
b = p[--c];
if (b >= 0xC0 || c == lb) { /* 1100 0000 */
*slot = (b & 0x1F) << 6 | a;
return 2;
}
b1 = p[--c];
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
a |= (b & 0x3F) << 6;
b = p[--c];
if (b >= 0xE0 || c == lb) { /* 1110 0000 */
*slot = (b & 0xF) << 12 | a;
return 3;
}
* slot = (p[--c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
return 4;
}
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
......@@ -230,8 +248,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
common++;
}
}
if (diff < 0) { j = k; common_j = common; }
else { i = k; common_i = common; }
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; /* v->s has been inspected */
if (j == i) break; /* only one item in v */
......@@ -360,9 +383,8 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const
z->l += adjustment;
if (z->c >= c_ket)
z->c += adjustment;
else
if (z->c > c_bra)
z->c = c_bra;
else if (z->c > c_bra)
z->c = c_bra;
}
if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (adjptr != NULL)
......
......@@ -716,6 +716,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"french", "French"},
{"german", "de"},
{"german", "German"},
{"greek", "el"},
{"greek", "Greek"},
{"hungarian", "hu"},
{"hungarian", "Hungarian"},
{"indonesian", "id"},
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201906161
#define CATALOG_VERSION_NO 201907041
#endif
......@@ -19,8 +19,15 @@ struct SN_env {
unsigned char * B;
};
#ifdef __cplusplus
extern "C" {
#endif
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
extern void SN_close_env(struct SN_env * z, int S_size);
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
#ifdef __cplusplus
}
#endif
/* This file was generated automatically by the Snowball to ISO C compiler */
/* http://snowballstem.org/ */
#ifdef __cplusplus
extern "C" {
#endif
extern struct SN_env * greek_UTF_8_create_env(void);
extern void greek_UTF_8_close_env(struct SN_env * z);
extern int greek_UTF_8_stem(struct SN_env * z);
#ifdef __cplusplus
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment