Commit fd582317 authored by Tom Lane's avatar Tom Lane

Sync our Snowball stemmer dictionaries with current upstream.

We haven't touched these since text search functionality landed in core
in 2007 :-(.  While the upstream project isn't a beehive of activity,
they do make additions and bug fixes from time to time.  Update our
copies of these files.

Also update our documentation about how to keep things in sync, since
they're not making distribution tarballs these days.  Fortunately,
their source code turns out to be a breeze to build.

Notable changes:

* The non-UTF8 version of the hungarian stemmer now works in LATIN2
not LATIN1.

* New stemmers have appeared for arabic, indonesian, irish, lithuanian,
nepali, and tamil.  These all work in UTF8, and the indonesian and
irish ones also work in LATIN1.

(There are some new stemmers that I did not incorporate, mainly because
their names don't match the underlying languages, suggesting that they're
not to be considered mainstream.)

Worth noting: the upstream Nepali dictionary was contributed by
Arthur Zakirov.

initdb forced because the contents of snowball_create.sql have
changed.

Still TODO: see about updating the stopword lists.

Arthur Zakirov, minor mods and doc work by me

Discussion: https://postgr.es/m/20180626122025.GA12647@zakirov.localdomain
Discussion: https://postgr.es/m/20180219140849.GA9050@zakirov.localdomain
parent b076eb76
......@@ -3795,6 +3795,7 @@ Parser: "pg_catalog.default"
List of text search dictionaries
Schema | Name | Description
------------+-----------------+-----------------------------------------------------------
pg_catalog | arabic_stem | snowball stemmer for arabic language
pg_catalog | danish_stem | snowball stemmer for danish language
pg_catalog | dutch_stem | snowball stemmer for dutch language
pg_catalog | english_stem | snowball stemmer for english language
......@@ -3802,7 +3803,11 @@ Parser: "pg_catalog.default"
pg_catalog | french_stem | snowball stemmer for french language
pg_catalog | german_stem | snowball stemmer for german language
pg_catalog | hungarian_stem | snowball stemmer for hungarian language
pg_catalog | indonesian_stem | snowball stemmer for indonesian language
pg_catalog | irish_stem | snowball stemmer for irish language
pg_catalog | italian_stem | snowball stemmer for italian language
pg_catalog | lithuanian_stem | snowball stemmer for lithuanian language
pg_catalog | nepali_stem | snowball stemmer for nepali language
pg_catalog | norwegian_stem | snowball stemmer for norwegian language
pg_catalog | portuguese_stem | snowball stemmer for portuguese language
pg_catalog | romanian_stem | snowball stemmer for romanian language
......@@ -3810,6 +3815,7 @@ Parser: "pg_catalog.default"
pg_catalog | simple | simple dictionary: just lower case and check for stopword
pg_catalog | spanish_stem | snowball stemmer for spanish language
pg_catalog | swedish_stem | snowball stemmer for swedish language
pg_catalog | tamil_stem | snowball stemmer for tamil language
pg_catalog | turkish_stem | snowball stemmer for turkish language
</screen>
</para>
......
......@@ -23,15 +23,18 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_ISO_8859_1_finnish.o \
stem_ISO_8859_1_french.o \
stem_ISO_8859_1_german.o \
stem_ISO_8859_1_hungarian.o \
stem_ISO_8859_1_indonesian.o \
stem_ISO_8859_1_irish.o \
stem_ISO_8859_1_italian.o \
stem_ISO_8859_1_norwegian.o \
stem_ISO_8859_1_porter.o \
stem_ISO_8859_1_portuguese.o \
stem_ISO_8859_1_spanish.o \
stem_ISO_8859_1_swedish.o \
stem_ISO_8859_2_hungarian.o \
stem_ISO_8859_2_romanian.o \
stem_KOI8_R_russian.o \
stem_UTF_8_arabic.o \
stem_UTF_8_danish.o \
stem_UTF_8_dutch.o \
stem_UTF_8_english.o \
......@@ -39,7 +42,11 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_UTF_8_french.o \
stem_UTF_8_german.o \
stem_UTF_8_hungarian.o \
stem_UTF_8_indonesian.o \
stem_UTF_8_irish.o \
stem_UTF_8_italian.o \
stem_UTF_8_lithuanian.o \
stem_UTF_8_nepali.o \
stem_UTF_8_norwegian.o \
stem_UTF_8_porter.o \
stem_UTF_8_portuguese.o \
......@@ -47,6 +54,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_UTF_8_russian.o \
stem_UTF_8_spanish.o \
stem_UTF_8_swedish.o \
stem_UTF_8_tamil.o \
stem_UTF_8_turkish.o
# first column is language name and also name of dictionary for not-all-ASCII
......@@ -54,6 +62,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language
LANGUAGES= \
arabic arabic \
danish danish \
dutch dutch \
english english \
......@@ -61,13 +70,18 @@ LANGUAGES= \
french french \
german german \
hungarian hungarian \
indonesian indonesian \
irish irish \
italian italian \
lithuanian lithuanian \
nepali nepali \
norwegian norwegian \
portuguese portuguese \
romanian romanian \
russian english \
spanish spanish \
swedish swedish \
tamil tamil \
turkish turkish
......
......@@ -4,46 +4,61 @@ Snowball-Based Stemming
=======================
This module uses the word stemming code developed by the Snowball project,
http://snowball.tartarus.org/
http://snowballstem.org (formerly http://snowball.tartarus.org)
which is released by them under a BSD-style license.
The files under src/backend/snowball/libstemmer/ and
src/include/snowball/libstemmer/ are taken directly from their libstemmer_c
distribution, with only some minor adjustments of file inclusions. Note
The Snowball project is not currently making formal releases; it's best
to pull from their git repository
git clone https://github.com/snowballstem/snowball.git
and then building the derived files is as simple as
cd snowball
make
At least on Linux, no platform-specific adjustment is needed.
Postgres' files under src/backend/snowball/libstemmer/ and
src/include/snowball/libstemmer/ are taken directly from the Snowball
files, with only some minor adjustments of file inclusions. Note
that most of these files are in fact derived files, not master source.
The master sources are in the Snowball language, and are available along
with the Snowball-to-C compiler from the Snowball project. We choose to
include the derived files in the PostgreSQL distribution because most
installations will not have the Snowball compiler available.
The master sources are in the Snowball language, and are built using
the Snowball-to-C compiler that is also part of the Snowball project.
We choose to include the derived files in the PostgreSQL distribution
because most installations will not have the Snowball compiler available.
We are currently synced with the Snowball git commit
1964ce688cbeca505263c8f77e16ed923296ce7a
of 2018-06-29.
To update the PostgreSQL sources from a new Snowball libstemmer_c
distribution:
To update the PostgreSQL sources from a new Snowball version:
1. Copy the *.c files in libstemmer_c/src_c/ to src/backend/snowball/libstemmer
0. If you didn't do it already, "make -C snowball".
1. Copy the *.c files in snowball/src_c/ to src/backend/snowball/libstemmer
with replacement of "../runtime/header.h" by "header.h", for example
for f in libstemmer_c/src_c/*.c
for f in .../snowball/src_c/*.c
do
sed 's|\.\./runtime/header\.h|header.h|' $f >libstemmer/`basename $f`
done
(Alternatively, if you rebuild the stemmer files from the master Snowball
sources, just omit "-r ../runtime" from the Snowball compiler switches.)
2. Copy the *.c files in libstemmer_c/runtime/ to
2. Copy the *.c files in snowball/runtime/ to
src/backend/snowball/libstemmer, and edit them to remove direct inclusions
of system headers such as <stdio.h> --- they should only include "header.h".
(This removal avoids portability problems on some platforms where <stdio.h>
is sensitive to largefile compilation options.)
3. Copy the *.h files in libstemmer_c/src_c/ and libstemmer_c/runtime/
3. Copy the *.h files in snowball/src_c/ and snowball/runtime/
to src/include/snowball/libstemmer. At this writing the header files
do not require any changes.
4. Check whether any stemmer modules have been added or removed. If so, edit
the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the
stemmer_modules[] table in dict_snowball.c.
stemmer_modules[] table in dict_snowball.c. You might also need to change
the LANGUAGES list in Makefile.
5. The various stopword files in stopwords/ must be downloaded
individually from pages on the snowball.tartarus.org website.
individually from pages on the snowballstem.org website.
Be careful that these files must be stored in UTF-8 encoding.
......@@ -32,15 +32,18 @@
#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
#include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
#include "snowball/libstemmer/stem_KOI8_R_russian.h"
#include "snowball/libstemmer/stem_UTF_8_arabic.h"
#include "snowball/libstemmer/stem_UTF_8_danish.h"
#include "snowball/libstemmer/stem_UTF_8_dutch.h"
#include "snowball/libstemmer/stem_UTF_8_english.h"
......@@ -48,7 +51,11 @@
#include "snowball/libstemmer/stem_UTF_8_french.h"
#include "snowball/libstemmer/stem_UTF_8_german.h"
#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
#include "snowball/libstemmer/stem_UTF_8_irish.h"
#include "snowball/libstemmer/stem_UTF_8_italian.h"
#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
#include "snowball/libstemmer/stem_UTF_8_nepali.h"
#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
#include "snowball/libstemmer/stem_UTF_8_porter.h"
#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
......@@ -56,6 +63,7 @@
#include "snowball/libstemmer/stem_UTF_8_russian.h"
#include "snowball/libstemmer/stem_UTF_8_spanish.h"
#include "snowball/libstemmer/stem_UTF_8_swedish.h"
#include "snowball/libstemmer/stem_UTF_8_tamil.h"
#include "snowball/libstemmer/stem_UTF_8_turkish.h"
PG_MODULE_MAGIC;
......@@ -74,48 +82,60 @@ typedef struct stemmer_module
int (*stem) (struct SN_env *);
} stemmer_module;
/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
#define STEMMER_MODULE(name,enc,senc) \
{#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
static const stemmer_module stemmer_modules[] =
{
/*
* Stemmers list from Snowball distribution
*/
{"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
{"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
{"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
{"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
{"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
{"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
{"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
{"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
{"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
{"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
{"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
{"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
{"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
{"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
{"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
{"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
{"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
{"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
{"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
{"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
{"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
{"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
{"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
{"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
{"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
{"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
{"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
{"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
{"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
{"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
{"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
STEMMER_MODULE(danish, PG_UTF8, UTF_8),
STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
STEMMER_MODULE(english, PG_UTF8, UTF_8),
STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
STEMMER_MODULE(french, PG_UTF8, UTF_8),
STEMMER_MODULE(german, PG_UTF8, UTF_8),
STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
STEMMER_MODULE(irish, PG_UTF8, UTF_8),
STEMMER_MODULE(italian, PG_UTF8, UTF_8),
STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
STEMMER_MODULE(porter, PG_UTF8, UTF_8),
STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
STEMMER_MODULE(russian, PG_UTF8, UTF_8),
STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
/*
* Stemmer with PG_SQL_ASCII encoding should be valid for any server
* encoding
*/
{"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
{NULL, 0, NULL, NULL, NULL} /* list end marker */
};
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include "header.h"
#define unless(C) if(!(C))
#define CREATE_SIZE 1
extern symbol * create_s(void) {
......@@ -10,7 +8,7 @@ extern symbol * create_s(void) {
if (mem == NULL) return NULL;
p = (symbol *) (HEAD + (char *) mem);
CAPACITY(p) = CREATE_SIZE;
SET_SIZE(p, CREATE_SIZE);
SET_SIZE(p, 0);
return p;
}
......@@ -22,7 +20,7 @@ extern void lose_s(symbol * p) {
/*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
position, or 0 on failure.
position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
......@@ -85,14 +83,14 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
}
* slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
* slot = (p[--c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
}
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
unless (w) return -1;
if (!w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c += w;
......@@ -104,7 +102,7 @@ extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min,
do {
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
unless (w) return -1;
if (!w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c -= w;
......@@ -116,8 +114,8 @@ extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, i
do {
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
unless (w) return -1;
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
if (!w) return -1;
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return w;
z->c += w;
} while (repeat);
......@@ -128,8 +126,8 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min,
do {
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
unless (w) return -1;
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
if (!w) return -1;
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return w;
z->c -= w;
} while (repeat);
......@@ -167,7 +165,7 @@ extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int
int ch;
if (z->c >= z->l) return -1;
ch = z->p[z->c];
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return 1;
z->c++;
} while (repeat);
......@@ -179,7 +177,7 @@ extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, i
int ch;
if (z->c <= z->lb) return -1;
ch = z->p[z->c - 1];
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return 1;
z->c--;
} while (repeat);
......@@ -366,7 +364,7 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const
if (z->c > c_bra)
z->c = c_bra;
}
unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (adjptr != NULL)
*adjptr = adjustment;
return 0;
......@@ -412,12 +410,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo
}
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
int adjustment;
if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
return -1;
if (bra <= z->bra) z->bra += adjustment;
if (bra <= z->ket) z->ket += adjustment;
return 0;
return insert_s(z, bra, ket, SIZE(p), p);
}
extern symbol * slice_to(struct SN_env * z, symbol * p) {
......@@ -450,6 +443,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) {
return p;
}
extern int len_utf8(const symbol * p) {
int size = SIZE(p);
int len = 0;
while (size--) {
symbol b = *p++;
if (b >= 0xC0 || b < 0x80) ++len;
}
return len;
}
#if 0
extern void debug(struct SN_env * z, int number, int line_count) {
int i;
......
This diff is collapsed.
......@@ -718,6 +718,8 @@ struct tsearch_config_match
static const struct tsearch_config_match tsearch_config_languages[] =
{
{"arabic", "ar"},
{"arabic", "Arabic"},
{"danish", "da"},
{"danish", "Danish"},
{"dutch", "nl"},
......@@ -734,8 +736,16 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"german", "German"},
{"hungarian", "hu"},
{"hungarian", "Hungarian"},
{"indonesian", "id"},
{"indonesian", "Indonesian"},
{"irish", "ga"},
{"irish", "Irish"},
{"italian", "it"},
{"italian", "Italian"},
{"lithuanian", "lt"},
{"lithuanian", "Lithuanian"},
{"nepali", "ne"},
{"nepali", "Nepali"},
{"norwegian", "no"},
{"norwegian", "Norwegian"},
{"portuguese", "pt"},
......@@ -747,6 +757,8 @@ static const struct tsearch_config_match tsearch_config_languages[] =
{"spanish", "Spanish"},
{"swedish", "sv"},
{"swedish", "Swedish"},
{"tamil", "ta"},
{"tamil", "Tamil"},
{"turkish", "tr"},
{"turkish", "Turkish"},
{NULL, NULL} /* end marker */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment