Commit d4fd7d85 authored by Tom Lane's avatar Tom Lane

Fix text_position to not scan past end of source string in multibyte

case, per report from Korea PostgreSQL Users' Group.  Also do some
cosmetic cleanup in nearby code.
parent a3c969d5
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.109 2003/12/19 04:56:41 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.110 2004/01/31 00:45:21 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -46,7 +46,7 @@ typedef struct varlena unknown; ...@@ -46,7 +46,7 @@ typedef struct varlena unknown;
#define TEXTLEN(textp) \ #define TEXTLEN(textp) \
text_length(PointerGetDatum(textp)) text_length(PointerGetDatum(textp))
#define TEXTPOS(buf_text, from_sub_text) \ #define TEXTPOS(buf_text, from_sub_text) \
text_position(PointerGetDatum(buf_text), PointerGetDatum(from_sub_text), 1) text_position(buf_text, from_sub_text, 1)
#define TEXTDUP(textp) \ #define TEXTDUP(textp) \
DatumGetTextPCopy(PointerGetDatum(textp)) DatumGetTextPCopy(PointerGetDatum(textp))
#define LEFT(buf_text, from_sub_text) \ #define LEFT(buf_text, from_sub_text) \
...@@ -55,12 +55,12 @@ typedef struct varlena unknown; ...@@ -55,12 +55,12 @@ typedef struct varlena unknown;
TEXTPOS(buf_text, from_sub_text) - 1, false) TEXTPOS(buf_text, from_sub_text) - 1, false)
#define RIGHT(buf_text, from_sub_text, from_sub_text_len) \ #define RIGHT(buf_text, from_sub_text, from_sub_text_len) \
text_substring(PointerGetDatum(buf_text), \ text_substring(PointerGetDatum(buf_text), \
TEXTPOS(buf_text, from_sub_text) + from_sub_text_len, \ TEXTPOS(buf_text, from_sub_text) + (from_sub_text_len), \
-1, true) -1, true)
static int text_cmp(text *arg1, text *arg2); static int text_cmp(text *arg1, text *arg2);
static int32 text_length(Datum str); static int32 text_length(Datum str);
static int32 text_position(Datum str, Datum search_str, int matchnum); static int32 text_position(text *t1, text *t2, int matchnum);
static text *text_substring(Datum str, static text *text_substring(Datum str,
int32 start, int32 start,
int32 length, int32 length,
...@@ -403,14 +403,20 @@ unknownsend(PG_FUNCTION_ARGS) ...@@ -403,14 +403,20 @@ unknownsend(PG_FUNCTION_ARGS)
Datum Datum
textlen(PG_FUNCTION_ARGS) textlen(PG_FUNCTION_ARGS)
{ {
PG_RETURN_INT32(text_length(PG_GETARG_DATUM(0))); Datum str = PG_GETARG_DATUM(0);
/* try to avoid decompressing argument */
PG_RETURN_INT32(text_length(str));
} }
/* /*
* text_length - * text_length -
* Does the real work for textlen() * Does the real work for textlen()
*
* This is broken out so it can be called directly by other string processing * This is broken out so it can be called directly by other string processing
* functions. * functions. Note that the argument is passed as a Datum, to indicate that
* it may still be in compressed form. We can avoid decompressing it at all
* in some cases.
*/ */
static int32 static int32
text_length(Datum str) text_length(Datum str)
...@@ -418,20 +424,13 @@ text_length(Datum str) ...@@ -418,20 +424,13 @@ text_length(Datum str)
/* fastpath when max encoding length is one */ /* fastpath when max encoding length is one */
if (pg_database_encoding_max_length() == 1) if (pg_database_encoding_max_length() == 1)
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
else
if (pg_database_encoding_max_length() > 1)
{ {
text *t = DatumGetTextP(str); text *t = DatumGetTextP(str);
PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA(t), PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA(t),
VARSIZE(t) - VARHDRSZ)); VARSIZE(t) - VARHDRSZ));
} }
/* should never get here */
elog(ERROR, "invalid backend encoding: encoding max length < 1");
/* not reached: suppress compiler warning */
return 0;
} }
/* /*
...@@ -442,7 +441,10 @@ text_length(Datum str) ...@@ -442,7 +441,10 @@ text_length(Datum str)
Datum Datum
textoctetlen(PG_FUNCTION_ARGS) textoctetlen(PG_FUNCTION_ARGS)
{ {
PG_RETURN_INT32(toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ); Datum str = PG_GETARG_DATUM(0);
/* We need not detoast the input at all */
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
} }
/* /*
...@@ -504,9 +506,6 @@ textcat(PG_FUNCTION_ARGS) ...@@ -504,9 +506,6 @@ textcat(PG_FUNCTION_ARGS)
* adjusting the length to be consistent with the "negative start" per SQL92. * adjusting the length to be consistent with the "negative start" per SQL92.
* If the length is less than zero, return the remaining string. * If the length is less than zero, return the remaining string.
* *
* Note that the arguments operate on octet length,
* so not aware of multibyte character sets.
*
* Added multibyte support. * Added multibyte support.
* - Tatsuo Ishii 1998-4-21 * - Tatsuo Ishii 1998-4-21
* Changed behavior if starting position is less than one to conform to SQL92 behavior. * Changed behavior if starting position is less than one to conform to SQL92 behavior.
...@@ -545,8 +544,11 @@ text_substr_no_len(PG_FUNCTION_ARGS) ...@@ -545,8 +544,11 @@ text_substr_no_len(PG_FUNCTION_ARGS)
/* /*
* text_substring - * text_substring -
* Does the real work for text_substr() and text_substr_no_len() * Does the real work for text_substr() and text_substr_no_len()
*
* This is broken out so it can be called directly by other string processing * This is broken out so it can be called directly by other string processing
* functions. * functions. Note that the argument is passed as a Datum, to indicate that
* it may still be in compressed/toasted form. We can avoid detoasting all
* of it in some cases.
*/ */
static text * static text *
text_substring(Datum str, int32 start, int32 length, bool length_not_specified) text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
...@@ -717,7 +719,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) ...@@ -717,7 +719,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
elog(ERROR, "invalid backend encoding: encoding max length < 1"); elog(ERROR, "invalid backend encoding: encoding max length < 1");
/* not reached: suppress compiler warning */ /* not reached: suppress compiler warning */
return PG_STR_GET_TEXT(""); return NULL;
} }
/* /*
...@@ -730,51 +732,61 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) ...@@ -730,51 +732,61 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
Datum Datum
textpos(PG_FUNCTION_ARGS) textpos(PG_FUNCTION_ARGS)
{ {
PG_RETURN_INT32(text_position(PG_GETARG_DATUM(0), PG_GETARG_DATUM(1), 1)); text *str = PG_GETARG_TEXT_P(0);
text *search_str = PG_GETARG_TEXT_P(1);
PG_RETURN_INT32(text_position(str, search_str, 1));
} }
/* /*
* text_position - * text_position -
* Does the real work for textpos() * Does the real work for textpos()
*
* Inputs:
* t1 - string to be searched
* t2 - pattern to match within t1
* matchnum - number of the match to be found (1 is the first match)
* Result:
* Character index of the first matched char, starting from 1,
* or 0 if no match.
*
* This is broken out so it can be called directly by other string processing * This is broken out so it can be called directly by other string processing
* functions. * functions.
*/ */
static int32 static int32
text_position(Datum str, Datum search_str, int matchnum) text_position(text *t1, text *t2, int matchnum)
{ {
int eml = pg_database_encoding_max_length();
text *t1 = DatumGetTextP(str);
text *t2 = DatumGetTextP(search_str);
int match = 0, int match = 0,
pos = 0, pos = 0,
p = 0, p,
px, px,
len1, len1,
len2; len2;
if (matchnum == 0) if (matchnum <= 0)
return 0; /* result for 0th match */ return 0; /* result for 0th match */
if (VARSIZE(t2) <= VARHDRSZ) if (VARSIZE(t2) <= VARHDRSZ)
PG_RETURN_INT32(1); /* result for empty pattern */ return 1; /* result for empty pattern */
len1 = (VARSIZE(t1) - VARHDRSZ); len1 = (VARSIZE(t1) - VARHDRSZ);
len2 = (VARSIZE(t2) - VARHDRSZ); len2 = (VARSIZE(t2) - VARHDRSZ);
/* no use in searching str past point where search_str will fit */ if (pg_database_encoding_max_length() == 1)
px = (len1 - len2);
if (eml == 1) /* simple case - single byte encoding */
{ {
/* simple case - single byte encoding */
char *p1, char *p1,
*p2; *p2;
p1 = VARDATA(t1); p1 = VARDATA(t1);
p2 = VARDATA(t2); p2 = VARDATA(t2);
/* no use in searching str past point where search_str will fit */
px = (len1 - len2);
for (p = 0; p <= px; p++) for (p = 0; p <= px; p++)
{ {
if ((*p2 == *p1) && (strncmp(p1, p2, len2) == 0)) if ((*p1 == *p2) && (strncmp(p1, p2, len2) == 0))
{ {
if (++match == matchnum) if (++match == matchnum)
{ {
...@@ -785,8 +797,9 @@ text_position(Datum str, Datum search_str, int matchnum) ...@@ -785,8 +797,9 @@ text_position(Datum str, Datum search_str, int matchnum)
p1++; p1++;
} }
} }
else if (eml > 1) /* not as simple - multibyte encoding */ else
{ {
/* not as simple - multibyte encoding */
pg_wchar *p1, pg_wchar *p1,
*p2, *p2,
*ps1, *ps1,
...@@ -799,9 +812,12 @@ text_position(Datum str, Datum search_str, int matchnum) ...@@ -799,9 +812,12 @@ text_position(Datum str, Datum search_str, int matchnum)
(void) pg_mb2wchar_with_len((unsigned char *) VARDATA(t2), p2, len2); (void) pg_mb2wchar_with_len((unsigned char *) VARDATA(t2), p2, len2);
len2 = pg_wchar_strlen(p2); len2 = pg_wchar_strlen(p2);
/* no use in searching str past point where search_str will fit */
px = (len1 - len2);
for (p = 0; p <= px; p++) for (p = 0; p <= px; p++)
{ {
if ((*p2 == *p1) && (pg_wchar_strncmp(p1, p2, len2) == 0)) if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, len2) == 0))
{ {
if (++match == matchnum) if (++match == matchnum)
{ {
...@@ -815,10 +831,8 @@ text_position(Datum str, Datum search_str, int matchnum) ...@@ -815,10 +831,8 @@ text_position(Datum str, Datum search_str, int matchnum)
pfree(ps1); pfree(ps1);
pfree(ps2); pfree(ps2);
} }
else
elog(ERROR, "invalid backend encoding: encoding max length < 1");
PG_RETURN_INT32(pos); return pos;
} }
/* varstr_cmp() /* varstr_cmp()
...@@ -1199,7 +1213,10 @@ bttext_pattern_cmp(PG_FUNCTION_ARGS) ...@@ -1199,7 +1213,10 @@ bttext_pattern_cmp(PG_FUNCTION_ARGS)
Datum Datum
byteaoctetlen(PG_FUNCTION_ARGS) byteaoctetlen(PG_FUNCTION_ARGS)
{ {
PG_RETURN_INT32(toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ); Datum str = PG_GETARG_DATUM(0);
/* We need not detoast the input at all */
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
} }
/* /*
...@@ -1925,17 +1942,17 @@ byteacmp(PG_FUNCTION_ARGS) ...@@ -1925,17 +1942,17 @@ byteacmp(PG_FUNCTION_ARGS)
Datum Datum
replace_text(PG_FUNCTION_ARGS) replace_text(PG_FUNCTION_ARGS)
{ {
text *src_text = PG_GETARG_TEXT_P(0);
text *from_sub_text = PG_GETARG_TEXT_P(1);
text *to_sub_text = PG_GETARG_TEXT_P(2);
int src_text_len = TEXTLEN(src_text);
int from_sub_text_len = TEXTLEN(from_sub_text);
char *to_sub_str = PG_TEXT_GET_STR(to_sub_text);
text *left_text; text *left_text;
text *right_text; text *right_text;
text *buf_text; text *buf_text;
text *ret_text; text *ret_text;
int curr_posn; int curr_posn;
text *src_text = PG_GETARG_TEXT_P(0);
int src_text_len = TEXTLEN(src_text);
text *from_sub_text = PG_GETARG_TEXT_P(1);
int from_sub_text_len = TEXTLEN(from_sub_text);
text *to_sub_text = PG_GETARG_TEXT_P(2);
char *to_sub_str = PG_TEXT_GET_STR(to_sub_text);
StringInfo str = makeStringInfo(); StringInfo str = makeStringInfo();
if (src_text_len == 0 || from_sub_text_len == 0) if (src_text_len == 0 || from_sub_text_len == 0)
...@@ -1978,14 +1995,20 @@ Datum ...@@ -1978,14 +1995,20 @@ Datum
split_text(PG_FUNCTION_ARGS) split_text(PG_FUNCTION_ARGS)
{ {
text *inputstring = PG_GETARG_TEXT_P(0); text *inputstring = PG_GETARG_TEXT_P(0);
int inputstring_len = TEXTLEN(inputstring);
text *fldsep = PG_GETARG_TEXT_P(1); text *fldsep = PG_GETARG_TEXT_P(1);
int fldsep_len = TEXTLEN(fldsep);
int fldnum = PG_GETARG_INT32(2); int fldnum = PG_GETARG_INT32(2);
int start_posn = 0; int inputstring_len = TEXTLEN(inputstring);
int end_posn = 0; int fldsep_len = TEXTLEN(fldsep);
int start_posn;
int end_posn;
text *result_text; text *result_text;
/* field number is 1 based */
if (fldnum < 1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("field position must be greater than zero")));
/* return empty string for empty input string */ /* return empty string for empty input string */
if (inputstring_len < 1) if (inputstring_len < 1)
PG_RETURN_TEXT_P(PG_STR_GET_TEXT("")); PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
...@@ -1993,52 +2016,45 @@ split_text(PG_FUNCTION_ARGS) ...@@ -1993,52 +2016,45 @@ split_text(PG_FUNCTION_ARGS)
/* empty field separator */ /* empty field separator */
if (fldsep_len < 1) if (fldsep_len < 1)
{ {
if (fldnum == 1) /* first field - just return the input /* if first field, return input string, else empty string */
* string */ if (fldnum == 1)
PG_RETURN_TEXT_P(inputstring); PG_RETURN_TEXT_P(inputstring);
else else
/* otherwise return an empty string */
PG_RETURN_TEXT_P(PG_STR_GET_TEXT("")); PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
} }
/* field number is 1 based */ start_posn = text_position(inputstring, fldsep, fldnum - 1);
if (fldnum < 1) end_posn = text_position(inputstring, fldsep, fldnum);
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("field position must be greater than zero")));
start_posn = text_position(PointerGetDatum(inputstring),
PointerGetDatum(fldsep),
fldnum - 1);
end_posn = text_position(PointerGetDatum(inputstring),
PointerGetDatum(fldsep),
fldnum);
if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */ if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */
{ {
if (fldnum == 1) /* first field - just return the input /* if first field, return input string, else empty string */
* string */ if (fldnum == 1)
PG_RETURN_TEXT_P(inputstring); PG_RETURN_TEXT_P(inputstring);
else else
/* otherwise return an empty string */
PG_RETURN_TEXT_P(PG_STR_GET_TEXT("")); PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
} }
else if ((start_posn != 0) && (end_posn == 0)) else if (start_posn == 0)
{ {
/* last field requested */ /* first field requested */
result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, -1, true); result_text = LEFT(inputstring, fldsep);
PG_RETURN_TEXT_P(result_text); PG_RETURN_TEXT_P(result_text);
} }
else if ((start_posn == 0) && (end_posn != 0)) else if (end_posn == 0)
{ {
/* first field requested */ /* last field requested */
result_text = LEFT(inputstring, fldsep); result_text = text_substring(PointerGetDatum(inputstring),
start_posn + fldsep_len,
-1, true);
PG_RETURN_TEXT_P(result_text); PG_RETURN_TEXT_P(result_text);
} }
else else
{ {
/* prior to last field requested */ /* interior field requested */
result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, end_posn - start_posn - fldsep_len, false); result_text = text_substring(PointerGetDatum(inputstring),
start_posn + fldsep_len,
end_posn - start_posn - fldsep_len,
false);
PG_RETURN_TEXT_P(result_text); PG_RETURN_TEXT_P(result_text);
} }
} }
...@@ -2053,15 +2069,14 @@ Datum ...@@ -2053,15 +2069,14 @@ Datum
text_to_array(PG_FUNCTION_ARGS) text_to_array(PG_FUNCTION_ARGS)
{ {
text *inputstring = PG_GETARG_TEXT_P(0); text *inputstring = PG_GETARG_TEXT_P(0);
int inputstring_len = TEXTLEN(inputstring);
text *fldsep = PG_GETARG_TEXT_P(1); text *fldsep = PG_GETARG_TEXT_P(1);
int inputstring_len = TEXTLEN(inputstring);
int fldsep_len = TEXTLEN(fldsep); int fldsep_len = TEXTLEN(fldsep);
int fldnum; int fldnum;
int start_posn = 0; int start_posn;
int end_posn = 0; int end_posn;
text *result_text = NULL; text *result_text;
ArrayBuildState *astate = NULL; ArrayBuildState *astate = NULL;
MemoryContext oldcontext = CurrentMemoryContext;
/* return NULL for empty input string */ /* return NULL for empty input string */
if (inputstring_len < 1) if (inputstring_len < 1)
...@@ -2083,9 +2098,7 @@ text_to_array(PG_FUNCTION_ARGS) ...@@ -2083,9 +2098,7 @@ text_to_array(PG_FUNCTION_ARGS)
bool disnull = false; bool disnull = false;
start_posn = end_posn; start_posn = end_posn;
end_posn = text_position(PointerGetDatum(inputstring), end_posn = text_position(inputstring, fldsep, fldnum);
PointerGetDatum(fldsep),
fldnum);
if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */ if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */
{ {
...@@ -2101,30 +2114,36 @@ text_to_array(PG_FUNCTION_ARGS) ...@@ -2101,30 +2114,36 @@ text_to_array(PG_FUNCTION_ARGS)
else else
{ {
/* otherwise create array and exit */ /* otherwise create array and exit */
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, oldcontext)); PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
CurrentMemoryContext));
} }
} }
else if ((start_posn != 0) && (end_posn == 0)) else if (start_posn == 0)
{
/* last field requested */
result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, -1, true);
}
else if ((start_posn == 0) && (end_posn != 0))
{ {
/* first field requested */ /* first field requested */
result_text = LEFT(inputstring, fldsep); result_text = LEFT(inputstring, fldsep);
} }
else if (end_posn == 0)
{
/* last field requested */
result_text = text_substring(PointerGetDatum(inputstring),
start_posn + fldsep_len,
-1, true);
}
else else
{ {
/* prior to last field requested */ /* interior field requested */
result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, end_posn - start_posn - fldsep_len, false); result_text = text_substring(PointerGetDatum(inputstring),
start_posn + fldsep_len,
end_posn - start_posn - fldsep_len,
false);
} }
/* stash away current value */ /* stash away current value */
dvalue = PointerGetDatum(result_text); dvalue = PointerGetDatum(result_text);
astate = accumArrayResult(astate, dvalue, astate = accumArrayResult(astate, dvalue,
disnull, TEXTOID, oldcontext); disnull, TEXTOID,
CurrentMemoryContext);
} }
/* never reached -- keep compiler quiet */ /* never reached -- keep compiler quiet */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment