Commit 2918fced authored by Noah Misch's avatar Noah Misch

Ignore XML declaration in xpath_internal(), for UTF8 databases.

When a value contained an XML declaration naming some other encoding,
this function interpreted UTF8 bytes as the named encoding, yielding
mojibake.  xml_parse() already has similar logic.  This would be
necessary but not sufficient for non-UTF8 databases, so preserve
behavior there until the xpath facility can support such databases
comprehensively.  Back-patch to 9.3 (all supported versions).

Pavel Stehule and Noah Misch

Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
parent 5edc63bd
...@@ -3845,6 +3845,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, ...@@ -3845,6 +3845,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
int32 xpath_len; int32 xpath_len;
xmlChar *string; xmlChar *string;
xmlChar *xpath_expr; xmlChar *xpath_expr;
size_t xmldecl_len = 0;
int i; int i;
int ndim; int ndim;
Datum *ns_names_uris; Datum *ns_names_uris;
...@@ -3900,6 +3901,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, ...@@ -3900,6 +3901,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
string = pg_xmlCharStrndup(datastr, len); string = pg_xmlCharStrndup(datastr, len);
xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len); xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len);
/*
* In a UTF8 database, skip any xml declaration, which might assert
* another encoding. Ignore parse_xml_decl() failure, letting
* xmlCtxtReadMemory() report parse errors. Documentation disclaims
* xpath() support for non-ASCII data in non-UTF8 databases, so leave
* those scenarios bug-compatible with historical behavior.
*/
if (GetDatabaseEncoding() == PG_UTF8)
parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL);
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
PG_TRY(); PG_TRY();
...@@ -3914,7 +3925,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, ...@@ -3914,7 +3925,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
if (ctxt == NULL || xmlerrcxt->err_occurred) if (ctxt == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context"); "could not allocate parser context");
doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
len - xmldecl_len, NULL, NULL, 0);
if (doc == NULL || xmlerrcxt->err_occurred) if (doc == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document"); "could not parse XML document");
......
...@@ -670,6 +670,37 @@ SELECT xpath('/nosuchtag', '<root/>'); ...@@ -670,6 +670,37 @@ SELECT xpath('/nosuchtag', '<root/>');
{} {}
(1 row) (1 row)
-- Round-trip non-ASCII data through xpath().
DO $$
DECLARE
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
degree_symbol text;
res xml[];
BEGIN
-- Per the documentation, xpath() doesn't work on non-ASCII data when
-- the server encoding is not UTF8. The EXCEPTION block below,
-- currently dead code, will be relevant if we remove this limitation.
IF current_setting('server_encoding') <> 'UTF8' THEN
RAISE LOG 'skip: encoding % unsupported for xml',
current_setting('server_encoding');
RETURN;
END IF;
degree_symbol := convert_from('\xc2b0', 'UTF8');
res := xpath('text()', (xml_declaration ||
'<x>' || degree_symbol || '</x>')::xml);
IF degree_symbol <> res[1]::text THEN
RAISE 'expected % (%), got % (%)',
degree_symbol, convert_to(degree_symbol, 'UTF8'),
res[1], convert_to(res[1]::text, 'UTF8');
END IF;
EXCEPTION
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
END
$$;
-- Test xmlexists and xpath_exists -- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
xmlexists xmlexists
......
...@@ -576,6 +576,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>'); ...@@ -576,6 +576,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>');
^ ^
DETAIL: This functionality requires the server to be built with libxml support. DETAIL: This functionality requires the server to be built with libxml support.
HINT: You need to rebuild PostgreSQL using --with-libxml. HINT: You need to rebuild PostgreSQL using --with-libxml.
-- Round-trip non-ASCII data through xpath().
DO $$
DECLARE
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
degree_symbol text;
res xml[];
BEGIN
-- Per the documentation, xpath() doesn't work on non-ASCII data when
-- the server encoding is not UTF8. The EXCEPTION block below,
-- currently dead code, will be relevant if we remove this limitation.
IF current_setting('server_encoding') <> 'UTF8' THEN
RAISE LOG 'skip: encoding % unsupported for xml',
current_setting('server_encoding');
RETURN;
END IF;
degree_symbol := convert_from('\xc2b0', 'UTF8');
res := xpath('text()', (xml_declaration ||
'<x>' || degree_symbol || '</x>')::xml);
IF degree_symbol <> res[1]::text THEN
RAISE 'expected % (%), got % (%)',
degree_symbol, convert_to(degree_symbol, 'UTF8'),
res[1], convert_to(res[1]::text, 'UTF8');
END IF;
EXCEPTION
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
END
$$;
ERROR: unsupported XML feature
DETAIL: This functionality requires the server to be built with libxml support.
HINT: You need to rebuild PostgreSQL using --with-libxml.
CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment
-- Test xmlexists and xpath_exists -- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
ERROR: unsupported XML feature ERROR: unsupported XML feature
......
...@@ -650,6 +650,37 @@ SELECT xpath('/nosuchtag', '<root/>'); ...@@ -650,6 +650,37 @@ SELECT xpath('/nosuchtag', '<root/>');
{} {}
(1 row) (1 row)
-- Round-trip non-ASCII data through xpath().
DO $$
DECLARE
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
degree_symbol text;
res xml[];
BEGIN
-- Per the documentation, xpath() doesn't work on non-ASCII data when
-- the server encoding is not UTF8. The EXCEPTION block below,
-- currently dead code, will be relevant if we remove this limitation.
IF current_setting('server_encoding') <> 'UTF8' THEN
RAISE LOG 'skip: encoding % unsupported for xml',
current_setting('server_encoding');
RETURN;
END IF;
degree_symbol := convert_from('\xc2b0', 'UTF8');
res := xpath('text()', (xml_declaration ||
'<x>' || degree_symbol || '</x>')::xml);
IF degree_symbol <> res[1]::text THEN
RAISE 'expected % (%), got % (%)',
degree_symbol, convert_to(degree_symbol, 'UTF8'),
res[1], convert_to(res[1]::text, 'UTF8');
END IF;
EXCEPTION
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
END
$$;
-- Test xmlexists and xpath_exists -- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
xmlexists xmlexists
......
...@@ -189,6 +189,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>'); ...@@ -189,6 +189,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>');
SELECT xpath('name(/*)', '<root><sub/><sub/></root>'); SELECT xpath('name(/*)', '<root><sub/><sub/></root>');
SELECT xpath('/nosuchtag', '<root/>'); SELECT xpath('/nosuchtag', '<root/>');
-- Round-trip non-ASCII data through xpath().
DO $$
DECLARE
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
degree_symbol text;
res xml[];
BEGIN
-- Per the documentation, xpath() doesn't work on non-ASCII data when
-- the server encoding is not UTF8. The EXCEPTION block below,
-- currently dead code, will be relevant if we remove this limitation.
IF current_setting('server_encoding') <> 'UTF8' THEN
RAISE LOG 'skip: encoding % unsupported for xml',
current_setting('server_encoding');
RETURN;
END IF;
degree_symbol := convert_from('\xc2b0', 'UTF8');
res := xpath('text()', (xml_declaration ||
'<x>' || degree_symbol || '</x>')::xml);
IF degree_symbol <> res[1]::text THEN
RAISE 'expected % (%), got % (%)',
degree_symbol, convert_to(degree_symbol, 'UTF8'),
res[1], convert_to(res[1]::text, 'UTF8');
END IF;
EXCEPTION
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
END
$$;
-- Test xmlexists and xpath_exists -- Test xmlexists and xpath_exists
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment