[xml/sgml-pkgs] Bug#770836: libxml2: please consider adding a patch fixing invalid output

Thorsten Glaser t.glaser at tarent.de
Mon Nov 24 14:22:44 UTC 2014


Source: libxml2
Version: 2.9.2+dfsg1-1
Severity: wishlist
Tags: patch upstream forwarded-upstream
Forwarded: https://bugzilla.gnome.org/show_bug.cgi?id=739574

Hi,

please consider applying the attached patch in subsequent uploads,
at least until upstream has integrated it. It fixes:

• replace several ad-hōc UTF-8 decoders with calls to one that
  does the thing right (validate input string length and encoding,
  and check for minimal encoded values)

• in several places, check the values for being actually ok in
  XML documents, which limits what Unicode codepoints may be used
  ‣ when there was already error handling in place, re-use that
  ‣ otherwise silently drop the characters, to not break any
    existing application

This prevents e.g. a SOAP-WS client written in PHP from sending
invalid XML as SOAP request over the wire for strings containing
e.g. literal backspace characters.

Thanks,
//mirabilos
-- 
tarent solutions GmbH
Rochusstraße 2-4, D-53123 Bonn • http://www.tarent.de/
Tel: +49 228 54881-393 • Fax: +49 228 54881-235
HRB 5168 (AG Bonn) • USt-ID (VAT): DE122264941
Geschäftsführer: Dr. Stefan Barth, Kai Ebenrett, Boris Esser, Alexander Steeg
-------------- next part --------------
diff -Nru libxml2-2.9.2+dfsg1/debian/changelog libxml2-2.9.2+dfsg1/debian/changelog
--- libxml2-2.9.2+dfsg1/debian/changelog	2014-10-26 02:45:27.000000000 +0200
+++ libxml2-2.9.2+dfsg1/debian/changelog	2014-11-24 14:05:49.000000000 +0100
@@ -1,3 +1,10 @@
+libxml2 (2.9.2+dfsg1-1.0tarent1) tarent; urgency=medium
+
+  * Non-maintainer upload.
+  * Add patch fixing XML and UTF-8 character validity of output
+
+ -- Thorsten Glaser <t.glaser at tarent.de>  Mon, 24 Nov 2014 14:05:46 +0100
+
 libxml2 (2.9.2+dfsg1-1) unstable; urgency=low
 
   * New upstream release (Closes: #765722, CVE-2014-3660)
diff -Nru libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch
--- libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch	1970-01-01 01:00:00.000000000 +0100
+++ libxml2-2.9.2+dfsg1/debian/patches/quell-omitting-invalid-XML-chars.patch	2014-11-24 14:05:44.000000000 +0100
@@ -0,0 +1,330 @@
+# DP: Fix emitting invalid XML (things not IS_BYTE_CHAR or not IS_CHAR).
+# DP: Bonus: be correct when decoding UTF-8.
+# DP: Invalid XML or UTF-8 is silently skipped, unless existing code
+# DP: dealt with error conditions already.
+# DP: Bug: https://bugzilla.gnome.org/show_bug.cgi?id=739574
+# DP: Author: mirabilos <t.glaser at tarent.de>
+
+--- a/entities.c
++++ b/entities.c
+@@ -25,6 +25,84 @@
+ #include "save.h"
+ 
+ /*
++ * Bonus: correct UTF-8 decoder, for use here and elsewhere.
++ * Decodes into *wcp valid UTF-8 string to U+0000‥U+FFFD or
++ * U-00010000‥U-0010FFFF and returns number of octets used.
++ * In error case, returns 0 and does not change *wcp. Use 5
++ * for buflen if buf is guaranteed to be NUL-terminated.
++ */
++unsigned int
++xmlInternalUTF8decode(unsigned int *wcp, const void *buf, size_t buflen)
++{
++	unsigned int wc, to, lo;
++	const unsigned char *src = buf;
++
++	if (buflen < 1)
++		goto xmlInternalUTF8decode_error;
++	wc = *src++;
++	/* check for valid ASCII */
++	if (wc < 0x80)
++		goto xmlInternalUTF8decode_success;
++	/* check for valid lead octet in valid range */
++	if (wc < 0xC2 || wc > 0xF4)
++		goto xmlInternalUTF8decode_error;
++	/* check first trail byte for validity */
++	if (buflen < 2)
++		goto xmlInternalUTF8decode_error;
++	if ((to = *src++ ^ 0x80) > 0x3F)
++		goto xmlInternalUTF8decode_error;
++	to &= 0x3F;
++	/* check for 2-octet sequence */
++	if ((lo = wc) < 0xE0) {
++		wc = ((wc & 0x1F) << 6) | to;
++		/* check for minimal encoding */
++		if (wc < 0x80)
++			goto xmlInternalUTF8decode_error;
++		goto xmlInternalUTF8decode_success;
++	}
++	/* differentiate between 3-octet and 4-octet sequences */
++	if (lo < 0xF0)
++		wc = ((wc & 0x0F) << 12) | (to << 6);
++	else
++		wc = ((wc & 0x07) << 18) | (to << 12);
++
++	/* check second trail byte for validity */
++	if (buflen < 3)
++		goto xmlInternalUTF8decode_error;
++	if ((to = *src++ ^ 0x80) > 0x3F)
++		goto xmlInternalUTF8decode_error;
++	to &= 0x3F;
++	/* check for 3-octet sequence */
++	if (lo < 0xF0) {
++		wc |= to;
++		/* check for minimal and valid encoding */
++		if (wc < 0x800 || wc > 0xFFFD)
++			goto xmlInternalUTF8decode_error;
++		goto xmlInternalUTF8decode_success;
++	}
++	wc |= (to << 6);
++	/* check third trail byte for validity */
++	if (buflen < 4)
++		goto xmlInternalUTF8decode_error;
++	if ((to = *src++ ^ 0x80) > 0x3F)
++		goto xmlInternalUTF8decode_error;
++	/* handle 4-octet sequence */
++	wc |= to & 0x3F;
++	/* check for minimal and valid encoding */
++	if (wc < 0x10000 || wc > 0x10FFFF)
++		goto xmlInternalUTF8decode_error;
++
++ xmlInternalUTF8decode_success:
++	if (wcp)
++		*wcp = wc;
++	return ((unsigned int)(src - ((const unsigned char *)buf)));
++
++ xmlInternalUTF8decode_error:
++	return (0);
++}
++
++
++/*
+  * The XML predefined entities.
+  */
+ 
+@@ -663,46 +741,35 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc,
+ 		 * We assume we have UTF-8 input.
+ 		 */
+ 		char buf[11], *ptr;
+-		int val = 0, l = 1;
++		unsigned int val, l;
+ 
+-		if (*cur < 0xC0) {
++		if (!(l = xmlInternalUTF8decode(&val, cur, 5))) {
+ 		    xmlEntitiesErr(XML_CHECK_NOT_UTF8,
+ 			    "xmlEncodeEntities: input not UTF-8");
+ 		    if (doc != NULL)
+ 			doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
++		    if (!IS_BYTE_CHAR(*cur)) {
++			/* just skip the offending character */
++			cur++;
++			continue;
++		    }
+ 		    snprintf(buf, sizeof(buf), "&#%d;", *cur);
+ 		    buf[sizeof(buf) - 1] = 0;
+ 		    ptr = buf;
+ 		    while (*ptr != 0) *out++ = *ptr++;
+ 		    cur++;
+ 		    continue;
+-		} else if (*cur < 0xE0) {
+-                    val = (cur[0]) & 0x1F;
+-		    val <<= 6;
+-		    val |= (cur[1]) & 0x3F;
+-		    l = 2;
+-		} else if (*cur < 0xF0) {
+-                    val = (cur[0]) & 0x0F;
+-		    val <<= 6;
+-		    val |= (cur[1]) & 0x3F;
+-		    val <<= 6;
+-		    val |= (cur[2]) & 0x3F;
+-		    l = 3;
+-		} else if (*cur < 0xF8) {
+-                    val = (cur[0]) & 0x07;
+-		    val <<= 6;
+-		    val |= (cur[1]) & 0x3F;
+-		    val <<= 6;
+-		    val |= (cur[2]) & 0x3F;
+-		    val <<= 6;
+-		    val |= (cur[3]) & 0x3F;
+-		    l = 4;
+ 		}
+ 		if ((l == 1) || (!IS_CHAR(val))) {
+ 		    xmlEntitiesErr(XML_ERR_INVALID_CHAR,
+ 			"xmlEncodeEntities: char out of range\n");
+ 		    if (doc != NULL)
+ 			doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
++		    if (!IS_BYTE_CHAR(*cur)) {
++			/* just skip the offending character */
++			cur++;
++			continue;
++		    }
+ 		    snprintf(buf, sizeof(buf), "&#%d;", *cur);
+ 		    buf[sizeof(buf) - 1] = 0;
+ 		    ptr = buf;
+@@ -842,11 +909,17 @@ xmlEncodeSpecialChars(const xmlDoc *doc
+ 	    *out++ = '3';
+ 	    *out++ = ';';
+ 	} else {
+-	    /*
+-	     * Works because on UTF-8, all extended sequences cannot
+-	     * result in bytes in the ASCII range.
+-	     */
+-	    *out++ = *cur;
++	    unsigned int wc, wl;
++
++	    if ((wl = xmlInternalUTF8decode(&wc, cur, 5)) && IS_CHAR(wc)) {
++		/* copy correct UTF-8 sequence */
++		while (wl--)
++			*out++ = *cur++;
++		continue;
++	    }
++	    /* we can still copy it, but only if allowed */
++	    if (IS_BYTE_CHAR(*cur))
++		*out++ = *cur;
+ 	}
+ 	cur++;
+     }
+--- a/include/libxml/parserInternals.h
++++ b/include/libxml/parserInternals.h
+@@ -636,6 +636,9 @@ XMLPUBFUN void XMLCALL
+ XMLPUBFUN void XMLCALL
+ 	xmlErrMemory		(xmlParserCtxtPtr ctxt,
+ 				 const char *extra);
++
++unsigned int xmlInternalUTF8decode(unsigned int *wcp, const void *buf, size_t buflen)
++    __attribute__((__visibility__("hidden")));
+ #endif
+ 
+ #ifdef __cplusplus
+--- a/xmlIO.c
++++ b/xmlIO.c
+@@ -3570,7 +3570,15 @@ xmlEscapeContent(unsigned char* out, int
+ 	    *out++ = '3';
+ 	    *out++ = ';';
+ 	} else {
+-	    *out++ = (unsigned char) *in;
++	    unsigned int wc, wl;
++
++	    if ((wl = xmlInternalUTF8decode(&wc, in, inend - in)) && IS_CHAR(wc)) {
++		if (outend - out < wl) break;
++		/* copy correct UTF-8 sequence */
++		while (wl--)
++			*out++ = *in++;
++		continue;
++	    }
+ 	}
+ 	++in;
+     }
+--- a/xmlsave.c
++++ b/xmlsave.c
+@@ -249,44 +249,19 @@ xmlEscapeEntities(unsigned char* out, in
+ 	    *out++ = *in++;
+ 	    continue;
+ 	} else if (*in >= 0x80) {
++	    unsigned int wc, wl;
++
+ 	    /*
+ 	     * We assume we have UTF-8 input.
+ 	     */
+ 	    if (outend - out < 11) break;
+ 
+-	    if (*in < 0xC0) {
++	    if (!(wl = xmlInternalUTF8decode(&wc, in, inend - in))) {
+ 		xmlSaveErr(XML_SAVE_NOT_UTF8, NULL, NULL);
+ 		in++;
+ 		goto error;
+-	    } else if (*in < 0xE0) {
+-		if (inend - in < 2) break;
+-		val = (in[0]) & 0x1F;
+-		val <<= 6;
+-		val |= (in[1]) & 0x3F;
+-		in += 2;
+-	    } else if (*in < 0xF0) {
+-		if (inend - in < 3) break;
+-		val = (in[0]) & 0x0F;
+-		val <<= 6;
+-		val |= (in[1]) & 0x3F;
+-		val <<= 6;
+-		val |= (in[2]) & 0x3F;
+-		in += 3;
+-	    } else if (*in < 0xF8) {
+-		if (inend - in < 4) break;
+-		val = (in[0]) & 0x07;
+-		val <<= 6;
+-		val |= (in[1]) & 0x3F;
+-		val <<= 6;
+-		val |= (in[2]) & 0x3F;
+-		val <<= 6;
+-		val |= (in[3]) & 0x3F;
+-		in += 4;
+-	    } else {
+-		xmlSaveErr(XML_SAVE_CHAR_INVALID, NULL, NULL);
+-		in++;
+-		goto error;
+ 	    }
++	    val = wc;
+ 	    if (!IS_CHAR(val)) {
+ 		xmlSaveErr(XML_SAVE_CHAR_INVALID, NULL, NULL);
+ 		in++;
+@@ -2103,48 +2078,31 @@ xmlBufAttrSerializeTxtContent(xmlBufPtr
+              * We assume we have UTF-8 content.
+              */
+             unsigned char tmp[12];
+-            int val = 0, l = 1;
++	    unsigned int val, l;
+ 
+             if (base != cur)
+                 xmlBufAdd(buf, base, cur - base);
+-            if (*cur < 0xC0) {
++	    if (!(l = xmlInternalUTF8decode(&val, cur, 5))) {
+                 xmlSaveErr(XML_SAVE_NOT_UTF8, (xmlNodePtr) attr, NULL);
+                 if (doc != NULL)
+                     doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
+-		xmlSerializeHexCharRef(tmp, *cur);
+-                xmlBufAdd(buf, (xmlChar *) tmp, -1);
++		if (IS_BYTE_CHAR(*cur)) {
++			xmlSerializeHexCharRef(tmp, *cur);
++			xmlBufAdd(buf, (xmlChar *) tmp, -1);
++		}
+                 cur++;
+                 base = cur;
+                 continue;
+-            } else if (*cur < 0xE0) {
+-                val = (cur[0]) & 0x1F;
+-                val <<= 6;
+-                val |= (cur[1]) & 0x3F;
+-                l = 2;
+-            } else if (*cur < 0xF0) {
+-                val = (cur[0]) & 0x0F;
+-                val <<= 6;
+-                val |= (cur[1]) & 0x3F;
+-                val <<= 6;
+-                val |= (cur[2]) & 0x3F;
+-                l = 3;
+-            } else if (*cur < 0xF8) {
+-                val = (cur[0]) & 0x07;
+-                val <<= 6;
+-                val |= (cur[1]) & 0x3F;
+-                val <<= 6;
+-                val |= (cur[2]) & 0x3F;
+-                val <<= 6;
+-                val |= (cur[3]) & 0x3F;
+-                l = 4;
+-            }
++	    }
+             if ((l == 1) || (!IS_CHAR(val))) {
+                 xmlSaveErr(XML_SAVE_CHAR_INVALID, (xmlNodePtr) attr, NULL);
+                 if (doc != NULL)
+                     doc->encoding = xmlStrdup(BAD_CAST "ISO-8859-1");
+ 
+-		xmlSerializeHexCharRef(tmp, *cur);
+-                xmlBufAdd(buf, (xmlChar *) tmp, -1);
++		if (IS_BYTE_CHAR(*cur)) {
++			xmlSerializeHexCharRef(tmp, *cur);
++			xmlBufAdd(buf, (xmlChar *) tmp, -1);
++		}
+                 cur++;
+                 base = cur;
+                 continue;
+@@ -2157,6 +2115,11 @@ xmlBufAttrSerializeTxtContent(xmlBufPtr
+             xmlBufAdd(buf, (xmlChar *) tmp, -1);
+             cur += l;
+             base = cur;
++	} else if (!IS_BYTE_CHAR(*cur)) {
++	    if (base != cur)
++		xmlBufAdd(buf, base, cur - base);
++	    cur++;
++	    base = cur;
+         } else {
+             cur++;
+         }
diff -Nru libxml2-2.9.2+dfsg1/debian/patches/series libxml2-2.9.2+dfsg1/debian/patches/series
--- libxml2-2.9.2+dfsg1/debian/patches/series	2014-10-26 01:04:04.000000000 +0200
+++ libxml2-2.9.2+dfsg1/debian/patches/series	2014-11-21 15:36:57.000000000 +0100
@@ -1,2 +1,3 @@
 0001-modify-xml2-config-and-pkgconfig-behaviour.patch
 0002-fix-python-multiarch-includes.patch
+quell-omitting-invalid-XML-chars.patch


More information about the debian-xml-sgml-pkgs mailing list