[SCM] WebKit Debian packaging branch, debian/unstable, updated. debian/1.1.15-1-40151-g37bb677

Sat Sep 26 07:40:57 UTC 2009

The following commit has been merged in the debian/unstable branch:
commit 981d75e5c86565faa844efe4b81c06a1a4713274
Author: darin <darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date:   Thu May 8 14:57:38 2003 +0000

            Reviewed by Ken.
    
    	- fixed 3174769 -- Safari doesn't ignore byte order mark at start of UTF-8 HTML document
    
            * khtml/misc/decoder.cpp: (Decoder::decode): Added handling of the UTF-8 BOM
            to the code that already handled the UTF-16 BOM.
    
    
    git-svn-id: http://svn.webkit.org/repository/webkit/trunk@4306 268f45cc-cd09-0410-ab3c-d52691b4dbfc

diff --git a/WebCore/ChangeLog-2003-10-25 b/WebCore/ChangeLog-2003-10-25
index 26ee8e5..5042daa 100644
--- a/WebCore/ChangeLog-2003-10-25
+++ b/WebCore/ChangeLog-2003-10-25
@@ -1,3 +1,12 @@
+2003-05-08  Darin Adler  <darin at apple.com>
+
+        Reviewed by Ken.
+
+	- fixed 3174769 -- Safari doesn't ignore byte order mark at start of UTF-8 HTML document
+
+        * khtml/misc/decoder.cpp: (Decoder::decode): Added handling of the UTF-8 BOM
+        to the code that already handled the UTF-16 BOM.
+
 2003-05-06  David Hyatt  <hyatt at apple.com>
 
 	The purpose of this patch is to unify XML and HTML documents' root
diff --git a/WebCore/ChangeLog-2005-08-23 b/WebCore/ChangeLog-2005-08-23
index 26ee8e5..5042daa 100644
--- a/WebCore/ChangeLog-2005-08-23
+++ b/WebCore/ChangeLog-2005-08-23
@@ -1,3 +1,12 @@
+2003-05-08  Darin Adler  <darin at apple.com>
+
+        Reviewed by Ken.
+
+	- fixed 3174769 -- Safari doesn't ignore byte order mark at start of UTF-8 HTML document
+
+        * khtml/misc/decoder.cpp: (Decoder::decode): Added handling of the UTF-8 BOM
+        to the code that already handled the UTF-16 BOM.
+
 2003-05-06  David Hyatt  <hyatt at apple.com>
 
 	The purpose of this patch is to unify XML and HTML documents' root
diff --git a/WebCore/khtml/misc/decoder.cpp b/WebCore/khtml/misc/decoder.cpp
index 493f2f8..c588ed5 100644
--- a/WebCore/khtml/misc/decoder.cpp
+++ b/WebCore/khtml/misc/decoder.cpp
@@ -366,22 +366,46 @@ static void skipComment(const char *&ptr, const char *pEnd)
 
 QString Decoder::decode(const char *data, int len)
 {
-    // Check for BOM mark at the beginning, which is a sure sign of some kind of 16-bit Unicode.
-    if (beginning && buffer.length() + len >= 2) {
+    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
+    int bufferLength = buffer.length();
+    const int UTF8BOMLength = 3;
+    const int maximumBOMLength = 3;
+    if (beginning && bufferLength + len >= maximumBOMLength) {
         if (m_type != UserChosenEncoding) {
+            // Extract the first three bytes.
+            // Handle the case where some of bytes are already in the buffer.
+            // The last byte is always guaranteed to not be in the buffer.
             const uchar *udata = (const uchar *)data;
-            uchar c1;
-            if (buffer.length() != 0) {
-                assert(buffer.length() == 1);
-                c1 = buffer[0];
+            uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;
+            uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;
+            assert(bufferLength < 3);
+            uchar c3 = *udata;
+
+            // Check for the BOM.
+            const char *autoDetectedEncoding;
+            if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
+                autoDetectedEncoding = "ISO-10646-UCS-2";
+
+                // Leave the BOM in place, because the decoder knows how to
+                // discard it, and it uses it to figure out byte ordering.
+            } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
+                autoDetectedEncoding = "UTF-8";
+
+                // Consume the three-byte UTF-8 BOM, so that the decoder does not have to.
+                buffer.truncate(0);
+                int bytesInData = UTF8BOMLength - bufferLength;
+                len -= bytesInData;
+                data += bytesInData;
             } else {
-                c1 = *udata++;
+                autoDetectedEncoding = 0;
             }
-            uchar c2 = *udata;
-            if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
-                enc = "ISO-10646-UCS-2";
+
+            // If we found a BOM, use the encoding it implies.
+            if (autoDetectedEncoding != 0) {
                 m_type = AutoDetectedEncoding;
-                m_codec = QTextCodec::codecForName(enc);
+                m_codec = QTextCodec::codecForName(autoDetectedEncoding);
+                assert(m_codec);
+                enc = m_codec->name();
                 delete m_decoder;
                 m_decoder = m_codec->makeDecoder();
             }
@@ -533,7 +557,7 @@ QString Decoder::decode(const char *data, int len)
  found:
 #if APPLE_CHANGES
     // Do the auto-detect if our default encoding is one of the Japanese ones.
-    if (m_type != UserChosenEncoding && m_codec && m_codec->isJapanese())
+    if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_codec && m_codec->isJapanese())
 #else
     if (m_type == DefaultEncoding && KGlobal::locale()->languageList()[0] == "ja")
 #endif
@@ -560,7 +584,7 @@ QString Decoder::decode(const char *data, int len)
 	kdDebug( 6005 ) << "Decoder: auto detect encoding is "
             << (autoDetectedEncoding ? autoDetectedEncoding : "NULL") << endl;
 #endif
-	if (autoDetectedEncoding != NULL) {
+	if (autoDetectedEncoding != 0) {
 	    setEncoding(autoDetectedEncoding, AutoDetectedEncoding);
 	}
     }

-- 
WebKit Debian packaging