[SCM] WebKit Debian packaging branch, debian/unstable, updated. debian/1.1.15-1-40151-g37bb677

darin darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc
Sat Sep 26 07:52:09 UTC 2009


The following commit has been merged in the debian/unstable branch:
commit a66ca8772f343dbadd8ef6e96852b2fb7ea21408
Author: darin <darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date:   Mon Aug 18 19:23:22 2003 +0000

            Reviewed by Maciej.
    
            - fixed 3381295 -- regular expression matches for text with UTF-16 surrogates will give incorrect results
    
            * kwq/KWQRegExp.mm: (QRegExp::match): Removed local copy of convertCharacterOffsetsToUTF8ByteOffsets
            and its reverse. Instead use convertUTF16OffsetsToUTF8Offsets and its reverse, now available from
            <JavaScriptCore/ustring.h>.
    
            - added test for the URI encoding and decoding functions in JavaScriptCore
    
            * layout-tests/fast/js/global/encode-URI-test-expected.txt: Added.
            * layout-tests/fast/js/global/encode-URI-test.html: Added.
    
    
    git-svn-id: http://svn.webkit.org/repository/webkit/trunk@4838 268f45cc-cd09-0410-ab3c-d52691b4dbfc

diff --git a/LayoutTests/fast/selectors/167-expected.txt b/LayoutTests/fast/js/global/encode-URI-test-expected.txt
similarity index 59%
copy from LayoutTests/fast/selectors/167-expected.txt
copy to LayoutTests/fast/js/global/encode-URI-test-expected.txt
index 5b2d092..2a18c0b 100644
--- a/LayoutTests/fast/selectors/167-expected.txt
+++ b/LayoutTests/fast/js/global/encode-URI-test-expected.txt
@@ -4,5 +4,5 @@ layer at (0,0) size 800x45
   RenderBlock {HTML} at (0,0) size 800x45
     RenderBody {BODY} at (8,14) size 784x17
       RenderBlock {P} at (0,0) size 784x17
-        RenderText {TEXT} at (0,0) size 441x17
-          text run at (0,0) width 441: "The first line of this paragraph should have a green background."
+        RenderText {TEXT} at (0,0) size 206x17
+          text run at (0,0) width 206: "Testing complete. No failures."
diff --git a/LayoutTests/fast/js/global/encode-URI-test.html b/LayoutTests/fast/js/global/encode-URI-test.html
new file mode 100644
index 0000000..317a08e
--- /dev/null
+++ b/LayoutTests/fast/js/global/encode-URI-test.html
@@ -0,0 +1,277 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+
+<head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<title>test of JavaScript URI encoding and decoding methods</title>
+</head>
+
+<body>
+
+<script type="text/javascript">
+
+// --------
+
+// Helper functions.
+
+function hexDigit(number)
+{
+    if (number >= 10)
+        return String.fromCharCode(number + 55);
+    return number;
+}
+
+function printable(s)
+{
+    if (s == "")
+        return "empty string";
+    var p = "";
+    for (var i = 0; i < s.length; i++) {
+        var c = s.charAt(i);
+        var cc = s.charCodeAt(i);
+        if (c == "\\") {
+            p += "\\\\";
+        } else if (c == "\"") {
+            p += "\\\"";
+        } else if (c == "\n") {
+            p += "\\n";
+        } else if (c == "\r") {
+            p += "\\r";
+        } else if (c == "\t") {
+            p += "\\t";
+        } else if (cc >= 20 && cc < 0x7F) {
+            p += c;
+        } else if (cc <= 0xFF) {
+            p += "\\x" + hexDigit(cc >> 4) + hexDigit(cc & 0xF);
+        } else if (cc <= 0xFFFF) {
+            p += "\\u" + hexDigit((cc >> 12) & 0xF) + hexDigit((cc >> 8) & 0xF) + hexDigit((cc >> 4) & 0xF) + hexDigit(cc & 0xF);
+        } else {
+            p += "\\U" + hexDigit((cc >> 28) & 0xF) + hexDigit((cc >> 24) & 0xF) + hexDigit((cc >> 20) & 0xF) + hexDigit((cc >> 16) & 0xF)
+                       + hexDigit((cc >> 12) & 0xF) + hexDigit((cc >> 8) & 0xF) + hexDigit((cc >> 4) & 0xF) + hexDigit(cc & 0xF);
+        }
+    }
+    return "\"" + p + "\"";
+}
+
+function escapedCharacter(c)
+{
+    // UTF-8 is what Gecko does, but not what WinIE 6 does.
+    // It makes much more sense, though, to produce encodings that actually work in URLs.
+    // So for JavaScriptCore, we want to match Gecko on this, WinIE on most other things.
+
+    // Instead of writing a JavaScript implementation of UTF-8 escaping, just do some specific cases here.
+    switch (c) {
+        case    0x80: return "%C2%80";
+        case   0x7FF: return "%DF%BF";
+        case   0x800: return "%E0%A0%80";
+        case  0x2022: return "%E2%80%A2";
+        case  0xD7FF: return "%ED%9F%BF";
+        case  0xD800: return "%ED%A0%80";
+        case  0xE000: return "%EE%80%80";
+        case  0xFFFC: return "%EF%BF%BC";
+        case  0xFFFD: return "%EF%BF%BD";
+        case  0xFFFE: return "%EF%BF%BE";
+        case  0xFFFF: return "%EF%BF%BF";
+        case 0x10000: return "%F0%90%80%80";
+    }
+
+    if (c < 0 || c > 0x7F) {
+        window.alert("escapedCharacter doesn't know how to escape character code " + c);
+        return "?";
+    }
+    
+    return "%" + hexDigit(c >> 4) + hexDigit(c - (c >> 4 << 4));
+}
+
+function forEachCharacterCode(f, s)
+{
+    for (var i = 0; i < s.length; i++) {
+        f(s.charCodeAt(i));
+    }
+}
+
+function call(functionName, parameter)
+{
+    try {
+        result = eval(functionName + "(parameter)");
+    } catch (e) {
+        result = "exception";
+    }
+    return result;
+}
+
+// --------
+
+// Build up tables with expected results.
+
+var expectedResult = new Object;
+
+function addExpectedNonEscaped(f, c)
+{
+    expectedResult[f + "(" + String.fromCharCode(c) + ")"] = String.fromCharCode(c);
+}
+
+function addNoEscape(c)
+{
+    addExpectedNonEscaped("escape", c);
+    addExpectedNonEscaped("encodeURI", c);
+    addExpectedNonEscaped("encodeURIComponent", c);
+}
+
+function addEscapeNoEscape(c)
+{
+    addExpectedNonEscaped("escape", c);
+}
+
+function addURIComponentNoEscape(c)
+{
+    addExpectedNonEscaped("encodeURI", c);
+    addExpectedNonEscaped("encodeURIComponent", c);
+}
+
+function addURINoEscape(c)
+{
+    addExpectedNonEscaped("encodeURI", c);
+    expectedResult["decodeURI(" + escapedCharacter(c) + ")"] = escapedCharacter(c);
+    expectedResult["decodeURI(" + escapedCharacter(c).toLowerCase() + ")"] = escapedCharacter(c).toLowerCase();
+}
+
+forEachCharacterCode(addNoEscape, "*0123456789-.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_");
+forEachCharacterCode(addEscapeNoEscape, "+/");
+forEachCharacterCode(addURINoEscape, "@#$&+,/:;=?");
+forEachCharacterCode(addURIComponentNoEscape, "!'()~");
+
+// WinIE 6's escape function does not escape @, although Gecko's does.
+expectedResult["escape(@)"] = "@";
+
+// --------
+
+// Run tests.
+
+var failureCount = 0;
+
+function test(functionName, parameter, desiredResult)
+{
+    var alternateResult = expectedResult[functionName + "(" + parameter + ")"];
+    if (alternateResult)
+        desiredResult = alternateResult;
+    var result = call(functionName, parameter);
+    if (result != desiredResult) {
+        var s = "called " + functionName + " on " + printable(parameter) + " and got " + printable(result) + " instead of " + printable(desiredResult);
+        document.writeln("<p>called " + functionName + " on " + printable(parameter) + " and got " + printable(result) + " instead of " + printable(desiredResult) + "</p>");
+        failureCount += 1;
+    }
+}
+
+function testEscape(parameter, expected)
+{
+    test("escape", parameter, expected);
+    test("encodeURI", parameter, expected);
+    test("encodeURIComponent", parameter, expected);
+}
+
+function testUnescape(parameter, expected)
+{
+    test("unescape", parameter, expected);
+}
+
+function testDecode(parameter, expected)
+{
+    if (expected == "\uFFFE" || expected == "\uFFFF")
+        expected = "\uFFFD";
+
+    test("decodeURI", parameter, expected);
+    test("decodeURIComponent", parameter, expected);
+}
+
+function testUnescapeAndDecode(parameter, expectedUnescape, expectedDecode)
+{
+    testUnescape(parameter, expectedUnescape);
+    testDecode(parameter, expectedDecode);
+}
+
+function testCharacter(c)
+{
+    var s = String.fromCharCode(c);
+    var escaped = escapedCharacter(c);
+
+    testEscape(s, escaped);
+    testUnescape(escaped, s);
+    testUnescape(escaped.toLowerCase(), s);
+    testDecode(escaped, s);
+    testDecode(escaped.toLowerCase(), s);
+}
+
+for (var c = 0; c <= 128; c++) {
+    testCharacter(c);
+}
+testCharacter(0x7FF);
+testCharacter(0x800);
+testCharacter(0x2022);
+testCharacter(0xD7FF);
+testCharacter(0xE000);
+testCharacter(0xFFFC);
+testCharacter(0xFFFD);
+
+// These tests are currently turned off because it's not yet entirely clear what correct behavior
+// is for these cases. Gecko seems to reject values in the surrogate range entirely, yet turns
+// U+FFFE and U+FFFF into U+FFFD, even though Unicode documentation says to treat both the same.
+// And all the JavaScript engines seem to use UTF-16 in a way that prevents characters greater
+// than U+FFFF (outside the BMP) from working properly.
+
+//testCharacter(0xD800);
+//testCharacter(0xDBFF);
+//testCharacter(0xDC00);
+//testCharacter(0xDFFF);
+//testCharacter(0xFFFE);
+//testCharacter(0xFFFF);
+//testCharacter(0x10000);
+
+testUnescapeAndDecode("%", "%", "exception");
+testUnescapeAndDecode("%0", "%0", "exception");
+testUnescapeAndDecode("%a", "%a", "exception");
+testUnescapeAndDecode("%u", "%u", "exception");
+testUnescapeAndDecode("%xx", "%xx", "exception");
+testUnescapeAndDecode("%u004", "%u004", "exception");
+testUnescapeAndDecode("%u0041", "A", "exception");
+testUnescapeAndDecode("%uxxxx", "%uxxxx", "exception");
+
+testUnescapeAndDecode(String.fromCharCode(0x80), String.fromCharCode(0x80), String.fromCharCode(0x80));
+testUnescapeAndDecode(String.fromCharCode(0xD800), String.fromCharCode(0xD800), String.fromCharCode(0xD800));
+
+testUnescapeAndDecode("%C2%80", String.fromCharCode(0x80), String.fromCharCode(0x80));
+testUnescapeAndDecode("%C2", "%C2", "exception");
+testUnescapeAndDecode("%C2" + String.fromCharCode(0x80), "%C2" + String.fromCharCode(0x80), "exception");
+
+// The characters below hav to be literal because String.fromCharCode will make a single character
+// and the \u syntax won't allow us to specify characters with Unicode values higher than U+FFFF.
+// For most JavaScript engines, this will turn into two characters because they use UTF-16
+// instead of Unicode; it's not clear to me at the moment if the standard asks for this UTF-16
+// behavior, forbids it, or doesn't say either way.
+testEscape("𐀀", "%F0%90%80%80");
+testUnescapeAndDecode("%F0%90%80%80", "𐀀", "𐀀");
+testEscape("𦏵", "%F0%A6%8F%B5");
+testUnescapeAndDecode("%F0%A6%8F%B5", "𦏵", "𦏵");
+testEscape("𯿿", "%F0%AF%BF%BF");
+testUnescapeAndDecode("%F0%AF%BF%BF", "𯿿", "𯿿");
+
+// --------
+
+// Summarize.
+
+var failuresMessage;
+if (failureCount) {
+    failuresMessage = failureCount + " tests failed.";
+} else {
+    failuresMessage = "No failures.";
+}
+document.writeln("<p>Testing complete. " + failuresMessage + "</p>");
+
+// --------
+
+</script>
+
+</body>
+
+</html>
diff --git a/WebCore/ChangeLog-2003-10-25 b/WebCore/ChangeLog-2003-10-25
index 7b3b5d2..800cf53 100644
--- a/WebCore/ChangeLog-2003-10-25
+++ b/WebCore/ChangeLog-2003-10-25
@@ -1,3 +1,18 @@
+2003-08-18  Darin Adler  <darin at apple.com>
+
+        Reviewed by Maciej.
+
+        - fixed 3381295 -- regular expression matches for text with UTF-16 surrogates will give incorrect results
+
+        * kwq/KWQRegExp.mm: (QRegExp::match): Removed local copy of convertCharacterOffsetsToUTF8ByteOffsets
+        and its reverse. Instead use convertUTF16OffsetsToUTF8Offsets and its reverse, now available from
+        <JavaScriptCore/ustring.h>.
+
+        - added test for the URI encoding and decoding functions in JavaScriptCore
+
+        * layout-tests/fast/js/global/encode-URI-test-expected.txt: Added.
+        * layout-tests/fast/js/global/encode-URI-test.html: Added.
+
 2003-08-15  Chris Blumenberg  <cblu at apple.com>
 
 	Fixed: <rdar://problem/3380418>: Ignore specified string encoding when constructing file, mailto and help URLs
diff --git a/WebCore/ChangeLog-2005-08-23 b/WebCore/ChangeLog-2005-08-23
index 7b3b5d2..800cf53 100644
--- a/WebCore/ChangeLog-2005-08-23
+++ b/WebCore/ChangeLog-2005-08-23
@@ -1,3 +1,18 @@
+2003-08-18  Darin Adler  <darin at apple.com>
+
+        Reviewed by Maciej.
+
+        - fixed 3381295 -- regular expression matches for text with UTF-16 surrogates will give incorrect results
+
+        * kwq/KWQRegExp.mm: (QRegExp::match): Removed local copy of convertCharacterOffsetsToUTF8ByteOffsets
+        and its reverse. Instead use convertUTF16OffsetsToUTF8Offsets and its reverse, now available from
+        <JavaScriptCore/ustring.h>.
+
+        - added test for the URI encoding and decoding functions in JavaScriptCore
+
+        * layout-tests/fast/js/global/encode-URI-test-expected.txt: Added.
+        * layout-tests/fast/js/global/encode-URI-test.html: Added.
+
 2003-08-15  Chris Blumenberg  <cblu at apple.com>
 
 	Fixed: <rdar://problem/3380418>: Ignore specified string encoding when constructing file, mailto and help URLs
diff --git a/WebCore/kwq/KWQRegExp.mm b/WebCore/kwq/KWQRegExp.mm
index 9532584..26aa384 100644
--- a/WebCore/kwq/KWQRegExp.mm
+++ b/WebCore/kwq/KWQRegExp.mm
@@ -28,106 +28,10 @@
 
 #import <sys/types.h>
 #import <JavaScriptCore/pcre.h>
+#import <JavaScriptCore/ustring.h>
 
-
-// Functions to convert between byte offets and character offsets were
-// lifted from JavaScriptCore/regexp.cpp. It would be nice to share this code.
-struct StringOffset {
-    int offset;
-    int locationInOffsetsArray;
-};
-
-static int compareStringOffsets(const void *a, const void *b)
-{
-    const StringOffset *oa = static_cast<const StringOffset *>(a);
-    const StringOffset *ob = static_cast<const StringOffset *>(b);
-    
-    if (oa->offset < ob->offset) {
-        return -1;
-    }
-    if (oa->offset > ob->offset) {
-        return +1;
-    }
-    return 0;
-}
-
-const int sortedOffsetsFixedBufferSize = 128;
-
-static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
-                                              StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
-{
-    // Allocate the sorted offsets.
-    StringOffset *sortedOffsets;
-    if (numOffsets <= sortedOffsetsFixedBufferSize) {
-        sortedOffsets = sortedOffsetsFixedBuffer;
-    } else {
-        sortedOffsets = new StringOffset [numOffsets];
-    }
-    
-    // Copy offsets.
-    for (int i = 0; i != numOffsets; ++i) {
-        sortedOffsets[i].offset = offsets[i];
-        sortedOffsets[i].locationInOffsetsArray = i;
-    }
-    
-    // Sort them.
-    qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
-    
-    return sortedOffsets;
-}
-
-static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
-{
-    // Allocate buffer.
-    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
-    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
-    
-    // Walk through sorted offsets and string, adjusting all the offests.
-    // Offsets that are off the ends of the string map to the edges of the string.
-    int characterOffset = 0;
-    const char *p = s;
-    for (int oi = 0; oi != numOffsets; ++oi) {
-        const int nextOffset = sortedOffsets[oi].offset;
-        while (*p && characterOffset < nextOffset) {
-            // Skip to the next character.
-            ++characterOffset;
-            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
-        }
-        offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
-    }
-    
-    // Free buffer.
-    if (sortedOffsets != fixedBuffer) {
-        delete [] sortedOffsets;
-    }
-}
-
-static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
-{
-    // Allocate buffer.
-    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
-    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
-    
-    // Walk through sorted offsets and string, adjusting all the offests.
-    // Offsets that are off the end of the string map to the edges of the string.
-    int characterOffset = 0;
-    const char *p = s;
-    for (int oi = 0; oi != numOffsets; ++oi) {
-        const int nextOffset = sortedOffsets[oi].offset;
-        while (*p && (p - s) < nextOffset) {
-            // Skip to the next character.
-            ++characterOffset;
-            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
-        }
-        offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
-    }
-    
-    // Free buffer.
-    if (sortedOffsets != fixedBuffer) {
-        delete [] sortedOffsets;
-    }
-}
-
+using KJS::convertUTF16OffsetsToUTF8Offsets;
+using KJS::convertUTF8OffsetsToUTF16Offsets;
 
 class QRegExp::KWQRegExpPrivate
 {
@@ -268,7 +172,7 @@ int QRegExp::match(const QString &str, int startFrom, int *matchLength) const
         
     // first 2 offsets are start and end offsets; 3rd entry is used internally by pcre
     int offsets[3];
-    convertCharacterOffsetsToUTF8ByteOffsets(cstring, &startFrom, 1);
+    convertUTF16OffsetsToUTF8Offsets(cstring, &startFrom, 1);
     int result = pcre_exec(d->regex, NULL, cstring, strlen(cstring), startFrom, 
                            startFrom == 0 ? 0 : PCRE_NOTBOL, offsets, 3);
     
@@ -283,7 +187,7 @@ int QRegExp::match(const QString &str, int startFrom, int *matchLength) const
     
     ASSERT(result < 2);
     // 1 means 1 match; 0 means more than one match, first one is recorded in offsets
-    convertUTF8ByteOffsetsToCharacterOffsets(cstring, offsets, 2);
+    convertUTF8OffsetsToUTF16Offsets(cstring, offsets, 2);
     d->lastMatchPos = offsets[0];
     d->lastMatchLength = offsets[1] - offsets[0];
     if (matchLength != NULL) {

-- 
WebKit Debian packaging



More information about the Pkg-webkit-commits mailing list