[SCM] WebKit Debian packaging branch, debian/unstable, updated. debian/1.1.15-1-40151-g37bb677

Sat Sep 26 07:44:07 UTC 2009

The following commit has been merged in the debian/unstable branch:
commit 971484a9053ac330eb6abc4a68552b40c924dcf9
Author: darin <darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date:   Thu Jun 5 00:11:22 2003 +0000

            Reviewed by Dave.
    
    	- fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
    
            Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
    
            * kjs/regexp.h: Redo field names, remove some unused stuff.
            * kjs/regexp.cpp:
            (convertToUTF8): Added.
            (compareStringOffsets): Added.
            (createSortedOffsetsArray): Added.
            (convertCharacterOffsetsToUTF8ByteOffsets): Added.
            (convertUTF8ByteOffsetsToCharacterOffsets): Added.
            (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
            using ascii() on it.
            (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
            (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
            from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
            correct position and matched string.
    
            * JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
    
    
    git-svn-id: http://svn.webkit.org/repository/webkit/trunk@4482 268f45cc-cd09-0410-ab3c-d52691b4dbfc

diff --git a/JavaScriptCore/ChangeLog b/JavaScriptCore/ChangeLog
index 6f01a9a..34b17c6 100644
--- a/JavaScriptCore/ChangeLog
+++ b/JavaScriptCore/ChangeLog
@@ -1,3 +1,27 @@
+2003-06-04  Darin Adler  <darin at apple.com>
+
+        Reviewed by Dave.
+
+	- fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
+
+        Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
+
+        * kjs/regexp.h: Redo field names, remove some unused stuff.
+        * kjs/regexp.cpp:
+        (convertToUTF8): Added.
+        (compareStringOffsets): Added.
+        (createSortedOffsetsArray): Added.
+        (convertCharacterOffsetsToUTF8ByteOffsets): Added.
+        (convertUTF8ByteOffsetsToCharacterOffsets): Added.
+        (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
+        using ascii() on it.
+        (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
+        (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
+        from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
+        correct position and matched string.
+
+        * JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
+
 === Safari-82 ===
 
 === Safari-81 ===
diff --git a/JavaScriptCore/ChangeLog-2003-10-25 b/JavaScriptCore/ChangeLog-2003-10-25
index 6f01a9a..34b17c6 100644
--- a/JavaScriptCore/ChangeLog-2003-10-25
+++ b/JavaScriptCore/ChangeLog-2003-10-25
@@ -1,3 +1,27 @@
+2003-06-04  Darin Adler  <darin at apple.com>
+
+        Reviewed by Dave.
+
+	- fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
+
+        Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
+
+        * kjs/regexp.h: Redo field names, remove some unused stuff.
+        * kjs/regexp.cpp:
+        (convertToUTF8): Added.
+        (compareStringOffsets): Added.
+        (createSortedOffsetsArray): Added.
+        (convertCharacterOffsetsToUTF8ByteOffsets): Added.
+        (convertUTF8ByteOffsetsToCharacterOffsets): Added.
+        (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
+        using ascii() on it.
+        (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
+        (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
+        from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
+        correct position and matched string.
+
+        * JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
+
 === Safari-82 ===
 
 === Safari-81 ===
diff --git a/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj b/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
index a74490f..a7b051e 100644
--- a/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
+++ b/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
@@ -221,6 +221,7 @@
 				651F6415039D5B5F0078395C,
 				65417211039E08B90058BFEB,
 				65417219039E0B390058BFEB,
+				937F4F25048E5B9900CA2AC4,
 			);
 			isa = PBXHeadersBuildPhase;
 			runOnlyForDeploymentPostprocessing = 0;
@@ -487,6 +488,7 @@
 				6541720E039E08B90058BFEB,
 				65417204039E02E70058BFEB,
 				65417205039E02E70058BFEB,
+				937F4F24048E5B9900CA2AC4,
 				65417206039E02E70058BFEB,
 				65417217039E0B280058BFEB,
 				6541720F039E08B90058BFEB,
@@ -713,6 +715,19 @@
 			settings = {
 			};
 		};
+		937F4F24048E5B9900CA2AC4 = {
+			fileEncoding = 30;
+			isa = PBXFileReference;
+			name = internal.h;
+			path = pcre/internal.h;
+			refType = 4;
+		};
+		937F4F25048E5B9900CA2AC4 = {
+			fileRef = 937F4F24048E5B9900CA2AC4;
+			isa = PBXBuildFile;
+			settings = {
+			};
+		};
 		938772E5038BFE19008635CE = {
 			fileEncoding = 4;
 			isa = PBXFileReference;
diff --git a/JavaScriptCore/kjs/regexp.cpp b/JavaScriptCore/kjs/regexp.cpp
index 46572b3..4865c4c 100644
--- a/JavaScriptCore/kjs/regexp.cpp
+++ b/JavaScriptCore/kjs/regexp.cpp
@@ -25,39 +25,177 @@
 #include <stdlib.h>
 #include <string.h>
 
-using namespace KJS;
+using KJS::CString;
+using KJS::RegExp;
+using KJS::UString;
 
-RegExp::RegExp(const UString &p, int f)
-  : pattern(p), flgs(f)
-{
 #ifdef HAVE_PCREPOSIX
-  int pcreflags = 0;
-  const char *perrormsg;
-  int errorOffset;
 
-  if (flgs & IgnoreCase)
-    pcreflags |= PCRE_CASELESS;
+static CString convertToUTF8(const UString &s)
+{
+    // Allocate a buffer big enough to hold all the characters.
+    const int length = s.size();
+    const unsigned bufferSize = length * 3 + 1;
+    char fixedSizeBuffer[1024];
+    char *buffer;
+    if (bufferSize > sizeof(fixedSizeBuffer)) {
+        buffer = new char [bufferSize];
+    } else {
+        buffer = fixedSizeBuffer;
+    }
+
+    // Convert to runs of 8-bit characters.
+    char *p = buffer;
+    for (int i = 0; i != length; ++i) {
+        unsigned short c = s[i].unicode();
+        if (c < 0x80) {
+            *p++ = (char)c;
+        } else if (c < 0x800) {
+            *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+        } else {
+            *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+            *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+            *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+        }
+    }
+    *p = 0;
+
+    // Return the result as a C string.
+    CString result(buffer);
+    if (buffer != fixedSizeBuffer) {
+        delete [] buffer;
+    }
+    return result;
+}
+
+struct StringOffset {
+    int offset;
+    int locationInOffsetsArray;
+};
+
+static int compareStringOffsets(const void *a, const void *b)
+{
+    const StringOffset *oa = static_cast<const StringOffset *>(a);
+    const StringOffset *ob = static_cast<const StringOffset *>(b);
+    
+    if (oa->offset < ob->offset) {
+        return -1;
+    }
+    if (oa->offset > ob->offset) {
+        return +1;
+    }
+    return 0;
+}
+
+const int sortedOffsetsFixedBufferSize = 128;
+
+static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
+    StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
+{
+    // Allocate the sorted offsets.
+    StringOffset *sortedOffsets;
+    if (numOffsets <= sortedOffsetsFixedBufferSize) {
+        sortedOffsets = sortedOffsetsFixedBuffer;
+    } else {
+        sortedOffsets = new StringOffset [numOffsets];
+    }
+
+    // Copy offsets.
+    for (int i = 0; i != numOffsets; ++i) {
+        sortedOffsets[i].offset = offsets[i];
+        sortedOffsets[i].locationInOffsetsArray = i;
+    }
+
+    // Sort them.
+    qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
+
+    return sortedOffsets;
+}
+
+static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
+{
+    // Allocate buffer.
+    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+
+    // Walk through sorted offsets and string, adjusting all the offests.
+    // Offsets that are off the ends of the string map to the edges of the string.
+    int characterOffset = 0;
+    const char *p = s;
+    for (int oi = 0; oi != numOffsets; ++oi) {
+        const int nextOffset = sortedOffsets[oi].offset;
+        while (*p && characterOffset < nextOffset) {
+            // Skip to the next character.
+            ++characterOffset;
+            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+        }
+        offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
+    }
+
+    // Free buffer.
+    if (sortedOffsets != fixedBuffer) {
+        delete [] sortedOffsets;
+    }
+}
+
+static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
+{
+    // Allocate buffer.
+    StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+    StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+
+    // Walk through sorted offsets and string, adjusting all the offests.
+    // Offsets that are off the end of the string map to the edges of the string.
+    int characterOffset = 0;
+    const char *p = s;
+    for (int oi = 0; oi != numOffsets; ++oi) {
+        const int nextOffset = sortedOffsets[oi].offset;
+        while (*p && (p - s) < nextOffset) {
+            // Skip to the next character.
+            ++characterOffset;
+            do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+        }
+        offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
+    }
+
+    // Free buffer.
+    if (sortedOffsets != fixedBuffer) {
+        delete [] sortedOffsets;
+    }
+}
+
+#endif // HAVE_PCREPOSIX
+
+RegExp::RegExp(const UString &p, int flags)
+  : _flags(flags), _numSubPatterns(0)
+{
+#ifdef HAVE_PCREPOSIX
 
-  if (flgs & Multiline)
-    pcreflags |= PCRE_MULTILINE;
+  int options = PCRE_UTF8;
+  // Note: the Global flag is already handled by RegExpProtoFunc::execute.
+  if (flags & IgnoreCase)
+    options |= PCRE_CASELESS;
+  if (flags & Multiline)
+    options |= PCRE_MULTILINE;
 
-  pcregex = pcre_compile(p.ascii(), pcreflags,
-			 &perrormsg, &errorOffset, NULL);
+  const char *errorMessage;
+  int errorOffset;
+  _regex = pcre_compile(convertToUTF8(p).c_str(), options, &errorMessage, &errorOffset, NULL);
+  if (!_regex) {
 #ifndef NDEBUG
-  if (!pcregex)
-    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
+    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
 #endif
+    return;
+  }
 
 #ifdef PCRE_INFO_CAPTURECOUNT
-  // Get number of subpatterns that will be returned
-  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
-  if (rc != 0)
+  // Get number of subpatterns that will be returned.
+  pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
 #endif
-    nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
 
 #else /* HAVE_PCREPOSIX */
 
-  nrSubPatterns = 0; // determined in match() with POSIX regex.
   int regflags = 0;
 #ifdef REG_EXTENDED
   regflags |= REG_EXTENDED;
@@ -72,20 +210,19 @@ RegExp::RegExp(const UString &p, int f)
   //    ;
   // Note: the Global flag is already handled by RegExpProtoFunc::execute
 
-  regcomp(&preg, p.ascii(), regflags);
+  regcomp(&_regex, p.ascii(), regflags);
   /* TODO check for errors */
-#endif
 
+#endif
 }
 
 RegExp::~RegExp()
 {
 #ifdef HAVE_PCREPOSIX
-  if (pcregex)
-    pcre_free(pcregex);
+  pcre_free(_regex);
 #else
   /* TODO: is this really okay after an error ? */
-  regfree(&preg);
+  regfree(&_regex);
 #endif
 }
 
@@ -93,32 +230,62 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
 {
   if (i < 0)
     i = 0;
-  if (ovector)
-    *ovector = 0L;
   int dummyPos;
   if (!pos)
     pos = &dummyPos;
   *pos = -1;
+  if (ovector)
+    *ovector = 0;
+
   if (i > s.size() || s.isNull())
     return UString::null();
 
 #ifdef HAVE_PCREPOSIX
-  CString buffer(s.cstring());
-  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
-  if (ovector) *ovector = new int[ovecsize];
 
-  if (!pcregex || pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), i,
-		  0, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
+  if (!_regex)
+    return UString::null();
+
+  // Set up the offset vector for the result.
+  // First 2/3 used for result, the last third used by PCRE.
+  int *offsetVector;
+  int offsetVectorSize;
+  int fixedSizeOffsetVector[3];
+  if (!ovector) {
+    offsetVectorSize = 3;
+    offsetVector = fixedSizeOffsetVector;
+  } else {
+    offsetVectorSize = (_numSubPatterns + 1) * 3;
+    offsetVector = new int [offsetVectorSize];
+  }
+
+  const CString buffer(convertToUTF8(s));
+  convertCharacterOffsetsToUTF8ByteOffsets(buffer.c_str(), &i, 1);
+  const int numMatches = pcre_exec(_regex, NULL, buffer.c_str(), buffer.size(), i, 0, offsetVector, offsetVectorSize);
+
+  if (numMatches < 0) {
+#ifndef NDEBUG
+    if (numMatches != PCRE_ERROR_NOMATCH)
+      fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
+#endif
+    if (offsetVector != fixedSizeOffsetVector)
+      delete [] offsetVector;
     return UString::null();
+  }
+
+  convertUTF8ByteOffsetsToCharacterOffsets(buffer.c_str(), offsetVector, (numMatches == 0 ? 1 : numMatches) * 2);
+
+  *pos = offsetVector[0];
+  if (ovector)
+    *ovector = offsetVector;
+  return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
 
-  if (!ovector)
-    return UString::null(); // don't rely on the return value if you pass ovector==0
 #else
+
   const uint maxMatch = 10;
   regmatch_t rmatch[maxMatch];
 
   char *str = strdup(s.ascii()); // TODO: why ???
-  if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
+  if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
     free(str);
     return UString::null();
   }
@@ -130,44 +297,20 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
   }
 
   // map rmatch array to ovector used in PCRE case
-  nrSubPatterns = 0;
+  _numSubPatterns = 0;
   for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
-      nrSubPatterns++;
-  int ovecsize = (nrSubPatterns+1)*3; // see above
+      _numSubPatterns++;
+  int ovecsize = (_numSubPatterns+1)*3; // see above
   *ovector = new int[ovecsize];
-  for (uint j = 0; j < nrSubPatterns + 1; j++) {
+  for (uint j = 0; j < _numSubPatterns + 1; j++) {
     if (j>maxMatch)
       break;
     (*ovector)[2*j] = rmatch[j].rm_so + i;
     (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
   }
-#endif
 
   *pos = (*ovector)[0];
   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
-}
 
-#if 0 // unused
-bool RegExp::test(const UString &s, int)
-{
-#ifdef HAVE_PCREPOSIX
-  int ovector[300];
-  CString buffer(s.cstring());
-
-  if (s.isNull() ||
-      pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
-		0, ovector, 300) == PCRE_ERROR_NOMATCH)
-    return false;
-  else
-    return true;
-
-#else
-
-  char *str = strdup(s.ascii());
-  int r = regexec(&preg, str, 0, 0, 0);
-  free(str);
-
-  return r == 0;
 #endif
 }
-#endif
diff --git a/JavaScriptCore/kjs/regexp.h b/JavaScriptCore/kjs/regexp.h
index 468bdd1..233cd81 100644
--- a/JavaScriptCore/kjs/regexp.h
+++ b/JavaScriptCore/kjs/regexp.h
@@ -41,26 +41,26 @@ namespace KJS {
   class RegExp {
   public:
     enum { None = 0, Global = 1, IgnoreCase = 2, Multiline = 4 };
-    RegExp(const UString &p, int f = None);
+
+    RegExp(const UString &pattern, int flags = None);
     ~RegExp();
-    int flags() const { return flgs; }
-    UString match(const UString &s, int i = -1, int *pos = 0L, int **ovector = 0L);
-    // test is unused. The JS spec says that RegExp.test should use
-    // RegExp.exec, so it has to store $1 etc.
-    // bool test(const UString &s, int i = -1);
-    uint subPatterns() const { return nrSubPatterns; }
-  private:
-    const UString &pattern;
-    int flgs;
 
-#ifndef HAVE_PCREPOSIX
-    regex_t preg;
+    int flags() const { return _flags; }
+
+    UString match(const UString &s, int i, int *pos = 0, int **ovector = 0);
+    uint subPatterns() const { return _numSubPatterns; }
+
+  private:
+#ifdef HAVE_PCREPOSIX
+    pcre *_regex;
 #else
-    pcre *pcregex;
+    regex_t _regex;
 #endif
-    uint nrSubPatterns;
+    int _flags;
+    uint _numSubPatterns;
 
-    RegExp();
+    RegExp(const RegExp &);
+    RegExp &operator=(const RegExp &);
   };
 
 }; // namespace

-- 
WebKit Debian packaging