[SCM] WebKit Debian packaging branch, debian/unstable, updated. debian/1.1.15-1-40151-g37bb677
darin
darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc
Sat Sep 26 07:44:07 UTC 2009
The following commit has been merged in the debian/unstable branch:
commit 971484a9053ac330eb6abc4a68552b40c924dcf9
Author: darin <darin at 268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date: Thu Jun 5 00:11:22 2003 +0000
Reviewed by Dave.
- fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
* kjs/regexp.h: Redo field names, remove some unused stuff.
* kjs/regexp.cpp:
(convertToUTF8): Added.
(compareStringOffsets): Added.
(createSortedOffsetsArray): Added.
(convertCharacterOffsetsToUTF8ByteOffsets): Added.
(convertUTF8ByteOffsetsToCharacterOffsets): Added.
(RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
using ascii() on it.
(RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
(RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
correct position and matched string.
* JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
git-svn-id: http://svn.webkit.org/repository/webkit/trunk@4482 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/JavaScriptCore/ChangeLog b/JavaScriptCore/ChangeLog
index 6f01a9a..34b17c6 100644
--- a/JavaScriptCore/ChangeLog
+++ b/JavaScriptCore/ChangeLog
@@ -1,3 +1,27 @@
+2003-06-04 Darin Adler <darin at apple.com>
+
+ Reviewed by Dave.
+
+ - fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
+
+ Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
+
+ * kjs/regexp.h: Redo field names, remove some unused stuff.
+ * kjs/regexp.cpp:
+ (convertToUTF8): Added.
+ (compareStringOffsets): Added.
+ (createSortedOffsetsArray): Added.
+ (convertCharacterOffsetsToUTF8ByteOffsets): Added.
+ (convertUTF8ByteOffsetsToCharacterOffsets): Added.
+ (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
+ using ascii() on it.
+ (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
+ (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
+ from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
+ correct position and matched string.
+
+ * JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
+
=== Safari-82 ===
=== Safari-81 ===
diff --git a/JavaScriptCore/ChangeLog-2003-10-25 b/JavaScriptCore/ChangeLog-2003-10-25
index 6f01a9a..34b17c6 100644
--- a/JavaScriptCore/ChangeLog-2003-10-25
+++ b/JavaScriptCore/ChangeLog-2003-10-25
@@ -1,3 +1,27 @@
+2003-06-04 Darin Adler <darin at apple.com>
+
+ Reviewed by Dave.
+
+ - fixed 3224031 -- can't search at rakuten.co.jp b/c of extra characters inserted by regexp replace (8-bit char)
+
+ Use PCRE UTF-8 regular expressions instead of just chopping off high bytes.
+
+ * kjs/regexp.h: Redo field names, remove some unused stuff.
+ * kjs/regexp.cpp:
+ (convertToUTF8): Added.
+ (compareStringOffsets): Added.
+ (createSortedOffsetsArray): Added.
+ (convertCharacterOffsetsToUTF8ByteOffsets): Added.
+ (convertUTF8ByteOffsetsToCharacterOffsets): Added.
+ (RegExp::RegExp): Set the PCRE_UTF8 flag, and convert the UString to UTF-8 instead of
+ using ascii() on it.
+ (RegExp::~RegExp): Remove unneeded if statement (pcre_free is 0-tolerant as free is).
+ (RegExp::match): Convert the UString to UTF-8 and convert the character offsets to and
+ from UTF-8 byte offsets. Also do fixes for the "no offset vector" case so we get the
+ correct position and matched string.
+
+ * JavaScriptCore.pbproj/project.pbxproj: Add a PCRE header that was missing before.
+
=== Safari-82 ===
=== Safari-81 ===
diff --git a/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj b/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
index a74490f..a7b051e 100644
--- a/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
+++ b/JavaScriptCore/JavaScriptCore.pbproj/project.pbxproj
@@ -221,6 +221,7 @@
651F6415039D5B5F0078395C,
65417211039E08B90058BFEB,
65417219039E0B390058BFEB,
+ 937F4F25048E5B9900CA2AC4,
);
isa = PBXHeadersBuildPhase;
runOnlyForDeploymentPostprocessing = 0;
@@ -487,6 +488,7 @@
6541720E039E08B90058BFEB,
65417204039E02E70058BFEB,
65417205039E02E70058BFEB,
+ 937F4F24048E5B9900CA2AC4,
65417206039E02E70058BFEB,
65417217039E0B280058BFEB,
6541720F039E08B90058BFEB,
@@ -713,6 +715,19 @@
settings = {
};
};
+ 937F4F24048E5B9900CA2AC4 = {
+ fileEncoding = 30;
+ isa = PBXFileReference;
+ name = internal.h;
+ path = pcre/internal.h;
+ refType = 4;
+ };
+ 937F4F25048E5B9900CA2AC4 = {
+ fileRef = 937F4F24048E5B9900CA2AC4;
+ isa = PBXBuildFile;
+ settings = {
+ };
+ };
938772E5038BFE19008635CE = {
fileEncoding = 4;
isa = PBXFileReference;
diff --git a/JavaScriptCore/kjs/regexp.cpp b/JavaScriptCore/kjs/regexp.cpp
index 46572b3..4865c4c 100644
--- a/JavaScriptCore/kjs/regexp.cpp
+++ b/JavaScriptCore/kjs/regexp.cpp
@@ -25,39 +25,177 @@
#include <stdlib.h>
#include <string.h>
-using namespace KJS;
+using KJS::CString;
+using KJS::RegExp;
+using KJS::UString;
-RegExp::RegExp(const UString &p, int f)
- : pattern(p), flgs(f)
-{
#ifdef HAVE_PCREPOSIX
- int pcreflags = 0;
- const char *perrormsg;
- int errorOffset;
- if (flgs & IgnoreCase)
- pcreflags |= PCRE_CASELESS;
+static CString convertToUTF8(const UString &s)
+{
+ // Allocate a buffer big enough to hold all the characters.
+ const int length = s.size();
+ const unsigned bufferSize = length * 3 + 1;
+ char fixedSizeBuffer[1024];
+ char *buffer;
+ if (bufferSize > sizeof(fixedSizeBuffer)) {
+ buffer = new char [bufferSize];
+ } else {
+ buffer = fixedSizeBuffer;
+ }
+
+ // Convert to runs of 8-bit characters.
+ char *p = buffer;
+ for (int i = 0; i != length; ++i) {
+ unsigned short c = s[i].unicode();
+ if (c < 0x80) {
+ *p++ = (char)c;
+ } else if (c < 0x800) {
+ *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+ *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ } else {
+ *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+ *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+ *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ }
+ }
+ *p = 0;
+
+ // Return the result as a C string.
+ CString result(buffer);
+ if (buffer != fixedSizeBuffer) {
+ delete [] buffer;
+ }
+ return result;
+}
+
+struct StringOffset {
+ int offset;
+ int locationInOffsetsArray;
+};
+
+static int compareStringOffsets(const void *a, const void *b)
+{
+ const StringOffset *oa = static_cast<const StringOffset *>(a);
+ const StringOffset *ob = static_cast<const StringOffset *>(b);
+
+ if (oa->offset < ob->offset) {
+ return -1;
+ }
+ if (oa->offset > ob->offset) {
+ return +1;
+ }
+ return 0;
+}
+
+const int sortedOffsetsFixedBufferSize = 128;
+
+static StringOffset *createSortedOffsetsArray(const int offsets[], int numOffsets,
+ StringOffset sortedOffsetsFixedBuffer[sortedOffsetsFixedBufferSize])
+{
+ // Allocate the sorted offsets.
+ StringOffset *sortedOffsets;
+ if (numOffsets <= sortedOffsetsFixedBufferSize) {
+ sortedOffsets = sortedOffsetsFixedBuffer;
+ } else {
+ sortedOffsets = new StringOffset [numOffsets];
+ }
+
+ // Copy offsets.
+ for (int i = 0; i != numOffsets; ++i) {
+ sortedOffsets[i].offset = offsets[i];
+ sortedOffsets[i].locationInOffsetsArray = i;
+ }
+
+ // Sort them.
+ qsort(sortedOffsets, numOffsets, sizeof(StringOffset), compareStringOffsets);
+
+ return sortedOffsets;
+}
+
+static void convertCharacterOffsetsToUTF8ByteOffsets(const char *s, int *offsets, int numOffsets)
+{
+ // Allocate buffer.
+ StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+ StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+
+ // Walk through sorted offsets and string, adjusting all the offests.
+ // Offsets that are off the ends of the string map to the edges of the string.
+ int characterOffset = 0;
+ const char *p = s;
+ for (int oi = 0; oi != numOffsets; ++oi) {
+ const int nextOffset = sortedOffsets[oi].offset;
+ while (*p && characterOffset < nextOffset) {
+ // Skip to the next character.
+ ++characterOffset;
+ do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+ }
+ offsets[sortedOffsets[oi].locationInOffsetsArray] = p - s;
+ }
+
+ // Free buffer.
+ if (sortedOffsets != fixedBuffer) {
+ delete [] sortedOffsets;
+ }
+}
+
+static void convertUTF8ByteOffsetsToCharacterOffsets(const char *s, int *offsets, int numOffsets)
+{
+ // Allocate buffer.
+ StringOffset fixedBuffer[sortedOffsetsFixedBufferSize];
+ StringOffset *sortedOffsets = createSortedOffsetsArray(offsets, numOffsets, fixedBuffer);
+
+ // Walk through sorted offsets and string, adjusting all the offests.
+ // Offsets that are off the end of the string map to the edges of the string.
+ int characterOffset = 0;
+ const char *p = s;
+ for (int oi = 0; oi != numOffsets; ++oi) {
+ const int nextOffset = sortedOffsets[oi].offset;
+ while (*p && (p - s) < nextOffset) {
+ // Skip to the next character.
+ ++characterOffset;
+ do ++p; while ((*p & 0xC0) == 0x80); // if 1 of the 2 high bits is set, it's not the start of a character
+ }
+ offsets[sortedOffsets[oi].locationInOffsetsArray] = characterOffset;
+ }
+
+ // Free buffer.
+ if (sortedOffsets != fixedBuffer) {
+ delete [] sortedOffsets;
+ }
+}
+
+#endif // HAVE_PCREPOSIX
+
+RegExp::RegExp(const UString &p, int flags)
+ : _flags(flags), _numSubPatterns(0)
+{
+#ifdef HAVE_PCREPOSIX
- if (flgs & Multiline)
- pcreflags |= PCRE_MULTILINE;
+ int options = PCRE_UTF8;
+ // Note: the Global flag is already handled by RegExpProtoFunc::execute.
+ if (flags & IgnoreCase)
+ options |= PCRE_CASELESS;
+ if (flags & Multiline)
+ options |= PCRE_MULTILINE;
- pcregex = pcre_compile(p.ascii(), pcreflags,
- &perrormsg, &errorOffset, NULL);
+ const char *errorMessage;
+ int errorOffset;
+ _regex = pcre_compile(convertToUTF8(p).c_str(), options, &errorMessage, &errorOffset, NULL);
+ if (!_regex) {
#ifndef NDEBUG
- if (!pcregex)
- fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
+ fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
#endif
+ return;
+ }
#ifdef PCRE_INFO_CAPTURECOUNT
- // Get number of subpatterns that will be returned
- int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
- if (rc != 0)
+ // Get number of subpatterns that will be returned.
+ pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
#endif
- nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
#else /* HAVE_PCREPOSIX */
- nrSubPatterns = 0; // determined in match() with POSIX regex.
int regflags = 0;
#ifdef REG_EXTENDED
regflags |= REG_EXTENDED;
@@ -72,20 +210,19 @@ RegExp::RegExp(const UString &p, int f)
// ;
// Note: the Global flag is already handled by RegExpProtoFunc::execute
- regcomp(&preg, p.ascii(), regflags);
+ regcomp(&_regex, p.ascii(), regflags);
/* TODO check for errors */
-#endif
+#endif
}
RegExp::~RegExp()
{
#ifdef HAVE_PCREPOSIX
- if (pcregex)
- pcre_free(pcregex);
+ pcre_free(_regex);
#else
/* TODO: is this really okay after an error ? */
- regfree(&preg);
+ regfree(&_regex);
#endif
}
@@ -93,32 +230,62 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
{
if (i < 0)
i = 0;
- if (ovector)
- *ovector = 0L;
int dummyPos;
if (!pos)
pos = &dummyPos;
*pos = -1;
+ if (ovector)
+ *ovector = 0;
+
if (i > s.size() || s.isNull())
return UString::null();
#ifdef HAVE_PCREPOSIX
- CString buffer(s.cstring());
- int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
- if (ovector) *ovector = new int[ovecsize];
- if (!pcregex || pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), i,
- 0, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
+ if (!_regex)
+ return UString::null();
+
+ // Set up the offset vector for the result.
+ // First 2/3 used for result, the last third used by PCRE.
+ int *offsetVector;
+ int offsetVectorSize;
+ int fixedSizeOffsetVector[3];
+ if (!ovector) {
+ offsetVectorSize = 3;
+ offsetVector = fixedSizeOffsetVector;
+ } else {
+ offsetVectorSize = (_numSubPatterns + 1) * 3;
+ offsetVector = new int [offsetVectorSize];
+ }
+
+ const CString buffer(convertToUTF8(s));
+ convertCharacterOffsetsToUTF8ByteOffsets(buffer.c_str(), &i, 1);
+ const int numMatches = pcre_exec(_regex, NULL, buffer.c_str(), buffer.size(), i, 0, offsetVector, offsetVectorSize);
+
+ if (numMatches < 0) {
+#ifndef NDEBUG
+ if (numMatches != PCRE_ERROR_NOMATCH)
+ fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
+#endif
+ if (offsetVector != fixedSizeOffsetVector)
+ delete [] offsetVector;
return UString::null();
+ }
+
+ convertUTF8ByteOffsetsToCharacterOffsets(buffer.c_str(), offsetVector, (numMatches == 0 ? 1 : numMatches) * 2);
+
+ *pos = offsetVector[0];
+ if (ovector)
+ *ovector = offsetVector;
+ return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
- if (!ovector)
- return UString::null(); // don't rely on the return value if you pass ovector==0
#else
+
const uint maxMatch = 10;
regmatch_t rmatch[maxMatch];
char *str = strdup(s.ascii()); // TODO: why ???
- if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
+ if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
free(str);
return UString::null();
}
@@ -130,44 +297,20 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
}
// map rmatch array to ovector used in PCRE case
- nrSubPatterns = 0;
+ _numSubPatterns = 0;
for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
- nrSubPatterns++;
- int ovecsize = (nrSubPatterns+1)*3; // see above
+ _numSubPatterns++;
+ int ovecsize = (_numSubPatterns+1)*3; // see above
*ovector = new int[ovecsize];
- for (uint j = 0; j < nrSubPatterns + 1; j++) {
+ for (uint j = 0; j < _numSubPatterns + 1; j++) {
if (j>maxMatch)
break;
(*ovector)[2*j] = rmatch[j].rm_so + i;
(*ovector)[2*j+1] = rmatch[j].rm_eo + i;
}
-#endif
*pos = (*ovector)[0];
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
-}
-#if 0 // unused
-bool RegExp::test(const UString &s, int)
-{
-#ifdef HAVE_PCREPOSIX
- int ovector[300];
- CString buffer(s.cstring());
-
- if (s.isNull() ||
- pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
- 0, ovector, 300) == PCRE_ERROR_NOMATCH)
- return false;
- else
- return true;
-
-#else
-
- char *str = strdup(s.ascii());
- int r = regexec(&preg, str, 0, 0, 0);
- free(str);
-
- return r == 0;
#endif
}
-#endif
diff --git a/JavaScriptCore/kjs/regexp.h b/JavaScriptCore/kjs/regexp.h
index 468bdd1..233cd81 100644
--- a/JavaScriptCore/kjs/regexp.h
+++ b/JavaScriptCore/kjs/regexp.h
@@ -41,26 +41,26 @@ namespace KJS {
class RegExp {
public:
enum { None = 0, Global = 1, IgnoreCase = 2, Multiline = 4 };
- RegExp(const UString &p, int f = None);
+
+ RegExp(const UString &pattern, int flags = None);
~RegExp();
- int flags() const { return flgs; }
- UString match(const UString &s, int i = -1, int *pos = 0L, int **ovector = 0L);
- // test is unused. The JS spec says that RegExp.test should use
- // RegExp.exec, so it has to store $1 etc.
- // bool test(const UString &s, int i = -1);
- uint subPatterns() const { return nrSubPatterns; }
- private:
- const UString &pattern;
- int flgs;
-#ifndef HAVE_PCREPOSIX
- regex_t preg;
+ int flags() const { return _flags; }
+
+ UString match(const UString &s, int i, int *pos = 0, int **ovector = 0);
+ uint subPatterns() const { return _numSubPatterns; }
+
+ private:
+#ifdef HAVE_PCREPOSIX
+ pcre *_regex;
#else
- pcre *pcregex;
+ regex_t _regex;
#endif
- uint nrSubPatterns;
+ int _flags;
+ uint _numSubPatterns;
- RegExp();
+ RegExp(const RegExp &);
+ RegExp &operator=(const RegExp &);
};
}; // namespace
--
WebKit Debian packaging
More information about the Pkg-webkit-commits
mailing list