[Forensics-changes] [yara] 106/415: Improved regular expression hashing (PCRE-only)
Hilko Bengen
bengen at moszumanska.debian.org
Thu Apr 3 05:42:51 UTC 2014
This is an automated email from the git hooks/post-receive script.
bengen pushed a commit to branch debian
in repository yara.
commit 181916ad86ef70cb3bff19c82b5de6f0fc67b307
Author: Victor M. Alvarez <plusvic at gmail.com>
Date: Tue Aug 2 16:11:20 2011 +0000
Improved regular expression hashing (PCRE-only)
---
libyara/ast.c | 1 -
libyara/config.h | 2 +-
libyara/eval.c | 1 +
libyara/grammar.c | 3 +-
libyara/grammar.h | 2 +-
libyara/grammar.y | 1 -
libyara/regex.h | 24 ++--
libyara/regex/regex-pcre.c | 54 +++++++--
libyara/regex/regex-re2.cc | 16 +--
libyara/scan.c | 289 +++++++++++++++++++--------------------------
libyara/yara.h | 1 -
11 files changed, 195 insertions(+), 199 deletions(-)
diff --git a/libyara/ast.c b/libyara/ast.c
index 6d7f283..58c6b2a 100644
--- a/libyara/ast.c
+++ b/libyara/ast.c
@@ -458,7 +458,6 @@ int new_text_string( YARA_CONTEXT* context,
{
if (regex_compile(re, // REGEXP *
charstr->c_string, // Regex pattern
- TRUE, // Anchor the pattern to the first character when evaluating
flags & STRING_FLAGS_NO_CASE, // If TRUE then case insensitive search
context->last_error_extra_info, // Error message
sizeof(context->last_error_extra_info), // Size of error buffer
diff --git a/libyara/config.h b/libyara/config.h
index d27a9a0..3800d0a 100644
--- a/libyara/config.h
+++ b/libyara/config.h
@@ -67,4 +67,4 @@
/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
`char[]'. */
-/* #undef YYTEXT_POINTER */
+#define YYTEXT_POINTER 1
diff --git a/libyara/eval.c b/libyara/eval.c
index cb6ea08..988c8a8 100644
--- a/libyara/eval.c
+++ b/libyara/eval.c
@@ -432,6 +432,7 @@ long long evaluate(TERM* term, EVALUATION_CONTEXT* context)
case TERM_TYPE_STRING_MATCH:
rc = regex_exec(&(term_string_operation->re),
+ FALSE,
term_string_operation->variable->string,
strlen(term_string_operation->variable->string));
return (rc >= 0);
diff --git a/libyara/grammar.c b/libyara/grammar.c
index e2dbdd6..f7f19c5 100644
--- a/libyara/grammar.c
+++ b/libyara/grammar.c
@@ -246,7 +246,7 @@ typedef union YYSTYPE
void* meta;
}
-/* Line 187 of yacc.c. */
+/* Line 193 of yacc.c. */
#line 251 "grammar.c"
YYSTYPE;
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
@@ -3210,7 +3210,6 @@ TERM* reduce_string_operation( yyscan_t yyscanner,
if (regex_compile(&(term->re),
string->c_string,
FALSE,
- FALSE,
context->last_error_extra_info,
sizeof(context->last_error_extra_info),
&erroffset) <= 0)
diff --git a/libyara/grammar.h b/libyara/grammar.h
index 0b15486..d26375b 100644
--- a/libyara/grammar.h
+++ b/libyara/grammar.h
@@ -182,7 +182,7 @@ typedef union YYSTYPE
void* meta;
}
-/* Line 1489 of yacc.c. */
+/* Line 1529 of yacc.c. */
#line 187 "grammar.h"
YYSTYPE;
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
diff --git a/libyara/grammar.y b/libyara/grammar.y
index be64201..a1b726e 100644
--- a/libyara/grammar.y
+++ b/libyara/grammar.y
@@ -1167,7 +1167,6 @@ TERM* reduce_string_operation( yyscan_t yyscanner,
if (regex_compile(&(term->re),
string->c_string,
FALSE,
- FALSE,
context->last_error_extra_info,
sizeof(context->last_error_extra_info),
&erroffset) <= 0)
diff --git a/libyara/regex.h b/libyara/regex.h
index a02c3dd..4610f9c 100644
--- a/libyara/regex.h
+++ b/libyara/regex.h
@@ -21,15 +21,23 @@ GNU General Public License for more details.
extern "C" {
#endif
+
void regex_free(REGEXP* regex);
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size);
-int regex_compile(REGEXP* output,
- const char* pattern,
- int anchored,
- int case_insensitive,
- char* error_message,
- size_t error_message_size,
- int* error_offset);
+
+int regex_exec( REGEXP* regex,
+ int anchored,
+ const char *buffer,
+ size_t buffer_size);
+
+int regex_compile( REGEXP* output,
+ const char* pattern,
+ int case_insensitive,
+ char* error_message,
+ size_t error_message_size,
+ int* error_offset);
+
+int regex_get_first_bytes( REGEXP* regex,
+ unsigned char* table);
#ifdef __cplusplus
}
diff --git a/libyara/regex/regex-pcre.c b/libyara/regex/regex-pcre.c
index 9cdbbde..56c362f 100644
--- a/libyara/regex/regex-pcre.c
+++ b/libyara/regex/regex-pcre.c
@@ -20,21 +20,25 @@ GNU General Public License for more details.
#include "../yara.h"
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size)
+int regex_exec(REGEXP* regex, int anchored, const char *buffer, size_t buffer_size)
{
int ovector[3];
int result = -1;
+ int options = 0;
char *s;
if (!regex || buffer_size == 0)
return 0;
+
+ if (anchored)
+ options = PCRE_ANCHORED;
result = pcre_exec( (pcre*)regex->regexp, /* the compiled pattern */
(pcre_extra*)regex->extra, /* extra data */
(char*) buffer, /* the subject string */
buffer_size, /* the length of the subject */
0, /* start at offset 0 in the subject */
- 0, /* default options */
+ options, /* options */
ovector, /* output vector for substring information */
sizeof(ovector)/sizeof(int)); /* number of elements in the output vector */
@@ -74,7 +78,6 @@ void regex_free(REGEXP* regex)
int regex_compile(REGEXP* output,
const char* pattern,
- int anchored,
int case_insensitive,
char* error_message,
size_t error_message_size,
@@ -83,15 +86,12 @@ int regex_compile(REGEXP* output,
int pcre_options = 0;
char *pcre_error = NULL;
-
+
if (!output || !pattern)
return 0;
memset(output, '\0', sizeof(REGEXP));
- if (anchored)
- pcre_options |= PCRE_ANCHORED;
-
if (case_insensitive)
pcre_options |= PCRE_CASELESS;
@@ -99,8 +99,7 @@ int regex_compile(REGEXP* output,
if (output->regexp != NULL)
{
- output->extra = (pcre_extra *)pcre_study(
- output->regexp, 0, (const char **)error_message);
+ output->extra = (pcre_extra *) pcre_study(output->regexp, 0, (const char **)error_message);
}
else
{
@@ -116,3 +115,40 @@ int regex_compile(REGEXP* output,
return 1;
}
+
+
+
+int regex_get_first_bytes( REGEXP* regex,
+ unsigned char* table)
+{
+ unsigned char* t;
+
+ int i;
+ int b;
+ int result;
+ int count = 0;
+
+ result = pcre_fullinfo(regex->regexp, regex->extra, PCRE_INFO_FIRSTTABLE, &t);
+
+ if (result == 0 && t != NULL)
+ {
+ for (i = 0; i < 256; i++)
+ {
+ if (t[i / 8] & (1 << i % 8))
+ {
+ table[count] = i;
+ count++;
+ }
+ }
+ }
+
+ result = pcre_fullinfo(regex->regexp, regex->extra, PCRE_INFO_FIRSTBYTE, &b);
+
+ if (result == 0 && b > 0)
+ {
+ table[count] = b;
+ count++;
+ }
+
+ return count;
+}
diff --git a/libyara/regex/regex-re2.cc b/libyara/regex/regex-re2.cc
index ffe1db6..d349f22 100644
--- a/libyara/regex/regex-re2.cc
+++ b/libyara/regex/regex-re2.cc
@@ -20,7 +20,7 @@ GNU General Public License for more details.
#include <re2/stringpiece.h>
#include "yara.h"
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size)
+int regex_exec(REGEXP* regex, int anchored, const char *buffer, size_t buffer_size)
{
if (!regex || buffer_size == 0)
return 0;
@@ -28,8 +28,8 @@ int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size)
re2::StringPiece data(buffer, buffer_size);
re2::StringPiece substring;
re2::RE2::Anchor anchor = re2::RE2::UNANCHORED;
-
- if (regex->re2_anchored)
+
+ if (anchored)
anchor = re2::RE2::ANCHOR_START;
re2::RE2* re_ptr = (re2::RE2*) regex->regexp;
@@ -58,7 +58,6 @@ void regex_free(REGEXP* regex)
int regex_compile(REGEXP* output,
const char* pattern,
- int anchored,
int case_insensitive,
char* error_message,
size_t error_message_size,
@@ -75,9 +74,6 @@ int regex_compile(REGEXP* output,
if (case_insensitive)
options.set_case_sensitive(false);
-
- if (anchored)
- output->re2_anchored = anchored;
re2::StringPiece string_piece_pattern(pattern);
output->regexp = (void *)new RE2(string_piece_pattern, options);
@@ -106,3 +102,9 @@ int regex_compile(REGEXP* output,
return 1;
}
+
+int regex_get_first_bytes( REGEXP* regex,
+ unsigned char* table)
+{
+ return 0;
+}
diff --git a/libyara/scan.c b/libyara/scan.c
index c68306c..c0ccba3 100644
--- a/libyara/scan.c
+++ b/libyara/scan.c
@@ -40,6 +40,7 @@ GNU General Public License for more details.
static char lowercase[256];
+static char altercase[256];
static char isalphanum[256];
/* Function implementations */
@@ -324,7 +325,7 @@ int regexp_match(unsigned char* buffer, size_t buffer_size, unsigned char* patte
return 0;
}
- result = regex_exec(&re, (char *)buffer, buffer_size);
+ result = regex_exec(&re, TRUE, (char *)buffer, buffer_size);
if (result >= 0)
return result;
@@ -337,15 +338,27 @@ int populate_hash_table(HASH_TABLE* hash_table, RULE_LIST* rule_list)
RULE* rule;
STRING* string;
STRING_LIST_ENTRY* entry;
- unsigned char x,y;
- char hashable_2b;
- char hashable_1b;
- int i, next;
+
+ unsigned char first[256];
+ unsigned char second[2];
+
+ unsigned char f;
+ unsigned char s;
+
+ int fcount;
+ int scount;
+
+ int i, j;
for (i = 0; i < 256; i++)
{
lowercase[i] = tolower(i);
isalphanum[i] = isalnum(i);
+
+ if (lowercase[i] == i)
+ altercase[i] = toupper(i);
+ else
+ altercase[i] = lowercase[i];
}
rule = rule_list->head;
@@ -355,173 +368,113 @@ int populate_hash_table(HASH_TABLE* hash_table, RULE_LIST* rule_list)
string = rule->string_list_head;
while (string != NULL)
- {
- if (string->flags & STRING_FLAGS_REGEXP)
- {
- /* take into account anchors (^) at beginning of regular expressions */
-
- if (string->string[0] == '^')
- {
- if (string->length > 2)
- {
- x = string->string[1];
- y = string->string[2];
- }
- else
- {
- x = 0;
- y = 0;
- }
- }
- else
- {
- x = string->string[0];
- y = string->string[1];
- }
-
- hashable_2b = isalphanum[x] && isalphanum[y];
- hashable_1b = isalphanum[x];
- }
- else
- {
- x = string->string[0];
- y = string->string[1];
-
- hashable_2b = TRUE;
- hashable_1b = TRUE;
-
- } /* if (string->flags & STRING_FLAGS_REGEXP) */
-
- if (string->flags & STRING_FLAGS_HEXADECIMAL)
- {
- hashable_2b = (string->mask[0] == 0xFF) && (string->mask[1] == 0xFF);
- hashable_1b = (string->mask[0] == 0xFF);
- }
-
- if (hashable_1b && string->flags & STRING_FLAGS_NO_CASE)
- {
- if (hashable_2b)
- {
- /*
- if string is case-insensitive add an entry in the hash table
- for each posible combination
- */
-
- x = lowercase[x];
- y = lowercase[y];
-
- /* both lowercases */
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_2b[x][y];
- entry->string = string;
- hash_table->hashed_strings_2b[x][y] = entry;
-
- /* X uppercase Y lowercase */
-
- x = toupper(x);
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_2b[x][y];
- entry->string = string;
- hash_table->hashed_strings_2b[x][y] = entry;
-
- /* both uppercases */
-
- y = toupper(y);
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
+ {
+ fcount = 0;
+ scount = 0;
+
+ if (string->flags & STRING_FLAGS_REGEXP)
+ {
+ if (string->string[0] == '^')
+ {
+ if (string->length > 2)
+ {
+ f = string->string[1];
+ s = string->string[2];
+ }
+ else
+ {
+ f = string->string[1];
+ s = 0;
+ }
+ }
+ else
+ {
+ f = string->string[0];
+ s = string->string[1];
+ }
+
+ if (isalphanum[f])
+ {
+ first[fcount++] = f;
+
+ if (string->flags & STRING_FLAGS_NO_CASE)
+ first[fcount++] = altercase[f];
+
+ if (isalphanum[s])
+ {
+ second[scount++] = s;
+
+ if (string->flags & STRING_FLAGS_NO_CASE)
+ second[scount++] = altercase[s];
+ }
+ }
+
+ if (fcount == 0)
+ {
+ fcount += regex_get_first_bytes(&(string->re), first);
+ }
+
+ }
+ else if (string->flags & STRING_FLAGS_HEXADECIMAL)
+ {
+ if (string->mask[0] == 0xFF)
+ first[fcount++] = string->string[0];
+
+ if (string->mask[1] == 0xFF);
+ second[scount++] = string->string[1];
+ }
+ else
+ {
+ first[fcount++] = string->string[0];
+ second[scount++] = string->string[1];
+
+ if (string->flags & STRING_FLAGS_NO_CASE)
+ {
+ first[fcount++] = altercase[string->string[0]];
+ second[scount++] = altercase[string->string[1]];
+ }
+ }
+
+ for (i = 0; i < fcount; i++)
+ {
+ for (j = 0; j < scount; j++)
+ {
+ entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+ if (entry == NULL)
return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_2b[x][y];
+
+ entry->next = hash_table->hashed_strings_2b[first[i]][second[j]];
+ entry->string = string;
+ hash_table->hashed_strings_2b[first[i]][second[j]] = entry;
+
+ }
+
+ if (scount == 0)
+ {
+ entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+ if (entry == NULL)
+ return ERROR_INSUFICIENT_MEMORY;
+
+ entry->next = hash_table->hashed_strings_1b[first[i]];
entry->string = string;
- hash_table->hashed_strings_2b[x][y] = entry;
-
- /* X lowercase Y uppercase */
-
- x = lowercase[x];
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_2b[x][y];
- entry->string = string;
- hash_table->hashed_strings_2b[x][y] = entry;
- }
- else
- {
- /* lowercase */
-
- x = lowercase[x];
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_1b[x];
- entry->string = string;
- hash_table->hashed_strings_1b[x] = entry;
-
- /* uppercase */
-
- x = toupper(x);
-
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->hashed_strings_1b[x];
- entry->string = string;
- hash_table->hashed_strings_1b[x] = entry;
- }
-
- }
- else if (hashable_1b)
- {
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
+ hash_table->hashed_strings_1b[first[i]] = entry;
+ }
+ }
+
+ if (fcount == 0)
+ {
+ entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+ if (entry == NULL)
return ERROR_INSUFICIENT_MEMORY;
-
+
+ entry->next = hash_table->non_hashed_strings;
entry->string = string;
-
- if (hashable_2b)
- {
- entry->next = hash_table->hashed_strings_2b[x][y];
- hash_table->hashed_strings_2b[x][y] = entry;
- }
- else
- {
- entry->next = hash_table->hashed_strings_1b[x];
- hash_table->hashed_strings_1b[x] = entry;
- }
- }
- else /* non hashable */
- {
- entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-
- if (entry == NULL)
- return ERROR_INSUFICIENT_MEMORY;
-
- entry->next = hash_table->non_hashed_strings;
- entry->string = string;
hash_table->non_hashed_strings = entry;
- }
-
+ }
+
string = string->next;
}
@@ -678,7 +631,7 @@ inline int string_match(unsigned char* buffer, size_t buffer_size, STRING* strin
}
else
{
- return regexp_match(buffer, buffer_size, string->string, string->length, string->re, negative_size);
+ return regexp_match(buffer, buffer_size, string->string, string->length, string->re, negative_size);
}
}
diff --git a/libyara/yara.h b/libyara/yara.h
index cb9bb37..35de6e6 100644
--- a/libyara/yara.h
+++ b/libyara/yara.h
@@ -127,7 +127,6 @@ typedef struct _REGEXP
{
void *regexp;
void *extra;
- int re2_anchored;
} REGEXP;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git
More information about the forensics-changes
mailing list