[Forensics-changes] [yara] 106/415: Improved regular expression hashing (PCRE-only)

Thu Apr 3 05:42:51 UTC 2014

This is an automated email from the git hooks/post-receive script.

bengen pushed a commit to branch debian
in repository yara.

commit 181916ad86ef70cb3bff19c82b5de6f0fc67b307
Author: Victor M. Alvarez <plusvic at gmail.com>
Date:   Tue Aug 2 16:11:20 2011 +0000

    Improved regular expression hashing (PCRE-only)
---
 libyara/ast.c              |   1 -
 libyara/config.h           |   2 +-
 libyara/eval.c             |   1 +
 libyara/grammar.c          |   3 +-
 libyara/grammar.h          |   2 +-
 libyara/grammar.y          |   1 -
 libyara/regex.h            |  24 ++--
 libyara/regex/regex-pcre.c |  54 +++++++--
 libyara/regex/regex-re2.cc |  16 +--
 libyara/scan.c             | 289 +++++++++++++++++++--------------------------
 libyara/yara.h             |   1 -
 11 files changed, 195 insertions(+), 199 deletions(-)

diff --git a/libyara/ast.c b/libyara/ast.c
index 6d7f283..58c6b2a 100644
--- a/libyara/ast.c
+++ b/libyara/ast.c
@@ -458,7 +458,6 @@ int new_text_string(    YARA_CONTEXT* context,
     {
         if (regex_compile(re,  // REGEXP *
                           charstr->c_string,  // Regex pattern
-                          TRUE,  // Anchor the pattern to the first character when evaluating
                           flags & STRING_FLAGS_NO_CASE,  // If TRUE then case insensitive search
                           context->last_error_extra_info,  // Error message
                           sizeof(context->last_error_extra_info), // Size of error buffer
diff --git a/libyara/config.h b/libyara/config.h
index d27a9a0..3800d0a 100644
--- a/libyara/config.h
+++ b/libyara/config.h
@@ -67,4 +67,4 @@
 
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
-/* #undef YYTEXT_POINTER */
+#define YYTEXT_POINTER 1
diff --git a/libyara/eval.c b/libyara/eval.c
index cb6ea08..988c8a8 100644
--- a/libyara/eval.c
+++ b/libyara/eval.c
@@ -432,6 +432,7 @@ long long evaluate(TERM* term, EVALUATION_CONTEXT* context)
         
     case TERM_TYPE_STRING_MATCH:
         rc = regex_exec(&(term_string_operation->re),
+                        FALSE,
                         term_string_operation->variable->string,
                         strlen(term_string_operation->variable->string));
         return (rc >= 0);
diff --git a/libyara/grammar.c b/libyara/grammar.c
index e2dbdd6..f7f19c5 100644
--- a/libyara/grammar.c
+++ b/libyara/grammar.c
@@ -246,7 +246,7 @@ typedef union YYSTYPE
     void*           meta;
 
 }
-/* Line 187 of yacc.c.  */
+/* Line 193 of yacc.c.  */
 #line 251 "grammar.c"
 	YYSTYPE;
 # define yystype YYSTYPE /* obsolescent; will be withdrawn */
@@ -3210,7 +3210,6 @@ TERM* reduce_string_operation( yyscan_t yyscanner,
                     if (regex_compile(&(term->re),
                                       string->c_string,
                                       FALSE,
-                                      FALSE,
                                       context->last_error_extra_info,
                                       sizeof(context->last_error_extra_info),
                                       &erroffset) <= 0)
diff --git a/libyara/grammar.h b/libyara/grammar.h
index 0b15486..d26375b 100644
--- a/libyara/grammar.h
+++ b/libyara/grammar.h
@@ -182,7 +182,7 @@ typedef union YYSTYPE
     void*           meta;
 
 }
-/* Line 1489 of yacc.c.  */
+/* Line 1529 of yacc.c.  */
 #line 187 "grammar.h"
 	YYSTYPE;
 # define yystype YYSTYPE /* obsolescent; will be withdrawn */
diff --git a/libyara/grammar.y b/libyara/grammar.y
index be64201..a1b726e 100644
--- a/libyara/grammar.y
+++ b/libyara/grammar.y
@@ -1167,7 +1167,6 @@ TERM* reduce_string_operation( yyscan_t yyscanner,
                     if (regex_compile(&(term->re),
                                       string->c_string,
                                       FALSE,
-                                      FALSE,
                                       context->last_error_extra_info,
                                       sizeof(context->last_error_extra_info),
                                       &erroffset) <= 0)
diff --git a/libyara/regex.h b/libyara/regex.h
index a02c3dd..4610f9c 100644
--- a/libyara/regex.h
+++ b/libyara/regex.h
@@ -21,15 +21,23 @@ GNU General Public License for more details.
 extern "C" {
 #endif
 
+
 void regex_free(REGEXP* regex);
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size);
-int regex_compile(REGEXP* output,
-                  const char* pattern,
-                  int anchored,
-                  int case_insensitive,
-                  char* error_message,
-                  size_t error_message_size,
-                  int* error_offset);
+
+int regex_exec( REGEXP* regex, 
+                int anchored, 
+                const char *buffer, 
+                size_t buffer_size);
+
+int regex_compile(  REGEXP* output, 
+                    const char* pattern,
+                    int case_insensitive,
+                    char* error_message,
+                    size_t error_message_size,
+                    int* error_offset);
+                    
+int regex_get_first_bytes(  REGEXP* regex, 
+                            unsigned char* table);
 
 #ifdef __cplusplus
 }
diff --git a/libyara/regex/regex-pcre.c b/libyara/regex/regex-pcre.c
index 9cdbbde..56c362f 100644
--- a/libyara/regex/regex-pcre.c
+++ b/libyara/regex/regex-pcre.c
@@ -20,21 +20,25 @@ GNU General Public License for more details.
 #include "../yara.h"
 
 
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size) 
+int regex_exec(REGEXP* regex, int anchored, const char *buffer, size_t buffer_size) 
 {    
     int ovector[3];
     int result = -1;
+    int options = 0;
     char *s;
     
     if (!regex || buffer_size == 0)
         return 0;
+        
+    if (anchored)
+        options = PCRE_ANCHORED;
 
     result = pcre_exec( (pcre*)regex->regexp,           /* the compiled pattern */
                         (pcre_extra*)regex->extra,      /* extra data */
                         (char*) buffer,                 /* the subject string */
                         buffer_size,                    /* the length of the subject */
                         0,                              /* start at offset 0 in the subject */
-                        0,                              /* default options */
+                        options,                        /* options */
                         ovector,                        /* output vector for substring information */
                         sizeof(ovector)/sizeof(int));   /* number of elements in the output vector */
     
@@ -74,7 +78,6 @@ void regex_free(REGEXP* regex)
 
 int regex_compile(REGEXP* output,
                   const char* pattern,
-                  int anchored,
                   int case_insensitive,
                   char* error_message,
                   size_t error_message_size,
@@ -83,15 +86,12 @@ int regex_compile(REGEXP* output,
   
     int pcre_options = 0;
     char *pcre_error = NULL;
-                      
+                          
     if (!output || !pattern)
         return 0;
 
     memset(output, '\0', sizeof(REGEXP));
 
-    if (anchored)
-        pcre_options |= PCRE_ANCHORED;
-        
     if (case_insensitive)
         pcre_options |= PCRE_CASELESS;
 
@@ -99,8 +99,7 @@ int regex_compile(REGEXP* output,
   
     if (output->regexp != NULL) 
     {
-        output->extra = (pcre_extra *)pcre_study(
-        output->regexp, 0, (const char **)error_message);
+        output->extra = (pcre_extra *) pcre_study(output->regexp, 0, (const char **)error_message);
     } 
     else 
     {
@@ -116,3 +115,40 @@ int regex_compile(REGEXP* output,
 
     return 1;
 }
+
+
+
+int regex_get_first_bytes(  REGEXP* regex, 
+                            unsigned char* table)
+{
+    unsigned char* t;
+    
+    int i;
+    int b;
+    int result;
+    int count = 0;
+    
+    result = pcre_fullinfo(regex->regexp, regex->extra, PCRE_INFO_FIRSTTABLE, &t);
+    
+    if (result == 0 && t != NULL)
+    {        
+        for (i = 0; i < 256; i++)
+        {
+            if (t[i / 8] & (1 << i % 8))
+            {
+                table[count] = i; 
+                count++;
+            }
+        }
+    }
+    
+    result = pcre_fullinfo(regex->regexp, regex->extra, PCRE_INFO_FIRSTBYTE, &b);
+    
+    if (result == 0 && b > 0)
+    {   
+        table[count] = b;
+        count++;
+    }
+    
+    return count;
+}
diff --git a/libyara/regex/regex-re2.cc b/libyara/regex/regex-re2.cc
index ffe1db6..d349f22 100644
--- a/libyara/regex/regex-re2.cc
+++ b/libyara/regex/regex-re2.cc
@@ -20,7 +20,7 @@ GNU General Public License for more details.
 #include <re2/stringpiece.h>
 #include "yara.h"
 
-int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size) 
+int regex_exec(REGEXP* regex, int anchored, const char *buffer, size_t buffer_size) 
 {
     if (!regex || buffer_size == 0)
         return 0;
@@ -28,8 +28,8 @@ int regex_exec(REGEXP* regex, const char *buffer, size_t buffer_size)
     re2::StringPiece data(buffer, buffer_size);
     re2::StringPiece substring;
     re2::RE2::Anchor anchor = re2::RE2::UNANCHORED;
-
-    if (regex->re2_anchored)
+    
+    if (anchored)
         anchor = re2::RE2::ANCHOR_START;
 
     re2::RE2* re_ptr = (re2::RE2*) regex->regexp;
@@ -58,7 +58,6 @@ void regex_free(REGEXP* regex)
 
 int regex_compile(REGEXP* output,
                   const char* pattern,
-                  int anchored,
                   int case_insensitive,
                   char* error_message,
                   size_t error_message_size,
@@ -75,9 +74,6 @@ int regex_compile(REGEXP* output,
 
     if (case_insensitive)
         options.set_case_sensitive(false);
-    
-    if (anchored)
-        output->re2_anchored = anchored;
 
     re2::StringPiece string_piece_pattern(pattern);
     output->regexp = (void *)new RE2(string_piece_pattern, options);
@@ -106,3 +102,9 @@ int regex_compile(REGEXP* output,
     
     return 1;
 }
+
+int regex_get_first_bytes(  REGEXP* regex, 
+                            unsigned char* table)
+{
+    return 0;
+}
diff --git a/libyara/scan.c b/libyara/scan.c
index c68306c..c0ccba3 100644
--- a/libyara/scan.c
+++ b/libyara/scan.c
@@ -40,6 +40,7 @@ GNU General Public License for more details.
 
 
 static char lowercase[256];
+static char altercase[256];
 static char isalphanum[256];
 
 /* Function implementations */
@@ -324,7 +325,7 @@ int regexp_match(unsigned char* buffer, size_t buffer_size, unsigned char* patte
 		return 0;
 	}
 
-    result = regex_exec(&re, (char *)buffer, buffer_size);
+    result = regex_exec(&re, TRUE, (char *)buffer, buffer_size);
 
     if (result >= 0)
         return result;
@@ -337,15 +338,27 @@ int populate_hash_table(HASH_TABLE* hash_table, RULE_LIST* rule_list)
 	RULE* rule;
 	STRING* string;
 	STRING_LIST_ENTRY* entry;
-	unsigned char x,y;
-    char hashable_2b;
-    char hashable_1b;
-    int i, next;
+	
+    unsigned char first[256];
+    unsigned char second[2];
+    
+    unsigned char f;
+    unsigned char s;
+    
+    int fcount;
+	int scount;
+    
+    int i, j;
     
     for (i = 0; i < 256; i++)
     {
         lowercase[i] = tolower(i);
         isalphanum[i] = isalnum(i);
+        
+        if (lowercase[i] == i)
+            altercase[i] = toupper(i);
+        else
+            altercase[i] = lowercase[i];
     }
 		
 	rule = rule_list->head;
@@ -355,173 +368,113 @@ int populate_hash_table(HASH_TABLE* hash_table, RULE_LIST* rule_list)
 		string = rule->string_list_head;
 
 		while (string != NULL)
-		{	        
-			if (string->flags & STRING_FLAGS_REGEXP)
-			{	
-				/* take into account anchors (^) at beginning of regular expressions */
-							
-				if (string->string[0] == '^')
-				{
-				    if (string->length > 2)
-				    {
-					    x = string->string[1];
-					    y = string->string[2];
-					}
-					else
-					{
-                        x = 0;
-                        y = 0; 
-					}
-				}
-				else
-				{
-					x = string->string[0];
-					y = string->string[1];
-				}
-			
-                hashable_2b = isalphanum[x] && isalphanum[y];
-                hashable_1b = isalphanum[x];
-			}
-			else
-			{
-			    x = string->string[0];
-				y = string->string[1];
-				
-				hashable_2b = TRUE;
-				hashable_1b = TRUE;
-				
-			} /* if (string->flags & STRING_FLAGS_REGEXP) */
-			
-			if (string->flags & STRING_FLAGS_HEXADECIMAL)
-			{
-			    hashable_2b = (string->mask[0] == 0xFF) && (string->mask[1] == 0xFF);
-			    hashable_1b = (string->mask[0] == 0xFF);
-			}
-			
-			if (hashable_1b && string->flags & STRING_FLAGS_NO_CASE)
-			{	
-			    if (hashable_2b)
-			    {	    
-    			    /* 
-    			       if string is case-insensitive add an entry in the hash table
-    			       for each posible combination 
-    			    */
-			    
-    				x = lowercase[x];
-    				y = lowercase[y];
-				
-    				/* both lowercases */
-				
-    				entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
-        			    return ERROR_INSUFICIENT_MEMORY;
-    			    
-        			entry->next = hash_table->hashed_strings_2b[x][y];
-        			entry->string = string;
-        			hash_table->hashed_strings_2b[x][y] = entry;
-    			
-        			/* X uppercase Y lowercase */
-    			
-                    x = toupper(x);
-				
-    				entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
-                        return ERROR_INSUFICIENT_MEMORY;
-    			    
-            		entry->next = hash_table->hashed_strings_2b[x][y];  
-            		entry->string = string;
-            		hash_table->hashed_strings_2b[x][y] = entry; 
-        		
-            		/* both uppercases */			    
-    			
-        			y = toupper(y);  
-    			    
-        			entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
+		{	      
+            fcount = 0;
+            scount = 0;
+		    
+		    if (string->flags & STRING_FLAGS_REGEXP)
+            {				    			    
+            	if (string->string[0] == '^')
+            	{
+            	    if (string->length > 2)
+            	    {
+            		    f = string->string[1];
+            		    s = string->string[2];
+            		}
+            		else
+            		{
+                        f = string->string[1];
+                        s = 0; 
+            		}
+            	}
+            	else
+            	{
+            		f = string->string[0];
+            		s = string->string[1];
+            	}
+            	
+            	if (isalphanum[f])
+            	{
+            	    first[fcount++] = f;
+            	    
+        	        if (string->flags & STRING_FLAGS_NO_CASE)
+                        first[fcount++] = altercase[f];
+        	                	
+                	if (isalphanum[s])
+                	{
+                	    second[scount++] = s;    
+        	    
+            	        if (string->flags & STRING_FLAGS_NO_CASE)
+                            second[scount++] = altercase[s];
+            	    }
+        	    }
+        	    
+        	    if (fcount == 0)
+        	    {
+                    fcount += regex_get_first_bytes(&(string->re), first);
+        	    }
+            	
+            }
+            else if (string->flags & STRING_FLAGS_HEXADECIMAL)
+            {
+                if (string->mask[0] == 0xFF) 
+                    first[fcount++] = string->string[0];
+                
+                if (string->mask[1] == 0xFF);
+                    second[scount++] = string->string[1];
+            }
+            else 
+            {
+                first[fcount++] = string->string[0];
+                second[scount++] = string->string[1];
+                
+                if (string->flags & STRING_FLAGS_NO_CASE)
+                {
+                    first[fcount++] = altercase[string->string[0]];
+                    second[scount++] = altercase[string->string[1]];
+                }
+            }
+            
+            for (i = 0; i < fcount; i++)
+            {
+                for (j = 0; j < scount; j++)
+                {
+                    entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+            		if (entry == NULL)
                         return ERROR_INSUFICIENT_MEMORY;
-    			    
-            		entry->next = hash_table->hashed_strings_2b[x][y];
+                        
+                    entry->next = hash_table->hashed_strings_2b[first[i]][second[j]];
+                	entry->string = string;
+                	hash_table->hashed_strings_2b[first[i]][second[j]] = entry;
+                
+                }
+                
+                if (scount == 0)
+                {
+                    entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+            		if (entry == NULL)
+            		    return ERROR_INSUFICIENT_MEMORY;
+
+            		entry->next = hash_table->hashed_strings_1b[first[i]];
             		entry->string = string;
-            		hash_table->hashed_strings_2b[x][y] = entry;
-        		
-            		/* X lowercase Y uppercase */
-    			    
-                    x = lowercase[x];
- 
-        			entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
-                        return ERROR_INSUFICIENT_MEMORY;
-    			    
-            		entry->next = hash_table->hashed_strings_2b[x][y]; 
-            		entry->string = string; 
-            		hash_table->hashed_strings_2b[x][y] = entry;
-        		}
-        		else
-        		{
-        		    /* lowercase */
-    
-        		    x = lowercase[x];
-		
-    				entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
-        			    return ERROR_INSUFICIENT_MEMORY;
-    			    
-        			entry->next = hash_table->hashed_strings_1b[x];
-        			entry->string = string;
-        			hash_table->hashed_strings_1b[x] = entry;
-        			
-        			/* uppercase */
-    
-        		    x = toupper(x);
-		
-    				entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-    				if (entry == NULL)
-        			    return ERROR_INSUFICIENT_MEMORY;
-    			    
-        			entry->next = hash_table->hashed_strings_1b[x];
-        			entry->string = string;
-        			hash_table->hashed_strings_1b[x] = entry;
-        		}               
-    							
-			}
-			else if (hashable_1b)
-			{
-			    entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-			
-				if (entry == NULL)
+            		hash_table->hashed_strings_1b[first[i]] = entry;
+                }
+            }
+            
+            if (fcount == 0)
+            {
+                entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
+
+            	if (entry == NULL)
                     return ERROR_INSUFICIENT_MEMORY;
-                
+
+                entry->next = hash_table->non_hashed_strings;
                 entry->string = string; 
-			    
-			    if (hashable_2b)
-			    {   
-            		entry->next = hash_table->hashed_strings_2b[x][y]; 		
-            		hash_table->hashed_strings_2b[x][y] = entry;
-        		}
-        		else
-        		{    			    
-        			entry->next = hash_table->hashed_strings_1b[x];
-        			hash_table->hashed_strings_1b[x] = entry;
-        		}  
-			}
-			else /* non hashable */
-			{
-			    entry = (STRING_LIST_ENTRY*) yr_malloc(sizeof(STRING_LIST_ENTRY));
-				
-				if (entry == NULL)
-                    return ERROR_INSUFICIENT_MEMORY;
-			    
-			    entry->next = hash_table->non_hashed_strings;
-			    entry->string = string; 
                 hash_table->non_hashed_strings = entry;
-			}
-		
+            }
+            
 			string = string->next;
 		}
 		
@@ -678,7 +631,7 @@ inline int string_match(unsigned char* buffer, size_t buffer_size, STRING* strin
 		}
 		else
 		{
-			return regexp_match(buffer, buffer_size, string->string, string->length, string->re, negative_size);
+			return regexp_match(buffer, buffer_size, string->string, string->length, string->re, negative_size);   
 		}
 	}
 	
diff --git a/libyara/yara.h b/libyara/yara.h
index cb9bb37..35de6e6 100644
--- a/libyara/yara.h
+++ b/libyara/yara.h
@@ -127,7 +127,6 @@ typedef struct _REGEXP
 {
     void    *regexp;
     void    *extra;
-    int     re2_anchored;
     
 } REGEXP;
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/yara.git