[med-svn] [r-cran-fastmatch] 01/02: Imported Upstream version 1.0-4

Thu May 14 10:56:00 UTC 2015

This is an automated email from the git hooks/post-receive script.

albac-guest pushed a commit to branch master
in repository r-cran-fastmatch.

commit 4d6bc6c3e4955932663bc404ffaf647f9d666b2b
Author: Alba Crespi <crespialba+debian at gmail.com>
Date:   Thu May 14 01:00:00 2015 +0100

    Imported Upstream version 1.0-4
---
 DESCRIPTION     |  15 +++
 MD5             |   7 ++
 NAMESPACE       |   3 +
 NEWS            |  32 +++++
 R/fastmatch.R   |   2 +
 R/match.hash.R  |   7 ++
 man/fmatch.Rd   | 120 +++++++++++++++++++
 src/fastmatch.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 541 insertions(+)

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..a9e672e
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: fastmatch
+Version: 1.0-4
+Title: Fast match() function
+Author: Simon Urbanek <simon.urbanek at r-project.org>
+Maintainer: Simon Urbanek <simon.urbanek at r-project.org>
+Description: Package providing a fast match() replacement for cases
+        that require repeated look-ups. It is slightly faster that R's
+        built-in match() function on first match against a table, but
+        extremely fast on any subsequent lookup as it keeps the hash
+        table in memory.
+License: GPL-2
+URL: http://www.rforge.net/fastmatch
+Packaged: 2012-01-21 10:09:18 UTC; svnuser
+Repository: CRAN
+Date/Publication: 2012-01-21 10:22:24
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..28aa699
--- /dev/null
+++ b/MD5
@@ -0,0 +1,7 @@
+89f00fff119030016fece98c08b5040b *DESCRIPTION
+7dd3c164abc64183f0681eaf7b85d73e *NAMESPACE
+27e152f5450341fbb88d31cfbff45520 *NEWS
+770a7b76ccff6f95d86152999543269b *R/fastmatch.R
+ddc4a8e8795d9bc6be2c7d507b7e160b *R/match.hash.R
+1cf3221f784b90ed613d2454cc00a727 *man/fmatch.Rd
+632693d50dad9116f97f57578ee10502 *src/fastmatch.c
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..eceafca
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,3 @@
+useDynLib(fastmatch)
+export(fmatch)
+S3method(print, match.hash)
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..13dcc71
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,32 @@
+ NEWS for fastmatch
+--------------------
+
+0.1-4	2012-01-12
+    o	some R functions (such as subset assignment like x[1] <- 2)
+	can create a new object (with possibly modified content) and
+	copy all attributes including the hash cache. If the original
+	object was used as a table in fmatch(), the hash cache will be
+	copied into the modified object and thus its cache will be
+	possibly out of	sync with the object. fmatch() will now
+	identify such cases and discard the hash to prevent errorneous
+	results.
+
+0.1-3	2011-12-21
+    o	match() coerces POSIXlt objects into characters, but so far 
+	fmatch() performed the match on the actual objects.
+	Now fmatch() coerces POSIXlt object into characters just like
+	match(), but note that you will lose the ability to perform
+	fast lookups if the table is a POSIXlt object -- please use
+	POSIXct objects (much more efficient) or use as.character() on
+	the POSIXlt object to create a table that you want to re-use.
+
+0.1-2	2011-09-14
+    o	bugfix: nomatch was ignored in the fastmatch implementation
+	(thanks to Enrico Schumann for reporting)
+
+0.1-1	2010-12-23
+    o	minor cleanups
+
+0.1-0	2010-12-23
+    o	initial release
+
diff --git a/R/fastmatch.R b/R/fastmatch.R
new file mode 100644
index 0000000..fe32610
--- /dev/null
+++ b/R/fastmatch.R
@@ -0,0 +1,2 @@
+fmatch <- function(x, table, nomatch = NA_integer_, incomparables = NULL)
+  .Call("fmatch", x, table, nomatch, incomparables, PACKAGE = "fastmatch")
diff --git a/R/match.hash.R b/R/match.hash.R
new file mode 100644
index 0000000..be1c08a
--- /dev/null
+++ b/R/match.hash.R
@@ -0,0 +1,7 @@
+# match.hash is an infomal (S3) class representing the
+# chain of hash tables stored in the .match.hash attribute
+# of tables that have been hashed
+
+# we provide a (sort of dummy) print method so
+# the output is not as ugly
+print.match.hash <- function(x, ...) { cat("<hash table>\n"); x }
diff --git a/man/fmatch.Rd b/man/fmatch.Rd
new file mode 100644
index 0000000..03eef29
--- /dev/null
+++ b/man/fmatch.Rd
@@ -0,0 +1,120 @@
+\name{fmatch}
+\alias{fmatch}
+\alias{fastmatch}
+\title{
+Fast match() replacement
+}
+\description{
+\code{fmatch} is a faster version of the built-in \code{\link{match}}()
+function. It is slightly faster than the built-in version because it
+uses more specialized code, but in addition it retains the hash table
+within the table object such that it can be re-used, dramatically reducing
+the look-up time especially for large tables.
+
+Although \code{fmatch} can be used separately, in general it is also
+safe to use: \code{match <- fmatch} since it is a drop-in
+replacement. Any cases not directly handled by \code{fmatch} are passed
+to \code{match} with a warning.
+}
+\usage{
+fmatch(x, table, nomatch = NA_integer_, incomparables = NULL)
+}
+\arguments{
+  \item{x}{values to be matched}
+  \item{table}{values to be matched against}
+  \item{nomatch}{the value to be returned in the case when no match is
+    found. It is coerced to \code{integer}.}
+  \item{incomparables}{a vector of values that cannot be matched. Any
+    value other than \code{NULL} will result in a fall-back to
+    \code{match} without any speed gains.}
+}
+\details{
+  See \code{\link{match}} for the purpose and details of the
+  \code{match} function. \code{fmatch} is a drop-in replacement for
+  the \code{match} function with the focus on
+  performance. \code{incomparables} are not supported by \code{fmatch}
+  and will be passed down to \code{match}.
+
+  The first match against a table results in a hash table to be computed
+  from the table. This table is then attached as the `.match.hash`
+  attribute of the table so that it can be re-used on subsequent calls
+  to \code{fmatch} with the same table.
+
+  The hashing algorithm used is the same as the \code{match} function in
+  R, but it is re-implemented in a slight different way to improve its
+  performance at the cost of supporting only a subset of types (integer,
+  real and character). For any other types \code{fmatch} falls back to
+  \code{match} (with a warning).
+}
+\value{
+  A vector of the same length as \code{x} - see \code{\link{match}} for
+  details.
+}
+%\references{
+%}
+%\author{
+%%  ~~who you are~~
+%}
+\note{
+\code{fmatch} modifies the \code{table} by attaching an attribute to
+  it. It is expected that the values will not change unless that
+  attribute is dropped. Under normal circumstances this should not have
+  any effect from user's point of view, but there is a theoretical
+  chance of the cache being out of sync with the table in case the table
+  is modified directly (e.g. by some C code) without removing
+  attributes.
+
+  Also \code{fmatch} does not convert to a common encoding so strings
+  with different representation in two encodings don't match.
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+\seealso{
+\code{\link{match}}
+}
+\examples{
+# some random speed comparison examples:
+# first use integer matching
+x = as.integer(rnorm(1e6) * 1000000)
+s = 1:100
+# the first call to fmatch is comparable to match
+system.time(fmatch(s,x))
+# but the subsequent calls take no time!
+system.time(fmatch(s,x))
+system.time(fmatch(-50:50,x))
+system.time(fmatch(-5000:5000,x))
+# here is the speed of match for comparison
+system.time(base::match(s, x))
+# the results should be identical
+identical(base::match(s, x), fmatch(s, x))
+
+# next, match a factor against the table
+# this will require both x and the factor
+# to be cast to strings
+s=factor(c("1","1","2","foo","3",NA))
+# because the casting will have to allocate a string
+# cache in R, we run a dummy conversion to take
+# that out of the equation
+dummy = as.character(x)
+# now we can run the speed tests
+system.time(fmatch(s, x))
+system.time(fmatch(s, x))
+# the cache is still valid for string matches as well
+system.time(fmatch(c("foo","bar","1","2"),x))
+# now back to match
+system.time(base::match(s, x))
+identical(base::match(s, x), fmatch(s, x))
+
+# finally, some reals to match
+y = rnorm(1e6)
+s = c(y[sample(length(y), 100)], 123.567, NA, NaN)
+system.time(fmatch(s, y))
+system.time(fmatch(s, y))
+system.time(fmatch(s, y))
+system.time(base::match(s, y))
+identical(base::match(s, y), fmatch(s, y))
+
+# this used to fail before 0.1-2 since nomatch was ignored
+identical(base::match(4L, 1:3, nomatch=0), fmatch(4L, 1:3, nomatch=0))
+}
+\keyword{manip}
diff --git a/src/fastmatch.c b/src/fastmatch.c
new file mode 100644
index 0000000..f92b3e3
--- /dev/null
+++ b/src/fastmatch.c
@@ -0,0 +1,355 @@
+/*
+ *  fastmatch: fast implementation of match() in R using semi-permanent hash tables
+ *
+ *  Copyright (C) 2010, 2011  Simon Urbanek
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ */
+
+/* for speed (should not really matter in this case as most time is spent in the hashing) */
+#define USE_RINTERNALS 1
+#include <Rinternals.h>
+
+/* for malloc/free since we handle our hash table memory separately from R */
+#include <stdlib.h>
+/* for hashing for pointers we need intptr_t */
+#include <stdint.h>
+
+/* match5 to fall-back to R's internal match for types we don't support */
+SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env);
+
+/* ".match.hash" symbol - cached on first use */
+SEXP hs;
+
+typedef int hash_index_t;
+
+typedef struct hash {
+  int m, k, els, type;
+  void *src;
+  SEXP prot, parent;
+  struct hash *next;
+  hash_index_t ix[1];
+} hash_t;
+
+/* create a new hash table with the given source and length.
+   we store only the index - values are picked from the source 
+   so you must make sure the source is still alive when used */
+static hash_t *new_hash(void *src, hash_index_t len) {
+  hash_t *h;
+  hash_index_t m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */
+  while (m < desired) { m *= 2; k++; }
+  h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m));
+  if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0));
+  h->m = m;
+  h->k = k;
+  h->src = src;
+  return h;
+}
+
+/* free the hash table (and all chained hash tables as well) */
+static void free_hash(hash_t *h) {
+  if (h->next) free_hash(h->next);
+  if (h->prot) R_ReleaseObject(h->prot);
+  free(h);
+}
+
+/* R finalized for the hash table object */
+static void hash_fin(SEXP ho) {
+  hash_t *h = (hash_t*) EXTPTR_PTR(ho);
+  if (h) free_hash(h);
+}
+
+/* pi-hash fn */
+#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
+
+/* add the integer value at index i (0-based!) to the hash */
+static void add_hash_int(hash_t *h, hash_index_t i) {
+  int *src = (int*) h->src;
+  int val = src[i++], addr;
+  addr = HASH(val);
+#ifdef PROFILE_HASH
+  int oa = addr;
+#endif
+  while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+#ifdef PROFILE_HASH
+  if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+#endif
+  if (!h->ix[addr])
+    h->ix[addr] = i;
+}
+
+/* to avoid aliasing rules issues use a union */
+union dint_u {
+  double d;
+  unsigned int u[2];
+};
+
+/* add the double value at index i (0-based!) to the hash */
+static void add_hash_real(hash_t *h, hash_index_t i) {
+  double *src = (double*) h->src;
+  union dint_u val;
+  int addr;
+  /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
+  val.d = (src[i] == 0.0) ? 0.0 : src[i];
+  if (R_IsNA(val.d)) val.d = NA_REAL;
+  else if (R_IsNaN(val.d)) val.d = R_NaN;
+  addr = HASH(val.u[0]+ val.u[1]);
+#ifdef PROFILE_HASH
+  int oa = addr;
+#endif
+  while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+#ifdef PROFILE_HASH
+  if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa);
+#endif
+  if (!h->ix[addr])
+    h->ix[addr] = i + 1;
+}
+
+/* add the pointer value at index i (0-based!) to the hash */
+static void add_hash_ptr(hash_t *h, hash_index_t i) {
+  int addr;
+  void **src = (void**) h->src;
+  intptr_t val = (intptr_t) src[i++];
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+  addr = HASH(val);
+#endif
+#ifdef PROFILE_HASH
+  int oa = addr;
+#endif
+  while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+#ifdef PROFILE_HASH
+  if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+#endif
+  if (!h->ix[addr])
+    h->ix[addr] = i;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static int get_hash_int(hash_t *h, int val, int nmv) {
+  int *src = (int*) h->src;
+  int addr;
+  addr = HASH(val);
+  while (h->ix[addr]) {
+    if (src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr ++;
+    if (addr == h->m) addr = 0;
+  }
+  return nmv;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static int get_hash_real(hash_t *h, double val, int nmv) {
+  double *src = (double*) h->src;
+  int addr;
+  union dint_u val_u;
+  /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+  if (val == 0.0) val = 0.0;
+  if (R_IsNA(val)) val = NA_REAL;
+  else if (R_IsNaN(val)) val = R_NaN;
+  val_u.d = val;
+  addr = HASH(val_u.u[0] + val_u.u[1]);
+  while (h->ix[addr]) {
+    if (src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+  return nmv;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static int get_hash_ptr(hash_t *h, void *val_ptr, int nmv) {
+  void **src = (void **) h->src;
+  intptr_t val = (intptr_t) val_ptr;
+  int addr;
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+  addr = HASH(val);
+#endif
+  while (h->ix[addr]) {
+    if ((intptr_t) src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr ++;
+    if (addr == h->m) addr = 0;
+  }
+  return nmv;
+}
+
+static SEXP asCharacter(SEXP s, SEXP env)
+{
+  SEXP call, r;
+  PROTECT(call = lang2(install("as.character"), s));
+  PROTECT(r = eval(call, env));
+  UNPROTECT(2);
+  return r;
+}
+
+
+/* the only externally visible function to be called from R */
+SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp) {
+  SEXP a;
+  SEXPTYPE type;
+  hash_t *h = 0;
+  int nmv = asInteger(nonmatch), n = LENGTH(x), np = 0, y_to_char = 0, y_factor = 0;
+
+  /* edge-cases of 0 length */
+  if (n == 0) return allocVector(INTSXP, 0);
+  if (LENGTH(y) == 0) { /* empty table -> vector full of nmv */
+    int *ai;
+    a = allocVector(INTSXP, n);
+    ai = INTEGER(a);
+    for (np = 0; np < n; np++) ai[np] = nmv;
+    return a;
+  }
+
+  /* if incomparables are used we fall back straight to match() */
+  if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) {
+    Rf_warning("incomparables used in fmatch(), falling back to match()");
+    return match5(y, x, nmv, incomp, R_BaseEnv);
+  }
+
+  /* implicitly convert factors/POSIXlt to character */
+  if (OBJECT(x)) {
+    if (inherits(x, "factor")) {
+      x = PROTECT(asCharacterFactor(x));
+      np++;
+    } else if (inherits(x, "POSIXlt")) {
+      x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
+      np++;
+    }
+  }
+
+  /* for y we may need to do that later */
+  y_factor = OBJECT(y) && inherits(y, "factor");
+  y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt"));
+
+  /* coerce to common type - in the order of SEXP types */
+  if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP)
+    type = STRSXP;
+  else
+    type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x);
+
+  /* we only support INT/REAL/STR */
+  if (type != INTSXP && type != REALSXP && type != STRSXP) {
+    Rf_warning("incompatible type, fastmatch() is falling back to match()");
+    return match5(y, x, nmv, NULL, R_BaseEnv);
+  }
+
+  if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */
+    type = STRSXP;
+
+  /* coerce x - not y yet because we may get away with the existing cache */
+  if (TYPEOF(x) != type) {
+    x = PROTECT(coerceVector(x, type));
+    np++;
+  }
+
+  /* find existing cache(s) */
+  if (!hs) hs = Rf_install(".match.hash");
+  a = Rf_getAttrib(y, hs);
+  if (a != R_NilValue) { /* if there is a cache, try to find the matching type */
+    h = (hash_t*) EXTPTR_PTR(a);
+    /* could the object be out of sync ? If so, better remove the hash and ignore it */
+    if (h->parent != y) {
+#if HASH_VERBOSE
+      Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n");
+#endif
+      h = 0;
+      Rf_setAttrib(y, hs, R_NilValue);
+    }
+    while (h && h->type != type) h = h->next;
+  }
+  /* if there is no cache or not of the needed coerced type, create one */
+  if (a == R_NilValue || !h) {
+    h = new_hash(DATAPTR(y), LENGTH(y));
+    h->type = type;
+    h->parent = y;
+#if HASH_VERBOSE
+    Rprintf(" - creating new hash for type %d\n", type);
+#endif
+    if (a == R_NilValue) { /* if there is no cache attribute, create one */
+      a = R_MakeExternalPtr(h, R_NilValue, R_NilValue);
+      Rf_setAttrib(y, hs, a);
+      Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash"));
+      R_RegisterCFinalizer(a, hash_fin);
+    } else { /* otherwise append the new cache */
+      hash_t *lh = (hash_t*) EXTPTR_PTR(a);
+      while (lh->next) lh = lh->next;
+      lh->next = h;
+#if HASH_VERBOSE
+      Rprintf("   (appended to the cache list)\n");
+#endif
+    }
+
+    if (TYPEOF(y) != type) {
+#if HASH_VERBOSE
+      if (y_to_char)
+	Rprintf("   (need to convert table factor/POSIXlt to strings\n");
+      else
+	Rprintf("   (need to coerce table to %d)\n", type);
+#endif
+      y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type);
+      h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */
+      h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */
+      R_PreserveObject(y);
+    }
+    /* make sure y doesn't go away while we create the hash */
+    /* R_PreserveObject(y);     */
+    /* spawn a thread to create the hash */
+    /* nope - so far we do it serially */
+
+    { /* create the hash table */
+      int i, n = LENGTH(y);
+      if (type == INTSXP)
+	for(i = 0; i < n; i++)
+	  add_hash_int(h, i);
+      else if (type == REALSXP)
+	for(i = 0; i < n; i++)
+	  add_hash_real(h, i);
+      else
+	for(i = 0; i < n; i++)
+	  add_hash_ptr(h, i);
+    }
+  }
+
+  { /* query the hash table */
+    int i, n = LENGTH(x);
+    SEXP r = allocVector(INTSXP, n);
+    int *v = INTEGER(r);
+    if (type == INTSXP) {
+      int *k = INTEGER(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_int(h, k[i], nmv);
+    } else if (type == REALSXP) {
+      double *k = REAL(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_real(h, k[i], nmv);
+    } else {
+      SEXP *k = (SEXP*) DATAPTR(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_ptr(h, k[i], nmv);
+    }
+    if (np) UNPROTECT(np);
+    return r;
+  }
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-fastmatch.git