[med-svn] [r-cran-fastmatch] 02/11: New upstream version 1.1-0

Mon Oct 23 17:38:09 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository r-cran-fastmatch.

commit 268d61bff196f8fc38f8856b23992f09f8c1d901
Author: Andreas Tille <tille at debian.org>
Date:   Mon Oct 23 19:27:33 2017 +0200

    New upstream version 1.1-0
---
 DESCRIPTION     |  15 +-
 MD5             |  20 +-
 NAMESPACE       |   4 +-
 NEWS            |  34 +++-
 R/coalesce.R    |   1 +
 R/ctapply.R     |   1 +
 R/fastmatch.R   |   8 +-
 R/hash.R        |   7 +
 man/coalesce.Rd |  67 +++++++
 man/ctapply.Rd  |  57 ++++++
 man/fmatch.Rd   |  42 +++-
 src/common.h    |  29 +++
 src/ctapply.c   |  93 +++++++++
 src/fasthash.c  | 481 +++++++++++++++++++++++++++++++++++++++++++++
 src/fastmatch.c | 594 ++++++++++++++++++++++++++++++++++----------------------
 15 files changed, 1192 insertions(+), 261 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a9e672e..67a0f50 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,15 +1,16 @@
 Package: fastmatch
-Version: 1.0-4
+Version: 1.1-0
 Title: Fast match() function
 Author: Simon Urbanek <simon.urbanek at r-project.org>
 Maintainer: Simon Urbanek <simon.urbanek at r-project.org>
 Description: Package providing a fast match() replacement for cases
-        that require repeated look-ups. It is slightly faster that R's
-        built-in match() function on first match against a table, but
-        extremely fast on any subsequent lookup as it keeps the hash
-        table in memory.
+	that require repeated look-ups. It is slightly faster that R's
+	built-in match() function on first match against a table, but
+	extremely fast on any subsequent lookup as it keeps the hash
+	table in memory.
 License: GPL-2
 URL: http://www.rforge.net/fastmatch
-Packaged: 2012-01-21 10:09:18 UTC; svnuser
+NeedsCompilation: yes
+Packaged: 2017-01-28 14:18:51 UTC; svnuser
 Repository: CRAN
-Date/Publication: 2012-01-21 10:22:24
+Date/Publication: 2017-01-28 17:37:09
diff --git a/MD5 b/MD5
index 28aa699..1a6a557 100644
--- a/MD5
+++ b/MD5
@@ -1,7 +1,15 @@
-89f00fff119030016fece98c08b5040b *DESCRIPTION
-7dd3c164abc64183f0681eaf7b85d73e *NAMESPACE
-27e152f5450341fbb88d31cfbff45520 *NEWS
-770a7b76ccff6f95d86152999543269b *R/fastmatch.R
+9599fb099644bde527af6f7e9df71105 *DESCRIPTION
+7e7ec63e6925cc4435d98adf39c5e26d *NAMESPACE
+51291a35ca1fc791fa0ab55e3e9ea21f *NEWS
+f89bb99f16073fd87eb20e3a366e0d2e *R/coalesce.R
+c55a081862af768fb0493109ec5b898d *R/ctapply.R
+aa671c24c5486532d61366bd014ecb26 *R/fastmatch.R
+c4f0cdd605049cd165c501d88bd4f51f *R/hash.R
 ddc4a8e8795d9bc6be2c7d507b7e160b *R/match.hash.R
-1cf3221f784b90ed613d2454cc00a727 *man/fmatch.Rd
-632693d50dad9116f97f57578ee10502 *src/fastmatch.c
+f61d17ec420b9ada0a40d24277999f3b *man/coalesce.Rd
+b8d381ce543a5aa2a7f4421bf6c1cbdf *man/ctapply.Rd
+0837fb690176702f0fcd16d2abdc0668 *man/fmatch.Rd
+18e74f8e423543a4004ffbda4eeae2a1 *src/common.h
+e16cc450c8e002f1b819261653706f88 *src/ctapply.c
+461b9689226fc1550c15289a52c5896a *src/fasthash.c
+1f61304c0b28d2b837dbd8c446aac94e *src/fastmatch.c
diff --git a/NAMESPACE b/NAMESPACE
index eceafca..2b5aa55 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,3 +1,3 @@
-useDynLib(fastmatch)
-export(fmatch)
+useDynLib(fastmatch, C_fmatch = fmatch, C_ctapply = ctapply, C_coalesce = coalesce, C_append = append, mk_hash, get_table, get_values)
+export(fmatch, fmatch.hash, ctapply, coalesce, "%fin%")
 S3method(print, match.hash)
diff --git a/NEWS b/NEWS
index 13dcc71..c211aaa 100644
--- a/NEWS
+++ b/NEWS
@@ -1,7 +1,31 @@
  NEWS for fastmatch
 --------------------
 
-0.1-4	2012-01-12
+1.1-0	(under development)
+    o	add fmatch.hash() which will create a hash table that can
+	be used later with fmatch(). This can be used in cases where
+	attaching the hash to the table implicitly is not reliable.
+
+    o	added ctapply() - a fast version of tapply()
+
+    o	added coalesce() - fast way of grouping unique values into
+	contiguous groups (in linear time).
+
+    o	added %fin% - a fast version of %in%
+
+    o	fastmatch now supports long vectors. Note that the hash
+	function is the same as in R and thus it uses at most 32-bits,
+	hence long vectors can be used, but they must have less than
+	2^32 (~4e9) unique values.
+
+    o	bugfix: matching reals against a table that contains NA or NaNs
+	would not match the position of those but return NA instead.
+
+    o	bugfix: fix crash when a newly unserialized hash table is
+	used (since the table hash is not stored during serialization).
+
+
+1.0-4	2012-01-12
     o	some R functions (such as subset assignment like x[1] <- 2)
 	can create a new object (with possibly modified content) and
 	copy all attributes including the hash cache. If the original
@@ -11,7 +35,7 @@
 	identify such cases and discard the hash to prevent errorneous
 	results.
 
-0.1-3	2011-12-21
+1.0-3	2011-12-21
     o	match() coerces POSIXlt objects into characters, but so far 
 	fmatch() performed the match on the actual objects.
 	Now fmatch() coerces POSIXlt object into characters just like
@@ -20,13 +44,13 @@
 	POSIXct objects (much more efficient) or use as.character() on
 	the POSIXlt object to create a table that you want to re-use.
 
-0.1-2	2011-09-14
+1.0-2	2011-09-14
     o	bugfix: nomatch was ignored in the fastmatch implementation
 	(thanks to Enrico Schumann for reporting)
 
-0.1-1	2010-12-23
+1.0-1	2010-12-23
     o	minor cleanups
 
-0.1-0	2010-12-23
+1.0-0	2010-12-23
     o	initial release
 
diff --git a/R/coalesce.R b/R/coalesce.R
new file mode 100644
index 0000000..680772e
--- /dev/null
+++ b/R/coalesce.R
@@ -0,0 +1 @@
+coalesce <- function(x) .Call(C_coalesce, x)
diff --git a/R/ctapply.R b/R/ctapply.R
new file mode 100644
index 0000000..6b0e171
--- /dev/null
+++ b/R/ctapply.R
@@ -0,0 +1 @@
+ctapply <- function(X, INDEX, FUN, ..., MERGE=c) .External(C_ctapply, parent.frame(), X, INDEX, FUN, MERGE, ...)
diff --git a/R/fastmatch.R b/R/fastmatch.R
index fe32610..1458da0 100644
--- a/R/fastmatch.R
+++ b/R/fastmatch.R
@@ -1,2 +1,8 @@
 fmatch <- function(x, table, nomatch = NA_integer_, incomparables = NULL)
-  .Call("fmatch", x, table, nomatch, incomparables, PACKAGE = "fastmatch")
+  .Call(C_fmatch, x, table, nomatch, incomparables, FALSE)
+
+fmatch.hash <- function(x, table, nomatch = NA_integer_, incomparables = NULL)
+  .Call(C_fmatch, x, table, nomatch, incomparables, TRUE)
+
+`%fin%` <- function (x, table)
+  .Call(C_fmatch, x, table, 0L, NULL, FALSE) > 0L
diff --git a/R/hash.R b/R/hash.R
new file mode 100644
index 0000000..7687792
--- /dev/null
+++ b/R/hash.R
@@ -0,0 +1,7 @@
+mk.hash <- function(x, size=256L, index=FALSE, values=NULL) .Call(mk_hash, x, index, size, values)
+
+levels.fasthash <- function(x) .Call(get_table, x)
+
+map.values <- function(hash, keys) .Call(get_values, hash, keys)
+
+append.hash <- function(hash, x, index=TRUE, values=NULL) .Call(C_append, hash, x, index, values)
diff --git a/man/coalesce.Rd b/man/coalesce.Rd
new file mode 100644
index 0000000..3a106aa
--- /dev/null
+++ b/man/coalesce.Rd
@@ -0,0 +1,67 @@
+\name{coalesce}
+\alias{coalesce}
+\title{
+  Create an index that groups unique values together
+}
+\description{
+  \code{coalesce} makes sure that a given index vector is coalesced,
+  i.e., identical values are grouped into contiguous blocks. This can be
+  used as a much faster alternative to \code{\link{sort.list}} where the
+  goal is to group identical values, but not necessarily in a
+  pre-defined order. The algorithm is linear in the length of the vector.
+}
+\usage{
+  coalesce(x)
+}
+\arguments{
+  \item{x}{character, integer or real vector to coalesce}
+}
+\details{
+  The current implementation takes two passes through the vector. In the
+  first pass it creates a hash table for the values of \code{x} counting
+  the occurrences in the process. In the second pass it assigns indices
+  for every element based on the index stored in the hash table.
+
+  The order of the groups of unique values is defined by the first
+  occurence of each unique value, hence it is identical to the order of
+  \code{\link{unique}}.
+
+  One common use of \code{coalesce} is to allow the use of arbitrary
+  vectors in \code{\link{ctapply}} via
+  \code{ctapply(x[coalesce(x)], ...)}.
+}
+\value{
+  Integer vector with the resulting permutation. \code{x[coalesce(x)]}
+  gives \code{x} with contiguous unique values.
+}
+%\references{
+%}
+\author{
+Simon Urbanek
+}
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+\seealso{
+\code{\link{unique}}, \code{\link{sort.list}}, \code{\link{ctapply}}
+}
+\examples{
+i = rnorm(2e6)
+names(i) = as.integer(rnorm(2e6))
+## compare sorting and coalesce
+system.time(o <- i[order(names(i))])
+system.time(o <- i[coalesce(names(i))])
+
+## more fair comparison taking the coalesce time (and copy) into account
+system.time(tapply(i, names(i), sum))
+system.time({ o <- i[coalesce(names(i))]; ctapply(o, names(o), sum) })
+
+## in fact, using ctapply() on a dummy vector is faster than table() ...
+## believe it or not ... (that that is actually wasteful, since coalesce
+## already computed the table internally anyway ...)
+ftable <- function(x) {
+   t <- ctapply(rep(0L, length(x)), x[coalesce(x)], length)
+   t[sort.list(names(t))]
+}
+system.time(table(names(i)))
+system.time(ftable(names(i)))
+}
+\keyword{manip}
diff --git a/man/ctapply.Rd b/man/ctapply.Rd
new file mode 100644
index 0000000..bbad4a5
--- /dev/null
+++ b/man/ctapply.Rd
@@ -0,0 +1,57 @@
+\name{ctapply}
+\alias{ctapply}
+\title{
+Fast tapply() replacement functions
+}
+\description{
+  \code{ctapply} is a fast replacement of \code{tapply} that assumes
+  contiguous input, i.e. unique values in the index are never speparated
+  by any other values. This avoids an expensive \code{split} step since
+  both value and the index chungs can be created on the fly. It also
+  cuts a few corners to allow very efficient copying of values. This
+  makes it many orders of magnitude faster than the classical
+  \code{lapply(split(), ...)} implementation.
+}
+\usage{
+ctapply(X, INDEX, FUN, ..., MERGE=c)
+}
+\arguments{
+  \item{X}{an atomic object, typically a vector}
+  \item{INDEX}{numeric or character vector of the same length as \code{X}}
+  \item{FUN}{the function to be applied}
+  \item{...}{additional arguments to \code{FUN}. They are passed as-is,
+    i.e., without replication or recycling}
+  \item{MERGE}{function to merge the resulting vector or \code{NULL} if
+    the arguments to such a functiona re to be returned instead}
+}
+\details{
+  Note that \code{ctapply} supports either integer, real or character
+  vectors as indices (note that factors are integer vectors and thus
+  supported, but you do not need to convert character vectors). Unlike
+  \code{tapply} it does not take a list of factors - if you want to use
+  a cross-product of factors, create the product first, e.g. using
+  \code{paste(i1, i2, i3, sep='\01')} or multiplication - whetever
+  method is convenient for the input types.
+
+  \code{ctapply} requires the \code{INDEX} to contiguous. One (slow) way
+  to achieve that is to use \code{\link{sort}} or \code{\link{order}}.
+}
+%\value{
+%}
+%\references{
+%}
+\author{
+Simon Urbanek
+}
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+\seealso{
+\code{\link{tapply}}
+}
+\examples{
+i = rnorm(4e6)
+names(i) = as.integer(rnorm(1e6))
+i = i[order(names(i))]
+system.time(tapply(i, names(i), sum))
+system.time(ctapply(i, names(i), sum))
+}
+\keyword{manip}
diff --git a/man/fmatch.Rd b/man/fmatch.Rd
index 03eef29..b22172d 100644
--- a/man/fmatch.Rd
+++ b/man/fmatch.Rd
@@ -1,5 +1,7 @@
 \name{fmatch}
 \alias{fmatch}
+\alias{\%fin\%}
+\alias{fmatch.hash}
 \alias{fastmatch}
 \title{
 Fast match() replacement
@@ -15,9 +17,19 @@ Although \code{fmatch} can be used separately, in general it is also
 safe to use: \code{match <- fmatch} since it is a drop-in
 replacement. Any cases not directly handled by \code{fmatch} are passed
 to \code{match} with a warning.
+
+\code{fmatch.hash} is identical to \code{fmatch} but it returns the table
+object with the hash table attached instead of the result, so it can be
+used to create a table object in cases where direct modification is
+not possible.
+
+\code{\%fin\%} is a version of the built-in \code{\link{\%in\%}} function
+that uses \code{fmatch} instead of \code{\link{match}}().
 }
 \usage{
 fmatch(x, table, nomatch = NA_integer_, incomparables = NULL)
+fmatch.hash(x, table, nomatch = NA_integer_, incomparables = NULL)
+x \%fin\% table
 }
 \arguments{
   \item{x}{values to be matched}
@@ -36,25 +48,32 @@ fmatch(x, table, nomatch = NA_integer_, incomparables = NULL)
   and will be passed down to \code{match}.
 
   The first match against a table results in a hash table to be computed
-  from the table. This table is then attached as the `.match.hash`
+  from the table. This table is then attached as the \code{".match.hash"}
   attribute of the table so that it can be re-used on subsequent calls
   to \code{fmatch} with the same table.
 
   The hashing algorithm used is the same as the \code{match} function in
-  R, but it is re-implemented in a slight different way to improve its
+  R, but it is re-implemented in a slightly different way to improve its
   performance at the cost of supporting only a subset of types (integer,
   real and character). For any other types \code{fmatch} falls back to
   \code{match} (with a warning).
 }
 \value{
-  A vector of the same length as \code{x} - see \code{\link{match}} for
-  details.
+  \code{fmatch}: A vector of the same length as \code{x} - see
+  \code{\link{match}} for details.
+
+  \code{fmatch.hash}: \code{table}, possibly coerced to match the type
+  of \code{x}, with the hash table attached.
+
+  \code{\%fin\%}: A logical vector the same length as \code{x} - see
+  \code{\link{\%in\%}} for details.
+
 }
 %\references{
 %}
-%\author{
-%%  ~~who you are~~
-%}
+\author{
+Simon Urbanek
+}
 \note{
 \code{fmatch} modifies the \code{table} by attaching an attribute to
   it. It is expected that the values will not change unless that
@@ -64,6 +83,12 @@ fmatch(x, table, nomatch = NA_integer_, incomparables = NULL)
   is modified directly (e.g. by some C code) without removing
   attributes.
 
+  In cases where the \code{table} object cannot be modified (or such
+  modification would not survive) \code{fmatch.hash} can be used to build
+  the hash table and return \code{table} object including the hash
+  table. In that case no lookup is done and \code{x} is only used to
+  determine the type into which \code{table} needs to be coerced.
+  
   Also \code{fmatch} does not convert to a common encoding so strings
   with different representation in two encodings don't match.
 }
@@ -91,7 +116,7 @@ identical(base::match(s, x), fmatch(s, x))
 # next, match a factor against the table
 # this will require both x and the factor
 # to be cast to strings
-s=factor(c("1","1","2","foo","3",NA))
+s = factor(c("1","1","2","foo","3",NA))
 # because the casting will have to allocate a string
 # cache in R, we run a dummy conversion to take
 # that out of the equation
@@ -118,3 +143,4 @@ identical(base::match(s, y), fmatch(s, y))
 identical(base::match(4L, 1:3, nomatch=0), fmatch(4L, 1:3, nomatch=0))
 }
 \keyword{manip}
+\keyword{logic}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..50f4e9d
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,29 @@
+/* fastmatch - common types */
+
+#ifndef FM_COMMON_H__
+#define FM_COMMON_H__
+
+/* for speed (should not really matter in this case as most time is spent in the hashing) */
+#define USE_RINTERNALS 1
+#include <Rinternals.h>
+
+#ifndef XLENGTH /* for compatibility with old R */
+#define XLENGTH(X) LENGTH(X)
+#define IS_LONG_VEC(X) 0
+typedef R_len_t R_xlen_t;
+#endif
+
+/* hash_index_t is big enough to cover long vectors */
+#ifdef LONG_VECTOR_SUPPORT
+typedef R_xlen_t hash_index_t;
+#else
+typedef int hash_index_t;
+#endif
+
+/* hashes are always 32-bit -- this is for compatibility with
+   the hash function used in R.
+   This means that long vectors are fine, but they may not have
+   more than 2^32 - 1 unique values */
+typedef unsigned int hash_value_t;
+
+#endif
diff --git a/src/ctapply.c b/src/ctapply.c
new file mode 100644
index 0000000..3b7c8b3
--- /dev/null
+++ b/src/ctapply.c
@@ -0,0 +1,93 @@
+#include <stdlib.h>
+#include <string.h>
+
+#define USE_RINTERNALS 1
+#include <Rinternals.h>
+
+#define MIN_CACHE 128
+
+SEXP ctapply_(SEXP args) {
+    SEXP rho, vec, by, fun, mfun, cdi = 0, cdv = 0, tmp, acc, tail;
+    int i = 0, n, cdlen;
+    
+    args = CDR(args);
+    rho = CAR(args); args = CDR(args);    
+    vec = CAR(args); args = CDR(args);
+    by  = CAR(args); args = CDR(args);
+    fun = CAR(args); args = CDR(args);
+    mfun= CAR(args); args = CDR(args);
+    tmp = PROTECT(allocVector(VECSXP, 3));
+    acc = 0;
+    if (TYPEOF(by) != INTSXP && TYPEOF(by) != REALSXP && TYPEOF(by) != STRSXP)
+	Rf_error("INDEX must be either integer, real or character vector");
+    if (TYPEOF(vec) != INTSXP && TYPEOF(vec) != REALSXP && TYPEOF(vec) != STRSXP && TYPEOF(vec) != VECSXP)
+	Rf_error("X must be either integer, real, character or generic vector (list)");
+    
+    if ((n = LENGTH(vec)) != LENGTH(by)) Rf_error("X and INDEX must have the same length");
+    while (i < n) {
+	int i0 = i, N;
+	SEXP eres;
+	/* find the contiguous stretch */
+	while (++i < n) {
+	    if ((TYPEOF(by) == INTSXP && INTEGER(by)[i] != INTEGER(by)[i - 1]) ||
+		(TYPEOF(by) == STRSXP && STRING_ELT(by, i) != STRING_ELT(by, i - 1)) ||
+		(TYPEOF(by) == REALSXP && REAL(by)[i] != REAL(by)[i - 1]))
+		break;
+	}
+	/* [i0, i - 1] is the interval to run on */
+	N = i - i0;
+	/* allocate cache for both the vector and index */
+	if (!cdi) {
+	    /* we have to guarantee named > 0 since we'll be modifying it in-place */
+	    SET_NAMED(cdi = SET_VECTOR_ELT(tmp, 0, allocVector(TYPEOF(by), (cdlen = ((N < MIN_CACHE) ? MIN_CACHE : N)))), 1);
+	    SET_NAMED(cdv = SET_VECTOR_ELT(tmp, 1, allocVector(TYPEOF(vec), cdlen)), 1);
+	} else if (cdlen < N) {
+	    SET_NAMED(cdi = SET_VECTOR_ELT(tmp, 0, allocVector(TYPEOF(by), (cdlen = N))), 1);
+	    SET_NAMED(cdv = SET_VECTOR_ELT(tmp, 1, allocVector(TYPEOF(vec), cdlen)), 1);
+	}
+	SETLENGTH(cdi, N);
+	SETLENGTH(cdv, N);
+	/* copy the index slice */
+	if (TYPEOF(by) == INTSXP) memcpy(INTEGER(cdi), INTEGER(by) + i0, sizeof(int) * N);
+	else if (TYPEOF(by) == REALSXP) memcpy(REAL(cdi), REAL(by) + i0, sizeof(double) * N);
+	else if (TYPEOF(by) == STRSXP) memcpy(STRING_PTR(cdi), STRING_PTR(by) + i0, sizeof(SEXP) * N);
+	/* copy the vector slice */
+	if (TYPEOF(vec) == INTSXP) memcpy(INTEGER(cdv), INTEGER(vec) + i0, sizeof(int) * N);
+	else if (TYPEOF(vec) == REALSXP) memcpy(REAL(cdv), REAL(vec) + i0, sizeof(double) * N);
+	else if (TYPEOF(vec) == STRSXP) memcpy(STRING_PTR(cdv), STRING_PTR(vec) + i0, sizeof(SEXP) * N);
+	else if (TYPEOF(vec) == VECSXP) memcpy(VECTOR_PTR(cdv), VECTOR_PTR(vec) + i0, sizeof(SEXP) * N);
+	eres = eval(PROTECT(LCONS(fun, CONS(cdv, args))), rho);
+	UNPROTECT(1); /* eval arg */
+	/* if the result has NAMED > 1 then we have to duplicate it
+	   see ctapply(x, y, identity). It should be uncommon, though
+	   since most functions will return newly allocated objects
+
+	   FIXME: check NAMED == 1 -- may also be bad if the reference is outside,
+	   but then NAMED1 should be duplicated before modification so I think we're safe
+	*/
+	/* Rprintf("NAMED(eres)=%d\n", NAMED(eres)); */
+	if (NAMED(eres) > 1) eres = duplicate(eres);
+	PROTECT(eres);
+	if (!acc) tail = acc = SET_VECTOR_ELT(tmp, 2, list1(eres));
+	else tail = SETCDR(tail, list1(eres));
+	{
+	    char cbuf[64];
+	    const char *name = "";
+	    if (TYPEOF(by) == STRSXP) name = CHAR(STRING_ELT(by, i0));
+	    else if (TYPEOF(by) == INTSXP) {
+		snprintf(cbuf, sizeof(cbuf), "%d", INTEGER(by)[i0]);
+		name = cbuf;
+	    } else { /* FIXME: this one is not consistent with R ... */
+		snprintf(cbuf, sizeof(cbuf), "%g", REAL(by)[i0]);
+		name = cbuf;
+	    }
+	    SET_TAG(tail, install(name));
+	}
+	UNPROTECT(1); /* eres */
+    }
+    UNPROTECT(1); /* tmp */
+    if (!acc) return R_NilValue;
+    acc = eval(PROTECT(LCONS(mfun, acc)), rho);
+    UNPROTECT(1);
+    return acc;
+}
diff --git a/src/fasthash.c b/src/fasthash.c
new file mode 100644
index 0000000..d9f55ed
--- /dev/null
+++ b/src/fasthash.c
@@ -0,0 +1,481 @@
+/*
+ *  fasthash: hash table
+ *  This is very similar to fastmatch except that the payload
+ *  is stored in the hash table as well and thus can be used to
+ *  append values
+ *
+ *  Copyright (C) 2013  Simon Urbanek
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ */
+
+#include "common.h"
+
+/* for malloc/free since we handle our hash table memory separately from R */
+#include <stdlib.h>
+/* for hashing for pointers we need intptr_t */
+#include <stdint.h>
+/* for memcpy */
+#include <string.h>
+
+typedef struct hash {
+    hash_index_t m, els;   /* hash size, added elements */
+    hash_index_t max_load; /* max. load - resize when reached */
+    int k, type;           /* used bits, payload type */
+    void *src;             /* the data array of the hashed object */
+    SEXP prot;             /* object to protect along whith this hash */
+    SEXP parent;           /* hashed object */
+    SEXP vals;             /* values vector if used as key/value storage */
+    struct hash *next;
+    hash_index_t ix[1];
+} hash_t;
+
+#define MAX_LOAD 0.85
+
+/* create a new hash table with the given type and length.
+   Implicitly calls allocVector(type, len) to create the storage
+   of the newly added hash values
+   NOTE: len is the *hash* size, so it should be 2 * length(unique(x))
+   It will be rounded up to the next power of 2 */
+static hash_t *new_hash(SEXPTYPE type, hash_index_t len) {
+    hash_t *h;
+    int k = 8; /* force a minimal size of 256 */
+    hash_index_t m = 1 << k;    
+    hash_index_t max_load;
+    SEXP keys;
+    while (m < len) { m *= 2; k++; }
+    max_load = (hash_index_t) (((double) m) * MAX_LOAD);
+    keys = allocVector(type, max_load);
+    h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m));
+    if (!h)
+	Rf_error("unable to allocate %.2fMb for a hash table",
+		 (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0));
+    h->parent = keys;
+    h->max_load = max_load;
+    R_PreserveObject(h->parent);
+    h->m = m;
+    h->k = k;
+    h->src = DATAPTR(h->parent);
+    h->type = type;
+    return h;
+}
+
+/* free the hash table (and all chained hash tables as well) */
+static void free_hash(hash_t *h) {
+    if (h->next) free_hash(h->next);
+    if (h->prot) R_ReleaseObject(h->prot);
+    R_ReleaseObject(h->parent);
+    free(h);
+}
+
+/* R finalized for the hash table object */
+static void hash_fin(SEXP ho) {
+    hash_t *h = (hash_t*) EXTPTR_PTR(ho);
+    if (h) free_hash(h);
+}
+
+/* pi-hash fn */
+#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
+
+static int INCEL(hash_t *h) {
+    if (h->els == h->max_load)
+	Rf_error("Maximal hash load reached, resizing is currently unimplemented");
+    return h->els++;
+}
+
+/* add an integer value to the hash */
+static int add_hash_int(hash_t *h, int val) {
+    int *src = (int*) h->src;
+    hash_index_t addr = HASH(val);
+#ifdef PROFILE_HASH
+    int oa = addr;
+#endif
+    while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+#ifdef PROFILE_HASH
+    if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+#endif
+    if (!h->ix[addr]) {
+	src[INCEL(h)] = val;
+	h->ix[addr] = h->els;
+    }
+    return addr;
+}
+
+/* to avoid aliasing rules issues use a union */
+union dint_u {
+    double d;
+    unsigned int u[2];
+};
+
+/* add the double value at index i (0-based!) to the hash */
+static int add_hash_real(hash_t *h, double val_) {
+    double *src = (double*) h->src;
+    union dint_u val;
+    int addr;
+    /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
+    val.d = (val_ == 0.0) ? 0.0 : val_;
+    if (R_IsNA(val.d)) val.d = NA_REAL;
+    else if (R_IsNaN(val.d)) val.d = R_NaN;
+    addr = HASH(val.u[0] + val.u[1]);
+#ifdef PROFILE_HASH
+    int oa = addr;
+#endif
+    while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+#ifdef PROFILE_HASH
+    if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa);
+#endif
+    if (!h->ix[addr]) {
+	src[INCEL(h)] = val.d;
+	h->ix[addr] = h->els;
+    }
+    return addr;
+}
+
+/* add a R object to the hash */
+static int add_hash_obj(hash_t *h, SEXP val) {
+    int addr;
+    SEXP *src = (SEXP*) h->src;
+    intptr_t val_i = (intptr_t) val;
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+    addr = HASH((val_i & 0xffffffff) ^ (val_i >> 32));
+#else
+    addr = HASH(val_i);
+#endif
+#ifdef PROFILE_HASH
+    int oa = addr;
+#endif
+    while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+#ifdef PROFILE_HASH
+    if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+#endif
+    if (!h->ix[addr]) {
+	src[INCEL(h)] = val;
+	h->ix[addr] = h->els;
+    }
+    return addr;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static hash_index_t get_hash_int(hash_t *h, int val) {
+    int *src = (int*) h->src;
+    hash_index_t addr;
+    addr = HASH(val);
+    while (h->ix[addr]) {
+	if (src[h->ix[addr] - 1] == val)
+	    return h->ix[addr];
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+    return 0;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static hash_index_t get_hash_real(hash_t *h, double val) {
+    double *src = (double*) h->src;
+    hash_index_t addr;
+    union dint_u val_u;
+    /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+    if (val == 0.0) val = 0.0;
+    if (R_IsNA(val)) val = NA_REAL;
+    else if (R_IsNaN(val)) val = R_NaN;
+    val_u.d = val;
+    addr = HASH(val_u.u[0] + val_u.u[1]);
+    while (h->ix[addr]) {
+	if (src[h->ix[addr] - 1] == val)
+	    return h->ix[addr];
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+    return 0;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+static int get_hash_obj(hash_t *h, SEXP val_ptr) {
+    SEXP *src = (SEXP *) h->src;
+    intptr_t val = (intptr_t) val_ptr;
+    hash_index_t addr;
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+    addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+    addr = HASH(val);
+#endif
+    while (h->ix[addr]) {
+	if ((intptr_t) src[h->ix[addr] - 1] == val)
+	    return h->ix[addr];
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+    return 0;
+}
+
+static SEXP asCharacter(SEXP s, SEXP env)
+{
+  SEXP call, r;
+  PROTECT(call = lang2(install("as.character"), s));
+  r = eval(call, env);
+  UNPROTECT(1);
+  return r;
+}
+
+/* there are really three modes:
+   1) if vals in non-NULL then h->vals are populated with the
+      values from vals corresponding to x as the keys
+   2) if ix is non-NULL then ix is is populated with the
+      indices into the hash table (1-based)
+   3) if both are NULL then only the hash table is built */
+static void append_hash(hash_t *h, SEXP x, int *ix, SEXP vals) {
+    SEXPTYPE type = TYPEOF(x);
+    R_xlen_t i, n = XLENGTH(x);
+    if (type == INTSXP) {
+	int *iv = INTEGER(x);
+	if (vals)
+	    for(i = 0; i < n; i++)
+		SET_VECTOR_ELT(h->vals, h->ix[add_hash_int(h, iv[i])] - 1, VECTOR_ELT(vals, i));
+	else if (ix)
+	    for(i = 0; i < n; i++)
+		ix[i] = h->ix[add_hash_int(h, iv[i])];
+	else
+	    for(i = 0; i < n; i++)
+		add_hash_int(h, iv[i]);
+    } else if (type == REALSXP) {
+	double *dv = REAL(x);
+	if (vals)
+	    for(i = 0; i < n; i++)
+		SET_VECTOR_ELT(h->vals, h->ix[add_hash_real(h, dv[i])] - 1, VECTOR_ELT(vals, i));
+	else if (ix)
+	    for(i = 0; i < n; i++)
+		ix[i] = h->ix[add_hash_real(h, dv[i])];
+	else
+	    for(i = 0; i < n; i++)
+		add_hash_real(h, dv[i]);
+    } else {
+	SEXP *sv = (SEXP*) DATAPTR(x);
+	if (vals)
+	    for(i = 0; i < n; i++)
+		SET_VECTOR_ELT(h->vals, h->ix[add_hash_obj(h, sv[i])] - 1, VECTOR_ELT(vals, i));
+	else if (ix)
+	    for(i = 0; i < n; i++)
+		ix[i] = h->ix[add_hash_obj(h, sv[i])];
+	else
+	    for(i = 0; i < n; i++)
+		add_hash_obj(h, sv[i]);
+    }
+}
+
+static hash_t *unwrap(SEXP ht) {
+    hash_t *h;
+    if (!inherits(ht, "fasthash"))
+	Rf_error("Invalid hash object");
+    h = (hash_t*) EXTPTR_PTR(ht);
+    if (!h) /* FIXME: we should just rebuild the hash ... */
+	Rf_error("Hash object is NULL - probably unserialized?");
+    return h;
+}
+
+static SEXP chk_vals(SEXP vals, SEXP keys) {
+    /* FIXME: requiring vals to be a list is not very flexible, but the
+              easiest to implement. Anything else complicates the
+	      append_hash() function enormously and would require
+	      a separate solution for each combination of key and value types
+    */
+    if (vals == R_NilValue)
+	vals = 0;
+    else {
+	if (TYPEOF(vals) != VECSXP)
+	    Rf_error("`values' must be a list");
+	if (XLENGTH(vals) != XLENGTH(keys))
+	    Rf_error("keys and values vectors must have the same length");
+    }
+    return vals;
+}
+
+static void setval(SEXP res, R_xlen_t i, hash_index_t ix, SEXP vals)
+{
+    SET_VECTOR_ELT(res, i, (ix == 0) ? R_NilValue : VECTOR_ELT(vals, ix - 1));
+}
+
+/*---- API visible form R ----*/
+
+SEXP mk_hash(SEXP x, SEXP sGetIndex, SEXP sValueEst, SEXP vals) {
+    SEXP a, six;
+    SEXPTYPE type;
+    hash_t *h = 0;
+    int np = 0, get_index = asInteger(sGetIndex) == 1;
+    int *ix = 0;
+    hash_index_t val_est = 0;
+
+    if (TYPEOF(sValueEst) == REALSXP) {
+	double ve = REAL(sValueEst)[0];
+	if (ve < 0 || R_IsNaN(ve))
+	    Rf_error("Invalid value count estimate, must be positive or NA");
+	if (R_IsNA(ve)) ve = 0.0;
+	val_est = ve;
+    } else {
+	int ve = asInteger(sValueEst);
+	if (ve == NA_INTEGER) ve = 0;
+	if (ve < 0)
+	    Rf_error("Invalid value count estimate, must be positive or NA");
+	val_est = ve;
+    }
+
+    vals = chk_vals(vals, x);
+
+    /* implicitly convert factors/POSIXlt to character */
+    if (OBJECT(x)) {
+	if (inherits(x, "factor")) {
+	    x = PROTECT(asCharacterFactor(x));
+	    np++;
+	} else if (inherits(x, "POSIXlt")) {
+	    x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
+	    np++;
+	}
+    }
+    type = TYPEOF(x);
+
+    /* we only support INT/REAL/STR */
+    if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP)
+	Rf_error("Currently supported types are integer, real, chracter vectors and lists");
+
+    if (get_index) {
+	ix = INTEGER(six = PROTECT(allocVector(INTSXP, XLENGTH(x))));
+	np++;
+    }
+
+    /* FIXME: determine the proper hash size */
+    if (!val_est) val_est = XLENGTH(x);
+    /* check for overflow */
+    if (val_est * 2 > val_est) val_est *= 2; 
+    
+    h = new_hash(TYPEOF(x), val_est);
+    a = PROTECT(R_MakeExternalPtr(h, R_NilValue, R_NilValue));
+    Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("fasthash"));
+    if (ix)
+	Rf_setAttrib(a, install("index"), six);
+    R_RegisterCFinalizer(a, hash_fin);
+    np++;
+    
+#if HASH_VERBOSE
+    Rprintf(" - creating new hash for type %d\n", type);
+#endif
+    append_hash(h, x, ix, vals);
+    UNPROTECT(np);
+    return a;
+}
+
+SEXP append(SEXP ht, SEXP x, SEXP sGetIndex, SEXP vals) {
+    SEXP six;
+    SEXPTYPE type;
+    hash_t *h = 0;
+    int np = 0;
+    int *ix = 0;
+    int get_index = (asInteger(sGetIndex) == 1);
+
+    h = unwrap(ht);
+
+    vals = chk_vals(vals, x);
+
+    /* implicitly convert factors/POSIXlt to character */
+    if (OBJECT(x)) {
+	if (inherits(x, "factor")) {
+	    x = PROTECT(asCharacterFactor(x));
+	    np++;
+	} else if (inherits(x, "POSIXlt")) {
+	    x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
+	    np++;
+	}
+    }
+    type = TYPEOF(x);
+
+    /* we only support INT/REAL/STR */
+    if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP)
+	Rf_error("Currently supported types are integer, real, chracter vectors and lists");
+
+    if (get_index) { /* FIXME: long vec support? */
+	ix = INTEGER(six = PROTECT(allocVector(INTSXP, LENGTH(x))));
+	np++;
+    }
+
+    append_hash(h, x, ix, vals);
+    if (np) UNPROTECT(np);
+    return ix ? six : ht;
+}
+
+SEXP get_table(SEXP ht) {
+    R_len_t n;
+    R_xlen_t sz = sizeof(int);
+    SEXP res;
+    hash_t *h = unwrap(ht);
+    n = h->els;
+    res = allocVector(h->type, n);
+    if (h->type == REALSXP) sz = sizeof(double);
+    else if (h->type != INTSXP) sz = sizeof(SEXP);
+    sz *= n;
+    memcpy(DATAPTR(res), DATAPTR(h->parent), sz);
+    return res;
+}
+
+SEXP get_values(SEXP ht, SEXP x) {
+    SEXP res;
+    SEXPTYPE type;
+    hash_t *h = 0;
+    int np = 0;
+
+    h = unwrap(ht);
+
+    if (!h->vals)
+	Rf_error("This is not a key/value hash table");
+    
+    /* implicitly convert factors/POSIXlt to character */
+    if (OBJECT(x)) {
+	if (inherits(x, "factor")) {
+	    x = PROTECT(asCharacterFactor(x));
+	    np++;
+	} else if (inherits(x, "POSIXlt")) {
+	    x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
+	    np++;
+	}
+    }
+    type = TYPEOF(x);
+
+    /* we only support INT/REAL/STR */
+    if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP)
+	Rf_error("Currently supported types are integer, real, chracter vectors and lists");
+    
+    {
+	R_xlen_t i, n = XLENGTH(x);
+	res = PROTECT(allocVector(VECSXP, n));
+	np++;
+	
+	if (type == INTSXP) {
+	    int *iv = INTEGER(x);
+	    for (i = 0; i < n; i++)
+		setval(res, i, get_hash_int(h, iv[i]), h->vals);
+	} else if (type == REALSXP) {
+	    double *rv = REAL(x);
+	    for (i = 0; i < n; i++)
+		setval(res, i, get_hash_real(h, rv[i]), h->vals);
+	} else {
+	    SEXP *rv = (SEXP*) DATAPTR(x);
+	    for (i = 0; i < n; i++)
+		setval(res, i, get_hash_obj(h, rv[i]), h->vals);
+	}
+    }
+    UNPROTECT(np);
+    return res;
+}
diff --git a/src/fastmatch.c b/src/fastmatch.c
index f92b3e3..ead735f 100644
--- a/src/fastmatch.c
+++ b/src/fastmatch.c
@@ -13,12 +13,11 @@
  *  GNU General Public License for more details.
  */
 
-/* for speed (should not really matter in this case as most time is spent in the hashing) */
-#define USE_RINTERNALS 1
-#include <Rinternals.h>
+#include "common.h"
 
 /* for malloc/free since we handle our hash table memory separately from R */
 #include <stdlib.h>
+#include <string.h>
 /* for hashing for pointers we need intptr_t */
 #include <stdint.h>
 
@@ -28,14 +27,15 @@ SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env);
 /* ".match.hash" symbol - cached on first use */
 SEXP hs;
 
-typedef int hash_index_t;
-
 typedef struct hash {
-  int m, k, els, type;
-  void *src;
-  SEXP prot, parent;
-  struct hash *next;
-  hash_index_t ix[1];
+    hash_index_t m, els; /* hash size, added elements (unused!) */
+    int k;               /* used bits */
+    SEXPTYPE type;       /* payload type */
+    void *src;           /* the data array of the hashed object */
+    SEXP prot;           /* object to protect along whith this hash */
+    SEXP parent;         /* hashed object */
+    struct hash *next;   /* next hash table - typically for another type */
+    hash_index_t ix[1];  /* actual table of indices */
 } hash_t;
 
 /* create a new hash table with the given source and length.
@@ -43,10 +43,11 @@ typedef struct hash {
    so you must make sure the source is still alive when used */
 static hash_t *new_hash(void *src, hash_index_t len) {
   hash_t *h;
-  hash_index_t m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */
+  int k = 1;
+  hash_index_t m = 2, desired = len * 2; /* we want a maximal load of 50% */
   while (m < desired) { m *= 2; k++; }
   h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m));
-  if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0));
+  if (!h) Rf_error("unable to allocate %.2fMb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0));
   h->m = m;
   h->k = k;
   h->src = src;
@@ -70,286 +71,415 @@ static void hash_fin(SEXP ho) {
 #define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
 
 /* add the integer value at index i (0-based!) to the hash */
-static void add_hash_int(hash_t *h, hash_index_t i) {
-  int *src = (int*) h->src;
-  int val = src[i++], addr;
-  addr = HASH(val);
+static hash_value_t add_hash_int(hash_t *h, hash_index_t i) {
+    int *src = (int*) h->src;
+    int val = src[i++];
+    hash_value_t addr = HASH(val);
 #ifdef PROFILE_HASH
-  int oa = addr;
+    hash_value_t oa = addr;
 #endif
-  while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
+    while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
 #ifdef PROFILE_HASH
-  if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+    if (addr != oa) Rprintf("%d: dist=%d (addr=%d, oa=%d)\n", val,
+			   (int) (addr - oa), (int) addr, (int) oa);
 #endif
-  if (!h->ix[addr])
-    h->ix[addr] = i;
+    if (!h->ix[addr])
+	h->ix[addr] = i;
+    return addr;
 }
 
 /* to avoid aliasing rules issues use a union */
 union dint_u {
-  double d;
-  unsigned int u[2];
+    double d;
+    unsigned int u[2];
 };
 
 /* add the double value at index i (0-based!) to the hash */
-static void add_hash_real(hash_t *h, hash_index_t i) {
-  double *src = (double*) h->src;
-  union dint_u val;
-  int addr;
-  /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
-  val.d = (src[i] == 0.0) ? 0.0 : src[i];
-  if (R_IsNA(val.d)) val.d = NA_REAL;
-  else if (R_IsNaN(val.d)) val.d = R_NaN;
-  addr = HASH(val.u[0]+ val.u[1]);
+static hash_value_t add_hash_real(hash_t *h, hash_index_t i) {
+    double *src = (double*) h->src;
+    union dint_u val;
+    hash_value_t addr;
+    /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
+    val.d = (src[i] == 0.0) ? 0.0 : src[i];
+    if (R_IsNA(val.d)) val.d = NA_REAL;
+    else if (R_IsNaN(val.d)) val.d = R_NaN;
+    addr = HASH(val.u[0]+ val.u[1]);
 #ifdef PROFILE_HASH
-  int oa = addr;
+    hash_value_t oa = addr;
 #endif
-  while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
+    while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
 #ifdef PROFILE_HASH
-  if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa);
+    if (addr != oa)
+	Rprintf("%g: dist=%d (addr=%d, oa=%d)\n", val.d,
+	       (int) (addr - oa), (int)addr, (int)oa);
 #endif
-  if (!h->ix[addr])
-    h->ix[addr] = i + 1;
+    if (!h->ix[addr])
+	h->ix[addr] = i + 1;
+    return addr;
 }
 
 /* add the pointer value at index i (0-based!) to the hash */
-static void add_hash_ptr(hash_t *h, hash_index_t i) {
-  int addr;
-  void **src = (void**) h->src;
-  intptr_t val = (intptr_t) src[i++];
+static int add_hash_ptr(hash_t *h, hash_index_t i) {
+    hash_value_t addr;
+    void **src = (void**) h->src;
+    intptr_t val = (intptr_t) src[i++];
 #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
-  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+    addr = HASH((val & 0xffffffff) ^ (val >> 32));
 #else
-  addr = HASH(val);
+    addr = HASH(val);
 #endif
 #ifdef PROFILE_HASH
-  int oa = addr;
+    hash_value_t oa = addr;
 #endif
-  while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
+    while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
 #ifdef PROFILE_HASH
-  if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa);
+    if (addr != oa)
+	Rprintf("%p: dist=%d (addr=%d, oa=%d)\n", val,
+		(int)(addr - oa), (int)addr, (int)oa);
 #endif
-  if (!h->ix[addr])
-    h->ix[addr] = i;
+    if (!h->ix[addr])
+	h->ix[addr] = i;
+    return addr;
 }
 
 /* NOTE: we are returning a 1-based index ! */
-static int get_hash_int(hash_t *h, int val, int nmv) {
-  int *src = (int*) h->src;
-  int addr;
-  addr = HASH(val);
-  while (h->ix[addr]) {
-    if (src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr ++;
-    if (addr == h->m) addr = 0;
-  }
-  return nmv;
+static hash_index_t get_hash_int(hash_t *h, int val, int nmv) {
+    int *src = (int*) h->src;
+    hash_value_t addr = HASH(val);
+    while (h->ix[addr]) {
+	if (src[h->ix[addr] - 1] == val)
+	    return h->ix[addr];
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+    return nmv;
 }
 
 /* NOTE: we are returning a 1-based index ! */
-static int get_hash_real(hash_t *h, double val, int nmv) {
-  double *src = (double*) h->src;
-  int addr;
-  union dint_u val_u;
-  /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
-  if (val == 0.0) val = 0.0;
-  if (R_IsNA(val)) val = NA_REAL;
-  else if (R_IsNaN(val)) val = R_NaN;
-  val_u.d = val;
-  addr = HASH(val_u.u[0] + val_u.u[1]);
-  while (h->ix[addr]) {
-    if (src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
-  return nmv;
+static hash_index_t get_hash_real(hash_t *h, double val, int nmv) {
+    double *src = (double*) h->src;
+    hash_value_t addr;
+    union dint_u val_u;
+    /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+    if (val == 0.0) val = 0.0;
+    if (R_IsNA(val)) val = NA_REAL;
+    else if (R_IsNaN(val)) val = R_NaN;
+    val_u.d = val;
+    addr = HASH(val_u.u[0] + val_u.u[1]);
+    while (h->ix[addr]) {
+	if (!memcmp(&src[h->ix[addr] - 1], &val, sizeof(val)))
+	    return h->ix[addr];
+	addr++;
+	if (addr == h->m) addr = 0;
+    }
+    return nmv;
 }
 
 /* NOTE: we are returning a 1-based index ! */
-static int get_hash_ptr(hash_t *h, void *val_ptr, int nmv) {
-  void **src = (void **) h->src;
-  intptr_t val = (intptr_t) val_ptr;
-  int addr;
+static hash_index_t get_hash_ptr(hash_t *h, void *val_ptr, int nmv) {
+    void **src = (void **) h->src;
+    intptr_t val = (intptr_t) val_ptr;
+    hash_value_t addr;
 #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
-  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+    addr = HASH((val & 0xffffffff) ^ (val >> 32));
 #else
-  addr = HASH(val);
+    addr = HASH(val);
 #endif
-  while (h->ix[addr]) {
-    if ((intptr_t) src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr ++;
-    if (addr == h->m) addr = 0;
-  }
-  return nmv;
+    while (h->ix[addr]) {
+	if ((intptr_t) src[h->ix[addr] - 1] == val)
+	    return h->ix[addr];
+	addr ++;
+	if (addr == h->m) addr = 0;
+    }
+    return nmv;
 }
 
 static SEXP asCharacter(SEXP s, SEXP env)
 {
-  SEXP call, r;
-  PROTECT(call = lang2(install("as.character"), s));
-  PROTECT(r = eval(call, env));
-  UNPROTECT(2);
-  return r;
+    SEXP call, r;
+    PROTECT(call = lang2(install("as.character"), s));
+    r = eval(call, env);
+    UNPROTECT(1);
+    return r;
 }
 
+static double NA_int2real(hash_index_t res) {
+    return (res == NA_INTEGER) ? R_NaReal : ((double)  res);
+}
 
 /* the only externally visible function to be called from R */
-SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp) {
-  SEXP a;
-  SEXPTYPE type;
-  hash_t *h = 0;
-  int nmv = asInteger(nonmatch), n = LENGTH(x), np = 0, y_to_char = 0, y_factor = 0;
-
-  /* edge-cases of 0 length */
-  if (n == 0) return allocVector(INTSXP, 0);
-  if (LENGTH(y) == 0) { /* empty table -> vector full of nmv */
-    int *ai;
-    a = allocVector(INTSXP, n);
-    ai = INTEGER(a);
-    for (np = 0; np < n; np++) ai[np] = nmv;
-    return a;
-  }
-
-  /* if incomparables are used we fall back straight to match() */
-  if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) {
-    Rf_warning("incomparables used in fmatch(), falling back to match()");
-    return match5(y, x, nmv, incomp, R_BaseEnv);
-  }
+SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp, SEXP hashOnly) {
+    SEXP a;
+    SEXPTYPE type;
+    hash_t *h = 0;
+    int nmv = asInteger(nonmatch), np = 0, y_to_char = 0, y_factor = 0, hash_only = asInteger(hashOnly);
+    hash_index_t n = XLENGTH(x);
+
+    /* edge-cases of 0 length */
+    if (n == 0) return allocVector(INTSXP, 0);
+    if (XLENGTH(y) == 0) { /* empty table -> vector full of nmv */
+	int *ai;
+	hash_index_t ii;
+	a = allocVector(INTSXP, n);
+	ai = INTEGER(a);
+	for (ii = 0; ii < n; ii++) ai[ii] = nmv;
+	return a;
+    }
+
+    /* if incomparables are used we fall back straight to match() */
+    if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) {
+	Rf_warning("incomparables used in fmatch(), falling back to match()");
+	return match5(y, x, nmv, incomp, R_BaseEnv);
+    }
 
   /* implicitly convert factors/POSIXlt to character */
-  if (OBJECT(x)) {
-    if (inherits(x, "factor")) {
-      x = PROTECT(asCharacterFactor(x));
-      np++;
-    } else if (inherits(x, "POSIXlt")) {
-      x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
-      np++;
+    if (OBJECT(x)) {
+	if (inherits(x, "factor")) {
+	    x = PROTECT(asCharacterFactor(x));
+	    np++;
+	} else if (inherits(x, "POSIXlt")) {
+	    x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */
+	    np++;
+	}
+    }
+
+    /* for y we may need to do that later */
+    y_factor = OBJECT(y) && inherits(y, "factor");
+    y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt"));
+    
+    /* coerce to common type - in the order of SEXP types */
+    if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP)
+	type = STRSXP;
+    else
+	type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x);
+    
+    /* we only support INT/REAL/STR */
+    if (type != INTSXP && type != REALSXP && type != STRSXP) {
+	Rf_warning("incompatible type, fastmatch() is falling back to match()");
+	return match5(y, x, nmv, NULL, R_BaseEnv);
     }
-  }
-
-  /* for y we may need to do that later */
-  y_factor = OBJECT(y) && inherits(y, "factor");
-  y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt"));
-
-  /* coerce to common type - in the order of SEXP types */
-  if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP)
-    type = STRSXP;
-  else
-    type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x);
-
-  /* we only support INT/REAL/STR */
-  if (type != INTSXP && type != REALSXP && type != STRSXP) {
-    Rf_warning("incompatible type, fastmatch() is falling back to match()");
-    return match5(y, x, nmv, NULL, R_BaseEnv);
-  }
-
-  if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */
-    type = STRSXP;
-
-  /* coerce x - not y yet because we may get away with the existing cache */
-  if (TYPEOF(x) != type) {
-    x = PROTECT(coerceVector(x, type));
-    np++;
-  }
-
-  /* find existing cache(s) */
-  if (!hs) hs = Rf_install(".match.hash");
-  a = Rf_getAttrib(y, hs);
-  if (a != R_NilValue) { /* if there is a cache, try to find the matching type */
-    h = (hash_t*) EXTPTR_PTR(a);
-    /* could the object be out of sync ? If so, better remove the hash and ignore it */
-    if (h->parent != y) {
+
+    if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */
+	type = STRSXP;
+
+    /* coerce x - not y yet because we may get away with the existing cache */
+    if (TYPEOF(x) != type) {
+	x = PROTECT(coerceVector(x, type));
+	np++;
+    }
+
+    /* find existing cache(s) */
+    if (!hs) hs = Rf_install(".match.hash");
+    a = Rf_getAttrib(y, hs);
+    if (a != R_NilValue) { /* if there is a cache, try to find the matching type */
+	h = (hash_t*) EXTPTR_PTR(a);
+	/* could the object be out of sync ? If so, better remove the hash and ignore it */
+	if (!h || h->parent != y) {
 #if HASH_VERBOSE
-      Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n");
+	    Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n");
 #endif
-      h = 0;
-      Rf_setAttrib(y, hs, R_NilValue);
+	    h = 0;
+	    Rf_setAttrib(y, hs, R_NilValue);
+	}
+	while (h && h->type != type) h = h->next;
     }
-    while (h && h->type != type) h = h->next;
-  }
-  /* if there is no cache or not of the needed coerced type, create one */
-  if (a == R_NilValue || !h) {
-    h = new_hash(DATAPTR(y), LENGTH(y));
-    h->type = type;
-    h->parent = y;
+    /* if there is no cache or not of the needed coerced type, create one */
+    if (a == R_NilValue || !h) {
+	h = new_hash(DATAPTR(y), XLENGTH(y));
+	h->type = type;
+	h->parent = y;
 #if HASH_VERBOSE
-    Rprintf(" - creating new hash for type %d\n", type);
+	Rprintf(" - creating new hash for type %d\n", type);
 #endif
-    if (a == R_NilValue) { /* if there is no cache attribute, create one */
-      a = R_MakeExternalPtr(h, R_NilValue, R_NilValue);
-      Rf_setAttrib(y, hs, a);
-      Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash"));
-      R_RegisterCFinalizer(a, hash_fin);
-    } else { /* otherwise append the new cache */
-      hash_t *lh = (hash_t*) EXTPTR_PTR(a);
-      while (lh->next) lh = lh->next;
-      lh->next = h;
+	if (a == R_NilValue || !EXTPTR_PTR(a)) { /* if there is no cache attribute, create one */
+	    a = R_MakeExternalPtr(h, R_NilValue, R_NilValue);
+	    Rf_setAttrib(y, hs, a);
+	    Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash"));
+	    R_RegisterCFinalizer(a, hash_fin);
+	} else { /* otherwise append the new cache */
+	    hash_t *lh = (hash_t*) EXTPTR_PTR(a);
+	    while (lh->next) lh = lh->next;
+	    lh->next = h;
 #if HASH_VERBOSE
-      Rprintf("   (appended to the cache list)\n");
+	    Rprintf("   (appended to the cache list)\n");
 #endif
-    }
+	}
 
-    if (TYPEOF(y) != type) {
+	if (TYPEOF(y) != type) {
 #if HASH_VERBOSE
-      if (y_to_char)
-	Rprintf("   (need to convert table factor/POSIXlt to strings\n");
-      else
-	Rprintf("   (need to coerce table to %d)\n", type);
+	    if (y_to_char)
+		Rprintf("   (need to convert table factor/POSIXlt to strings\n");
+	    else
+		Rprintf("   (need to coerce table to %d)\n", type);
 #endif
-      y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type);
-      h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */
-      h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */
-      R_PreserveObject(y);
+	    y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type);
+	    h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */
+	    h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */
+	    R_PreserveObject(y);
+	}
+	/* make sure y doesn't go away while we create the hash */
+	/* R_PreserveObject(y);     */
+	/* spawn a thread to create the hash */
+	/* nope - so far we do it serially */
+	
+	{ /* create the hash table */
+	    hash_index_t i, n = XLENGTH(y);
+	    if (type == INTSXP)
+		for(i = 0; i < n; i++)
+		    add_hash_int(h, i);
+	    else if (type == REALSXP)
+		for(i = 0; i < n; i++)
+		    add_hash_real(h, i);
+	    else
+		for(i = 0; i < n; i++)
+		    add_hash_ptr(h, i);
+	}
     }
-    /* make sure y doesn't go away while we create the hash */
-    /* R_PreserveObject(y);     */
-    /* spawn a thread to create the hash */
-    /* nope - so far we do it serially */
-
-    { /* create the hash table */
-      int i, n = LENGTH(y);
-      if (type == INTSXP)
+
+    if (hash_only) {
+	if (np) UNPROTECT(np);
+	return y;
+    }
+
+    { /* query the hash table */
+	SEXP r;
+#ifdef LONG_VECTOR_SUPPORT
+	if (IS_LONG_VEC(x)) {
+	    hash_index_t i, n = XLENGTH(x);
+	    double *v = REAL(r = allocVector(REALSXP, n));
+	    if (nmv == NA_INTEGER) {
+		/* we have to treat nmv = NA differently,
+		   because is has to be transformed into
+		   NA_REAL in the result. To avoid checking
+		   when nmv is different, we have two paths */
+		if (type == INTSXP) {
+		    int *k = INTEGER(x);
+		    for (i = 0; i < n; i++)
+			v[i] = NA_int2real(get_hash_int(h, k[i], NA_INTEGER));
+		} else if (type == REALSXP) {
+		    double *k = REAL(x);
+		    for (i = 0; i < n; i++)
+			v[i] = NA_int2real(get_hash_real(h, k[i], NA_INTEGER));
+		} else {
+		    SEXP *k = (SEXP*) DATAPTR(x);
+		    for (i = 0; i < n; i++)
+			v[i] = NA_int2real(get_hash_ptr(h, k[i], NA_INTEGER));
+		}
+	    } else { /* no need to transcode nmv */
+		if (type == INTSXP) {
+		    int *k = INTEGER(x);
+		    for (i = 0; i < n; i++)
+			v[i] = (double) get_hash_int(h, k[i], nmv);
+		} else if (type == REALSXP) {
+		    double *k = REAL(x);
+		    for (i = 0; i < n; i++)
+			v[i] = (double) get_hash_real(h, k[i], nmv);
+		} else {
+		    SEXP *k = (SEXP*) DATAPTR(x);
+		    for (i = 0; i < n; i++)
+			v[i] = (double) get_hash_ptr(h, k[i], nmv);
+		}
+	    }
+	} else
+#endif
+	{
+	    /* short vector - everything is int */
+	    int i, n = LENGTH(x);
+	    int *v = INTEGER(r = allocVector(INTSXP, n));
+	    if (type == INTSXP) {
+		int *k = INTEGER(x);
+		for (i = 0; i < n; i++)
+		    v[i] = get_hash_int(h, k[i], nmv);
+	    } else if (type == REALSXP) {
+		double *k = REAL(x);
+		for (i = 0; i < n; i++)
+		    v[i] = get_hash_real(h, k[i], nmv);
+	    } else {
+		SEXP *k = (SEXP*) DATAPTR(x);
+		for (i = 0; i < n; i++)
+		    v[i] = get_hash_ptr(h, k[i], nmv);
+	    }
+	}
+	if (np) UNPROTECT(np);
+	return r;
+    }
+}
+
+/* FIXME: should we also attach the hash? */
+SEXP coalesce(SEXP x) {
+    SEXPTYPE type = TYPEOF(x);
+    SEXP res;
+    hash_index_t i, n = XLENGTH(x), dst = 0;
+    hash_t *h;
+    hash_index_t *count;
+
+    res = PROTECT(allocVector(INTSXP, XLENGTH(x)));
+
+    h = new_hash(DATAPTR(x), XLENGTH(x));
+    h->type = type;
+    h->parent = x;
+ 
+    if (!(count = calloc(h->m, sizeof(*count)))) {
+	free_hash(h);
+	Rf_error("Unable to allocate memory for counts");
+    }
+
+    /* count the size of each category - we're using negative numbers
+       since we will re-purpose the array later to hold the pointer to the
+       index of the next entry to stroe which will be positive */
+    if (type == INTSXP)
 	for(i = 0; i < n; i++)
-	  add_hash_int(h, i);
-      else if (type == REALSXP)
+	    count[add_hash_int(h, i)]--;
+    else if (type == REALSXP)
 	for(i = 0; i < n; i++)
-	  add_hash_real(h, i);
-      else
+	    count[add_hash_real(h, i)]--;
+    else
 	for(i = 0; i < n; i++)
-	  add_hash_ptr(h, i);
-    }
-  }
-
-  { /* query the hash table */
-    int i, n = LENGTH(x);
-    SEXP r = allocVector(INTSXP, n);
-    int *v = INTEGER(r);
-    if (type == INTSXP) {
-      int *k = INTEGER(x);
-      for (i = 0; i < n; i++)
-	  v[i] = get_hash_int(h, k[i], nmv);
-    } else if (type == REALSXP) {
-      double *k = REAL(x);
-      for (i = 0; i < n; i++)
-	  v[i] = get_hash_real(h, k[i], nmv);
-    } else {
-      SEXP *k = (SEXP*) DATAPTR(x);
-      for (i = 0; i < n; i++)
-	  v[i] = get_hash_ptr(h, k[i], nmv);
-    }
-    if (np) UNPROTECT(np);
-    return r;
-  }
+	    count[add_hash_ptr(h, i)]--;
+
+    if (type == INTSXP)
+	for(i = 0; i < n; i++) {
+	    hash_value_t addr = add_hash_int(h, i);
+	    if (count[addr] < 0) { /* this cat has not been used yet, reserve the index space for it*/
+		hash_index_t ni = -count[addr];
+		count[addr] = dst;
+		dst += ni;
+	    }
+	    INTEGER(res)[count[addr]++] = i + 1;
+	}
+    else if (type == REALSXP)
+	for(i = 0; i < n; i++) {
+	    hash_value_t addr = add_hash_real(h, i);
+	    if (count[addr] < 0) {
+		hash_index_t ni = -count[addr];
+		count[addr] = dst;
+		dst += ni;
+	    }
+	    INTEGER(res)[count[addr]++] = i + 1;
+	}
+    else
+	for(i = 0; i < n; i++) {
+	    hash_value_t addr = add_hash_ptr(h, i);
+	    if (count[addr] < 0) {
+		hash_index_t ni = -count[addr];
+		count[addr] = dst;
+		dst += ni;
+	    }
+	    INTEGER(res)[count[addr]++] = i + 1;
+	}
+    
+    free(count);
+    free_hash(h);
+
+    UNPROTECT(1);
+    return res;
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-fastmatch.git