[med-svn] [rapmap] 01/04: New upstream version 0.4.0+dfsg

Sat Oct 15 13:51:14 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository rapmap.

commit 5fb69ead0e9a7f2d223175d3e3974d65866534d8
Author: Andreas Tille <tille at debian.org>
Date:   Sat Oct 15 15:33:41 2016 +0200

    New upstream version 0.4.0+dfsg
---
 CMakeLists.txt                       |   58 +-
 License.md                           |  676 ++++
 README.md                            |   60 +
 include/BooMap.hpp                   |   60 +-
 include/BooPHF.hpp                   |   26 +-
 include/FastxParser.hpp              |  150 +
 include/FrugalBooMap.hpp             |  323 ++
 include/HitManager.hpp               |   22 +
 include/IndexHeader.hpp              |   21 +
 include/JFRaw.hpp                    |   21 +
 include/PairAlignmentFormatter.hpp   |   21 +
 include/RapMapConfig.hpp             |   27 +-
 include/RapMapFileSystem.hpp         |   21 +
 include/RapMapIndex.hpp              |   21 +
 include/RapMapSAIndex.hpp            |   25 +-
 include/RapMapUtils.hpp              |  144 +-
 include/SACollector.hpp              | 1283 ++++----
 include/SASearcher.hpp               |   21 +
 include/ScopedTimer.hpp              |   30 +-
 include/SingleAlignmentFormatter.hpp |   21 +
 include/SparseHashSerializer.hpp     |   51 +
 include/SpinLock.hpp                 |   21 +
 include/concurrentqueue.h            | 3621 ++++++++++++++++++++++
 include/sparsepp.h                   | 5622 ++++++++++++++++++++++++++++++++++
 scripts/RunRapMap.sh                 |   29 +
 scripts/add-header.sh                |   33 +
 scripts/compile.sh                   |    2 +-
 src/CMakeLists.txt                   |   50 +-
 src/FastxParser.cpp                  |  306 ++
 src/HitManager.cpp                   |   55 +-
 src/RapMap.cpp                       |   21 +
 src/RapMapFileSystem.cpp             |   21 +
 src/RapMapIndex.cpp                  |   21 +
 src/RapMapIndexer.cpp                |   58 +-
 src/RapMapMapper.cpp                 |  105 +-
 src/RapMapSAIndex.cpp                |  112 +-
 src/RapMapSAIndexer.cpp              |  177 +-
 src/RapMapSAMapper.cpp               |  371 ++-
 src/RapMapUtils.cpp                  |  176 +-
 39 files changed, 12822 insertions(+), 1061 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17487b8..266eae8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,14 +4,15 @@ enable_testing()
 
 project (RapMap)
 
-set(CPACK_PACKAGE_VERSION "0.3.0")
+set(CPACK_PACKAGE_VERSION "0.4.0")
 SET(CPACK_PACKAGE_VERSION_MAJOR "0")
-set(CPACK_PACKAGE_VERSION_MINOR "3")
+set(CPACK_PACKAGE_VERSION_MINOR "4")
 set(CPACK_PACKAGE_VERSION_PATCH "0")
+set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
 set(CPACK_GENERATOR "TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
 set(CPACK_PACKAGE_VENDOR "Stony Brook University")
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast quasi/pseudo/lightweight alignment")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast qasi-mapping")
 set(CPACK_PACKAGE_NAME
   "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 set(CPACK_SOURCE_PACKAGE_FILE_NAME
@@ -25,8 +26,12 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 #    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -DEMPHF_USE_POPCOUNT")
 #endif(SSE4_2_FOUND)
 
-set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register -Wno-c++11-narrowing -Wno-unknown-pragmas")
-set (BOOST_CXX_FLAGS "-Wno-deprecated-register -std=c++11")
+if (APPLE)
+set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register -Wno-unknon-pragmas -Wreturn-type -Werror=return-type")
+else()
+set (WARNING_IGNORE_FLAGS "-Wno-unknown-pragmas -Wreturn-type -Werror=return-type")
+endif()
+
 ## Prefer static to dynamic libraries
 SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
 
@@ -37,7 +42,14 @@ else()
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
 endif()
 
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM -Wall -std=c++11 -Wreturn-type -Werror=return-type")
+
+if (QUIET_BUILD)
+  set(WALL "")
+else()
+  set(WALL "-Wall")
+endif()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM ${WALL} -std=c++11")
 
 ##
 # OSX is strange (some might say, stupid in this regard).  Deal with it's quirkines here.
@@ -69,6 +81,7 @@ set (PTHREAD_LIB)
 ##
 # First take care of what to do if we have gcc
 ##
+set (JELLYFISH_CXX_FLAGS "-fPIC") 
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
     execute_process(
         COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
@@ -97,10 +110,6 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
     endif()
 
     set (WARNING_IGNORE_FLAGS "${WARNING_IGNORE_FLAGS} -Wno-unused-local-typedefs")
-    set (BOOST_TOOLSET "gcc")
-    set (BOOST_CONFIGURE_TOOLSET "--with-toolset=gcc")
-	set (BCXX_FLAGS "-std=c++11")
-    set (BOOST_EXTRA_FLAGS toolset=gcc cxxflags=${BCXX_FLAGS})
 # Tentatively, we support clang now
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     set(CLANG TRUE)
@@ -110,11 +119,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     if (HAVE_LIBCPP)
         message ("It appears that you're compiling with clang and that libc++ is available, so I'll use that")
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
-	    set (BOOST_TOOLSET "clang")
-        set (BOOST_CONFIGURE_TOOLSET "--with-toolset=clang")
-	    set (BCXX_FLAGS "-stdlib=libc++ -DBOOST_HAS_INT128")
-	    set (BOOST_EXTRA_FLAGS toolset=clang cxxflags=${BCXX_FLAGS} linkflags="-stdlib=libc++")
-        set (JELLYFISH_CXX_FLAGS "-stdlib=libc++")
+	set (JELLYFISH_CXX_FLAGS "${JELLYFISH_CXX_FLAGS} -stdlib=libc++")
     # Otherwise, use libstdc++ (and make it static)
     else()
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
@@ -174,20 +179,6 @@ ExternalProject_Add(libdivsufsort
 )
 set(SUFFARRAY_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
 
-message("Build system will fetch and build SparseHash")
-message("==================================================================")
-ExternalProject_Add(libsparsehash
-    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/sparsehash/archive/sparsehash-2.0.2.tar.gz -o sparsehash-2.0.2.tar.gz &&
-        tar -xzf sparsehash-2.0.2.tar.gz
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/sparsehash-sparsehash-2.0.2
-    BUILD_IN_SOURCE TRUE
-    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-    CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=<INSTALL_DIR>"
-    INSTALL_COMMAND make install
-)
-
-
 if (NOT CEREAL_ROOT)
 	set(CEREAL_ROOT ${GAT_SOURCE_DIR}/external/install)
 endif()
@@ -232,7 +223,7 @@ ExternalProject_Add(libjellyfish
      	tar -xzvf jellyfish-2.2.5.tgz
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.5
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-    CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.5/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
+    CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.5/configure --enable-shared=no --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
     BUILD_COMMAND ${MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
     BUILD_IN_SOURCE 1
     INSTALL_COMMAND make install
@@ -295,3 +286,10 @@ add_subdirectory ( src )
 
 # build a CPack driven installer package
 include (CPack)
+
+set(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION})
+add_custom_target(dist
+        COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
+        | gzip > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.gz
+            WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
+
diff --git a/License.md b/License.md
new file mode 100644
index 0000000..5fdf13c
--- /dev/null
+++ b/License.md
@@ -0,0 +1,676 @@
+GNU GENERAL PUBLIC LICENSE
+==========================
+Version 3, 29 June 2007
+==========================
+
+> Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>  
+  Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+
+# Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+# TERMS AND CONDITIONS
+
+## 0. Definitions.
+
+  _"This License"_ refers to version 3 of the GNU General Public License.
+
+  _"Copyright"_ also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  _"The Program"_ refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as _"you"_.  _"Licensees"_ and
+"recipients" may be individuals or organizations.
+
+  To _"modify"_ a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a _"modified version"_ of the
+earlier work or a work _"based on"_ the earlier work.
+
+  A _"covered work"_ means either the unmodified Program or a work based
+on the Program.
+
+  To _"propagate"_ a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To _"convey"_ a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+## 1. Source Code.
+
+  The _"source code"_ for a work means the preferred form of the work
+for making modifications to it. _"Object code"_ means any non-source
+form of a work.
+
+  A _"Standard Interface"_ means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The _"System Libraries"_ of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The _"Corresponding Source"_ for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+## 2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+## 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+## 4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+## 5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+## 6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A _"User Product"_ is either (1) a _"consumer product"_, which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  _"Installation Information"_ for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+## 7. Additional Terms.
+
+  _"Additional permissions"_ are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+## 8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+## 9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+## 10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An _"entity transaction"_ is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+## 11. Patents.
+
+  A _"contributor"_ is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's _"essential patent claims"_ are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+## 12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+## 13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+## 14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+## 15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+## 16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+## 17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+# END OF TERMS AND CONDITIONS
+--------------------------------------------------------------------------
+
+
+# How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type 'show c' for details.
+
+  The hypothetical commands _'show w'_ and _'show c'_ should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/README.md b/README.md
index 473d5f8..334bab4 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Join the chat at https://gitter.im/COMBINE-lab/RapMap](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/COMBINE-lab/RapMap?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
 # What is RapMap?
 
 RapMap is a testing ground for ideas in quasi-mapping / (lightweight / pseudo) transcriptome alignment.  That means that, at this point, it is somewhat experimental.  The `develop` branch will have the latest improvements and additions, but is not guaranteed to be stable between commits.  Breaking changes to the master branch will be accompanied by a tag to the version before the breaking change.  Currently, RapMap is a stand-alone quasi-mapper that can be used with other tools.  It is a [...]
@@ -23,6 +25,42 @@ To build RapMap, you need a C++11 compliant compiler (g++ >= 4.7 and clang >= 3.
 ```
 This should output the standard help message for rapmap.
 
+# Using RapMap
+
+To use RapMap to map reads, you first have to index your reference transcriptome.  Once the index is created, it can be used to map many different sets of reads.  Assuming that your reference transcriptome is in the file `ref.fa`, you can produce the index as follows:
+
+```
+> rapmap quasiindex -t ref.fa -i ref_index
+```
+
+if you want to make use of a minimum perfect hash when indexing (which will lower the memory requirement during mapping), you can instead use the following command:
+
+```
+> rapmap quasiindex -t ref.fa -i ref_index -p -x 4
+```
+
+the `-p` option enables the minimum perfect hash and `-x 4` tells RapMap to use up to 4 threads when building the perfect hash (you can specify as many or as few threads as you wish).
+
+The index itself will record whether it was built with the aid of minimum perfect hashing or not, so no extra information concerning this need be provided when mapping.  For the purposes of this example, we'll assume that we wish to map paired-end reads with the first mates in the file `r1.fq.gz` and the second mates in the file `r2.fq.gz`.  We can perform the mapping like so:
+
+```
+> rapmap quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 -o mapped_reads.sam
+```
+
+This will tell RapMap to map the paired-end reads using 8 threads, and to write the resulting `SAM` records to the file `mapped_reads.sam`.  The SAM format is rather verbose, and so such output files can be rather large (and slow to write) if you're mapping many reads.  For that reason, we recommend that you use [samtools](http://www.htslib.org/) to convert the `SAM` file to a `BAM` file on-the-fly.  Assuming `samtools` is installed an in your path, that can be accomplished with the foll [...]
+
+```
+> rapmap quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 -o | samtools view -Sb -@ 4 - > mapped_reads.bam
+```
+
+This will stream the output from RapMap to standard out, and then convert it into a `BAM` file (using up to an additional 4 threads for `BAM` compression) and write the resulting output to the file `mapped_reads.bam`.  To reduce the amount that needs to be typed in the common case, and to prevent the user from having to remember invocations like the above, we inclde a simple wrapper script that simplifies this process.  After installing RapMap, there should be a script called `RunRapMap. [...]
+
+```
+> RunRapMap.sh quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 --bamOut mapped_reads.sam --bamThreads 4
+```
+
+This will run RapMap with a command equivalent to the one mentioned above.  If you leave out the `--bamThreads` argument, then a single thread will be used for compression.  The `RunRapMap.sh` script can be used even if you don't wish to write the output to `BAM` format; in that case it is simply equivalent to running whichever command you pass with the `rapmap` executable itself.
+
 # Can I use RapMap for genomic alignment?
 
 No, at least not right now.  The index and mapping strategy employed by RapMap are highly geared toward mapping to transcriptomes.  It may be the case that some of these ideas can be successfully applied to genomic alignment, but 
@@ -51,3 +89,25 @@ RapMap is experimental, and the code, at this point, is subject to me testing ou
 # License 
 
 Since RapMap uses Jellyfish, it must be released under the GPL.  However, this is currently the only GPL dependency.  If it can be replaced, I'd like to re-license RapMap under the BSD license.  I'd be happy to accept pull-requests that replace the Jellyfish components with a library released under a more liberal license (BSD-compatible), but note that I will *not* accept such pull requests if they reduce the speed or increase the memory consumption over the Jellyfish-based version.
+
+# Citation
+
+If you use RapMap, or wish to cite the quasi-mapping concept or our algorithm for computing quasi-mappings, please 
+use this bibtex entry. 
+
+```
+ at article{Srivastava15062016,
+author = {Srivastava, Avi and Sarkar, Hirak and Gupta, Nitish and Patro, Rob}, 
+title = {RapMap: a rapid, sensitive and accurate tool for mapping RNA-seq reads to transcriptomes},
+volume = {32}, 
+number = {12}, 
+pages = {i192-i200}, 
+year = {2016}, 
+doi = {10.1093/bioinformatics/btw277},
+URL = {http://bioinformatics.oxfordjournals.org/content/32/12/i192.abstract}, 
+eprint = {http://bioinformatics.oxfordjournals.org/content/32/12/i192.full.pdf+html}, 
+journal = {Bioinformatics} 
+}
+```
+
+Other citation formats for the RapMap paper are available [here](http://bioinformatics.oxfordjournals.org/citmgr?gca=bioinfo%3B32%2F12%2Fi192).
diff --git a/include/BooMap.hpp b/include/BooMap.hpp
index e2056c6..8a951d9 100644
--- a/include/BooMap.hpp
+++ b/include/BooMap.hpp
@@ -52,6 +52,15 @@ public:
         data_.emplace_back(k, v);
     }
 
+    bool validate_hash(){
+        for( auto& e : data_ ) {
+            if (e.first != data_[boophf_->lookup(e.first)].first) {
+                std::cerr << "lookup of " << e.first << " failed!\n";
+            }
+        }
+        return true;
+    }
+
     bool build(int nthreads=1) {
         size_t numElem = data_.size();
         KeyIterator<decltype(data_.begin())> kb(data_.begin());
@@ -60,11 +69,8 @@ public:
         BooPHFT* ph = new BooPHFT(numElem, keyIt, nthreads);
         boophf_.reset(ph);
         std::cerr << "reordering keys and values to coincide with phf ... ";
-        std::vector<size_t> inds; inds.reserve(data_.size());
-        for (size_t i = 0; i < data_.size(); ++i) {
-            inds.push_back(ph->lookup(data_[i].first));
-        }
-        reorder_destructive_(inds.begin(), inds.end(), data_.begin());
+        reorder_fn_();
+        //validate_hash();
         std::cerr << "done\n";
         built_ = true;
         return built_;
@@ -164,6 +170,50 @@ private:
         return true;
     }
 
+  
+    void reorder_fn_()  {
+	/* Adapted from code at: http://blog.merovius.de/2014/08/12/applying-permutation-in-constant.html */
+        // Note, we can actually do this with out the bitvector by using the high-order bit 
+        // of the start of the suffix array intervals (since they are signed integers and negative
+        // positions are forbidden). 
+      std::vector<bool> bits(data_.size(), false);
+	for ( size_t i = 0; i < data_.size(); ++i ) {
+	  if (!bits[i]) {
+	    decltype(data_.front()) v = data_[i];
+	    auto j = boophf_->lookup(data_[i].first);
+	    while (i != j) {
+            auto pj = boophf_->lookup(data_[j].first);
+	      std::swap(data_[j], v);
+	      bits[j] = 1;
+	      j = pj; 
+	    }
+	    data_[i] = v;
+	  }
+	}
+
+	/* http://blog.merovius.de/2014/08/12/applying-permutation-in-constant.html
+	    for i := 0; i < len(vals); i++ {
+        if perm[i] < 0 {
+            // already correct - unmark and go on
+            // (note that ^a is the bitwise negation
+            perm[i] = ^perm[i]
+            continue
+        }
+
+        v, j := vals[i], perm[i]
+        for j != i {
+            vals[j], v = v, vals[j]
+            // When we find this element in the future, we must not swap it any
+            // further, so we mark it here
+            perm[j], j = ^perm[j], perm[j]
+        }
+        vals[i] = v
+    }
+}
+	*/
+    }
+
+
     // From : http://stackoverflow.com/questions/838384/reorder-vector-using-a-vector-of-indices
     template< typename order_iterator, typename value_iterator >
     void reorder_destructive_( order_iterator order_begin, order_iterator order_end, value_iterator v )  {
diff --git a/include/BooPHF.hpp b/include/BooPHF.hpp
index 64b11c7..098adcf 100644
--- a/include/BooPHF.hpp
+++ b/include/BooPHF.hpp
@@ -533,11 +533,11 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 		//for debug purposes
 		void print() const
 		{
-			printf("bit array of size %llu: \n", _size);
+			printf("bit array of size %lu: \n",_size);
 			for(uint64_t ii = 0; ii< _size; ii++)
 			{
 				if(ii%10==0)
-					printf(" (%llu) ",ii);
+					printf(" (%lu) ",ii);
 				int val = (_bitArray[ii >> 6] >> (ii & 63 ) ) & 1;
 				printf("%i",val);
 			}
@@ -546,7 +546,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 			printf("rank array : size %lu \n",_ranks.size());
 			for (uint64_t ii = 0; ii< _ranks.size(); ii++)
 			{
-				printf("%llu:  %llu,  ",ii,_ranks[ii]);
+				printf("%lu :  %lu,  ",ii,_ranks[ii]);
 			}
 			printf("\n");
 		}
@@ -736,6 +736,8 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 		{
 			if(n ==0) return;
 			
+			_fastmode = false;
+			
 			if(_percent_elem_loaded_for_fastMode > 0.0 )
 				_fastmode =true;
 
@@ -748,7 +750,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 			if(_fastmode)
 				_progressBar.init( _nelem * (_fastModeLevel+1) +  ( _nelem * pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1))    ,"Building BooPHF",num_thread);
 			else
-				_progressBar.init( _nelem * _nb_levels ,"Building BooPHF");
+				_progressBar.init( _nelem * _nb_levels ,"Building BooPHF",num_thread);
 			}
 
 			uint64_t offset = 0;
@@ -851,7 +853,6 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 			auto until = *until_p;
 			uint64_t inbuff =0;
 
-
 			
 			for (bool isRunning=true;  isRunning ; )
 			{
@@ -879,8 +880,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 					{
 							__sync_fetch_and_add(& _cptLevel,1);
 
-						if(i == _fastModeLevel && _fastmode)
+						if(_fastmode && i == _fastModeLevel)
 						{
+
 							uint64_t idxl2 = __sync_fetch_and_add(& _idxLevelsetLevelFastmode,1);
 							//si depasse taille attendue pour setLevelFastmode, fall back sur slow mode mais devrait pas arriver si hash ok et proba avec nous
 							if(idxl2>= setLevelFastmode.size())
@@ -892,6 +894,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 						//insert to level i+1 : either next level of the cascade or final hash if last level reached
 						if(i == _nb_levels-1) //stop cascade here, insert into exact hash
 						{
+
 							uint64_t hashidx =  __sync_fetch_and_add (& _hashidx, 1);
 
 							pthread_mutex_lock(&_mutex); //see later if possible to avoid this, mais pas bcp item vont la
@@ -901,6 +904,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 						}
 						else
 						{
+
 							//computes next hash
 
 							if ( level == 0)
@@ -1047,12 +1051,10 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 				 if(pow(_proba_collision,ii) < _percent_elem_loaded_for_fastMode)
 				 {
 				 	_fastModeLevel = ii;
-				 	// printf("fast mode level :  %i \n",ii);
+				 	 //printf("fast mode level :  %i \n",ii);
 				 	break;
 				 }
 			}
-
-
 		}
 
 
@@ -1123,7 +1125,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 			t_arg.until_p =  std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.end()));
 
 			t_arg.level = i;
-			if(i >= (_fastModeLevel+1) && _fastmode)
+			if(_fastmode && i >= (_fastModeLevel+1))
 			{
 				auto data_iterator = boomphf::range(static_cast<const elem_t*>( &setLevelFastmode[0]), static_cast<const elem_t*>( (&setLevelFastmode[0]) +setLevelFastmode.size()));
                 typedef decltype(data_iterator.begin()) fastmode_it_type;
@@ -1146,9 +1148,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
 			{
 				pthread_join(tab_threads[ii], NULL);
 			}
-		//	printf("\ngoing to level %i  : %llu elems  %.2f %%  expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
+			//printf("\ngoing to level %i  : %llu elems  %.2f %%  expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
 
-			if(i == _fastModeLevel) //shrink to actual number of elements in set
+			if(_fastmode && i == _fastModeLevel) //shrink to actual number of elements in set
 			{
 				//printf("resize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
 				setLevelFastmode.resize(_idxLevelsetLevelFastmode);
diff --git a/include/FastxParser.hpp b/include/FastxParser.hpp
new file mode 100644
index 0000000..6dfa3ca
--- /dev/null
+++ b/include/FastxParser.hpp
@@ -0,0 +1,150 @@
+#ifndef __FASTX_PARSER__
+#define __FASTX_PARSER__
+
+#include "fcntl.h"
+#include "unistd.h"
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+extern "C" {
+#include "kseq.h"
+}
+
+#include "concurrentqueue.h"
+
+#ifndef __FASTX_PARSER_PRECXX14_MAKE_UNIQUE__
+#define __FASTX_PARSER_PRECXX14_MAKE_UNIQUE__
+
+#if __cplusplus >= 201402L
+#include <memory>
+using std::make_unique
+#else
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+template <class T> struct _Unique_if {
+  using _Single_object = std::unique_ptr<T>;
+};
+
+template <class T> struct _Unique_if<T[]> {
+  using _Unknown_bound = std::unique_ptr<T[]>;
+};
+
+template <class T, size_t N> struct _Unique_if<T[N]> {
+  using _Known_bound = void;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  using U = typename std::remove_extent<T>::type;
+  return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args&&...) = delete;
+
+#endif // C++11
+#endif //__FASTX_PARSER_PRECXX14_MAKE_UNIQUE__
+
+namespace fastx_parser {
+struct ReadSeq {
+    std::string seq;
+    std::string name;
+    ~ReadSeq() {}
+};
+
+struct ReadPair {
+  ReadSeq first;
+  ReadSeq second;
+};
+
+template <typename T> class ReadChunk {
+public:
+  ReadChunk(size_t want) : group_(want), want_(want), have_(want) {}
+  inline void have(size_t num) { have_ = num; }
+  inline size_t size() { return have_; }
+  inline size_t want() const { return want_; }
+  T& operator[](size_t i) { return group_[i]; }
+  typename std::vector<T>::iterator begin() { return group_.begin(); }
+  typename std::vector<T>::iterator end() { return group_.begin() + have_; }
+
+private:
+  std::vector<T> group_;
+  size_t want_;
+  size_t have_;
+};
+
+template <typename T> class ReadGroup {
+public:
+  ReadGroup(moodycamel::ProducerToken&& pt, moodycamel::ConsumerToken&& ct)
+      : pt_(std::move(pt)), ct_(std::move(ct)) {}
+  moodycamel::ConsumerToken& consumerToken() { return ct_; }
+  moodycamel::ProducerToken& producerToken() { return pt_; }
+  // get a reference to the chunk this ReadGroup owns
+  std::unique_ptr<ReadChunk<T>>& chunkPtr() { return chunk_; }
+  // get a *moveable* reference to the chunk this ReadGroup owns
+  std::unique_ptr<ReadChunk<T>>&& takeChunkPtr() { return std::move(chunk_); }
+  inline void have(size_t num) { chunk_->have(num); }
+  inline size_t size() { return chunk_->size(); }
+  inline size_t want() const { return chunk_->want(); }
+  T& operator[](size_t i) { return (*chunk_)[i]; }
+  typename std::vector<T>::iterator begin() { return chunk_->begin(); }
+  typename std::vector<T>::iterator end() {
+    return chunk_->begin() + chunk_->size();
+  }
+  void setChunkEmpty() { chunk_.release(); }
+  bool empty() const { return chunk_.get() == nullptr; }
+
+private:
+  std::unique_ptr<ReadChunk<T>> chunk_{nullptr};
+  moodycamel::ProducerToken pt_;
+  moodycamel::ConsumerToken ct_;
+};
+
+template <typename T> class FastxParser {
+public:
+  FastxParser(std::vector<std::string> files, uint32_t numConsumers,
+              uint32_t numParsers = 1, uint32_t chunkSize = 1000);
+
+  FastxParser(std::vector<std::string> files, std::vector<std::string> files2,
+              uint32_t numConsumers, uint32_t numParsers = 1,
+              uint32_t chunkSize = 1000);
+  ~FastxParser();
+  bool start();
+  ReadGroup<T> getReadGroup();
+  bool refill(ReadGroup<T>& rg);
+  void finishedWithGroup(ReadGroup<T>& s);
+
+private:
+  moodycamel::ProducerToken getProducerToken_();
+  moodycamel::ConsumerToken getConsumerToken_();
+
+  std::vector<std::string> inputStreams_;
+  std::vector<std::string> inputStreams2_;
+  uint32_t numParsers_;
+  std::atomic<uint32_t> numParsing_;
+  std::vector<std::unique_ptr<std::thread>> parsingThreads_;
+  size_t blockSize_;
+  moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>> readQueue_,
+      seqContainerQueue_;
+
+  // holds the indices of files (file-pairs) to be processed
+  moodycamel::ConcurrentQueue<uint32_t> workQueue_;
+
+  std::vector<std::unique_ptr<moodycamel::ProducerToken>> produceReads_;
+  std::vector<std::unique_ptr<moodycamel::ConsumerToken>> consumeContainers_;
+};
+}
+#endif // __FASTX_PARSER__
diff --git a/include/FrugalBooMap.hpp b/include/FrugalBooMap.hpp
new file mode 100644
index 0000000..f2209dc
--- /dev/null
+++ b/include/FrugalBooMap.hpp
@@ -0,0 +1,323 @@
+#ifndef __BOO_MAP_FRUGAL__
+#define __BOO_MAP_FRUGAL__
+
+#include "BooPHF.hpp"
+#include "RapMapUtils.hpp"
+
+#include "cereal/types/vector.hpp"
+#include "cereal/types/utility.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include <fstream>
+#include <vector>
+#include <iterator>
+#include <type_traits>
+
+#include <sys/stat.h>
+
+
+// adapted from :
+// http://stackoverflow.com/questions/34875315/implementation-my-own-list-and-iterator-stl-c
+template <typename Iter, typename IndexT, typename HashMapT>
+class KeyProxyIterator {
+public:
+    typedef KeyProxyIterator<Iter, IndexT, HashMapT> self_type;
+    typedef uint64_t value_type;//std::iterator_traits<Iter>::value_type::first_type value_type;
+    typedef value_type& reference;
+    typedef value_type* pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef int64_t difference_type;
+
+    KeyProxyIterator(Iter first, HashMapT* hm) : 
+        curr_(first), hm_(hm) {}
+    KeyProxyIterator operator++() { KeyProxyIterator i = *this; curr_++; return i; }
+    KeyProxyIterator operator++(int) { ++curr_; return *this; }
+    reference operator*() { 
+        intRep_ = hm_->getKmerFromPos_(*curr_, mer_);
+        return intRep_; 
+    }
+    /*
+    pointer operator->() { 
+        return &(curr_->first); 
+    }
+    */
+    bool operator==(const self_type& rhs) { return curr_ == rhs.curr_; }
+    bool operator!=(const self_type& rhs) { return curr_ != rhs.curr_; }
+    bool operator<(const self_type& rhs) { return curr_ < rhs.curr_; }
+    bool operator<=(const self_type& rhs) { return curr_ <= rhs.curr_; }
+    
+private:
+    rapmap::utils::my_mer mer_;
+    uint64_t intRep_;
+    Iter curr_;
+    HashMapT* hm_{nullptr}; 
+};
+
+
+template <typename IterT, typename ProxyValueT>
+class KVProxy {
+public:
+    typedef KVProxy<IterT, ProxyValueT> self_type;
+    typedef typename std::iterator_traits<IterT>::value_type ValueT;
+    typedef std::pair<uint64_t, ValueT> value_type;
+    typedef std::pair<uint64_t, ProxyValueT>& reference;
+    typedef std::pair<uint64_t, ProxyValueT>* pointer;
+
+    KVProxy(uint64_t mer, IterT it, ValueT len, bool isEnd = false) : curr_(it) {
+        if(!isEnd) {ProxyValueT x{*it, len}; pair_ = std::make_pair(mer, x);}
+    }
+    reference operator*() { return pair_; }
+    pointer operator->() { return &pair_; }
+    bool operator==(const self_type& rhs) { return curr_ == rhs.curr_; }
+    bool operator!=(const self_type& rhs) { return curr_ != rhs.curr_; }
+    bool operator<(const self_type& rhs) { return curr_ < rhs.curr_; }
+    bool operator<=(const self_type& rhs) { return curr_ <= rhs.curr_; }
+private:
+    IterT curr_;
+    std::pair<uint64_t, ProxyValueT> pair_;
+};
+
+
+// Unlike the standard "generic" BooMap, the frugal variant
+// *does not* store the key.  Rather, it assumes we have a
+// pointer to the suffix array, and it "spot checks" the index 
+// returned by the perfect hash by ensuring that the suffix at
+// the corresponding offset starts with the query k-mer.
+template <typename KeyT, typename ValueT>
+class FrugalBooMap {
+public:
+    using self_type = FrugalBooMap<KeyT, ValueT>;
+    using HasherT = boomphf::SingleHashFunctor<KeyT>;
+    using BooPHFT = boomphf::mphf<KeyT, HasherT>;
+    typedef typename ValueT::index_type IndexT;
+    using IteratorT = KVProxy<typename std::vector<IndexT>::iterator, ValueT>;
+
+    //using IteratorT = typename std::vector<std::pair<KeyT, ValueT>>::iterator;
+
+    FrugalBooMap() : built_(false) {}
+    void setSAPtr(std::vector<IndexT>* saPtr) { saPtr_ = saPtr; }
+    void setTextPtr(const char* txtPtr, size_t textLen) { txtPtr_ = txtPtr; textLen_ = textLen; }
+
+    void add(KeyT&& k, ValueT&& v) {
+        // In the frugal map, we don't even keep the key!
+        data_.emplace_back(v.begin());
+        IndexT l = v.end() - v.begin();
+        if (l >= std::numeric_limits<uint8_t>::max()) {
+            overflow_[v.begin()] = l;
+            lens_.emplace_back(std::numeric_limits<uint8_t>::max());
+        } else {
+            lens_.emplace_back(static_cast<uint8_t>(l));
+        }
+    }
+
+
+    bool validate_hash(){
+        for( auto& e : data_ ) {
+            rapmap::utils::my_mer kmer(txtPtr_ + (*saPtr_)[e]);
+            auto ind = boophf_->lookup(kmer.word(0));
+            if (ind >= data_.size()) { 
+                rapmap::utils::my_mer km(txtPtr_ + (*saPtr_)[e]);
+                std::cerr << "index for " << km << " was " << ind << ", outside bounds of data_ (" << data_.size() << ")\n";
+                return false;
+            }
+            auto mer = getKmerFromInterval_(e);
+            if (mer != getKmerFromInterval_(data_[ind]) ) {
+                std::cerr << "lookup of " << mer << " failed!\n";
+            }
+        }
+        return true;
+    }
+
+
+    bool build(int nthreads=1) {
+        size_t numElem = data_.size();
+        KeyProxyIterator<decltype(data_.begin()), IndexT, self_type> kb(data_.begin(), this);
+        KeyProxyIterator<decltype(data_.begin()), IndexT, self_type> ke(data_.end(), this);
+        auto keyIt = boomphf::range(kb, ke);
+        BooPHFT* ph = new BooPHFT(numElem, keyIt, nthreads);
+        boophf_.reset(ph);
+        std::cerr << "reordering keys and values to coincide with phf ... ";
+        reorder_fn_();
+        //std::cerr << "validating hash\n";
+        //validate_hash();
+        std::cerr << "done\n";
+        std::cerr << "size of overflow table is " << overflow_.size() << '\n';
+        built_ = true;
+        return built_;
+    }
+
+    inline IteratorT find(const KeyT& k) {
+        auto intervalIndex = boophf_->lookup(k);
+        if (intervalIndex >= data_.size()) return end();
+        auto ind = data_[intervalIndex];
+        auto textInd = (*saPtr_)[ind];
+        rapmap::utils::my_mer m(txtPtr_ + textInd);
+
+        // If what we find matches the key, return the iterator
+        // otherwise we don't have the key (it must have been here if it
+        // existed).
+        if (m.word(0) == k) {
+            IndexT l = *(lens_.begin() + intervalIndex);
+            if (l == std::numeric_limits<uint8_t>::max()) {
+                l = overflow_[ind];
+            }
+            return IteratorT(m.word(0), data_.begin() + intervalIndex, ind + l);
+        }
+        return end();
+    }
+    
+    /**
+     * NOTE: This function *assumes* that the key is in the hash.
+     * If it isn't, you'll get back a random element!
+     */
+    /*
+    inline ValueT& operator[](const KeyT& k) {
+        auto ind = boophf_->lookup(k);
+        return (ind < data_.size() ? data_[ind].second : data_[0].second);
+    }
+    */
+    
+    inline IteratorT begin() { return IteratorT(0, data_.begin(), lens_.front()); }
+    inline IteratorT end() { return IteratorT(0, data_.end(), 0, true); }
+    inline IteratorT cend() const { return IteratorT(0, data_.cend(), 0, true); }
+    inline IteratorT cbegin() const { return IteratorT(0, data_.cbegin(), lens_.front()); }
+    
+    void save(const std::string& ofileBase) {
+        if (built_) {
+            std::string hashFN = ofileBase + ".bph";
+            // save the perfect hash function
+            {
+                std::ofstream os(hashFN, std::ios::binary);
+                if (!os.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << hashFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                boophf_->save(os);
+                os.close();
+            }
+            // and the values
+            std::string dataFN = ofileBase + ".val";
+            {
+                std::ofstream valStream(dataFN, std::ios::binary);
+                if (!valStream.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << dataFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                {
+                    cereal::BinaryOutputArchive outArchive(valStream);
+                    outArchive(data_);
+                    outArchive(lens_);
+                    overflow_.serialize(typename spp_utils::pod_hash_serializer<IndexT, IndexT>(), &valStream);
+                }
+                valStream.close();
+            }
+        }
+    }
+    
+    void load(const std::string& ofileBase) {
+        std::string hashFN = ofileBase + ".bph";
+        std::string dataFN = ofileBase + ".val";
+
+        if ( !FileExists_(hashFN.c_str()) ) {
+            std::cerr << "BooM: Looking for perfect hash function file [" << hashFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+        if ( !FileExists_(dataFN.c_str()) ) {
+            std::cerr << "BooM: Looking for key-value file [" << dataFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+
+        // load the perfect hash function
+        {
+            boophf_.reset(new BooPHFT);
+            std::ifstream is(hashFN, std::ios::binary);
+            boophf_->load(is);
+            is.close();
+        }
+        // and the values
+        {
+            std::ifstream dataStream(dataFN, std::ios::binary);
+            {
+                cereal::BinaryInputArchive inArchive(dataStream);
+                inArchive(data_);
+                inArchive(lens_);
+                overflow_.unserialize(typename spp_utils::pod_hash_serializer<IndexT, IndexT>(), &dataStream);
+            }
+            dataStream.close();
+        }
+
+        built_ = true;
+    }
+
+    inline KeyT getKmerFromInterval_(ValueT& ival) {
+        rapmap::utils::my_mer m;// copy the global mer to get k-mer object
+        m.from_chars(txtPtr_ + (*saPtr_)[ival.begin()]);
+        return m.word(0);
+    }
+
+    // variant where we provide an existing mer object
+    inline KeyT getKmerFromInterval_(ValueT& ival, rapmap::utils::my_mer& m) {
+        m.from_chars(txtPtr_ + (*saPtr_)[ival.begin()]);
+        return m.word(0);
+    }
+
+    // variant where we provide an existing mer object
+    inline KeyT getKmerFromPos_(IndexT pos, rapmap::utils::my_mer& m) {
+        m.from_chars(txtPtr_ + (*saPtr_)[pos]);
+        return m.word(0);
+    }
+
+private:
+    // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+    bool FileExists_(const char *path) {
+        struct stat fileStat;
+        if ( stat(path, &fileStat) ) {
+            return false;
+        }
+        if ( !S_ISREG(fileStat.st_mode) ) {
+            return false;
+        }
+        return true;
+    }
+
+    void reorder_fn_()  {
+        /* Adapted from code at: http://blog.merovius.de/2014/08/12/applying-permutation-in-constant.html */
+        // Note, we can actually do this with out the bitvector by using the high-order bit 
+        // of the start of the suffix array intervals (since they are signed integers and negative
+        // positions are forbidden). 
+        rapmap::utils::my_mer mer;
+        std::vector<bool> bits(data_.size(), false);
+        for ( size_t i = 0; i < data_.size(); ++i ) {
+            if (!bits[i]) {
+                decltype(data_.front()) v = data_[i];
+                decltype(lens_.front()) v2 = lens_[i];
+                auto j = boophf_->lookup(getKmerFromPos_(data_[i], mer));
+                while (i != j) {
+                    auto pj = boophf_->lookup(getKmerFromPos_(data_[j], mer));
+                    std::swap(data_[j], v);
+                    std::swap(lens_[j], v2);
+                    bits[j] = 1;
+                    j = pj; 
+                }
+                data_[i] = v;
+                lens_[i] = v2;
+            }
+        }
+    }
+
+    std::vector<IndexT>* saPtr_;
+    const char* txtPtr_; 
+    size_t textLen_;
+    rapmap::utils::my_mer mer_;
+    bool built_;
+    // Starting offset in the suffix array
+    std::vector<IndexT> data_;
+    // Length of the interval
+    std::vector<uint8_t> lens_;
+    // Overflow table if interval is >= std::numeric_limits<uint8_t>::max()
+    spp::sparse_hash_map<IndexT, IndexT> overflow_;
+    std::unique_ptr<BooPHFT> boophf_{nullptr};
+};
+
+
+#endif // __BOO_MAP_FRUGAL__
diff --git a/include/HitManager.hpp b/include/HitManager.hpp
index 24a288e..b318b0e 100644
--- a/include/HitManager.hpp
+++ b/include/HitManager.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __HIT_MANAGER_HPP__
 #define __HIT_MANAGER_HPP__
 
@@ -96,6 +117,7 @@ namespace rapmap {
         SAHitMap intersectSAHits(
                                  std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
                                  RapMapIndexT& rmi, 
+                                 size_t readLen,
                                  bool strictFilter=false);
 
         template <typename RapMapIndexT>
diff --git a/include/IndexHeader.hpp b/include/IndexHeader.hpp
index 87eba2d..01ab45f 100644
--- a/include/IndexHeader.hpp
+++ b/include/IndexHeader.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __INDEX_HEADER_HPP__
 #define __INDEX_HEADER_HPP__
 
diff --git a/include/JFRaw.hpp b/include/JFRaw.hpp
index 4efa052..6de360c 100644
--- a/include/JFRaw.hpp
+++ b/include/JFRaw.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __JF_RAW_H__
 #define __JF_RAW_H__
 
diff --git a/include/PairAlignmentFormatter.hpp b/include/PairAlignmentFormatter.hpp
index a3e90fe..b7fbca7 100644
--- a/include/PairAlignmentFormatter.hpp
+++ b/include/PairAlignmentFormatter.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __PAIR_ALIGNMENT_FORMATTER_HPP__
 #define __PAIR_ALIGNMENT_FORMATTER_HPP__
 
diff --git a/include/RapMapConfig.hpp b/include/RapMapConfig.hpp
index df7d935..af5de9d 100644
--- a/include/RapMapConfig.hpp
+++ b/include/RapMapConfig.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __RAPMAP_CONFIG_HPP__
 #define __RAPMAP_CONFIG_HPP__
 
@@ -5,10 +26,10 @@
 
 namespace rapmap {
     constexpr char majorVersion[] = "0";
-    constexpr char minorVersion[] = "3";
+    constexpr char minorVersion[] = "4";
     constexpr char patchVersion[] = "0";
-    constexpr char version [] = "0.3.0";
-    constexpr uint32_t indexVersion = 2;
+    constexpr char version [] = "0.4.0";
+    constexpr uint32_t indexVersion = 3;
 }
 
 #endif //__RAPMAP_CONFIG_HPP__
diff --git a/include/RapMapFileSystem.hpp b/include/RapMapFileSystem.hpp
index 0292128..dcc9c6d 100644
--- a/include/RapMapFileSystem.hpp
+++ b/include/RapMapFileSystem.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __RAPMAP_FILESYSTEM_HPP__
 #define __RAPMAP_FILESYSTEM_HPP__
 
diff --git a/include/RapMapIndex.hpp b/include/RapMapIndex.hpp
index 3994d8d..3152ce5 100644
--- a/include/RapMapIndex.hpp
+++ b/include/RapMapIndex.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __RAP_MAP_INDEX_HPP__
 #define __RAP_MAP_INDEX_HPP__
 
diff --git a/include/RapMapSAIndex.hpp b/include/RapMapSAIndex.hpp
index 4bb5f83..4eb11bb 100644
--- a/include/RapMapSAIndex.hpp
+++ b/include/RapMapSAIndex.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __RAPMAP_SA_INDEX_HPP__
 #define __RAPMAP_SA_INDEX_HPP__
 
@@ -7,9 +28,9 @@
 #include <cereal/archives/binary.hpp>
 
 #include "spdlog/spdlog.h"
-#include "spdlog/details/format.h"
+#include "spdlog/fmt/ostr.h"
+#include "spdlog/fmt/fmt.h"
 
-#include "google/dense_hash_map"
 #include "bit_array.h"
 //#include "bitmap.h"
 //#include "shared.h"
diff --git a/include/RapMapUtils.hpp b/include/RapMapUtils.hpp
index 6239b4c..f83d47d 100644
--- a/include/RapMapUtils.hpp
+++ b/include/RapMapUtils.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __RAP_MAP_UTILS_HPP__
 #define __RAP_MAP_UTILS_HPP__
 
@@ -5,10 +26,13 @@
 #include <cmath>
 #include <memory>
 #include "xxhash.h"
+#include "sparsepp.h"
+#include "SparseHashSerializer.hpp"
 #include <cereal/archives/binary.hpp>
 #include "jellyfish/mer_dna.hpp"
 #include "spdlog/spdlog.h"
-#include "spdlog/details/format.h"
+#include "spdlog/fmt/ostr.h"
+#include "spdlog/fmt/fmt.h"
 #include "PairSequenceParser.hpp"
 
 #ifdef RAPMAP_SALMON_SUPPORT
@@ -32,6 +56,16 @@ class SingleAlignmentFormatter;
 // Forward-declare because the C++ compiler is dumb
 class RapMapIndex;
 
+template<typename KeyT, typename ValT, typename HasherT>
+//using RegHashT = google::dense_hash_map<KeyT, ValT, HasherT>;
+using RegHashT = spp::sparse_hash_map<KeyT, ValT, HasherT>;
+
+template<typename KeyT, typename ValT>
+class FrugalBooMap;
+
+template<typename KeyT, typename ValT>
+using PerfectHashT = FrugalBooMap<KeyT, ValT>;
+
 namespace rapmap {
     namespace utils {
 
@@ -49,7 +83,7 @@ namespace rapmap {
     template <typename IndexT>
         void writeSAMHeader(IndexT& rmi, std::shared_ptr<spdlog::logger> out) {
             fmt::MemoryWriter hd;
-            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+	    hd.write("@HD\tVN:1.0\tSO:unknown\n");
 
             auto& txpNames = rmi.txpNames;
             auto& txpLens = rmi.txpLens;
@@ -58,18 +92,18 @@ namespace rapmap {
             for (size_t i = 0; i < numRef; ++i) {
                 hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
             }
-            // Eventuall output a @PG line
-            //hd.format("@PG\t");
+            // Eventually output a @PG line
+            hd.write("@PG\tID:rapmap\tPN:rapmap\tVN:0.3.1\n");
             std::string headerStr(hd.str());
             // Don't include the last '\n', since the logger will do it for us.
             headerStr.pop_back();
-            out->info() << headerStr;
+            out->info(headerStr);
         }
 
     template <typename IndexT>
         void writeSAMHeader(IndexT& rmi, std::ostream& outStream) {
             fmt::MemoryWriter hd;
-            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+	    hd.write("@HD\tVN:1.0\tSO:unknown\n");
 
             auto& txpNames = rmi.txpNames;
             auto& txpLens = rmi.txpLens;
@@ -78,8 +112,8 @@ namespace rapmap {
             for (size_t i = 0; i < numRef; ++i) {
                 hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
             }
-            // Eventuall output a @PG line
-            //hd.format("@PG\t");
+            // Eventually output a @PG line
+            hd.write("@PG\tID:rapmap\tPN:rapmap\tVN:0.3.1\n");
             outStream << hd.str();
         }
 
@@ -116,13 +150,17 @@ namespace rapmap {
     struct SAIntervalWithKey {
         uint64_t kmer;
       //  SAInterval<IndexT> second;
-        IndexT begin;
-        IndexT end;
+        IndexT begin_;
+        IndexT end_;
+
+        inline IndexT begin() { return begin_; }
+        inline IndexT end() { return end_; }
+
         template <typename Archive>
-            void load(Archive& ar) { ar(kmer, begin, end); }
+            void load(Archive& ar) { ar(kmer, begin_, end_); }
 
         template <typename Archive>
-            void save(Archive& ar) const { ar(kmer, begin, end); }
+            void save(Archive& ar) const { ar(kmer, begin_, end_); }
     };
 
     template <typename IndexT>
@@ -136,14 +174,20 @@ namespace rapmap {
 	  end = *(il.begin());
 	}
 	*/
+        using index_type = IndexT;
+        IndexT begin_;
+        IndexT end_;
+        
+        inline IndexT begin() { return begin_; }
+        inline IndexT end() { return end_; }
 
-        IndexT begin;
-        IndexT end;
         template <typename Archive>
-            void load(Archive& ar) { ar(begin, end); }
+        void load(Archive& ar) { ar(begin_, end_); }
+        //void load(Archive& ar) { ar(begin_, len_); }
 
         template <typename Archive>
-            void save(Archive& ar) const { ar(begin, end); }
+        void save(Archive& ar) const { ar(begin_, end_); }
+        //void save(Archive& ar) const { ar(begin_, len_); }
     };
 
 
@@ -167,12 +211,17 @@ namespace rapmap {
     };
 
     class KmerKeyHasher {
+        //spp::spp_hash<uint64_t> hasher;
         public:
-            size_t operator()(const uint64_t& m) const {
+        //inline size_t operator()(const uint64_t& m) const { //{ return hasher(m); }
+        inline size_t operator()(const rapmap::utils::my_mer& m) const { //{ return hasher(m); }
                 //auto k = rapmap::utils::my_mer::k();
                 //auto v = m.get_bits(0, 2*k);
-                auto v = m;
-                return XXH64(static_cast<void*>(&v), 8, 0);
+                //auto v = m;
+            return XXH64(static_cast<void*>(const_cast<rapmap::utils::my_mer::base_type*>(m.data())), sizeof(m.word(0)) * m.nb_words(), 0);
+            }
+        inline size_t operator()(const uint64_t& m) const { //{ return hasher(m); }
+            return XXH64(static_cast<void*>(const_cast<uint64_t*>(&m)), sizeof(m), 0);
             }
     };
 
@@ -299,7 +348,23 @@ namespace rapmap {
 #ifdef RAPMAP_SALMON_SUPPORT
         inline uint32_t transcriptID() const { return tid; }
         inline double score() { return 1.0; }
-        inline uint32_t fragLength() { return fragLen; }
+        inline uint32_t fragLength() const { return fragLen; }
+
+        inline uint32_t fragLengthPedantic(uint32_t txpLen) const {
+            if (mateStatus != rapmap::utils::MateStatus::PAIRED_END_PAIRED
+                or fwd == mateIsFwd) {
+                return 0;
+            }
+            int32_t p1 = fwd ? pos : matePos;
+            p1 = (p1 < 0) ? 0 : p1;
+            p1 = (p1 > txpLen) ? txpLen : p1;
+            int32_t p2 = fwd ? matePos + mateLen : pos + readLen;
+            p2 = (p2 < 0) ? 0 : p2;
+            p2 = (p2 > txpLen) ? txpLen : p2;
+
+            return (p1 > p2) ? p1 - p2 : p2 - p1;
+        }
+
         inline int32_t hitPos() { return std::min(pos, matePos); }
         double logProb{HUGE_VAL};
         double logBias{HUGE_VAL};
@@ -373,13 +438,13 @@ namespace rapmap {
          * This enforces a more stringent consistency check on
          * the hits for this transcript.  The hits must be co-linear
          * with respect to the query and target.
-         * 
+         *
          * input: numToCheck --- the number of hits to check in sorted order
          *                       hits after the last of these need not be consistent.
-         * return: numToCheck if the first numToCheck hits are consistent; 
+         * return: numToCheck if the first numToCheck hits are consistent;
          *         -1 otherwise
          **/
-        int32_t checkConsistent(int32_t numToCheck) {
+        int32_t checkConsistent(size_t readLen, int32_t numToCheck) {
             auto numHits = tqvec.size();
 
             // special case for only 1 or two hits (common)
@@ -387,20 +452,37 @@ namespace rapmap {
                 return numToCheck;
             } else if (numHits == 2) {
                 auto& h1 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[0] : tqvec[1];
-                auto& h2 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[1] : tqvec[2];
-                return (h2.pos > h1.pos) ? (numToCheck) : -1;
+                auto& h2 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[1] : tqvec[0];
+                if (h2.pos > h1.pos) {
+                    int32_t distortion = (h2.pos - h1.pos) - (h2.queryPos - h1.queryPos);
+                    return (distortion > -10 and distortion < 10) ? numToCheck : -1;
+                } else {
+                    return -1;
+                }
+                //return (h2.pos > h1.pos) ? (numToCheck) : -1;
             } else {
                 // first, sort by query position
-                std::sort(tqvec.begin(), tqvec.end(), 
+                std::sort(tqvec.begin(), tqvec.end(),
                           [](const SATxpQueryPos& q1, const SATxpQueryPos& q2) -> bool {
                               return q1.queryPos < q2.queryPos;
                           });
 
                 int32_t lastRefPos{std::numeric_limits<int32_t>::min()};
+                int32_t lastQueryPos{std::numeric_limits<int32_t>::min()};
+                bool firstHit{true};
+                //int32_t maxDistortion{0};
                 for (size_t i = 0; i < numToCheck; ++i) {
                     int32_t refPos = static_cast<int32_t>(tqvec[i].pos);
+                    int32_t queryPos = static_cast<int32_t>(tqvec[i].queryPos);
                     if (refPos > lastRefPos) {
+                        int32_t distortion = 
+                            firstHit ? 0 : ((refPos - lastRefPos) - (queryPos - lastQueryPos));
+                        firstHit = false;
+                        if (distortion < -10 or distortion > 10) {
+                            return i;
+                        }
                         lastRefPos = refPos;
+                        lastQueryPos = queryPos;
                     } else {
                         return i;
                     }
@@ -617,6 +699,12 @@ namespace rapmap {
                 std::string& readWork,
                 std::string& qualWork);
 
+        void reverseRead(std::string& seq,
+                         std::string& readWork);
+
+
+        std::string reverseComplement(std::string& seq);
+
         template <typename ReadPairT, typename IndexT>
         uint32_t writeAlignmentsToStream(
                 ReadPairT& r,
@@ -685,7 +773,7 @@ namespace rapmap {
                                 int32_t startRead2 = std::max(rightIt->pos, signedZero);
                                 bool read1First{(startRead1 < startRead2)};
                                 int32_t fragStartPos = read1First ? startRead1 : startRead2;
-                                int32_t fragEndPos = read1First ? 
+                                int32_t fragEndPos = read1First ?
                                     (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
                                 uint32_t fragLen = fragEndPos - fragStartPos;
                                 jointHits.emplace_back(leftTxp,
@@ -749,7 +837,7 @@ namespace rapmap {
                                 int32_t startRead2 = std::max(rightIt->pos, signedZero);
                                 bool read1First{(startRead1 < startRead2)};
                                 int32_t fragStartPos = read1First ? startRead1 : startRead2;
-                                int32_t fragEndPos = read1First ? 
+                                int32_t fragEndPos = read1First ?
                                     (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
                                 uint32_t fragLen = fragEndPos - fragStartPos;
                                 jointHits.emplace_back(leftTxp,
diff --git a/include/SACollector.hpp b/include/SACollector.hpp
index 261b2ce..20dcad4 100644
--- a/include/SACollector.hpp
+++ b/include/SACollector.hpp
@@ -1,580 +1,765 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef SA_COLLECTOR_HPP
 #define SA_COLLECTOR_HPP
 
-#include "RapMapUtils.hpp"
 #include "RapMapSAIndex.hpp"
+#include "RapMapUtils.hpp"
 #include "SASearcher.hpp"
 
-#include <iostream>
 #include <algorithm>
+#include <iostream>
 #include <iterator>
 
-template <typename RapMapIndexT>
-class SACollector {
-    public:
-    using OffsetT = typename RapMapIndexT::IndexType;
-
-    SACollector(RapMapIndexT* rmi) : rmi_(rmi) {}
-    bool operator()(std::string& read,
-                    std::vector<rapmap::utils::QuasiAlignment>& hits,
-                    SASearcher<RapMapIndexT>& saSearcher,
-                    rapmap::utils::MateStatus mateStatus,
-                    bool strictCheck=false,
-                    bool consistentHits=false) {
-
-        using QuasiAlignment = rapmap::utils::QuasiAlignment;
-        using MateStatus = rapmap::utils::MateStatus;
-
-        //auto& posIDs = rmi_->positionIDs;
-        auto& rankDict = rmi_->rankDict;
-        auto& txpStarts = rmi_->txpOffsets;
-        auto& SA = rmi_->SA;
-        auto& khash = rmi_->khash;
-        auto& text = rmi_->seq;
-        uint32_t sampFactor{1};
-        auto salen = SA.size();
-
-        auto readLen = read.length();
-        auto maxDist = 1.5 * readLen;
-        auto k = rapmap::utils::my_mer::k();
-        auto readStartIt = read.begin();
-        auto readEndIt = read.end();
-
-        auto readRevStartIt = read.rbegin();
-        auto readRevEndIt = read.rend();
-
-        auto rb = read.begin();
-        auto re = rb + k;
-        OffsetT lbLeftFwd = 0, ubLeftFwd = 0;
-        OffsetT lbLeftRC = 0, ubLeftRC = 0;
-        OffsetT lbRightFwd = 0, ubRightFwd = 0;
-        OffsetT lbRightRC = 0, ubRightRC = 0;
-        OffsetT matchedLen;
-
-        uint32_t fwdHit{0};
-        uint32_t rcHit{0};
-
-        bool foundHit = false;
-        bool isRev = false;
-        rapmap::utils::my_mer mer;
-        rapmap::utils::my_mer rcMer;
-
-        enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
-        // Record if k-mers are hits in the
-        // fwd direction, rc direction or both
-        struct KmerDirScore {
-	  KmerDirScore(rapmap::utils::my_mer kmerIn, int32_t kposIn, HitStatus fwdScoreIn, HitStatus rcScoreIn) :
-	    kmer(kmerIn), kpos(kposIn), fwdScore(fwdScoreIn), rcScore(rcScoreIn) {}
-	  KmerDirScore() : kpos(0), fwdScore(UNTESTED), rcScore(UNTESTED) {}
-	  bool operator==(const KmerDirScore& other) const { return kpos == other.kpos; }
-	  bool operator<(const KmerDirScore& other) const { return kpos < other.kpos; }
-          void print() { 
-	    std::cerr << "{ " << kmer.to_str() << ", " <<  kpos << ", " << ((fwdScore) ? "PRESENT" : "ABSENT") << ", " << ((rcScore) ? "PRESENT" : "ABSENT") << "}\t";
-	  }
-            rapmap::utils::my_mer kmer;
-	    int32_t kpos;
-            HitStatus fwdScore;
-            HitStatus rcScore;
-        };
-
-        // This allows implementing our heurisic for comparing
-        // forward and reverse-complement strand matches
-        std::vector<KmerDirScore> kmerScores;
-
-        using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
-
-        std::vector<SAIntervalHit> fwdSAInts;
-        std::vector<SAIntervalHit> rcSAInts;
-
-        std::vector<uint32_t> leftTxps, leftTxpsRC;
-        std::vector<uint32_t> rightTxps, rightTxpsRC;
-        OffsetT maxInterval{1000};
-
-        // The number of bases that a new query position (to which
-        // we skipped) should overlap the previous extension. A
-        // value of 0 means no overlap (the new search begins at the next
-        // base) while a value of (k - 1) means that k-1 bases (one less than
-        // the k-mer size) must overlap.
-        OffsetT skipOverlap = k-1;
-        // Number of nucleotides to skip when encountering a homopolymer k-mer.
-        OffsetT homoPolymerSkip = k/2;
-
-        // Find a hit within the read
-        // While we haven't fallen off the end
-        while (re < read.end()) {
-
-            // Get the k-mer at the current start position.
-            // And make sure that it's valid (contains no Ns).
-            auto pos = std::distance(readStartIt, rb);
-            auto invalidPos = read.find_first_of("nN", pos);
-            if (invalidPos <= pos + k) {
-                rb = read.begin() + invalidPos + 1;
-                re = rb + k;
-                continue;
-            }
+template <typename RapMapIndexT> class SACollector {
+public:
+  using OffsetT = typename RapMapIndexT::IndexType;
 
-            // If the next k-bases are valid, get the k-mer and
-            // reverse complement k-mer
-            mer = rapmap::utils::my_mer(read.c_str() + pos);
-            if (mer.is_homopolymer()) { rb += homoPolymerSkip; re += homoPolymerSkip; continue; }
-            rcMer = mer.get_reverse_complement();
-
-            // See if we can find this k-mer in the hash
-            auto merIt = khash.find(mer.get_bits(0, 2*k));
-            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
-
-            // If we can find the k-mer in the hash, get its SA interval
-            if (merIt != khash.end()) {
-                OffsetT lb = merIt->second.begin;
-                OffsetT ub = merIt->second.end;
-
-                // lb must be 1 *less* then the current lb
-                auto lbRestart = std::max(static_cast<OffsetT>(0), lb-1);
-                // Extend the SA interval using the read sequence as far as
-                // possible
-                std::tie(lbLeftFwd, ubLeftFwd, matchedLen) =
-                    saSearcher.extendSearchNaive(lbRestart, ub, k, rb, readEndIt);
-
-                // If the SA interval is valid, and not too wide, then record
-                // the hit.
-                OffsetT diff = ubLeftFwd - lbLeftFwd;
-                if (ubLeftFwd > lbLeftFwd and diff < maxInterval) {
-                    auto queryStart = std::distance(read.begin(), rb);
-                    fwdSAInts.emplace_back(lbLeftFwd, ubLeftFwd, matchedLen, queryStart, false);
-                    if (strictCheck) {
-                        ++fwdHit;
-                        // If we also match this k-mer in the rc direction
-			if (rcMerIt != khash.end()) {
-			  ++rcHit;
-			  kmerScores.emplace_back(mer, pos, PRESENT, PRESENT);
-			} else { // Otherwise it doesn't match in the rc direction
-			  kmerScores.emplace_back(mer, pos, PRESENT, ABSENT);
-			}
-
-			// If we didn't end the match b/c we exhausted the query
-                        // test the mismatching k-mer in the other strand
-                        // TODO: check for 'N'?
-                        if (rb + matchedLen < readEndIt){
-                            auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
-                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
-                            kmerScores.emplace_back(mer, kmerPos, ABSENT, UNTESTED);
-                        }
-                    } else { // no strict check
-                        ++fwdHit;
-                        if (rcMerIt != khash.end()) { ++rcHit; }
-                    }
-                }
-            }
+  /** Disable NIP skipping **/
+  void disableNIP() { disableNIP_ = true; }
 
-            // See if the reverse complement k-mer is in the hash
-            if (rcMerIt != khash.end()) {
-                lbLeftRC = rcMerIt->second.begin;
-                ubLeftRC = rcMerIt->second.end;
-                OffsetT diff = ubLeftRC - lbLeftRC;
-                if (ubLeftRC > lbLeftRC) {
-                    // The original k-mer didn't match in the foward direction
-                    if (!fwdHit) {
-                        ++rcHit;
-                        if (strictCheck) {
-			  kmerScores.emplace_back(mer, pos, ABSENT, PRESENT);
-                        }
-                    }
-                }
-            }
+  /** Enable NIP skipping --- default state **/
+  void enableNIP() { disableNIP_ = false; }
 
-            // If we had a hit with either k-mer then we can
-            // break out of this loop to look for the next informative position
-            if (fwdHit + rcHit > 0) {
-                foundHit = true;
-                break;
-            }
-            ++rb; ++re;
-        }
+  /** Require a coverage fraction of at least req for all reported mappings **/
+  void setCoverageRequirement(double req) { covReq_ = req; }
+
+  /** Get the current coverage requirement for mappings (0 means no requirement)
+   * **/
+  double getCoverageRequirement() const { return covReq_; }
+
+  /** If any hit has a suffix array interval of this length or larger, just skip
+   * it **/
+  void setMaxInterval(OffsetT maxInterval) { maxInterval_ = maxInterval; }
+
+  /** Get the maximum allowable suffix array interval **/
+  OffsetT getMaxInterval(OffsetT maxInterval) const { return maxInterval_; }
 
-        // If we went the entire length of the read without finding a hit
-        // then we can bail.
-        if (!foundHit) { return false; }
-
-        bool lastSearch{false};
-        // If we had a hit on the forward strand
-        if (fwdHit) {
-
-            // The length of this match
-            auto matchLen = fwdSAInts.front().len;
-            // The iterator to where this match began
-            rb = read.begin() + fwdSAInts.front().queryPos;
-
-            // [lb, ub) is the suffix array interval for the MMP (maximum mappable prefix)
-            // of the k-mer we found.  The NIP (next informative position) in the sequence
-            // is the position after the LCE (longest common extension) of
-            // T[SA[lb]:] and T[SA[ub-1]:]
-            auto remainingLength = std::distance(rb + matchLen, readEndIt);
-            auto lce = saSearcher.lce(lbLeftFwd, ubLeftFwd-1, matchLen, remainingLength);
-            auto fwdSkip = std::max(static_cast<OffsetT>(matchLen) - skipOverlap,
-                                    static_cast<OffsetT>(lce) - skipOverlap);
-
-            size_t nextInformativePosition = std::min(
-                    std::max(static_cast<OffsetT>(0),
-                    static_cast<OffsetT>(readLen)- static_cast<OffsetT>(k)),
-                    static_cast<OffsetT>(std::distance(readStartIt, rb) + fwdSkip)
-                    );
-
-            rb = read.begin() + nextInformativePosition;
+  /** Get/Set usage of strict-checking **/
+  bool getStrictCheck() const { return strictCheck_; };
+  void setStrictCheck(bool sc) { strictCheck_ = sc; }
+
+  /** Construct an SACollector given an index **/
+  SACollector(RapMapIndexT* rmi)
+      : rmi_(rmi), hashEnd_(rmi->khash.end()), disableNIP_(false), 
+        covReq_(0.0), maxInterval_(1000),
+        strictCheck_(false) {}
+
+  enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
+  // Record if k-mers are hits in the
+  // fwd direction, rc direction or both
+  struct KmerDirScore {
+    KmerDirScore(rapmap::utils::my_mer kmerIn, int32_t kposIn,
+                 HitStatus fwdScoreIn, HitStatus rcScoreIn)
+        : kmer(kmerIn), kpos(kposIn), fwdScore(fwdScoreIn), rcScore(rcScoreIn) {
+    }
+    KmerDirScore() : kpos(0), fwdScore(UNTESTED), rcScore(UNTESTED) {}
+    bool operator==(const KmerDirScore& other) const {
+      return kpos == other.kpos;
+    }
+    bool operator<(const KmerDirScore& other) const {
+      return kpos < other.kpos;
+    }
+    void print() {
+      std::cerr << "{ " << kmer.to_str() << ", " << kpos << ", "
+                << ((fwdScore) ? "PRESENT" : "ABSENT") << ", "
+                << ((rcScore) ? "PRESENT" : "ABSENT") << "}\t";
+    }
+    rapmap::utils::my_mer kmer;
+    int32_t kpos;
+    HitStatus fwdScore;
+    HitStatus rcScore;
+  };
+
+  bool operator()(std::string& read,
+                  std::vector<rapmap::utils::QuasiAlignment>& hits,
+                  SASearcher<RapMapIndexT>& saSearcher,
+                  rapmap::utils::MateStatus mateStatus,
+                  bool consistentHits = false) {
+
+    using QuasiAlignment = rapmap::utils::QuasiAlignment;
+    using MateStatus = rapmap::utils::MateStatus;
+    using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
+
+    auto& rankDict = rmi_->rankDict;
+    auto& txpStarts = rmi_->txpOffsets;
+    auto& SA = rmi_->SA;
+    auto& khash = rmi_->khash;
+    auto& text = rmi_->seq;
+    auto salen = SA.size();
+    //auto hashEnd_ = khash.end();
+    auto readLen = read.length();
+    auto maxDist = 1.5 * readLen;
+
+    auto k = rapmap::utils::my_mer::k();
+    auto readStartIt = read.begin();
+    auto readEndIt = read.end();
+
+    auto rb = read.begin();
+    auto re = rb + k;
+
+    uint32_t fwdHit{0};
+    uint32_t rcHit{0};
+
+    size_t fwdCov{0};
+    size_t rcCov{0};
+
+    bool foundHit = false;
+    bool isRev = false;
+
+    rapmap::utils::my_mer mer;
+    rapmap::utils::my_mer rcMer;
+
+    bool useCoverageCheck{disableNIP_ and strictCheck_};
+
+    // This allows implementing our heurisic for comparing
+    // forward and reverse-complement strand matches
+    std::vector<KmerDirScore> kmerScores;
+
+    // Where we store the SA intervals for forward and rc hits
+    std::vector<SAIntervalHit> fwdSAInts;
+    std::vector<SAIntervalHit> rcSAInts;
+
+    // Number of nucleotides to skip when encountering a homopolymer k-mer.
+    OffsetT homoPolymerSkip = 1; // k / 2;
+
+    // Iterator for k-mer and rc k-mer lookups
+    auto merIt = hashEnd_;
+    auto rcMerIt = hashEnd_;
+
+    // The position of the k-mer in the read
+    size_t pos{0};
+    // The position of the next 'N' in the read
+    size_t invalidPos{0};
+
+    // Find a hit within the read
+    // While we haven't fallen off the end
+    while (re <= readEndIt) {
+
+      // Get the k-mer at the current start position.
+      // And make sure that it's valid (contains no Ns).
+      pos = std::distance(readStartIt, rb);
+
+      // See if this k-mer would contain an N
+      // only check if we don't yet know that there are no remaining
+      // Ns
+      if (invalidPos != std::string::npos) {
+        invalidPos = read.find_first_of("nN", pos);
+        if (invalidPos <= pos + k) {
+          rb = read.begin() + invalidPos + 1;
+          re = rb + k;
+          continue;
+        }
+      }
+
+      // If the next k-bases are valid, get the k-mer and
+      // reverse complement k-mer
+      mer = rapmap::utils::my_mer(read.c_str() + pos);
+      if (mer.is_homopolymer()) {
+        rb += homoPolymerSkip;
+        re += homoPolymerSkip;
+        /* Walk base-by-base rather than skipping
+        // If the first N is within k bases, then this k-mer is invalid
+        if (invalidPos < pos + k) {
+            // Skip to the k-mer starting at the next position
+            // (i.e. right past the N)
+            rb = read.begin() + invalidPos + 1;
             re = rb + k;
+            // Go to the next iteration of the while loop
+            continue;
+        }
+        */
+        continue;
+      }
+      rcMer = mer.get_reverse_complement();
+
+      // See if we can find this k-mer in the hash
+      merIt = khash.find(mer.word(0));//get_bits(0, 2 * k));
+      rcMerIt = khash.find(rcMer.word(0));//rcMer.get_bits(0, 2 * k));
+
+      // If we can find the k-mer in the hash
+      if (merIt != hashEnd_) {
+        if (strictCheck_) {
+          ++fwdHit;
+          // If we also match this k-mer in the rc direction
+          if (rcMerIt != hashEnd_) {
+            ++rcHit;
+            kmerScores.emplace_back(mer, pos, PRESENT, PRESENT);
+          } else { // Otherwise it doesn't match in the rc direction
+            kmerScores.emplace_back(mer, pos, PRESENT, ABSENT);
+          }
+        } else { // no strict check
+          ++fwdHit;
+          if (rcMerIt != hashEnd_) {
+            ++rcHit;
+          }
+        }
+      }
+
+      // See if the reverse complement k-mer is in the hash
+      if (rcMerIt != hashEnd_) {
+        // The original k-mer didn't match in the foward direction
+        if (!fwdHit) {
+          ++rcHit;
+          if (strictCheck_) {
+            kmerScores.emplace_back(mer, pos, ABSENT, PRESENT);
+          }
+        }
+      }
+
+      // If we had a hit with either k-mer then we can
+      // break out of this loop to look for the next informative position
+      if (fwdHit + rcHit > 0) {
+        foundHit = true;
+        break;
+      }
+      ++rb;
+      ++re;
+    }
+
+    // If we went the entire length of the read without finding a hit
+    // then we can bail.
+    if (!foundHit) {
+      return false;
+    }
+
+    bool didCheckFwd{false};
+    // If we had a hit on the forward strand
+    if (fwdHit) {
+      didCheckFwd = true;
+      getSAHits_(saSearcher,
+                 read,             // the read
+                 rb,               // where to start the search
+                 &(merIt->second), // pointer to the search interval
+                 fwdCov, fwdHit, rcHit, fwdSAInts, kmerScores, false);
+    }
+
+    bool checkRC = useCoverageCheck ? (rcHit > 0) : (rcHit >= fwdHit);
+    // If we had a hit on the reverse complement strand
+    if (checkRC) {
+      rapmap::utils::reverseRead(read, rcBuffer_);
+      getSAHits_(saSearcher,
+                 rcBuffer_,         // the read
+                 rcBuffer_.begin(), // where to start the search
+                 nullptr,           // pointer to the search interval
+                 rcCov, rcHit, fwdHit, rcSAInts, kmerScores, true);
+    }
+
+    // Now, if we *didn't* check the forward strand at first, but we encountered
+    // fwd hits
+    // while looking at the RC strand, then check the fwd strand now
+    bool checkFwd = useCoverageCheck ? (fwdHit > 0) : (fwdHit >= rcHit);
+    if (!didCheckFwd and checkFwd) {
+      didCheckFwd = true;
+      getSAHits_(saSearcher,
+                 read,         // the read
+                 read.begin(), // where to start the search
+                 nullptr,      // pointer to the search interval
+                 fwdCov, fwdHit, rcHit, fwdSAInts, kmerScores, false);
+    }
 
-            size_t invalidPos{0};
-            while (re <= readEndIt) {
-                // The offset into the string
-                auto pos = std::distance(readStartIt, rb);
-
-                // The position of the first N in the k-mer (if there is one)
-                // If we have already verified there are no Ns in the remainder
-                // of the string (invalidPos is std::string::npos) then we can
-                // skip this test.
-                if (invalidPos != std::string::npos) {
-                    invalidPos = read.find_first_of("nN", pos);
-                }
-
-                // If the first N is within k bases, then this k-mer is invalid
-                if (invalidPos < pos + k) {
-                    // A valid k-mer can't start until after the 'N'
-                    nextInformativePosition = invalidPos + 1;
-                    rb = read.begin() + nextInformativePosition;
-                    re = rb + k;
-                    // Go to the next iteration of the while loop
-                    continue;
-                }
-
-                // If the current end position is valid
-                if (re <= readEndIt) {
-
-                    mer = rapmap::utils::my_mer(read.c_str() + pos);
-                    if (mer.is_homopolymer()) { rb += homoPolymerSkip; re = rb + k; continue; }
-                    auto merIt = khash.find(mer.get_bits(0, 2*k));
-
-                    if (merIt != khash.end()) {
-                        if (strictCheck) {
-                            ++fwdHit;
-                            kmerScores.emplace_back(mer, pos, PRESENT, UNTESTED);
-                            auto rcMer = mer.get_reverse_complement();
-                            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
-                            if (rcMerIt != khash.end()) {
-                                ++rcHit;
-                                kmerScores.back().rcScore = PRESENT;
-                            }
-                        }
-
-                        lbRightFwd = merIt->second.begin;
-                        ubRightFwd = merIt->second.end;
-
-                        // lb must be 1 *less* then the current lb
-                        lbRightFwd = std::max(static_cast<OffsetT>(0), lbRightFwd - 1);
-                        std::tie(lbRightFwd, ubRightFwd, matchedLen) =
-                            saSearcher.extendSearchNaive(lbRightFwd, ubRightFwd,
-                                    k, rb, readEndIt);
-
-                        OffsetT diff = ubRightFwd - lbRightFwd;
-                        if (ubRightFwd > lbRightFwd and diff < maxInterval) {
-                            auto queryStart = std::distance(read.begin(), rb);
-                            fwdSAInts.emplace_back(lbRightFwd, ubRightFwd, matchedLen, queryStart, false);
-                            // If we didn't end the match b/c we exhausted the query
-                            // test the mismatching k-mer in the other strand
-                            // TODO: check for 'N'?
-                            if (strictCheck and rb + matchedLen < readEndIt){
-                                auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
-                                mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
-				// TODO: 04/11/16
-                                kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
-                            }
-
-                        }
-
-                        if (lastSearch) { break; }
-                        auto mismatchIt = rb + matchedLen;
-                        if (mismatchIt < readEndIt) {
-                            auto remainingDistance = std::distance(mismatchIt, readEndIt);
-                            auto lce = saSearcher.lce(lbRightFwd, ubRightFwd-1, matchedLen, remainingDistance);
-
-                            // Where we would jump if we just used the MMP
-                            auto skipMatch = mismatchIt - skipOverlap;
-                            // Where we would jump if we used the LCE
-                            auto skipLCE = rb + lce - skipOverlap;
-                            // Pick the larger of the two
-                            rb = std::max(skipLCE, skipMatch);
-                            if (rb > (readEndIt - k)) {
-                                rb = readEndIt - k;
-                                lastSearch = true;
-                            }
-                            re = rb + k;
-                        } else {
-                            lastSearch = true;
-                            rb = readEndIt - k;
-                            re = rb + k;
-                        }
-
-                    } else {
-                        rb += sampFactor;
-                        re = rb + k;
-                    }
-                }
+    if (strictCheck_) {
+      // If we're computing coverage, then we can make use of that info here
+      //useCoverageCheck = false;
+      if (useCoverageCheck) {
+        if (fwdCov > rcCov) {
+          rcSAInts.clear();
+        } else if (rcCov > fwdCov) {
+          fwdSAInts.clear();
+        }
+      } else { // use the k-mer "spot check"
+        // The first two conditions shouldn't happen
+        // but I'm just being paranoid here
+        if (fwdHit > 0 and rcHit == 0) {
+          rcSAInts.clear();
+        } else if (rcHit > 0 and fwdHit == 0) {
+          fwdSAInts.clear();
+        } else {
+          std::sort(kmerScores.begin(), kmerScores.end());
+          auto e = std::unique(kmerScores.begin(), kmerScores.end());
+          // Compute the score for the k-mers we need to
+          // test in both the forward and rc directions.
+          int32_t fwdScore{0};
+          int32_t rcScore{0};
+          // For every kmer score structure
+          // std::cerr << "[\n";
+          for (auto kmsIt = kmerScores.begin(); kmsIt != e;
+               ++kmsIt) { //: kmerScores) {
+            auto& kms = *kmsIt;
+            // If the forward k-mer is untested, then test it
+            if (kms.fwdScore == UNTESTED) {
+              auto merIt = khash.find(kms.kmer.word(0));//get_bits(0, 2 * k));
+              kms.fwdScore = (merIt != hashEnd_) ? PRESENT : ABSENT;
             }
+            // accumulate the score
+            fwdScore += kms.fwdScore;
+
+            // If the rc k-mer is untested, then test it
+            if (kms.rcScore == UNTESTED) {
+              rcMer = kms.kmer.get_reverse_complement();
+              auto rcMerIt = khash.find(rcMer.word(0));//get_bits(0, 2 * k));
+              kms.rcScore = (rcMerIt != hashEnd_) ? PRESENT : ABSENT;
+            }
+            // accumulate the score
+            rcScore += kms.rcScore;
+            // kms.print();
+            // std::cerr << "\n";
+          }
+          // std::cerr << "]\n";
+          // If the forward score is strictly greater
+          // then get rid of the rc hits.
+          if (fwdScore > rcScore) {
+            rcSAInts.clear();
+          } else if (rcScore > fwdScore) {
+            // If the rc score is strictly greater
+            // get rid of the forward hits
+            fwdSAInts.clear();
+          }
         }
+      }
+    }
 
-        lastSearch = false;
-        if (rcHit >= fwdHit) {
-            size_t pos{read.length() - k};
-
-            auto revReadEndIt = read.rend();
-
-            auto revRB = read.rbegin();
-            auto revRE = revRB + k;
-
-            auto invalidPosIt = revRB;
-            while (revRE <= revReadEndIt){
-
-                revRE = revRB + k;
-                if (revRE > revReadEndIt) { break; }
-
-                // See if this k-mer would contain an N
-                // only check if we don't yet know that there are no remaining
-                // Ns
-                if (invalidPosIt != revReadEndIt) {
-                    invalidPosIt = std::find_if(revRB, revRE,
-                                                 [](const char c) -> bool {
-                                                     return c == 'n' or c == 'N';
-                                                 });
-                }
-
-                // If we found an N before the end of the k-mer
-                if (invalidPosIt < revRE) {
-                    // Skip to the k-mer starting at the next position
-                    // (i.e. right past the N)
-                    revRB = invalidPosIt + 1;
-                    continue;
-                }
-
-                // The distance from the beginning of the read to the
-                // start of the k-mer
-                pos = std::distance(revRE, revReadEndIt);
-
-                // Get the k-mer and query it in the hash
-                mer = rapmap::utils::my_mer(read.c_str() + pos);
-                if (mer.is_homopolymer()) { revRB += homoPolymerSkip; revRE += homoPolymerSkip; continue; }
-                rcMer = mer.get_reverse_complement();
-                auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
-
-                // If we found the k-mer
-                if (rcMerIt != khash.end()) {
-                    if (strictCheck) {
-                        ++rcHit;
-                        kmerScores.emplace_back(mer, pos, UNTESTED, PRESENT);
-                        auto merIt = khash.find(mer.get_bits(0, 2*k));
-                        if (merIt != khash.end()) {
-                            ++fwdHit;
-                            kmerScores.back().fwdScore = PRESENT;
-                        }
-                    }
-
-
-                    lbRightRC = rcMerIt->second.begin;
-                    ubRightRC = rcMerIt->second.end;
-
-                    // lb must be 1 *less* then the current lb
-                    // We can't move any further in the reverse complement direction
-                    lbRightRC = std::max(static_cast<OffsetT>(0), lbRightRC - 1);
-                    std::tie(lbRightRC, ubRightRC, matchedLen) =
-                        saSearcher.extendSearchNaive(lbRightRC, ubRightRC, k,
-                                revRB, revReadEndIt, true);
-
-                    OffsetT diff = ubRightRC - lbRightRC;
-                    if (ubRightRC > lbRightRC and diff < maxInterval) {
-                        auto queryStart = std::distance(read.rbegin(), revRB);
-                        rcSAInts.emplace_back(lbRightRC, ubRightRC, matchedLen, queryStart, true);
-                        // If we didn't end the match b/c we exhausted the query
-                        // test the mismatching k-mer in the other strand
-                        // TODO: check for 'N'?
-                        if (strictCheck and revRB + matchedLen < revReadEndIt){
-                            auto kmerPos = std::distance(revRB + matchedLen, revReadEndIt);
-                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
-                            // TODO: 04/11/16
-                            kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
-                        }
-                    }
-
-                    if (lastSearch) { break; }
-                    auto mismatchIt = revRB + matchedLen;
-                    if (mismatchIt < revReadEndIt) {
-                        auto remainingDistance = std::distance(mismatchIt, revReadEndIt);
-                        auto lce = saSearcher.lce(lbRightRC, ubRightRC-1, matchedLen, remainingDistance);
-
-                        // Where we would jump if we just used the MMP
-                        auto skipMatch = mismatchIt - skipOverlap;
-                        // Where we would jump if we used the lce
-                        auto skipLCE = revRB + lce - skipOverlap;
-                        // Choose the larger of the two
-                        revRB = std::max(skipLCE, skipMatch);
-                        if (revRB > (revReadEndIt - k)) {
-                            revRB = revReadEndIt - k;
-                            lastSearch = true;
-                        }
-                        revRE = revRB + k;
-                    } else {
-                        lastSearch = true;
-                        revRB = revReadEndIt - k;
-                        revRE = revRB + k;
-                    }
-
-                } else {
-                    revRB += sampFactor;
-                    revRE = revRB + k;
-                }
-            }
+    // Coverage requirements only make sense if
+    // we have disabled NIP skipping.
+    if (covReq_ > 0.0 and disableNIP_) {
+      double fwdFrac{0.0};
+      double rcFrac{0.0};
+      if (fwdSAInts.size() > 0) {
+        fwdFrac = fwdCov / static_cast<double>(readLen);
+        if (fwdFrac < covReq_) {
+          fwdSAInts.clear();
         }
+      }
+      if (rcSAInts.size() > 0) {
+        rcFrac = rcCov / static_cast<double>(readLen);
+        if (rcFrac < covReq_) {
+          rcSAInts.clear();
+        }
+      }
+    }
+
+    auto fwdHitsStart = hits.size();
+    // If we had > 1 forward hit
+    if (fwdSAInts.size() > 1) {
+      auto processedHits = rapmap::hit_manager::intersectSAHits(
+          fwdSAInts, *rmi_, readLen, consistentHits);
+      rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist,
+                                               hits, mateStatus);
+    } else if (fwdSAInts.size() == 1) { // only 1 hit!
+      auto& saIntervalHit = fwdSAInts.front();
+      auto initialSize = hits.size();
+      for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+        auto globalPos = SA[i];
+        auto txpID = rmi_->transcriptAtPosition(globalPos);
+        // the offset into this transcript
+        auto pos = globalPos - txpStarts[txpID];
+        int32_t hitPos = pos - saIntervalHit.queryPos;
+        hits.emplace_back(txpID, hitPos, true, readLen);
+        hits.back().mateStatus = mateStatus;
+      }
+      // Now sort by transcript ID (then position) and eliminate
+      // duplicates
+      auto sortStartIt = hits.begin() + initialSize;
+      auto sortEndIt = hits.end();
+      std::sort(sortStartIt, sortEndIt,
+                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                  if (a.tid == b.tid) {
+                    return a.pos < b.pos;
+                  } else {
+                    return a.tid < b.tid;
+                  }
+                });
+      auto newEnd = std::unique(
+          hits.begin() + initialSize, hits.end(),
+          [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+            return a.tid == b.tid;
+          });
+      hits.resize(std::distance(hits.begin(), newEnd));
+    }
+    auto fwdHitsEnd = hits.size();
+
+    auto rcHitsStart = fwdHitsEnd;
+    // If we had > 1 rc hit
+    if (rcSAInts.size() > 1) {
+      auto processedHits = rapmap::hit_manager::intersectSAHits(
+          rcSAInts, *rmi_, readLen, consistentHits);
+      rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist,
+                                               hits, mateStatus);
+    } else if (rcSAInts.size() == 1) { // only 1 hit!
+      auto& saIntervalHit = rcSAInts.front();
+      auto initialSize = hits.size();
+      for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+        auto globalPos = SA[i];
+        auto txpID = rmi_->transcriptAtPosition(globalPos);
+        // the offset into this transcript
+        auto pos = globalPos - txpStarts[txpID];
+        int32_t hitPos = pos - saIntervalHit.queryPos;
+        hits.emplace_back(txpID, hitPos, false, readLen);
+        hits.back().mateStatus = mateStatus;
+      }
+      // Now sort by transcript ID (then position) and eliminate
+      // duplicates
+      auto sortStartIt = hits.begin() + rcHitsStart;
+      auto sortEndIt = hits.end();
+      std::sort(sortStartIt, sortEndIt,
+                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                  if (a.tid == b.tid) {
+                    return a.pos < b.pos;
+                  } else {
+                    return a.tid < b.tid;
+                  }
+                });
+      auto newEnd = std::unique(
+          sortStartIt, sortEndIt,
+          [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+            return a.tid == b.tid;
+          });
+      hits.resize(std::distance(hits.begin(), newEnd));
+    }
+    auto rcHitsEnd = hits.size();
+
+    // If we had both forward and RC hits, then merge them
+    if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
+      // Merge the forward and reverse hits
+      std::inplace_merge(
+          hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd,
+          hits.begin() + rcHitsEnd,
+          [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+            return a.tid < b.tid;
+          });
+      // And get rid of duplicate transcript IDs
+      auto newEnd = std::unique(
+          hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
+          [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+            return a.tid == b.tid;
+          });
+      hits.resize(std::distance(hits.begin(), newEnd));
+    }
+    // Return true if we had any valid hits and false otherwise.
+    return foundHit;
+  }
+
+private:
+  // spot-check k-mers to see if there are forward or rc hits
+  template <typename IteratorT>
+  inline void
+  spotCheck_(rapmap::utils::my_mer mer,
+             size_t pos, // the position of the k-mer on the read
+             size_t readLen,
+             IteratorT* merItPtr,           // nullptr if we haven't checked yet
+             IteratorT* complementMerItPtr, // nullptr if we haven't checked yet
+             bool isRC, // is this being called from the RC of the read
+             uint32_t& strandHits, uint32_t& otherStrandHits,
+             std::vector<KmerDirScore>& kmerScores
+             ) {
+    IteratorT merIt = hashEnd_;
+    IteratorT complementMerIt = hashEnd_;
+    auto& khash = rmi_->khash;
+    //auto hashEnd_ = khash.end();
+    auto k = rapmap::utils::my_mer::k();
+
+    auto complementMer = mer.get_reverse_complement();
+
+    if (merItPtr == nullptr) {
+      // We haven't tested this, so do that here
+      merIt = khash.find(mer.word(0));//get_bits(0, 2 * k));
+    } else {
+      // we already have this
+      merIt = *merItPtr;
+    }
+
+    if (complementMerItPtr == nullptr) {
+      // We haven't tested this, so do that here
+      complementMerIt = khash.find(complementMer.word(0));//get_bits(0, 2 * k));
+    } else {
+      // we already have this
+      complementMerIt = *complementMerItPtr;
+    }
+
+    HitStatus status{UNTESTED};
+    HitStatus complementStatus{UNTESTED};
 
-        if (strictCheck) {
-            // The first two conditions shouldn't happen
-            // but I'm just being paranoid here
-            if (fwdHit > 0 and rcHit == 0) {
-                rcSAInts.clear();
-            } else if (rcHit > 0 and fwdHit == 0) {
-                fwdSAInts.clear();
-            } else {
-	      std::sort( kmerScores.begin(), kmerScores.end() );
-	      auto e = std::unique(kmerScores.begin(), kmerScores.end());
-                // Compute the score for the k-mers we need to
-                // test in both the forward and rc directions.
-                int32_t fwdScore{0};
-                int32_t rcScore{0};
-                // For every kmer score structure
-		//std::cerr << "[\n";
-                for (auto kmsIt = kmerScores.begin(); kmsIt != e; ++kmsIt) {//: kmerScores) {
-   		    auto& kms = *kmsIt;
-                    // If the forward k-mer is untested, then test it
-                    if (kms.fwdScore == UNTESTED) {
-                        auto merIt = khash.find(kms.kmer.get_bits(0, 2*k));
-                        kms.fwdScore = (merIt != khash.end()) ? PRESENT : ABSENT;
-                    }
-                    // accumulate the score
-                    fwdScore += kms.fwdScore;
-
-                    // If the rc k-mer is untested, then test it
-                    if (kms.rcScore == UNTESTED) {
-                        rcMer = kms.kmer.get_reverse_complement();
-                        auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
-                        kms.rcScore = (rcMerIt != khash.end()) ? PRESENT : ABSENT;
-                    }
-                    // accumulate the score
-                    rcScore += kms.rcScore;
-		    //kms.print();
-		    //std::cerr << "\n";
-                }
-		//std::cerr << "]\n";
-                // If the forward score is strictly greater
-                // then get rid of the rc hits.
-                if (fwdScore > rcScore) {
-                    rcSAInts.clear();
-                } else if (rcScore > fwdScore) {
-                    // If the rc score is strictly greater
-                    // get rid of the forward hits
-                    fwdSAInts.clear();
-                }
+    if (merIt != hashEnd_) {
+      ++strandHits;
+      status = PRESENT;
+    } else {
+      status = ABSENT;
+    }
+    if (complementMerIt != hashEnd_) {
+      ++otherStrandHits;
+      complementStatus = PRESENT;
+    } else {
+      complementStatus = ABSENT;
+    }
+
+    HitStatus fwdStatus = isRC ? complementStatus : status;
+    HitStatus rcStatus = isRC ? status : complementStatus;
+
+    if (strictCheck_) {
+      // If we're on the reverse complement strand, then
+      // we have to adjust kmerPos to be with respect to the
+      // forward strand.
+      if (isRC) {
+        auto kp = pos;
+        pos = readLen - kp - k;
+        mer = complementMer;
+      }
+      kmerScores.emplace_back(mer, pos, fwdStatus, rcStatus);
+    }
+  }
+  /* 
+  // Attempts to find the next valid k-mer (a k-mer that doesn't contain an 'N' and is 
+  // not a homopolymer).  If no such k-mer exists within the read, then it returns false. 
+  inline bool getNextValidKmer_(std, size_t& pos, rapmap::utils::my_mer& mer) {
+      bool validMer = mer.from_chars(read + pos);
+      // if this kmer contains an 'N' then validMer is false, else true
+  }
+  */
+
+  inline void getSAHits_(
+      SASearcher<RapMapIndexT>& saSearcher, std::string& read,
+      std::string::iterator startIt,
+      rapmap::utils::SAInterval<OffsetT>* startInterval, size_t& cov,
+      uint32_t& strandHits, uint32_t& otherStrandHits,
+      std::vector<rapmap::utils::SAIntervalHit<OffsetT>>& saInts,
+      std::vector<KmerDirScore>& kmerScores,
+      bool isRC // true if read is the reverse complement, false otherwise
+      ) {
+    using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
+    auto& khash = rmi_->khash;
+
+    //auto hashEnd_ = khash.end();
+    decltype(hashEnd_)* nullItPtr = nullptr;
+
+    auto readLen = read.length();
+    auto readStartIt = read.begin();
+    auto readEndIt = read.end();
+    OffsetT matchedLen{0};
+
+    auto k = rapmap::utils::my_mer::k();
+    auto skipOverlapMMP = k - 1;
+    auto skipOverlapNIP = k - 1;
+    OffsetT homoPolymerSkip = 1;//k / 2;
+
+    auto rb = readStartIt;
+    auto re = rb + k;
+    OffsetT lb, ub;
+    size_t invalidPos{0};
+
+    rapmap::utils::my_mer mer, complementMer;
+    auto merIt = hashEnd_;
+    auto complementMerIt = hashEnd_;
+    size_t pos{0};
+    size_t sampFactor{1};
+    bool lastSearch{false};
+    size_t prevMMPEnd{0};
+    bool validMer{true};
+
+    // If we have some place to start that we have already computed
+    // then use it.
+    bool canSkipSetup{startInterval != nullptr};
+
+    if (canSkipSetup) {
+      rb = startIt;
+      re = rb + k;
+      pos = std::distance(readStartIt, rb);
+      invalidPos = pos;
+      lb = startInterval->begin();
+      ub = startInterval->end();
+      goto skipSetup;
+    }
+
+    while (re <= readEndIt) {
+      // The distance from the beginning of the read to the
+      // start of the k-mer
+      pos = std::distance(readStartIt, rb);
+      validMer = mer.from_chars(read.c_str() + pos);
+      // Get the next valid k-mer at some position >= pos
+      //validMer = getNextValidKmer_(read, pos, mer);
+      //if (!validMer) { return; }
+
+      // If this k-mer contains an 'N', then find the position
+      // of this character and skip one past it.
+      if (!validMer) {
+        invalidPos = read.find_first_of("nN", pos);
+        // If the first N is within k bases, then this k-mer is invalid
+        if (invalidPos < pos + k) {
+          // Skip to the k-mer starting at the next position
+          // (i.e. right past the N)
+          rb = read.begin() + invalidPos + 1;
+          re = rb + k;
+          // Go to the next iteration of the while loop
+          continue;
+        }
+      }
+      // If we got here, we have a k-mer without an 'N'
+
+      // If this is a homopolymer, then skip it
+      if (mer.is_homopolymer()) {
+        rb += homoPolymerSkip; 
+        re += homoPolymerSkip;
+        /*
+        rb += homoPolymerSkip;
+        re += homoPolymerSkip;
+        // If the default skip jumps us off the end of the read
+        // then try to check the last k-mer
+        if (re >= readEndIt and !lastSearch) {
+          rb = readEndIt - k;
+          re = rb + k;
+          // but give up if that's still a homopolymer
+          lastSearch = true;
+        }
+        */
+        continue;
+      }
+      
+      // If it's not a homopolymer, then get the complement
+      // k-mer and query both in the hash.
+      complementMer = mer.get_reverse_complement();
+      merIt = khash.find(mer.word(0));//get_bits(0, 2 * k));
+
+      // If we found the k-mer
+      if (merIt != hashEnd_) {
+        spotCheck_(mer, pos, readLen, &merIt, nullItPtr, isRC, strandHits,
+                   otherStrandHits, kmerScores);
+
+        lb = merIt->second.begin();
+        ub = merIt->second.end();
+      skipSetup:
+        // lb must be 1 *less* then the current lb
+        // We can't move any further in the reverse complement direction
+        lb = std::max(static_cast<OffsetT>(0), lb - 1);
+        std::tie(lb, ub, matchedLen) =
+            saSearcher.extendSearchNaive(lb, ub, k, rb, readEndIt);
+
+        OffsetT diff = ub - lb;
+        if (ub > lb and diff < maxInterval_) {
+          uint32_t queryStart =
+              static_cast<uint32_t>(std::distance(readStartIt, rb));
+          saInts.emplace_back(lb, ub, matchedLen, queryStart, isRC);
+
+          size_t matchOffset = std::distance(readStartIt, rb);
+          size_t correction = 0;
+
+          // NOTE: prevMMPEnd points 1 position past the last *match* of the
+          // previous MMP (i.e. it points to the *first mismatch*).  This is
+          // why we ignore the case where prevMMPEnd == matchOffset, and why
+          // we don't have to add 1 to correction.
+          if (prevMMPEnd > matchOffset) {
+            correction = prevMMPEnd - matchOffset;
+          }
+          // Update the coverage and position of the last MMP match
+          cov += (matchedLen - correction);
+          prevMMPEnd = matchOffset + matchedLen;
+
+          // If we didn't end the match b/c we exhausted the query
+          // test the mismatching k-mer in the other strand
+          if (rb + matchedLen < readEndIt) {
+            uint32_t kmerPos = static_cast<uint32_t>(
+                std::distance(readStartIt, rb + matchedLen - skipOverlapMMP));
+            bool validNucs = mer.from_chars(read.c_str() + kmerPos);
+            if (validNucs) {
+              /*
+              // since the MMP *ended* before the end of the read, we assume
+              // that the k-mer one past the MMP is a mismatch (i.e is ABSENT)
+              // we avoid looking it up in spotCheck_ by simply passing a pointer
+              // to the end of the k-mer hash, which will treat this mer as ABSENT.
+              auto endItPtr = &hashEnd_;
+              */
+              // Even though the MMP *ended* before the end of the read, we're still
+              // going to check the mismatching k-mer in both directions to ensure that
+              // it doesn't appear somewhere else in the forward direction
+              spotCheck_(mer, kmerPos, readLen, nullItPtr, nullItPtr, isRC,
+                         strandHits, otherStrandHits, kmerScores);
             }
+          } // we didn't end the search by falling off the end
+        }   // This hit was worth recording --- occurred fewer then maxInterval_
+            // times
+
+        // If we've previously declared that the search that just occurred was
+        // our last, then we're done!
+        if (lastSearch) {
+          return;
         }
 
-        auto fwdHitsStart = hits.size();
-        // If we had > 1 forward hit
-        if (fwdSAInts.size() > 1) {
-            auto processedHits = rapmap::hit_manager::intersectSAHits(fwdSAInts, *rmi_, consistentHits);
-            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
-        } else if (fwdSAInts.size() == 1) { // only 1 hit!
-            auto& saIntervalHit = fwdSAInts.front();
-                auto initialSize = hits.size();
-                for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
-                        auto globalPos = SA[i];
-		            	auto txpID = rmi_->transcriptAtPosition(globalPos);
-                        // the offset into this transcript
-                        auto pos = globalPos - txpStarts[txpID];
-                        int32_t hitPos = pos - saIntervalHit.queryPos;
-                        hits.emplace_back(txpID, hitPos, true, readLen);
-                        hits.back().mateStatus = mateStatus;
-                }
-                // Now sort by transcript ID (then position) and eliminate
-                // duplicates
-                auto sortStartIt = hits.begin() + initialSize;
-                auto sortEndIt = hits.end();
-                std::sort(sortStartIt, sortEndIt,
-                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                                if (a.tid == b.tid) {
-                                return a.pos < b.pos;
-                                } else {
-                                return a.tid < b.tid;
-                                }
-                                });
-                auto newEnd = std::unique(hits.begin() + initialSize, hits.end(),
-                                [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                                return a.tid == b.tid;
-                                });
-                hits.resize(std::distance(hits.begin(), newEnd));
+        // Otherwise, figure out how we should continue the search.
+        auto mismatchIt = rb + matchedLen;
+        // If we reached the end of the read, then we're done.
+        if (mismatchIt >= readEndIt) {
+          return;
         }
-        auto fwdHitsEnd = hits.size();
-
-        auto rcHitsStart = fwdHitsEnd;
-        // If we had > 1 rc hit
-        if (rcSAInts.size() > 1) {
-            auto processedHits = rapmap::hit_manager::intersectSAHits(rcSAInts, *rmi_, consistentHits);
-            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
-        } else if (rcSAInts.size() == 1) { // only 1 hit!
-            auto& saIntervalHit = rcSAInts.front();
-            auto initialSize = hits.size();
-            for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
-                auto globalPos = SA[i];
-		        auto txpID = rmi_->transcriptAtPosition(globalPos);
-                // the offset into this transcript
-                auto pos = globalPos - txpStarts[txpID];
-                int32_t hitPos = pos - saIntervalHit.queryPos;
-                hits.emplace_back(txpID, hitPos, false, readLen);
-                hits.back().mateStatus = mateStatus;
-            }
-            // Now sort by transcript ID (then position) and eliminate
-            // duplicates
-            auto sortStartIt = hits.begin() + rcHitsStart;
-            auto sortEndIt = hits.end();
-            std::sort(sortStartIt, sortEndIt,
-                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                    if (a.tid == b.tid) {
-                    return a.pos < b.pos;
-                    } else {
-                    return a.tid < b.tid;
-                    }
-                    });
-            auto newEnd = std::unique(sortStartIt, sortEndIt,
-                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                    return a.tid == b.tid;
-                    });
-            hits.resize(std::distance(hits.begin(), newEnd));
+
+        auto remainingDistance = std::distance(mismatchIt, readEndIt);
+        auto lce = disableNIP_ ? matchedLen
+                               : saSearcher.lce(lb, ub - 1, matchedLen,
+                                                remainingDistance);
+
+        // Where we would jump if we just used the MMP
+        auto skipMatch = mismatchIt - skipOverlapMMP;
+        // if (skipMatch + k )
+        // Where we would jump if we used the LCE
+        auto skipLCE = rb + lce - skipOverlapNIP;
+        // Pick the maximum of the two
+        auto maxSkip = std::max(skipMatch, skipLCE);
+        // And that's where our new search will start
+        rb = maxSkip;
+
+        // If NIP skipping is *enabled*, and we got to the current position
+        // by doing an LCE query, then we allow ourselves to *double check*
+        // by querying the last k-mer in the read.
+        // Otherwise, we just take the skip we're given.
+        if (!disableNIP_ and (lce > matchedLen)) {
+          if (readLen > k) {
+            rb = std::min(readEndIt - k, rb);
+          }
         }
-        auto rcHitsEnd = hits.size();
 
-        // If we had both forward and RC hits, then merge them
-        if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
-            // Merge the forward and reverse hits
-            std::inplace_merge(hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd, hits.begin() + rcHitsEnd,
-                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                    return a.tid < b.tid;
-                    });
-            // And get rid of duplicate transcript IDs
-            auto newEnd = std::unique(hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
-                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
-                    return a.tid == b.tid;
-                    });
-            hits.resize(std::distance(hits.begin(), newEnd));
+        re = rb + k;
+
+        // If the search ends at the end of the read, then
+        // set the flag that we need not try after this.
+        if (re == readEndIt) {
+          lastSearch = true;
         }
-        // Return true if we had any valid hits and false otherwise.
-        return foundHit;
-    }
 
-    private:
-        RapMapIndexT* rmi_;
+      } else { // If we couldn't match this k-mer, move on to the next.
+          
+        // &merIt should point to the end of the k-mer hash,
+        // complementMerItPtr is null because we want to spot-check the complement k-mer.
+        spotCheck_(mer, pos, readLen, &merIt, nullItPtr, isRC, strandHits,
+                   otherStrandHits, kmerScores);
+        rb += sampFactor;
+        re = rb + k;
+      }
+    }
+  }
+
+  RapMapIndexT* rmi_;
+  decltype(rmi_->khash.end()) hashEnd_;
+  bool disableNIP_;
+  double covReq_;
+  OffsetT maxInterval_;
+  bool strictCheck_;
+  std::string rcBuffer_;
 };
 
 #endif // SA_COLLECTOR_HPP
diff --git a/include/SASearcher.hpp b/include/SASearcher.hpp
index b36e476..3b69c4a 100644
--- a/include/SASearcher.hpp
+++ b/include/SASearcher.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef SA_SEARCHER_HPP
 #define SA_SEARCHER_HPP
 
diff --git a/include/ScopedTimer.hpp b/include/ScopedTimer.hpp
index de1121c..25f7397 100644
--- a/include/ScopedTimer.hpp
+++ b/include/ScopedTimer.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __SCOPED_TIMER_HPP__
 #define __SCOPED_TIMER_HPP__
 // from https://gist.github.com/justgord/4482447
@@ -8,15 +29,18 @@ struct ScopedTimer
 {
     std::chrono::high_resolution_clock::time_point t0;
 
-    ScopedTimer()
-        : t0(std::chrono::high_resolution_clock::now())
+    ScopedTimer(bool print=true)
+        : t0(std::chrono::high_resolution_clock::now()), print_(print)
     { }
     ~ScopedTimer(void)
     {
         auto  t1 = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> elapsedSec =  t1 - t0;
-        std::cerr << "Elapsed time: " << elapsedSec.count() << "s\n";
+        if (print_) { std::cerr << "Elapsed time: " << elapsedSec.count() << "s\n"; }
     }
+
+private: 
+bool print_;
 };
 
 #endif //__SCOPED_TIMER_HPP__
diff --git a/include/SingleAlignmentFormatter.hpp b/include/SingleAlignmentFormatter.hpp
index 2510082..cd559bb 100644
--- a/include/SingleAlignmentFormatter.hpp
+++ b/include/SingleAlignmentFormatter.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __SINGLE_ALIGNMENT_FORMATTER_HPP__
 #define __SINGLE_ALIGNMENT_FORMATTER_HPP__
 
diff --git a/include/SparseHashSerializer.hpp b/include/SparseHashSerializer.hpp
new file mode 100644
index 0000000..f075620
--- /dev/null
+++ b/include/SparseHashSerializer.hpp
@@ -0,0 +1,51 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef SPARSEPP_HASH_SERALIZER_HPP
+#define SPARSEPP_HASH_SERALIZER_HPP
+
+#include "sparsepp.h"
+
+namespace spp_utils {
+
+// Can not be used with datatypes containing internal pointers
+// OK with POD datatypes
+template <typename key_type, typename value_type> struct pod_hash_serializer {
+  using KeySerializer = spp::sparsehash_internal::pod_serializer<key_type>;
+  using ValueSerializer = spp::sparsehash_internal::pod_serializer<value_type>;
+
+  KeySerializer ks_;
+  ValueSerializer vs_;
+
+  template <typename OUTPUT>
+  bool operator()(OUTPUT* fp, const std::pair<const key_type, value_type>& value) const {
+    return ks_(fp, value.first) && vs_(fp, value.second);
+  }
+
+  template <typename INPUT>
+  bool operator()(INPUT* fp, std::pair<const key_type, value_type>* value) const {
+    return ks_(fp, (key_type*)&value->first) && vs_(fp, (value_type*)&value->second);
+  }
+};
+
+}
+
+#endif // SPARSEPP_HASH_SERALIZER_HPP
diff --git a/include/SpinLock.hpp b/include/SpinLock.hpp
index 56647fa..5bd7aee 100644
--- a/include/SpinLock.hpp
+++ b/include/SpinLock.hpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #ifndef __SPIN_LOCK_HPP__
 #define __SPIN_LOCK_HPP__
 
diff --git a/include/concurrentqueue.h b/include/concurrentqueue.h
new file mode 100644
index 0000000..f826fee
--- /dev/null
+++ b/include/concurrentqueue.h
@@ -0,0 +1,3621 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // always disabled for now since several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	inline bool likely(bool x) { return __builtin_expect((x), true); }
+	inline bool unlikely(bool x) { return __builtin_expect((x), false); }
+#else
+	inline bool likely(bool x) { return x; }
+	inline bool unlikely(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GNUC__) && !defined( __clang__ )
+	typedef ::max_align_t max_align_t;      // GCC forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+	
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+	};
+	
+	
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			listener->next = tlsInst.tail;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = ((((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers)) * BLOCK_SIZE;
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if (details::likely(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	friend struct ExplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if (details::unlikely(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#if MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#if MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_add(-2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_add(-1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#if MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+#if MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(index_t i)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#if MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#if MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent, bool isExplicit) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit),
+			parent(parent)
+		{
+		}
+		
+		virtual ~ProducerBase() { };
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				assert(overcommit <= myDequeueCount);
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if (details::likely(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / BLOCK_SIZE);
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+						
+						element = std::move(el);
+					}
+					else {
+						element = std::move(el);
+						el.~T();
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) {
+				blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#if MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if (details::likely(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+						
+						element = std::move(el);
+					}
+					else {
+						element = std::move(el);
+						el.~T();
+					
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry;
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#if MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			if (allocMode == CannotAlloc || !new_block_index()) {
+				return false;
+			}
+			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+			idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+			localBlockIndex->tail.store(newTail, std::memory_order_release);
+			return true;
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / BLOCK_SIZE);
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#if MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#if MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		freeList.add(block);
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		if (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		
+		return nullptr;
+	}
+	
+
+#if MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+		bool recycled;
+		return recycle_or_create_producer(isExplicit, recycled);
+	}
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled)
+	{
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					recycled = true;
+					return ptr;
+				}
+			}
+		}
+		
+		recycled = false;
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		implicitProducerHashCount.store(0, std::memory_order_relaxed);
+		auto hash = &initialImplicitProducerHash;
+		hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+		hash->entries = &initialImplicitProducerHashEntries[0];
+		for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+			initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+		}
+		hash->prev = nullptr;
+		implicitProducerHash.store(hash, std::memory_order_relaxed);
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		// Swap (assumes our implicit producer hash is initialized)
+		initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+		initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+		other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+		
+		details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+		
+		details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+		if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+			implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &initialImplicitProducerHash;
+		}
+		if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+			other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &other.initialImplicitProducerHash;
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1;
+							probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed)) ||
+								(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire))) {
+#else
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed))) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					auto newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_add(-1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = newCapacity;
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				bool recycled;
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false, recycled));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_add(-1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				if (recycled) {
+					implicitProducerHashCount.fetch_add(-1, std::memory_order_relaxed);
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1;
+					auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+					
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed)) ||
+						(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire))) {
+#else
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed))) {
+#endif
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from thread exit listeners
+		details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+		
+		// Remove from hash
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1;
+				probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release);
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+	
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		auto p = static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+		if (p == nullptr) {
+			return nullptr;
+		}
+		
+		for (size_t i = 0; i != count; ++i) {
+			new (p + i) U();
+		}
+		return p;
+	}
+	
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; ) {
+				(p + --i)->~U();
+			}
+			(Traits::free)(p);
+		}
+	}
+	
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+	
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#if !MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#if MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/sparsepp.h b/include/sparsepp.h
new file mode 100644
index 0000000..192b565
--- /dev/null
+++ b/include/sparsepp.h
@@ -0,0 +1,5622 @@
+#if !defined(sparsepp_h_guard_)
+#define sparsepp_h_guard_
+
+
+// ----------------------------------------------------------------------
+// Copyright (c) 2016, Gregory Popovitch - greg7mdp at gmail.com
+// All rights reserved.
+// 
+// This work is derived from Google's sparsehash library
+//
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ----------------------------------------------------------------------
+
+ 
+// ---------------------------------------------------------------------------
+// Compiler detection code (SPP_ proprocessor macros) derived from Boost 
+// libraries. Therefore Boost software licence reproduced below.
+// ---------------------------------------------------------------------------
+// Boost Software License - Version 1.0 - August 17th, 2003
+// 
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+// 
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+// ---------------------------------------------------------------------------
+
+
+// some macros for portability
+// ---------------------------
+#define spp_ spp
+#define SPP_NAMESPACE spp_
+#define SPP_START_NAMESPACE   namespace spp {
+#define SPP_END_NAMESPACE     }
+#define SPP_GROUP_SIZE 32     // must be 32 or 64
+#define SPP_ALLOC_SZ 0        // must be power of 2 (0 = agressive alloc, 1 = smallest memory usage, 2 = good compromise)
+#define SPP_STORE_NUM_ITEMS 1 // little bit more memory, but faster!!
+
+#if (SPP_GROUP_SIZE == 32)
+    #define SPP_SHIFT_ 5
+    #define SPP_MASK_  0x1F    
+#elif (SPP_GROUP_SIZE == 64)
+    #define SPP_SHIFT_ 6
+    #define SPP_MASK_  0x3F
+#else
+    #error "SPP_GROUP_SIZE must be either 32 or 64"
+#endif
+
+// Boost like configuration
+// ------------------------
+#if defined __clang__ 
+
+    #include <cpuid.h>
+    inline void spp_cpuid(int info[4], int InfoType) {
+        __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+    }
+
+    #define SPP_POPCNT   __builtin_popcount
+    #define SPP_POPCNT64 __builtin_popcountll
+    
+    #define SPP_HAS_CSTDINT
+
+    #ifndef __has_extension
+        #define __has_extension __has_feature
+    #endif
+
+    #if !__has_feature(cxx_exceptions) && !defined(SPP_NO_EXCEPTIONS)
+        #define SPP_NO_EXCEPTIONS
+    #endif
+
+    #if !__has_feature(cxx_rtti) && !defined(SPP_NO_RTTI)
+      #define SPP_NO_RTTI
+    #endif
+
+    #if !__has_feature(cxx_rtti) && !defined(SPP_NO_TYPEID)
+        #define SPP_NO_TYPEID
+    #endif
+
+    #if defined(__int64) && !defined(__GNUC__)
+        #define SPP_HAS_MS_INT64
+    #endif
+
+    #define SPP_HAS_NRVO
+
+    // Branch prediction hints
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_expect)
+             #define SPP_LIKELY(x) __builtin_expect(x, 1)
+             #define SPP_UNLIKELY(x) __builtin_expect(x, 0)
+        #endif
+    #endif
+
+    // Clang supports "long long" in all compilation modes.
+    #define SPP_HAS_LONG_LONG
+
+    #if !__has_feature(cxx_constexpr)
+        #define SPP_NO_CXX11_CONSTEXPR
+    #endif
+
+    #if !__has_feature(cxx_decltype)
+        #define SPP_NO_CXX11_DECLTYPE
+    #endif
+
+    #if !__has_feature(cxx_decltype_incomplete_return_types)
+        #define SPP_NO_CXX11_DECLTYPE_N3276
+    #endif
+
+    #if !__has_feature(cxx_defaulted_functions)
+        #define SPP_NO_CXX11_DEFAULTED_FUNCTIONS
+    #endif
+
+    #if !__has_feature(cxx_deleted_functions)
+        #define SPP_NO_CXX11_DELETED_FUNCTIONS
+    #endif
+
+    #if !__has_feature(cxx_explicit_conversions)
+        #define SPP_NO_CXX11_EXPLICIT_CONVERSION_OPERATORS
+    #endif
+
+    #if !__has_feature(cxx_default_function_template_args)
+        #define SPP_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS
+    #endif
+
+    #if !__has_feature(cxx_generalized_initializers)
+        #define SPP_NO_CXX11_HDR_INITIALIZER_LIST
+    #endif
+
+    #if !__has_feature(cxx_lambdas)
+        #define SPP_NO_CXX11_LAMBDAS
+    #endif
+
+    #if !__has_feature(cxx_local_type_template_args)
+        #define SPP_NO_CXX11_LOCAL_CLASS_TEMPLATE_PARAMETERS
+    #endif
+
+    #if !__has_feature(cxx_nullptr)
+        #define SPP_NO_CXX11_NULLPTR
+    #endif
+
+    #if !__has_feature(cxx_range_for)
+        #define SPP_NO_CXX11_RANGE_BASED_FOR
+    #endif
+
+    #if !__has_feature(cxx_raw_string_literals)
+        #define SPP_NO_CXX11_RAW_LITERALS
+    #endif
+
+    #if !__has_feature(cxx_reference_qualified_functions)
+        #define SPP_NO_CXX11_REF_QUALIFIERS
+    #endif
+
+    #if !__has_feature(cxx_generalized_initializers)
+        #define SPP_NO_CXX11_UNIFIED_INITIALIZATION_SYNTAX
+    #endif
+
+    #if !__has_feature(cxx_rvalue_references)
+        #define SPP_NO_CXX11_RVALUE_REFERENCES
+    #endif
+
+    #if !__has_feature(cxx_strong_enums)
+        #define SPP_NO_CXX11_SCOPED_ENUMS
+    #endif
+
+    #if !__has_feature(cxx_static_assert)
+        #define SPP_NO_CXX11_STATIC_ASSERT
+    #endif
+
+    #if !__has_feature(cxx_alias_templates)
+        #define SPP_NO_CXX11_TEMPLATE_ALIASES
+    #endif
+
+    #if !__has_feature(cxx_unicode_literals)
+        #define SPP_NO_CXX11_UNICODE_LITERALS
+    #endif
+
+    #if !__has_feature(cxx_variadic_templates)
+        #define SPP_NO_CXX11_VARIADIC_TEMPLATES
+    #endif
+
+    #if !__has_feature(cxx_user_literals)
+        #define SPP_NO_CXX11_USER_DEFINED_LITERALS
+    #endif
+
+    #if !__has_feature(cxx_alignas)
+        #define SPP_NO_CXX11_ALIGNAS
+    #endif
+
+    #if !__has_feature(cxx_trailing_return)
+        #define SPP_NO_CXX11_TRAILING_RESULT_TYPES
+    #endif
+
+    #if !__has_feature(cxx_inline_namespaces)
+        #define SPP_NO_CXX11_INLINE_NAMESPACES
+    #endif
+
+    #if !__has_feature(cxx_override_control)
+        #define SPP_NO_CXX11_FINAL
+    #endif
+
+    #if !(__has_feature(__cxx_binary_literals__) || __has_extension(__cxx_binary_literals__))
+        #define SPP_NO_CXX14_BINARY_LITERALS
+    #endif
+
+    #if !__has_feature(__cxx_decltype_auto__)
+        #define SPP_NO_CXX14_DECLTYPE_AUTO
+    #endif
+
+    #if !__has_feature(__cxx_aggregate_nsdmi__)
+        #define SPP_NO_CXX14_AGGREGATE_NSDMI
+    #endif
+
+    #if !__has_feature(__cxx_init_captures__)
+        #define SPP_NO_CXX14_INITIALIZED_LAMBDA_CAPTURES
+    #endif
+
+    #if !__has_feature(__cxx_generic_lambdas__)
+        #define SPP_NO_CXX14_GENERIC_LAMBDAS
+    #endif
+
+
+    #if !__has_feature(__cxx_generic_lambdas__) || !__has_feature(__cxx_relaxed_constexpr__)
+        #define SPP_NO_CXX14_CONSTEXPR
+    #endif
+
+    #if !__has_feature(__cxx_return_type_deduction__)
+        #define SPP_NO_CXX14_RETURN_TYPE_DEDUCTION
+    #endif
+
+    #if !__has_feature(__cxx_variable_templates__)
+        #define SPP_NO_CXX14_VARIABLE_TEMPLATES
+    #endif
+
+    #if __cplusplus < 201400
+        #define SPP_NO_CXX14_DIGIT_SEPARATORS
+    #endif
+
+    #if defined(__has_builtin) && __has_builtin(__builtin_unreachable)
+      #define SPP_UNREACHABLE_RETURN(x) __builtin_unreachable();
+    #endif
+
+    #define SPP_ATTRIBUTE_UNUSED __attribute__((__unused__))
+
+    #ifndef SPP_COMPILER
+        #define SPP_COMPILER "Clang version " __clang_version__
+    #endif
+
+    #define SPP_CLANG 1
+
+
+#elif defined __GNUC__
+
+    #define SPP_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+    //  definition to expand macro then apply to pragma message
+    // #define VALUE_TO_STRING(x) #x
+    // #define VALUE(x) VALUE_TO_STRING(x)
+    // #define VAR_NAME_VALUE(var) #var "="  VALUE(var)
+    // #pragma message(VAR_NAME_VALUE(SPP_GCC_VERSION))
+
+    #include <cpuid.h>
+    inline void spp_cpuid(int info[4], int InfoType) {
+        __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+    }
+
+    // __POPCNT__ defined when the compiled with popcount support 
+    // (-mpopcnt compiler option is given for example)
+    #ifdef __POPCNT__
+        // slower unless compiled iwith -mpopcnt
+        #define SPP_POPCNT   __builtin_popcount
+        #define SPP_POPCNT64 __builtin_popcountll
+    #endif
+
+    #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (__cplusplus >= 201103L)
+        #define SPP_GCC_CXX11
+    #endif
+
+    #if __GNUC__ == 3
+        #if defined (__PATHSCALE__)
+             #define SPP_NO_TWO_PHASE_NAME_LOOKUP
+             #define SPP_NO_IS_ABSTRACT
+        #endif
+
+        #if __GNUC_MINOR__ < 4
+             #define SPP_NO_IS_ABSTRACT
+        #endif
+
+        #define SPP_NO_CXX11_EXTERN_TEMPLATE
+    #endif
+
+    #if __GNUC__ < 4
+    //
+    // All problems to gcc-3.x and earlier here:
+    //
+    #define SPP_NO_TWO_PHASE_NAME_LOOKUP
+        #ifdef __OPEN64__
+            #define SPP_NO_IS_ABSTRACT
+        #endif
+    #endif
+
+    // GCC prior to 3.4 had     #pragma once too but it didn't work well with filesystem links
+    #if SPP_GCC_VERSION >= 30400
+        #define SPP_HAS_PRAGMA_ONCE
+    #endif
+
+    #if SPP_GCC_VERSION < 40400
+        // Previous versions of GCC did not completely implement value-initialization:
+        // GCC Bug 30111, "Value-initialization of POD base class doesn't initialize
+        // members", reported by Jonathan Wakely in 2006,
+        // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=30111 (fixed for GCC 4.4)
+        // GCC Bug 33916, "Default constructor fails to initialize array members",
+        // reported by Michael Elizabeth Chastain in 2007,
+        // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33916 (fixed for GCC 4.2.4)
+        // See also: http://www.boost.org/libs/utility/value_init.htm    #compiler_issues
+        #define SPP_NO_COMPLETE_VALUE_INITIALIZATION
+    #endif
+
+    #if !defined(__EXCEPTIONS) && !defined(SPP_NO_EXCEPTIONS)
+        #define SPP_NO_EXCEPTIONS
+    #endif
+
+    //
+    // Threading support: Turn this on unconditionally here (except for
+    // those platforms where we can know for sure). It will get turned off again
+    // later if no threading API is detected.
+    //
+    #if !defined(__MINGW32__) && !defined(linux) && !defined(__linux) && !defined(__linux__)
+        #define SPP_HAS_THREADS
+    #endif
+
+    //
+    // gcc has "long long"
+    // Except on Darwin with standard compliance enabled (-pedantic)
+    // Apple gcc helpfully defines this macro we can query
+    //
+    #if !defined(__DARWIN_NO_LONG_LONG)
+        #define SPP_HAS_LONG_LONG
+    #endif
+
+    //
+    // gcc implements the named return value optimization since version 3.1
+    //
+    #define SPP_HAS_NRVO
+
+    // Branch prediction hints
+    #define SPP_LIKELY(x) __builtin_expect(x, 1)
+    #define SPP_UNLIKELY(x) __builtin_expect(x, 0)
+
+    //
+    // Dynamic shared object (DSO) and dynamic-link library (DLL) support
+    //
+    #if __GNUC__ >= 4
+       #if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32)) && !defined(__CYGWIN__)
+            // All Win32 development environments, including 64-bit Windows and MinGW, define
+            // _WIN32 or one of its variant spellings. Note that Cygwin is a POSIX environment,
+            // so does not define _WIN32 or its variants.
+            #define SPP_HAS_DECLSPEC
+            #define SPP_SYMBOL_EXPORT __attribute__((__dllexport__))
+            #define SPP_SYMBOL_IMPORT __attribute__((__dllimport__))
+       #else
+            #define SPP_SYMBOL_EXPORT __attribute__((__visibility__("default")))
+            #define SPP_SYMBOL_IMPORT
+       #endif
+
+       #define SPP_SYMBOL_VISIBLE __attribute__((__visibility__("default")))
+    #else
+       // config/platform/win32.hpp will define SPP_SYMBOL_EXPORT, etc., unless already defined
+       #define SPP_SYMBOL_EXPORT
+    #endif
+
+    //
+    // RTTI and typeinfo detection is possible post gcc-4.3:
+    //
+    #if SPP_GCC_VERSION > 40300
+        #ifndef __GXX_RTTI
+            #ifndef SPP_NO_TYPEID
+                #define SPP_NO_TYPEID
+            #endif
+            #ifndef SPP_NO_RTTI
+                #define SPP_NO_RTTI
+            #endif
+        #endif
+    #endif
+
+    //
+    // Recent GCC versions have __int128 when in 64-bit mode.
+    //
+    // We disable this if the compiler is really nvcc with C++03 as it
+    // doesn't actually support __int128 as of CUDA_VERSION=7500
+    // even though it defines __SIZEOF_INT128__.
+    // See https://svn.boost.org/trac/boost/ticket/8048
+    //     https://svn.boost.org/trac/boost/ticket/11852
+    // Only re-enable this for nvcc if you're absolutely sure
+    // of the circumstances under which it's supported:
+    //
+    #if defined(__CUDACC__)
+        #if defined(SPP_GCC_CXX11)
+            #define SPP_NVCC_CXX11
+        #else
+            #define SPP_NVCC_CXX03
+        #endif
+    #endif
+
+    #if defined(__SIZEOF_INT128__) && !defined(SPP_NVCC_CXX03)
+        #define SPP_HAS_INT128
+    #endif
+    //
+    // Recent GCC versions have a __float128 native type, we need to
+    // include a std lib header to detect this - not ideal, but we'll
+    // be including <cstddef> later anyway when we select the std lib.
+    //
+    // Nevertheless, as of CUDA 7.5, using __float128 with the host
+    // compiler in pre-C++11 mode is still not supported.
+    // See https://svn.boost.org/trac/boost/ticket/11852
+    //
+    #ifdef __cplusplus
+        #include <cstddef>
+    #else
+        #include <stddef.h>
+    #endif
+
+    #if defined(_GLIBCXX_USE_FLOAT128) && !defined(__STRICT_ANSI__) && !defined(SPP_NVCC_CXX03)
+         #define SPP_HAS_FLOAT128
+    #endif
+
+    // C++0x features in 4.3.n and later
+    //
+    #if (SPP_GCC_VERSION >= 40300) && defined(SPP_GCC_CXX11)
+       // C++0x features are only enabled when -std=c++0x or -std=gnu++0x are
+       // passed on the command line, which in turn defines
+       // __GXX_EXPERIMENTAL_CXX0X__.
+       #define SPP_HAS_DECLTYPE
+       #define SPP_HAS_RVALUE_REFS
+       #define SPP_HAS_STATIC_ASSERT
+       #define SPP_HAS_VARIADIC_TMPL
+       #define SPP_HAS_CSTDINT
+    #else
+       #define SPP_NO_CXX11_DECLTYPE
+       #define SPP_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS
+       #define SPP_NO_CXX11_RVALUE_REFERENCES
+       #define SPP_NO_CXX11_STATIC_ASSERT
+    #endif
+
+    // C++0x features in 4.4.n and later
+    //
+    #if (SPP_GCC_VERSION < 40400) || !defined(SPP_GCC_CXX11)
+       #define SPP_NO_CXX11_AUTO_DECLARATIONS
+       #define SPP_NO_CXX11_AUTO_MULTIDECLARATIONS
+       #define SPP_NO_CXX11_CHAR16_T
+       #define SPP_NO_CXX11_CHAR32_T
+       #define SPP_NO_CXX11_HDR_INITIALIZER_LIST
+       #define SPP_NO_CXX11_DEFAULTED_FUNCTIONS
+       #define SPP_NO_CXX11_DELETED_FUNCTIONS
+       #define SPP_NO_CXX11_TRAILING_RESULT_TYPES
+       #define SPP_NO_CXX11_INLINE_NAMESPACES
+       #define SPP_NO_CXX11_VARIADIC_TEMPLATES
+    #endif
+
+    #if SPP_GCC_VERSION < 40500
+       #define SPP_NO_SFINAE_EXPR
+    #endif
+
+    // GCC 4.5 forbids declaration of defaulted functions in private or protected sections
+    #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ == 5) || !defined(SPP_GCC_CXX11)
+       #define SPP_NO_CXX11_NON_PUBLIC_DEFAULTED_FUNCTIONS
+    #endif
+
+    // C++0x features in 4.5.0 and later
+    //
+    #if (SPP_GCC_VERSION < 40500) || !defined(SPP_GCC_CXX11)
+       #define SPP_NO_CXX11_EXPLICIT_CONVERSION_OPERATORS
+       #define SPP_NO_CXX11_LAMBDAS
+       #define SPP_NO_CXX11_LOCAL_CLASS_TEMPLATE_PARAMETERS
+       #define SPP_NO_CXX11_RAW_LITERALS
+       #define SPP_NO_CXX11_UNICODE_LITERALS
+    #endif
+
+    // C++0x features in 4.5.1 and later
+    //
+    #if (SPP_GCC_VERSION < 40501) || !defined(SPP_GCC_CXX11)
+       // scoped enums have a serious bug in 4.4.0, so define SPP_NO_CXX11_SCOPED_ENUMS before 4.5.1
+       // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38064
+       #define SPP_NO_CXX11_SCOPED_ENUMS
+    #endif
+
+    // C++0x features in 4.6.n and later
+    //
+    #if (SPP_GCC_VERSION < 40600) || !defined(SPP_GCC_CXX11)
+        #define SPP_NO_CXX11_CONSTEXPR
+        #define SPP_NO_CXX11_NULLPTR
+        #define SPP_NO_CXX11_RANGE_BASED_FOR
+        #define SPP_NO_CXX11_UNIFIED_INITIALIZATION_SYNTAX
+    #endif
+
+    // C++0x features in 4.7.n and later
+    //
+    #if (SPP_GCC_VERSION < 40700) || !defined(SPP_GCC_CXX11)
+        #define SPP_NO_CXX11_FINAL
+        #define SPP_NO_CXX11_TEMPLATE_ALIASES
+        #define SPP_NO_CXX11_USER_DEFINED_LITERALS
+        #define SPP_NO_CXX11_FIXED_LENGTH_VARIADIC_TEMPLATE_EXPANSION_PACKS
+    #endif
+
+    // C++0x features in 4.8.n and later
+    //
+    #if (SPP_GCC_VERSION < 40800) || !defined(SPP_GCC_CXX11)
+        #define SPP_NO_CXX11_ALIGNAS
+    #endif
+
+    // C++0x features in 4.8.1 and later
+    //
+    #if (SPP_GCC_VERSION < 40801) || !defined(SPP_GCC_CXX11)
+        #define SPP_NO_CXX11_DECLTYPE_N3276
+        #define SPP_NO_CXX11_REF_QUALIFIERS
+        #define SPP_NO_CXX14_BINARY_LITERALS
+    #endif
+
+    // C++14 features in 4.9.0 and later
+    //
+    #if (SPP_GCC_VERSION < 40900) || (__cplusplus < 201300)
+        #define SPP_NO_CXX14_RETURN_TYPE_DEDUCTION
+        #define SPP_NO_CXX14_GENERIC_LAMBDAS
+        #define SPP_NO_CXX14_DIGIT_SEPARATORS
+        #define SPP_NO_CXX14_DECLTYPE_AUTO
+        #if !((SPP_GCC_VERSION >= 40801) && (SPP_GCC_VERSION < 40900) && defined(SPP_GCC_CXX11))
+            #define SPP_NO_CXX14_INITIALIZED_LAMBDA_CAPTURES
+        #endif
+    #endif
+
+
+    // C++ 14:
+    #if !defined(__cpp_aggregate_nsdmi) || (__cpp_aggregate_nsdmi < 201304)
+        #define SPP_NO_CXX14_AGGREGATE_NSDMI
+    #endif
+    #if !defined(__cpp_constexpr) || (__cpp_constexpr < 201304)
+        #define SPP_NO_CXX14_CONSTEXPR
+    #endif
+    #if !defined(__cpp_variable_templates) || (__cpp_variable_templates < 201304)
+        #define SPP_NO_CXX14_VARIABLE_TEMPLATES
+    #endif
+
+    //
+    // Unused attribute:
+    #if __GNUC__ >= 4
+        #define SPP_ATTRIBUTE_UNUSED __attribute__((__unused__))
+    #endif
+    //
+    // __builtin_unreachable:
+    #if SPP_GCC_VERSION >= 40800
+        #define SPP_UNREACHABLE_RETURN(x) __builtin_unreachable();
+    #endif
+
+    #ifndef SPP_COMPILER
+        #define SPP_COMPILER "GNU C++ version " __VERSION__
+    #endif
+
+    // ConceptGCC compiler:
+    //   http://www.generic-programming.org/software/ConceptGCC/
+    #ifdef __GXX_CONCEPTS__
+        #define SPP_HAS_CONCEPTS
+        #define SPP_COMPILER "ConceptGCC version " __VERSION__
+    #endif
+
+
+#elif defined _MSC_VER
+
+    #include <intrin.h>                     // for __popcnt()
+
+    #define SPP_POPCNT_CHECK  // slower when defined, but we have to check!
+    #define spp_cpuid(info, x)    __cpuid(info, x)
+
+    #define SPP_POPCNT __popcnt
+    #if (SPP_GROUP_SIZE == 64 && INTPTR_MAX == INT64_MAX)
+        #define SPP_POPCNT64 __popcnt64
+    #endif
+
+    // Attempt to suppress VC6 warnings about the length of decorated names (obsolete):
+    #pragma warning( disable : 4503 ) // warning: decorated name length exceeded
+
+    #define SPP_HAS_PRAGMA_ONCE
+    #define SPP_HAS_CSTDINT
+
+   //
+    // versions check:
+    // we don't support Visual C++ prior to version 7.1:
+    #if _MSC_VER < 1310
+        #error "Antique compiler not supported"
+    #endif
+
+    #if _MSC_FULL_VER < 180020827
+        #define SPP_NO_FENV_H
+    #endif
+
+    #if _MSC_VER < 1400
+        // although a conforming signature for swprint exists in VC7.1
+        // it appears not to actually work:
+        #define SPP_NO_SWPRINTF
+
+        // Our extern template tests also fail for this compiler:
+        #define SPP_NO_CXX11_EXTERN_TEMPLATE
+
+        // Variadic macros do not exist for VC7.1 and lower
+        #define SPP_NO_CXX11_VARIADIC_MACROS
+    #endif
+
+    #if _MSC_VER < 1500  // 140X == VC++ 8.0
+        #undef SPP_HAS_CSTDINT
+        #define SPP_NO_MEMBER_TEMPLATE_FRIENDS
+    #endif
+
+    #if _MSC_VER < 1600  // 150X == VC++ 9.0
+        // A bug in VC9:
+        #define SPP_NO_ADL_BARRIER
+    #endif
+
+
+    // MSVC (including the latest checked version) has not yet completely
+    // implemented value-initialization, as is reported:
+    // "VC++ does not value-initialize members of derived classes without
+    // user-declared constructor", reported in 2009 by Sylvester Hesp:
+    // https:    //connect.microsoft.com/VisualStudio/feedback/details/484295
+    // "Presence of copy constructor breaks member class initialization",
+    // reported in 2009 by Alex Vakulenko:
+    // https:    //connect.microsoft.com/VisualStudio/feedback/details/499606
+    // "Value-initialization in new-expression", reported in 2005 by
+    // Pavel Kuznetsov (MetaCommunications Engineering):
+    // https:    //connect.microsoft.com/VisualStudio/feedback/details/100744
+    // See also: http:    //www.boost.org/libs/utility/value_init.htm    #compiler_issues
+    // (Niels Dekker, LKEB, May 2010)
+    #define SPP_NO_COMPLETE_VALUE_INITIALIZATION
+
+    #ifndef _NATIVE_WCHAR_T_DEFINED
+        #define SPP_NO_INTRINSIC_WCHAR_T
+    #endif
+
+    //
+    // check for exception handling support:
+    #if !defined(_CPPUNWIND) && !defined(SPP_NO_EXCEPTIONS)
+        #define SPP_NO_EXCEPTIONS
+    #endif
+
+    //
+    // __int64 support:
+    //
+    #define SPP_HAS_MS_INT64
+    #if defined(_MSC_EXTENSIONS) || (_MSC_VER >= 1400)
+        #define SPP_HAS_LONG_LONG
+    #else
+        #define SPP_NO_LONG_LONG
+    #endif
+
+    #if (_MSC_VER >= 1400) && !defined(_DEBUG)
+        #define SPP_HAS_NRVO
+    #endif
+
+    #if _MSC_VER >= 1500  // 150X == VC++ 9.0
+        #define SPP_HAS_PRAGMA_DETECT_MISMATCH
+    #endif
+
+    //
+    // disable Win32 API's if compiler extensions are
+    // turned off:
+    //
+    #if !defined(_MSC_EXTENSIONS) && !defined(SPP_DISABLE_WIN32)
+        #define SPP_DISABLE_WIN32
+    #endif
+
+    #if !defined(_CPPRTTI) && !defined(SPP_NO_RTTI)
+        #define SPP_NO_RTTI
+    #endif
+
+    //
+    // TR1 features:
+    //
+    #if _MSC_VER >= 1700
+        //      #define SPP_HAS_TR1_HASH	// don't know if this is true yet.
+        //      #define SPP_HAS_TR1_TYPE_TRAITS	// don't know if this is true yet.
+        #define SPP_HAS_TR1_UNORDERED_MAP
+        #define SPP_HAS_TR1_UNORDERED_SET
+    #endif
+
+    //
+    // C++0x features
+    //
+    //   See above for SPP_NO_LONG_LONG
+
+    // C++ features supported by VC++ 10 (aka 2010)
+    //
+    #if _MSC_VER < 1600
+        #define SPP_NO_CXX11_AUTO_DECLARATIONS
+        #define SPP_NO_CXX11_AUTO_MULTIDECLARATIONS
+        #define SPP_NO_CXX11_LAMBDAS
+        #define SPP_NO_CXX11_RVALUE_REFERENCES
+        #define SPP_NO_CXX11_STATIC_ASSERT
+        #define SPP_NO_CXX11_NULLPTR
+        #define SPP_NO_CXX11_DECLTYPE
+    #endif // _MSC_VER < 1600
+
+    #if _MSC_VER >= 1600
+        #define SPP_HAS_STDINT_H
+    #endif
+
+    // C++11 features supported by VC++ 11 (aka 2012)
+    //
+    #if _MSC_VER < 1700
+        #define SPP_NO_CXX11_FINAL
+        #define SPP_NO_CXX11_RANGE_BASED_FOR
+        #define SPP_NO_CXX11_SCOPED_ENUMS
+    #endif // _MSC_VER < 1700
+
+    // C++11 features supported by VC++ 12 (aka 2013).
+    //
+    #if _MSC_FULL_VER < 180020827
+        #define SPP_NO_CXX11_DEFAULTED_FUNCTIONS
+        #define SPP_NO_CXX11_DELETED_FUNCTIONS
+        #define SPP_NO_CXX11_EXPLICIT_CONVERSION_OPERATORS
+        #define SPP_NO_CXX11_FUNCTION_TEMPLATE_DEFAULT_ARGS
+        #define SPP_NO_CXX11_RAW_LITERALS
+        #define SPP_NO_CXX11_TEMPLATE_ALIASES
+        #define SPP_NO_CXX11_TRAILING_RESULT_TYPES
+        #define SPP_NO_CXX11_VARIADIC_TEMPLATES
+        #define SPP_NO_CXX11_UNIFIED_INITIALIZATION_SYNTAX
+        #define SPP_NO_CXX11_DECLTYPE_N3276
+    #endif
+
+    // C++11 features supported by VC++ 14 (aka 2014) CTP1
+    #if (_MSC_FULL_VER < 190021730)
+        #define SPP_NO_CXX11_REF_QUALIFIERS
+        #define SPP_NO_CXX11_USER_DEFINED_LITERALS
+        #define SPP_NO_CXX11_ALIGNAS
+        #define SPP_NO_CXX11_INLINE_NAMESPACES
+        #define SPP_NO_CXX14_DECLTYPE_AUTO
+        #define SPP_NO_CXX14_INITIALIZED_LAMBDA_CAPTURES
+        #define SPP_NO_CXX14_RETURN_TYPE_DEDUCTION
+        #define SPP_NO_CXX11_HDR_INITIALIZER_LIST
+    #endif
+
+    // C++11 features not supported by any versions
+    #define SPP_NO_CXX11_CHAR16_T
+    #define SPP_NO_CXX11_CHAR32_T
+    #define SPP_NO_CXX11_CONSTEXPR
+    #define SPP_NO_CXX11_UNICODE_LITERALS
+    #define SPP_NO_SFINAE_EXPR
+    #define SPP_NO_TWO_PHASE_NAME_LOOKUP
+
+    // C++ 14:
+    #if !defined(__cpp_aggregate_nsdmi) || (__cpp_aggregate_nsdmi < 201304)
+        #define SPP_NO_CXX14_AGGREGATE_NSDMI
+    #endif
+
+    #if !defined(__cpp_binary_literals) || (__cpp_binary_literals < 201304)
+        #define SPP_NO_CXX14_BINARY_LITERALS
+    #endif
+
+    #if !defined(__cpp_constexpr) || (__cpp_constexpr < 201304)
+        #define SPP_NO_CXX14_CONSTEXPR
+    #endif
+
+    #if (__cplusplus < 201304) // There's no SD6 check for this....
+        #define SPP_NO_CXX14_DIGIT_SEPARATORS
+    #endif
+
+    #if !defined(__cpp_generic_lambdas) || (__cpp_generic_lambdas < 201304)
+        #define SPP_NO_CXX14_GENERIC_LAMBDAS
+    #endif
+
+    #if !defined(__cpp_variable_templates) || (__cpp_variable_templates < 201304)
+         #define SPP_NO_CXX14_VARIABLE_TEMPLATES
+    #endif
+
+#endif
+
+// from boost/config/suffix.hpp
+// ----------------------------
+#ifndef SPP_ATTRIBUTE_UNUSED
+    #define SPP_ATTRIBUTE_UNUSED
+#endif
+
+// includes
+// --------
+#if defined(SPP_HAS_CSTDINT) && (__cplusplus >= 201103)
+    #include <cstdint>
+#else
+    #if defined(__FreeBSD__) || defined(__IBMCPP__) || defined(_AIX)
+        #include <inttypes.h>
+    #else
+        #include <stdint.h>
+    #endif
+#endif
+
+#include <cassert>
+#include <cstring>
+#include <string>
+#include <limits>                           // for numeric_limits
+#include <algorithm>                        // For swap(), eg
+#include <iterator>                         // for iterator tags
+#include <functional>                       // for equal_to<>, select1st<>, std::unary_function, etc
+#include <memory>                           // for alloc, uninitialized_copy, uninitialized_fill
+#include <cstdlib>                          // for malloc/realloc/free
+#include <cstddef>                          // for ptrdiff_t
+#include <new>                              // for placement new
+#include <stdexcept>                        // For length_error
+#include <utility>                          // for pair<>
+#include <cstdio>
+#include <iosfwd>
+#include <ios>
+
+#if !defined(SPP_NO_CXX11_HDR_INITIALIZER_LIST)
+    #include <initializer_list>
+#endif
+
+#if (SPP_GROUP_SIZE == 32)
+    typedef uint32_t group_bm_type;
+#else
+    typedef uint64_t group_bm_type;
+#endif
+
+template<int S, int H> class HashObject; // for Google's benchmark, not in spp namespace!
+
+//  ----------------------------------------------------------------------
+//                  H A S H    F U N C T I O N S
+//                  ----------------------------
+//
+//    Implements spp::spp_hash() and spp::hash_combine()
+//
+//    This is exactly the content of spp_utils.h, except for the copyright 
+//    attributions at the beginning
+//
+//    WARNING: Any change here has to be duplicated in spp_utils.h.
+//  ----------------------------------------------------------------------
+
+#if !defined(spp_utils_h_guard_)
+#define spp_utils_h_guard_
+
+#if defined(_MSC_VER) 
+    #if (_MSC_VER >= 1600 )                      // vs2010 (1900 is vs2015)
+        #include <functional>
+        #define SPP_HASH_CLASS std::hash
+    #else
+        #include  <hash_map>
+        #define SPP_HASH_CLASS stdext::hash_compare
+    #endif
+    #if (_MSC_FULL_VER < 190021730)
+        #define SPP_NO_CXX11_NOEXCEPT
+    #endif
+#elif defined(__GNUC__)
+    #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (__cplusplus >= 201103L)
+        #include <functional>
+        #define SPP_HASH_CLASS std::hash
+
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) < 40600
+            #define SPP_NO_CXX11_NOEXCEPT
+        #endif
+    #else
+        #include <tr1/unordered_map>
+        #define SPP_HASH_CLASS std::tr1::hash
+        #define SPP_NO_CXX11_NOEXCEPT
+    #endif
+#elif defined __clang__
+    #include <functional>
+    #define SPP_HASH_CLASS  std::hash
+
+    #if !__has_feature(cxx_noexcept)
+        #define SPP_NO_CXX11_NOEXCEPT
+    #endif
+#else
+    #include <functional>
+    #define SPP_HASH_CLASS  std::hash
+#endif
+
+#ifdef SPP_NO_CXX11_NOEXCEPT
+    #define SPP_NOEXCEPT
+#else
+    #define SPP_NOEXCEPT noexcept
+#endif
+
+#define SPP_INLINE
+
+#ifndef SPP_NAMESPACE
+    #define SPP_NAMESPACE spp
+#endif
+
+namespace SPP_NAMESPACE
+{
+
+template <class T>
+struct spp_hash
+{
+    SPP_INLINE size_t operator()(const T &__v) const SPP_NOEXCEPT 
+    {
+        SPP_HASH_CLASS<T> hasher;
+        return hasher(__v);
+    }
+};
+
+template <class T>
+struct spp_hash<T *>
+{
+    static size_t spp_log2 (size_t val) SPP_NOEXCEPT 
+    {
+        size_t res = 0;
+        while (val > 1) 
+        {
+            val >>= 1;
+            res++;
+        }
+        return res;
+    }
+
+    SPP_INLINE size_t operator()(const T *__v) const SPP_NOEXCEPT 
+    {
+        static const size_t shift = spp_log2(1 + sizeof(T));
+        return static_cast<size_t>((*(reinterpret_cast<const uintptr_t *>(&__v))) >> shift);
+    }
+};
+
+template <>
+struct spp_hash<bool> : public std::unary_function<bool, size_t>
+{
+    SPP_INLINE size_t operator()(bool __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<char> : public std::unary_function<char, size_t>
+{
+    SPP_INLINE size_t operator()(char __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<signed char> : public std::unary_function<signed char, size_t>
+{
+    SPP_INLINE size_t operator()(signed char __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<unsigned char> : public std::unary_function<unsigned char, size_t>
+{
+    SPP_INLINE size_t operator()(unsigned char __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<wchar_t> : public std::unary_function<wchar_t, size_t>
+{
+    SPP_INLINE size_t operator()(wchar_t __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<short> : public std::unary_function<short, size_t>
+{
+    SPP_INLINE size_t operator()(short __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <> 
+struct spp_hash<unsigned short> : public std::unary_function<unsigned short, size_t>
+{
+    SPP_INLINE size_t operator()(unsigned short __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<int> : public std::unary_function<int, size_t>
+{
+    SPP_INLINE size_t operator()(int __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<unsigned int> : public std::unary_function<unsigned int, size_t>
+{
+    SPP_INLINE size_t operator()(unsigned int __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<long> : public std::unary_function<long, size_t>
+{
+    SPP_INLINE size_t operator()(long __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<unsigned long> : public std::unary_function<unsigned long, size_t>
+{
+    SPP_INLINE size_t operator()(unsigned long __v) const SPP_NOEXCEPT {return static_cast<size_t>(__v);}
+};
+
+template <>
+struct spp_hash<float> : public std::unary_function<float, size_t>
+{
+    SPP_INLINE size_t operator()(float __v) const SPP_NOEXCEPT
+    {
+        // -0.0 and 0.0 should return same hash
+        uint32_t *as_int = reinterpret_cast<uint32_t *>(&__v);
+        return (__v == 0) ? static_cast<size_t>(0) : static_cast<size_t>(*as_int);
+    }
+};
+
+#if 0
+// todo: we should not ignore half of the double => see libcxx/include/functional
+template <>
+struct spp_hash<double> : public std::unary_function<double, size_t>
+{
+    SPP_INLINE size_t operator()(double __v) const SPP_NOEXCEPT
+    {
+        // -0.0 and 0.0 should return same hash
+        return (__v == 0) ? (size_t)0 : (size_t)*((uint64_t *)&__v);
+    }
+};
+#endif
+
+template <class T, int sz> struct Combiner
+{
+    inline void operator()(T& seed, T value);
+};
+
+template <class T> struct Combiner<T, 4>
+{
+    inline void  operator()(T& seed, T value)
+    {
+        seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+};
+
+template <class T> struct Combiner<T, 8>
+{
+    inline void  operator()(T& seed, T value)
+    {
+        seed ^= value + T(0xc6a4a7935bd1e995) + (seed << 6) + (seed >> 2);
+    }
+};
+
+template <class T>
+inline void hash_combine(std::size_t& seed, T const& v)
+{
+    spp::spp_hash<T> hasher;
+    Combiner<std::size_t, sizeof(std::size_t)> combiner;
+
+    combiner(seed, hasher(v));
+}
+    
+};
+
+#endif // spp_utils_h_guard_
+
+SPP_START_NAMESPACE
+
+//  ----------------------------------------------------------------------
+//                  U T I L    F U N C T I O N S
+//  ----------------------------------------------------------------------
+template <class E>
+inline void throw_exception(const E& exception)
+{
+#if !defined(SPP_NO_EXCEPTIONS)
+    throw exception;
+#else
+    assert(0);
+    abort();
+#endif
+}
+
+//  ----------------------------------------------------------------------
+//              M U T A B L E     P A I R      H A C K
+// turn mutable std::pair<K, V> into correct value_type std::pair<const K, V>
+//  ----------------------------------------------------------------------
+template <class T>
+struct cvt
+{
+    typedef T type;
+};
+
+template <class K, class V>
+struct cvt<std::pair<K, V> >
+{
+    typedef std::pair<const K, V> type;
+};
+
+template <class K, class V>
+struct cvt<const std::pair<K, V> >
+{
+    typedef const std::pair<const K, V> type;
+};
+
+//  ----------------------------------------------------------------------
+//              M O V E   I T E R A T O R
+//  ----------------------------------------------------------------------
+#ifdef SPP_NO_CXX11_RVALUE_REFERENCES
+    #define MK_MOVE_IT(p) (p)
+#else
+    #define MK_MOVE_IT(p) std::make_move_iterator(p)
+#endif
+
+
+//  ----------------------------------------------------------------------
+//                  A L L O C A T O R     S T U F F 
+//  ----------------------------------------------------------------------
+template<class T>
+class libc_allocator_with_realloc 
+{
+public:
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+
+    typedef T* pointer;
+    typedef const T* const_pointer;
+    typedef T& reference;
+    typedef const T& const_reference;
+
+    libc_allocator_with_realloc() {}
+    libc_allocator_with_realloc(const libc_allocator_with_realloc& /*unused*/) {}
+    ~libc_allocator_with_realloc() {}
+
+    pointer address(reference r) const  { return &r; }
+    const_pointer address(const_reference r) const  { return &r; }
+
+    pointer allocate(size_type n, const_pointer  /*unused*/= 0) 
+    {
+        return static_cast<pointer>(malloc(n * sizeof(value_type)));
+    }
+
+    void deallocate(pointer p, size_type /*unused*/) 
+    {
+        free(p);
+    }
+
+    pointer reallocate(pointer p, size_type n) 
+    {
+        return static_cast<pointer>(realloc(p, n * sizeof(value_type)));
+    }
+
+    size_type max_size() const  
+    {
+        return static_cast<size_type>(-1) / sizeof(value_type);
+    }
+
+    void construct(pointer p, const value_type& val) 
+    {
+        new(p) value_type(val);
+    }
+
+    void destroy(pointer p) { p->~value_type(); }
+
+    template <class U>
+    explicit libc_allocator_with_realloc(const libc_allocator_with_realloc<U>& /*unused*/) {}
+
+    template<class U>
+    struct rebind 
+    {
+        typedef libc_allocator_with_realloc<U> other;
+    };
+};
+
+//  ----------------------------------------------------------------------
+// libc_allocator_with_realloc<void> specialization.
+//  ----------------------------------------------------------------------
+template<>
+class libc_allocator_with_realloc<void> 
+{
+public:
+    typedef void value_type;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    typedef void* pointer;
+    typedef const void* const_pointer;
+
+    template<class U>
+    struct rebind 
+    {
+        typedef libc_allocator_with_realloc<U> other;
+    };
+};
+
+template<class T>
+inline bool operator==(const libc_allocator_with_realloc<T>& /*unused*/,
+                       const libc_allocator_with_realloc<T>& /*unused*/)
+{
+    return true;
+}
+
+template<class T>
+inline bool operator!=(const libc_allocator_with_realloc<T>& /*unused*/,
+                       const libc_allocator_with_realloc<T>& /*unused*/)
+{
+    return false;
+}
+
+//  ----------------------------------------------------------------------
+//             I N T E R N A L    S T U F F
+//  ----------------------------------------------------------------------
+#ifdef SPP_NO_CXX11_STATIC_ASSERT
+    template <bool> struct SppCompileAssert { };
+    #define SPP_COMPILE_ASSERT(expr, msg) \
+      SPP_ATTRIBUTE_UNUSED typedef SppCompileAssert<(bool(expr))> spp_bogus_[bool(expr) ? 1 : -1]
+#else
+    #define SPP_COMPILE_ASSERT static_assert
+#endif
+
+namespace sparsehash_internal 
+{
+
+// Adaptor methods for reading/writing data from an INPUT or OUPTUT
+// variable passed to serialize() or unserialize().  For now we
+// have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note
+// they are pointers, unlike typical use), or else a pointer to
+// something that supports a Read()/Write() method.
+//
+// For technical reasons, we implement read_data/write_data in two
+// stages.  The actual work is done in *_data_internal, which takes
+// the stream argument twice: once as a template type, and once with
+// normal type information.  (We only use the second version.)  We do
+// this because of how C++ picks what function overload to use.  If we
+// implemented this the naive way:
+//    bool read_data(istream* is, const void* data, size_t length);
+//    template<typename T> read_data(T* fp,  const void* data, size_t length);
+// C++ would prefer the second version for every stream type except
+// istream.  However, we want C++ to prefer the first version for
+// streams that are *subclasses* of istream, such as istringstream.
+// This is not possible given the way template types are resolved.  So
+// we split the stream argument in two, one of which is templated and
+// one of which is not.  The specialized functions (like the istream
+// version above) ignore the template arg and use the second, 'type'
+// arg, getting subclass matching as normal.  The 'catch-all'
+// functions (the second version above) use the template arg to deduce
+// the type, and use a second, void* arg to achieve the desired
+// 'catch-all' semantics.
+
+    // ----- low-level I/O for FILE* ----
+
+    template<typename Ignored>
+    inline bool read_data_internal(Ignored* /*unused*/, FILE* fp,
+                                   void* data, size_t length) 
+    {
+        return fread(data, length, 1, fp) == 1;
+    }
+
+    template<typename Ignored>
+    inline bool write_data_internal(Ignored* /*unused*/, FILE* fp,
+                                    const void* data, size_t length) 
+    {
+        return fwrite(data, length, 1, fp) == 1;
+    }
+
+    // ----- low-level I/O for iostream ----
+
+    // We want the caller to be responsible for #including <iostream>, not
+    // us, because iostream is a big header!  According to the standard,
+    // it's only legal to delay the instantiation the way we want to if
+    // the istream/ostream is a template type.  So we jump through hoops.
+    template<typename ISTREAM>
+    inline bool read_data_internal_for_istream(ISTREAM* fp,
+                                               void* data, size_t length) 
+    {
+        return fp->read(reinterpret_cast<char*>(data), 
+                        static_cast<std::streamsize>(length)).good();
+    }
+    template<typename Ignored>
+    inline bool read_data_internal(Ignored* /*unused*/, std::istream* fp,
+                                   void* data, size_t length) 
+    {
+        return read_data_internal_for_istream(fp, data, length);
+    }
+
+    template<typename OSTREAM>
+    inline bool write_data_internal_for_ostream(OSTREAM* fp,
+                                                const void* data, size_t length) 
+    {
+        return fp->write(reinterpret_cast<const char*>(data), 
+                         static_cast<std::streamsize>(length)).good();
+    }
+    template<typename Ignored>
+    inline bool write_data_internal(Ignored* /*unused*/, std::ostream* fp,
+                                    const void* data, size_t length)
+    {
+        return write_data_internal_for_ostream(fp, data, length);
+    }
+
+    // ----- low-level I/O for custom streams ----
+
+    // The INPUT type needs to support a Read() method that takes a
+    // buffer and a length and returns the number of bytes read.
+    template <typename INPUT>
+    inline bool read_data_internal(INPUT* fp, void* /*unused*/,
+                                   void* data, size_t length) 
+    {
+        return static_cast<size_t>(fp->Read(data, length)) == length;
+    }
+
+    // The OUTPUT type needs to support a Write() operation that takes
+    // a buffer and a length and returns the number of bytes written.
+    template <typename OUTPUT>
+    inline bool write_data_internal(OUTPUT* fp, void* /*unused*/,
+                                    const void* data, size_t length) 
+    {
+        return static_cast<size_t>(fp->Write(data, length)) == length;
+    }
+
+    // ----- low-level I/O: the public API ----
+
+    template <typename INPUT>
+    inline bool read_data(INPUT* fp, void* data, size_t length) 
+    {
+        return read_data_internal(fp, fp, data, length);
+    }
+
+    template <typename OUTPUT>
+    inline bool write_data(OUTPUT* fp, const void* data, size_t length) 
+    {
+        return write_data_internal(fp, fp, data, length);
+    }
+
+    // Uses read_data() and write_data() to read/write an integer.
+    // length is the number of bytes to read/write (which may differ
+    // from sizeof(IntType), allowing us to save on a 32-bit system
+    // and load on a 64-bit system).  Excess bytes are taken to be 0.
+    // INPUT and OUTPUT must match legal inputs to read/write_data (above).
+    // --------------------------------------------------------------------
+    template <typename INPUT, typename IntType>
+    bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) 
+    {
+        *value = 0;
+        unsigned char byte;
+        // We require IntType to be unsigned or else the shifting gets all screwy.
+        SPP_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0), "serializing_int_requires_an_unsigned_type");
+        for (size_t i = 0; i < length; ++i) 
+        {
+            if (!read_data(fp, &byte, sizeof(byte))) 
+                return false;
+            *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8);
+        }
+        return true;
+    }
+
+    template <typename OUTPUT, typename IntType>
+    bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) 
+    {
+        unsigned char byte;
+        // We require IntType to be unsigned or else the shifting gets all screwy.
+        SPP_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0), "serializing_int_requires_an_unsigned_type");
+        for (size_t i = 0; i < length; ++i) 
+        {
+            byte = (sizeof(value) <= length-1 - i)
+                ? static_cast<unsigned char>(0) : static_cast<unsigned char>((value >> ((length-1 - i) * 8)) & 255);
+            if (!write_data(fp, &byte, sizeof(byte))) return false;
+        }
+        return true;
+    }
+
+    // If your keys and values are simple enough, you can pass this
+    // serializer to serialize()/unserialize().  "Simple enough" means
+    // value_type is a POD type that contains no pointers.  Note,
+    // however, we don't try to normalize endianness.
+    // This is the type used for NopointerSerializer.
+    // ---------------------------------------------------------------
+    template <typename value_type> struct pod_serializer 
+    {
+        template <typename INPUT>
+        bool operator()(INPUT* fp, value_type* value) const 
+        {
+            return read_data(fp, value, sizeof(*value));
+        }
+
+        template <typename OUTPUT>
+        bool operator()(OUTPUT* fp, const value_type& value) const 
+        {
+            return write_data(fp, &value, sizeof(value));
+        }
+    };
+
+
+    // Settings contains parameters for growing and shrinking the table.
+    // It also packages zero-size functor (ie. hasher).
+    //
+    // It does some munging of the hash value in cases where we think
+    // (fear) the original hash function might not be very good.  In
+    // particular, the default hash of pointers is the identity hash,
+    // so probably all the low bits are 0.  We identify when we think
+    // we're hashing a pointer, and chop off the low bits.  Note this
+    // isn't perfect: even when the key is a pointer, we can't tell
+    // for sure that the hash is the identity hash.  If it's not, this
+    // is needless work (and possibly, though not likely, harmful).
+    // ---------------------------------------------------------------
+    template<typename Key, typename HashFunc,
+             typename SizeType, int HT_MIN_BUCKETS>
+    class sh_hashtable_settings : public HashFunc 
+    {
+    private:
+        template <class T, int sz> struct Mixer
+        {
+            inline T operator()(T h) const;
+        };
+
+        template <class T> struct Mixer<T, 4>
+        {
+            inline T operator()(T h) const
+            {
+                return h + (h >> 7) + (h >> 13) + (h >> 23);
+            }
+        };
+
+        template <class T> struct Mixer<T, 8>
+        {
+            inline T operator()(T h) const
+            {
+                return h + (h >> 7) + (h >> 13) + (h >> 23) + (h >> 32);
+            }
+        };
+
+    public:
+        typedef Key key_type;
+        typedef HashFunc hasher;
+        typedef SizeType size_type;
+
+    public:
+        sh_hashtable_settings(const hasher& hf,
+                              const float ht_occupancy_flt,
+                              const float ht_empty_flt)
+            : hasher(hf),
+              enlarge_threshold_(0),
+              shrink_threshold_(0),
+              consider_shrink_(false),
+              num_ht_copies_(0) 
+        {
+            set_enlarge_factor(ht_occupancy_flt);
+            set_shrink_factor(ht_empty_flt);
+        }
+
+        size_t hash(const key_type& v) const 
+        {
+            size_t h = hasher::operator()(v);
+            Mixer<size_t, sizeof(size_t)> mixer;
+
+            return mixer(h);
+        }
+
+        float enlarge_factor() const            { return enlarge_factor_; }
+        void set_enlarge_factor(float f)        { enlarge_factor_ = f;    }
+        float shrink_factor() const             { return shrink_factor_;  }
+        void set_shrink_factor(float f)         { shrink_factor_ = f;     }
+
+        size_type enlarge_threshold() const     { return enlarge_threshold_; }
+        void set_enlarge_threshold(size_type t) { enlarge_threshold_ = t; }
+        size_type shrink_threshold() const      { return shrink_threshold_; }
+        void set_shrink_threshold(size_type t)  { shrink_threshold_ = t; }
+
+        size_type enlarge_size(size_type x) const { return static_cast<size_type>(x * enlarge_factor_); }
+        size_type shrink_size(size_type x) const { return static_cast<size_type>(x * shrink_factor_); }
+
+        bool consider_shrink() const            { return consider_shrink_; }
+        void set_consider_shrink(bool t)        { consider_shrink_ = t; }
+
+        unsigned int num_ht_copies() const      { return num_ht_copies_; }
+        void inc_num_ht_copies()                { ++num_ht_copies_; }
+
+        // Reset the enlarge and shrink thresholds
+        void reset_thresholds(size_type num_buckets) 
+        {
+            set_enlarge_threshold(enlarge_size(num_buckets));
+            set_shrink_threshold(shrink_size(num_buckets));
+            // whatever caused us to reset already considered
+            set_consider_shrink(false);
+        }
+
+        // Caller is resposible for calling reset_threshold right after
+        // set_resizing_parameters.
+        // ------------------------------------------------------------
+        void set_resizing_parameters(float shrink, float grow) 
+        {
+            assert(shrink >= 0.0);
+            assert(grow <= 1.0);
+            if (shrink > grow/2.0f)
+                shrink = grow / 2.0f;     // otherwise we thrash hashtable size
+            set_shrink_factor(shrink);
+            set_enlarge_factor(grow);
+        }
+
+        // This is the smallest size a hashtable can be without being too crowded
+        // If you like, you can give a min #buckets as well as a min #elts
+        // ----------------------------------------------------------------------
+        size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) 
+        {
+            float enlarge = enlarge_factor();
+            size_type sz = HT_MIN_BUCKETS;             // min buckets allowed
+            while (sz < min_buckets_wanted ||
+                   num_elts >= static_cast<size_type>(sz * enlarge))
+            {
+                // This just prevents overflowing size_type, since sz can exceed
+                // max_size() here.
+                // -------------------------------------------------------------
+                if (static_cast<size_type>(sz * 2) < sz)
+                    throw_exception(std::length_error("resize overflow"));  // protect against overflow
+                sz *= 2;
+            }
+            return sz;
+        }
+
+    private:
+        size_type enlarge_threshold_;  // table.size() * enlarge_factor
+        size_type shrink_threshold_;   // table.size() * shrink_factor
+        float enlarge_factor_;         // how full before resize
+        float shrink_factor_;          // how empty before resize
+        bool consider_shrink_;         // if we should try to shrink before next insert
+                                       
+        unsigned int num_ht_copies_;   // num_ht_copies is a counter incremented every Copy/Move
+    };
+
+}  // namespace sparsehash_internal
+
+#undef SPP_COMPILE_ASSERT
+
+//  ----------------------------------------------------------------------
+//                    S P A R S E T A B L E
+//  ----------------------------------------------------------------------
+//
+// A sparsetable is a random container that implements a sparse array,
+// that is, an array that uses very little memory to store unassigned
+// indices (in this case, between 1-2 bits per unassigned index).  For
+// instance, if you allocate an array of size 5 and assign a[2] = <big
+// struct>, then a[2] will take up a lot of memory but a[0], a[1],
+// a[3], and a[4] will not.  Array elements that have a value are
+// called "assigned".  Array elements that have no value yet, or have
+// had their value cleared using erase() or clear(), are called
+// "unassigned".
+//
+// Unassigned values seem to have the default value of T (see below).
+// Nevertheless, there is a difference between an unassigned index and
+// one explicitly assigned the value of T().  The latter is considered
+// assigned.
+//
+// Access to an array element is constant time, as is insertion and
+// deletion.  Insertion and deletion may be fairly slow, however:
+// because of this container's memory economy, each insert and delete
+// causes a memory reallocation.
+//
+// NOTE: You should not test(), get(), or set() any index that is
+// greater than sparsetable.size().  If you need to do that, call
+// resize() first.
+//
+// --- Template parameters
+// PARAMETER   DESCRIPTION                           DEFAULT
+// T           The value of the array: the type of   --
+//             object that is stored in the array.
+//
+// Alloc:      Allocator to use to allocate memory.  libc_allocator_with_realloc
+//
+// --- Model of
+// Random Access Container
+//
+// --- Type requirements
+// T must be Copy Constructible. It need not be Assignable.
+//
+// --- Public base classes
+// None.
+//
+// --- Members
+//
+// [*] All iterators are const in a sparsetable (though nonempty_iterators
+//     may not be).  Use get() and set() to assign values, not iterators.
+//
+// [+] iterators are random-access iterators.  nonempty_iterators are
+//     bidirectional iterators.
+
+// [*] If you shrink a sparsetable using resize(), assigned elements
+// past the end of the table are removed using erase().  If you grow
+// a sparsetable, new unassigned indices are created.
+//
+// [+] Note that operator[] returns a const reference.  You must use
+// set() to change the value of a table element.
+//
+// [!] Unassignment also calls the destructor.
+//
+// Iterators are invalidated whenever an item is inserted or
+// deleted (ie set() or erase() is used) or when the size of
+// the table changes (ie resize() or clear() is used).
+
+
+// ---------------------------------------------------------------------------
+//                       type_traits we need
+// ---------------------------------------------------------------------------
+template<class T, T v>
+struct integral_constant { static const T value = v; };
+
+template <class T, T v> const T integral_constant<T, v>::value;
+
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+
+template<typename T, typename U> struct is_same : public false_type { };
+template<typename T> struct is_same<T, T> : public true_type { };
+
+template<typename T> struct remove_const { typedef T type; };
+template<typename T> struct remove_const<T const> { typedef T type; };
+
+template<typename T> struct remove_volatile { typedef T type; };
+template<typename T> struct remove_volatile<T volatile> { typedef T type; };
+
+template<typename T> struct remove_cv {
+    typedef typename remove_const<typename remove_volatile<T>::type>::type type;
+};
+
+// ---------------- is_integral ----------------------------------------
+template <class T> struct is_integral;
+template <class T> struct is_integral         : false_type { };
+template<> struct is_integral<bool>           : true_type { };
+template<> struct is_integral<char>           : true_type { };
+template<> struct is_integral<unsigned char>  : true_type { };
+template<> struct is_integral<signed char>    : true_type { };
+template<> struct is_integral<short>          : true_type { };
+template<> struct is_integral<unsigned short> : true_type { };
+template<> struct is_integral<int>            : true_type { };
+template<> struct is_integral<unsigned int>   : true_type { };
+template<> struct is_integral<long>           : true_type { };
+template<> struct is_integral<unsigned long>  : true_type { };
+#ifdef SPP_HAS_LONG_LONG
+    template<> struct is_integral<long long>  : true_type { };
+    template<> struct is_integral<unsigned long long> : true_type { };
+#endif
+template <class T> struct is_integral<const T>          : is_integral<T> { };
+template <class T> struct is_integral<volatile T>       : is_integral<T> { };
+template <class T> struct is_integral<const volatile T> : is_integral<T> { };
+
+// ---------------- is_floating_point ----------------------------------------
+template <class T> struct is_floating_point;
+template <class T> struct is_floating_point      : false_type { };
+template<> struct is_floating_point<float>       : true_type { };
+template<> struct is_floating_point<double>      : true_type { };
+template<> struct is_floating_point<long double> : true_type { };
+template <class T> struct is_floating_point<const T> :        is_floating_point<T> { };
+template <class T> struct is_floating_point<volatile T>       : is_floating_point<T> { };
+template <class T> struct is_floating_point<const volatile T> : is_floating_point<T> { };
+
+//  ---------------- is_pointer ----------------------------------------
+template <class T> struct is_pointer;
+template <class T> struct is_pointer     : false_type { };
+template <class T> struct is_pointer<T*> : true_type { };
+template <class T> struct is_pointer<const T>          : is_pointer<T> { };
+template <class T> struct is_pointer<volatile T>       : is_pointer<T> { };
+template <class T> struct is_pointer<const volatile T> : is_pointer<T> { };
+
+//  ---------------- is_reference ----------------------------------------
+template <class T> struct is_reference;
+template<typename T> struct is_reference     : false_type {};
+template<typename T> struct is_reference<T&> : true_type {};
+
+//  ---------------- is_relocatable ----------------------------------------
+// relocatable values can be moved around in memory using memcpy and remain 
+// correct. Most types are relocatable, an example of a type who is not would 
+// be a struct which contains a pointer to a buffer inside itself - this is the 
+// case for std::string in gcc 5.
+// ------------------------------------------------------------------------
+template <class T> struct is_relocatable;
+template <class T> struct is_relocatable : 
+     integral_constant<bool, (is_integral<T>::value || is_floating_point<T>::value)> 
+{ };
+
+template<int S, int H> struct is_relocatable<HashObject<S, H> > : true_type { };
+
+template <class T> struct is_relocatable<const T>          : is_relocatable<T> { };
+template <class T> struct is_relocatable<volatile T>       : is_relocatable<T> { };
+template <class T> struct is_relocatable<const volatile T> : is_relocatable<T> { };
+template <class A, int N> struct is_relocatable<A[N]>      : is_relocatable<A> { };
+template <class T, class U> struct is_relocatable<std::pair<T, U> > : 
+     integral_constant<bool, (is_relocatable<T>::value && is_relocatable<U>::value)> 
+{ };
+
+// ---------------------------------------------------------------------------
+// Our iterator as simple as iterators can be: basically it's just
+// the index into our table.  Dereference, the only complicated
+// thing, we punt to the table class.  This just goes to show how
+// much machinery STL requires to do even the most trivial tasks.
+//
+// A NOTE ON ASSIGNING:
+// A sparse table does not actually allocate memory for entries
+// that are not filled.  Because of this, it becomes complicated
+// to have a non-const iterator: we don't know, if the iterator points
+// to a not-filled bucket, whether you plan to fill it with something
+// or whether you plan to read its value (in which case you'll get
+// the default bucket value).  Therefore, while we can define const
+// operations in a pretty 'normal' way, for non-const operations, we
+// define something that returns a helper object with operator= and
+// operator& that allocate a bucket lazily.  We use this for table[]
+// and also for regular table iterators.
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class tabletype>
+class table_element_adaptor 
+{
+public:
+    typedef typename tabletype::value_type value_type;
+    typedef typename tabletype::size_type  size_type;
+    typedef typename tabletype::reference  reference;
+    typedef typename tabletype::pointer    pointer;
+
+    table_element_adaptor(tabletype *tbl, size_type p) :
+        table(tbl), pos(p) 
+    { }
+
+    table_element_adaptor& operator=(const value_type &val)
+    {
+        table->set(pos, val, false);
+        return *this;
+    }
+
+    operator value_type() { return table->get(pos); }   // we look like a value
+
+    pointer operator& () { return &table->mutating_get(pos); }
+
+private:
+    tabletype* table;
+    size_type pos;
+};
+
+// Our iterator as simple as iterators can be: basically it's just
+// the index into our table.  Dereference, the only complicated
+// thing, we punt to the table class.  This just goes to show how
+// much machinery STL requires to do even the most trivial tasks.
+//
+// By templatizing over tabletype, we have one iterator type which
+// we can use for both sparsetables and sparsebins.  In fact it
+// works on any class that allows size() and operator[] (eg vector),
+// as long as it does the standard STL typedefs too (eg value_type).
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class tabletype>
+class table_iterator 
+{
+public:
+    typedef table_iterator iterator;
+
+    typedef std::random_access_iterator_tag      iterator_category;
+    typedef typename tabletype::value_type       value_type;
+    typedef typename tabletype::difference_type  difference_type;
+    typedef typename tabletype::size_type        size_type;
+    typedef table_element_adaptor<tabletype>     reference;
+    typedef table_element_adaptor<tabletype>*    pointer;
+
+    explicit table_iterator(tabletype *tbl = 0, size_type p = 0) : 
+        table(tbl), pos(p) 
+    { }
+
+    // The main thing our iterator does is dereference.  If the table entry
+    // we point to is empty, we return the default value type.
+    // This is the big different function from the const iterator.
+    reference operator*()            
+    {
+        return table_element_adaptor<tabletype>(table, pos);
+    }
+
+    pointer operator->()  { return &(operator*()); }
+
+    // Helper function to assert things are ok; eg pos is still in range
+    void check() const 
+    {
+        assert(table);
+        assert(pos <= table->size());
+    }
+
+    // Arithmetic: we just do arithmetic on pos.  We don't even need to
+    // do bounds checking, since STL doesn't consider that its job.  :-)
+    iterator& operator+=(size_type t) { pos += t; check(); return *this; }
+    iterator& operator-=(size_type t) { pos -= t; check(); return *this; }
+    iterator& operator++()            { ++pos; check(); return *this; }
+    iterator& operator--()            { --pos; check(); return *this; }
+    iterator operator++(int)         
+    {
+        iterator tmp(*this);     // for x++
+        ++pos; check(); return tmp;
+    }
+
+    iterator operator--(int)          
+    {
+        iterator tmp(*this);     // for x--
+        --pos; check(); return tmp;
+    }
+
+    iterator operator+(difference_type i) const  
+    {
+        iterator tmp(*this);
+        tmp += i; return tmp; 
+    }
+
+    iterator operator-(difference_type i) const  
+    {
+        iterator tmp(*this);
+        tmp -= i; return tmp;
+    }
+
+    difference_type operator-(iterator it) const 
+    {      // for "x = it2 - it"
+        assert(table == it.table);
+        return pos - it.pos;
+    }
+
+    reference operator[](difference_type n) const 
+    {
+        return *(*this + n);            // simple though not totally efficient
+    }
+
+    // Comparisons.
+    bool operator==(const iterator& it) const 
+    {
+        return table == it.table && pos == it.pos;
+    }
+
+    bool operator<(const iterator& it) const
+    {
+        assert(table == it.table);              // life is bad bad bad otherwise
+        return pos < it.pos;
+    }
+
+    bool operator!=(const iterator& it) const { return !(*this == it); }
+    bool operator<=(const iterator& it) const { return !(it < *this); }
+    bool operator>(const iterator& it) const { return it < *this; }
+    bool operator>=(const iterator& it) const { return !(*this < it); }
+
+    // Here's the info we actually need to be an iterator
+    tabletype *table;              // so we can dereference and bounds-check
+    size_type pos;                 // index into the table
+};
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class tabletype>
+class const_table_iterator 
+{
+public:
+    typedef table_iterator<tabletype> iterator;
+    typedef const_table_iterator const_iterator;
+
+    typedef std::random_access_iterator_tag iterator_category;
+    typedef typename tabletype::value_type value_type;
+    typedef typename tabletype::difference_type difference_type;
+    typedef typename tabletype::size_type size_type;
+    typedef typename tabletype::const_reference reference;  // we're const-only
+    typedef typename tabletype::const_pointer pointer;
+
+    // The "real" constructor
+    const_table_iterator(const tabletype *tbl, size_type p)
+        : table(tbl), pos(p) { }
+
+    // The default constructor, used when I define vars of type table::iterator
+    const_table_iterator() : table(NULL), pos(0) { }
+
+    // The copy constructor, for when I say table::iterator foo = tbl.begin()
+    // Also converts normal iterators to const iterators // not explicit on purpose
+    const_table_iterator(const iterator &from)
+        : table(from.table), pos(from.pos) { }
+
+    // The default destructor is fine; we don't define one
+    // The default operator= is fine; we don't define one
+
+    // The main thing our iterator does is dereference.  If the table entry
+    // we point to is empty, we return the default value type.
+    reference operator*() const       { return (*table)[pos]; }
+    pointer operator->() const        { return &(operator*()); }
+
+    // Helper function to assert things are ok; eg pos is still in range
+    void check() const 
+    {
+        assert(table);
+        assert(pos <= table->size());
+    }
+
+    // Arithmetic: we just do arithmetic on pos.  We don't even need to
+    // do bounds checking, since STL doesn't consider that its job.  :-)
+    const_iterator& operator+=(size_type t) { pos += t; check(); return *this; }
+    const_iterator& operator-=(size_type t) { pos -= t; check(); return *this; }
+    const_iterator& operator++()            { ++pos; check(); return *this; }
+    const_iterator& operator--()            { --pos; check(); return *this; }
+    const_iterator operator++(int)          { const_iterator tmp(*this); // for x++
+        ++pos; check(); return tmp; }
+    const_iterator operator--(int)          { const_iterator tmp(*this); // for x--
+        --pos; check(); return tmp; }
+    const_iterator operator+(difference_type i) const  
+    {
+        const_iterator tmp(*this);
+        tmp += i;
+        return tmp; 
+    }
+    const_iterator operator-(difference_type i) const 
+    {
+        const_iterator tmp(*this);
+        tmp -= i; 
+        return tmp; 
+    }
+    difference_type operator-(const_iterator it) const
+    {   // for "x = it2 - it"
+        assert(table == it.table);
+        return pos - it.pos;
+    }
+    reference operator[](difference_type n) const
+    {
+        return *(*this + n);            // simple though not totally efficient
+    }
+
+    // Comparisons.
+    bool operator==(const const_iterator& it) const
+    {
+        return table == it.table && pos == it.pos;
+    }
+
+    bool operator<(const const_iterator& it) const 
+    {
+        assert(table == it.table);              // life is bad bad bad otherwise
+        return pos < it.pos;
+    }
+    bool operator!=(const const_iterator& it) const { return !(*this == it); }
+    bool operator<=(const const_iterator& it) const { return !(it < *this); }
+    bool operator>(const const_iterator& it) const { return it < *this; }
+    bool operator>=(const const_iterator& it) const { return !(*this < it); }
+
+    // Here's the info we actually need to be an iterator
+    const tabletype *table;        // so we can dereference and bounds-check
+    size_type pos;                 // index into the table
+};
+
+// ---------------------------------------------------------------------------
+// This is a 2-D iterator.  You specify a begin and end over a list
+// of *containers*.  We iterate over each container by iterating over
+// it.  It's actually simple:
+// VECTOR.begin() VECTOR[0].begin()  --------> VECTOR[0].end() ---,
+//     |          ________________________________________________/
+//     |          \_> VECTOR[1].begin()  -------->  VECTOR[1].end() -,
+//     |          ___________________________________________________/
+//     v          \_> ......
+// VECTOR.end()
+//
+// It's impossible to do random access on one of these things in constant
+// time, so it's just a bidirectional iterator.
+//
+// Unfortunately, because we need to use this for a non-empty iterator,
+// we use ne_begin() and ne_end() instead of begin() and end()
+// (though only going across, not down).
+// ---------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class T, class row_it, class col_it, class iter_type>
+class Two_d_iterator : public std::iterator<iter_type, T>
+{
+public:
+    typedef Two_d_iterator iterator;
+
+    // T can be std::pair<K, V>, but we need to return std::pair<const K, V>
+    // ---------------------------------------------------------------------
+    typedef typename spp_::cvt<T>::type value_type;
+    typedef value_type&                reference;
+    typedef value_type*                pointer;
+
+    explicit Two_d_iterator(row_it curr) : row_current(curr), col_current(0)
+    {
+        if (row_current && !row_current->is_marked()) 
+        {
+            col_current = row_current->ne_begin();
+            advance_past_end();                 // in case cur->begin() == cur->end()
+        }
+    }
+
+    explicit Two_d_iterator(row_it curr, col_it col) : row_current(curr), col_current(col) 
+    {
+        assert(col);
+    }
+
+    // The default constructor
+    Two_d_iterator() :  row_current(0), col_current(0) { }
+    
+    // Need this explicitly so we can convert normal iterators <=> const iterators
+    // not explicit on purpose
+    // ---------------------------------------------------------------------------
+    template <class T2, class row_it2, class col_it2, class iter_type2>
+    Two_d_iterator(const Two_d_iterator<T2, row_it2, col_it2, iter_type2>& it) :
+        row_current (*(row_it *)&it.row_current),
+        col_current (*(col_it *)&it.col_current)
+    { }
+
+    // The default destructor is fine; we don't define one
+    // The default operator= is fine; we don't define one
+
+    reference operator*() const    { return *(col_current); }
+    pointer operator->() const     { return &(operator*()); }
+
+    // Arithmetic: we just do arithmetic on pos.  We don't even need to
+    // do bounds checking, since STL doesn't consider that its job.  :-)
+    // NOTE: this is not amortized constant time!  What do we do about it?
+    // ------------------------------------------------------------------
+    void advance_past_end() 
+    {   
+        // used when col_current points to end()
+        while (col_current == row_current->ne_end()) 
+        { 
+            // end of current row
+            // ------------------
+            ++row_current;                                // go to beginning of next
+            if (!row_current->is_marked())                // col is irrelevant at end
+                col_current = row_current->ne_begin();
+            else
+                break;                                    // don't go past row_end
+        }
+    }
+
+    friend size_t operator-(iterator l, iterator f)
+    {
+        if (f.row_current->is_marked())
+            return 0;
+
+        size_t diff(0);
+        while (f != l)
+        {
+            ++diff;
+            ++f;
+        }
+        return diff;
+    }
+        
+    iterator& operator++() 
+    {
+        // assert(!row_current->is_marked());               // how to ++ from there?
+        ++col_current;
+        advance_past_end();                              // in case col_current is at end()
+        return *this;
+    }
+
+    iterator& operator--() 
+    {
+        while (row_current->is_marked() ||
+               col_current == row_current->ne_begin()) 
+        {
+            --row_current;
+            col_current = row_current->ne_end();             // this is 1 too far
+        }
+        --col_current;
+        return *this;
+    }
+    iterator operator++(int)       { iterator tmp(*this); ++*this; return tmp; }
+    iterator operator--(int)       { iterator tmp(*this); --*this; return tmp; }
+
+
+    // Comparisons.
+    bool operator==(const iterator& it) const 
+    {
+        return (row_current == it.row_current &&
+                (!row_current || row_current->is_marked() || col_current == it.col_current));
+    }
+
+    bool operator!=(const iterator& it) const { return !(*this == it); }
+
+    // Here's the info we actually need to be an iterator
+    // These need to be public so we convert from iterator to const_iterator
+    // ---------------------------------------------------------------------
+    row_it row_current;
+    col_it col_current;
+};
+
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class T, class row_it, class col_it, class iter_type, class Alloc>
+class Two_d_destructive_iterator : public Two_d_iterator<T, row_it, col_it, iter_type>
+{
+public:
+    typedef Two_d_destructive_iterator iterator;
+
+    Two_d_destructive_iterator(Alloc &alloc, row_it curr) : 
+        _alloc(alloc)
+    {
+        this->row_current = curr;
+        this->col_current = 0;
+        if (this->row_current && !this->row_current->is_marked()) 
+        {
+            this->col_current = this->row_current->ne_begin();
+            advance_past_end();                 // in case cur->begin() == cur->end()
+        }
+    }
+
+    // Arithmetic: we just do arithmetic on pos.  We don't even need to
+    // do bounds checking, since STL doesn't consider that its job.  :-)
+    // NOTE: this is not amortized constant time!  What do we do about it?
+    // ------------------------------------------------------------------
+    void advance_past_end() 
+    {   
+        // used when col_current points to end()
+        while (this->col_current == this->row_current->ne_end()) 
+        { 
+            this->row_current->clear(_alloc, true);  // This is what differs from non-destructive iterators above
+
+            // end of current row
+            // ------------------
+            ++this->row_current;                          // go to beginning of next
+            if (!this->row_current->is_marked())          // col is irrelevant at end
+                this->col_current = this->row_current->ne_begin();
+            else
+                break;                                    // don't go past row_end
+        }
+    }
+
+    iterator& operator++() 
+    {
+        // assert(!this->row_current->is_marked());         // how to ++ from there?
+        ++this->col_current;
+        advance_past_end();                              // in case col_current is at end()
+        return *this;
+    }
+
+private:
+    Two_d_destructive_iterator& operator=(const Two_d_destructive_iterator &o);
+
+    Alloc &_alloc;
+};
+
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+static const char spp_bits_in[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+static inline uint32_t s_spp_popcount_default_lut(uint32_t i)
+{
+    uint32_t res = static_cast<uint32_t>(spp_bits_in[i & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 8)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 16) & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[i >> 24]);
+    return res;
+}
+
+static inline uint32_t s_spp_popcount_default_lut(uint64_t i)
+{
+    uint32_t res = static_cast<uint32_t>(spp_bits_in[i & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >>  8)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 16)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 24)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 32)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 40)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[(i >> 48)  & 0xFF]);
+    res += static_cast<uint32_t>(spp_bits_in[i >> 56]);
+    return res;
+}
+
+// faster than the lookup table (LUT)
+// ----------------------------------
+static inline uint32_t s_spp_popcount_default(uint32_t i)
+{
+    i = i - ((i >> 1) & 0x55555555);
+    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+    return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
+}
+
+// faster than the lookup table (LUT)
+// ----------------------------------
+static inline uint32_t s_spp_popcount_default(uint64_t x)
+{
+    const uint64_t m1  = uint64_t(0x5555555555555555); // binary: 0101...
+    const uint64_t m2  = uint64_t(0x3333333333333333); // binary: 00110011..
+    const uint64_t m4  = uint64_t(0x0f0f0f0f0f0f0f0f); // binary:  4 zeros,  4 ones ...
+    const uint64_t h01 = uint64_t(0x0101010101010101); // the sum of 256 to the power of 0,1,2,3...
+
+    x -= (x >> 1) & m1;             // put count of each 2 bits into those 2 bits
+    x = (x & m2) + ((x >> 2) & m2); // put count of each 4 bits into those 4 bits 
+    x = (x + (x >> 4)) & m4;        // put count of each 8 bits into those 8 bits 
+    return (x * h01)>>56;           // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24)+...
+}
+
+#if defined(SPP_POPCNT_CHECK)
+static inline bool spp_popcount_check()
+{
+    int cpuInfo[4] = { -1 };
+    spp_cpuid(cpuInfo, 1);
+    if (cpuInfo[2] & (1 << 23))
+        return true;   // means SPP_POPCNT supported
+    return false;
+}
+#endif
+
+#if defined(SPP_POPCNT_CHECK) && defined(SPP_POPCNT)
+
+static inline uint32_t spp_popcount(uint32_t i)
+{
+    static const bool s_ok = spp_popcount_check(); 
+    return s_ok ? SPP_POPCNT(i) : s_spp_popcount_default(i);
+}
+
+#else
+
+static inline uint32_t spp_popcount(uint32_t i)
+{
+#if defined(SPP_POPCNT)
+    return static_cast<uint32_t>(SPP_POPCNT(i));
+#else
+    return s_spp_popcount_default(i);
+#endif
+}
+
+#endif
+
+#if defined(SPP_POPCNT_CHECK) && defined(SPP_POPCNT64)
+
+static inline uint32_t spp_popcount(uint64_t i)
+{
+    static const bool s_ok = spp_popcount_check(); 
+    return s_ok ? (uint32_t)SPP_POPCNT64(i) : s_spp_popcount_default(i);
+}
+
+#else
+
+static inline uint32_t spp_popcount(uint64_t i)
+{
+#if defined(SPP_POPCNT64)
+    return static_cast<uint32_t>(SPP_POPCNT64(i));
+#elif 1
+    return s_spp_popcount_default(i);
+#endif
+}
+
+#endif
+
+// ---------------------------------------------------------------------------
+// SPARSE-TABLE
+// ------------
+// The idea is that a table with (logically) t buckets is divided
+// into t/M *groups* of M buckets each.  (M is a constant, typically
+// 32)  Each group is stored sparsely.
+// Thus, inserting into the table causes some array to grow, which is
+// slow but still constant time.  Lookup involves doing a
+// logical-position-to-sparse-position lookup, which is also slow but
+// constant time.  The larger M is, the slower these operations are
+// but the less overhead (slightly).
+//
+// To store the sparse array, we store a bitmap B, where B[i] = 1 iff
+// bucket i is non-empty.  Then to look up bucket i we really look up
+// array[# of 1s before i in B].  This is constant time for fixed M.
+//
+// Terminology: the position of an item in the overall table (from
+// 1 .. t) is called its "location."  The logical position in a group
+// (from 1 .. M) is called its "position."  The actual location in
+// the array (from 1 .. # of non-empty buckets in the group) is
+// called its "offset."
+// ---------------------------------------------------------------------------
+
+template <class T, class Alloc>
+class sparsegroup 
+{
+public:
+    // Basic types
+    typedef typename spp::cvt<T>::type                     value_type;
+    typedef Alloc                                          allocator_type;
+    typedef value_type&                                    reference;
+    typedef const value_type&                              const_reference;
+    typedef value_type*                                    pointer;
+    typedef const value_type*                              const_pointer;
+
+    typedef table_element_adaptor<sparsegroup<T, Alloc> >  element_adaptor;
+    typedef uint8_t                                        size_type;        // max # of buckets
+
+    // These are our special iterators, that go over non-empty buckets in a
+    // group.  These aren't const-only because you can change non-empty bcks.
+    // ---------------------------------------------------------------------
+    typedef pointer                                        ne_iterator;
+    typedef const_pointer                                  const_ne_iterator;
+    typedef std::reverse_iterator<ne_iterator>             reverse_ne_iterator;
+    typedef std::reverse_iterator<const_ne_iterator>       const_reverse_ne_iterator;
+
+    // We'll have versions for our special non-empty iterator too
+    // ----------------------------------------------------------
+    ne_iterator               ne_begin()         { return reinterpret_cast<pointer>(_group); }
+    const_ne_iterator         ne_begin() const   { return reinterpret_cast<pointer>(_group); }
+    const_ne_iterator         ne_cbegin() const  { return reinterpret_cast<pointer>(_group); }
+    ne_iterator               ne_end()           { return reinterpret_cast<pointer>(_group + _num_items()); }
+    const_ne_iterator         ne_end() const     { return reinterpret_cast<pointer>(_group + _num_items()); }
+    const_ne_iterator         ne_cend() const    { return reinterpret_cast<pointer>(_group + _num_items()); }
+    reverse_ne_iterator       ne_rbegin()        { return reverse_ne_iterator(ne_end()); }
+    const_reverse_ne_iterator ne_rbegin() const  { return const_reverse_ne_iterator(ne_cend());  }
+    const_reverse_ne_iterator ne_crbegin() const { return const_reverse_ne_iterator(ne_cend());  }
+    reverse_ne_iterator       ne_rend()          { return reverse_ne_iterator(ne_begin()); }
+    const_reverse_ne_iterator ne_rend() const    { return const_reverse_ne_iterator(ne_cbegin());  }
+    const_reverse_ne_iterator ne_crend() const   { return const_reverse_ne_iterator(ne_cbegin());  }
+
+
+    // This gives us the "default" value to return for an empty bucket.
+    // We just use the default constructor on T, the template type
+    // ----------------------------------------------------------------
+    const_reference default_value() const 
+    {
+        static value_type defaultval = value_type();
+        return defaultval;
+    }
+
+private:
+    // T can be std::pair<K, V>, but we need to return std::pair<const K, V>
+    // ---------------------------------------------------------------------
+    typedef T                                              mutable_value_type;
+    typedef mutable_value_type&                            mutable_reference;
+    typedef const mutable_value_type&                      const_mutable_reference;
+    typedef mutable_value_type*                            mutable_pointer;
+    typedef const mutable_value_type*                      const_mutable_pointer;
+
+#define spp_mutable_ref(x) (*(reinterpret_cast<mutable_pointer>(&(x))))
+#define spp_const_mutable_ref(x) (*(reinterpret_cast<const_mutable_pointer>(&(x))))
+
+    typedef typename Alloc::template rebind<T>::other      value_alloc_type;
+
+    bool _bmtest(size_type i) const   { return !!(_bitmap & (static_cast<group_bm_type>(1) << i)); }
+    void _bmset(size_type i)          { _bitmap |= static_cast<group_bm_type>(1) << i; }
+    void _bmclear(size_type i)        { _bitmap &= ~(static_cast<group_bm_type>(1) << i); }
+
+    bool _bme_test(size_type i) const { return !!(_bm_erased & (static_cast<group_bm_type>(1) << i)); }
+    void _bme_set(size_type i)        { _bm_erased |= static_cast<group_bm_type>(1) << i; }
+    void _bme_clear(size_type i)      { _bm_erased &= ~(static_cast<group_bm_type>(1) << i); }
+
+    bool _bmtest_strict(size_type i) const   
+    { return !!((_bitmap | _bm_erased) & (static_cast<group_bm_type>(1) << i)); }
+
+    
+    static uint32_t _sizing(uint32_t n) 
+    {
+#if !defined(SPP_ALLOC_SZ) || (SPP_ALLOC_SZ == 0)
+        // aggressive allocation first, then decreasing as sparsegroups fill up
+        // --------------------------------------------------------------------
+        static uint8_t s_alloc_batch_sz[SPP_GROUP_SIZE] = { 0 };
+        if (!s_alloc_batch_sz[0])
+        {
+            // 32 bit bitmap
+            // ........ .... .... .. .. .. .. .  .  .  .  .  .  .  .
+            //     8     12   16  18 20 22 24 25 26   ...          32
+            // ------------------------------------------------------
+            uint8_t group_sz          = SPP_GROUP_SIZE / 4;
+            uint8_t group_start_alloc = SPP_GROUP_SIZE / 8; //4;
+            uint8_t alloc_sz          = group_start_alloc;
+            for (int i=0; i<4; ++i)
+            {
+                for (int j=0; j<group_sz; ++j)
+                {
+                    if (j && j % group_start_alloc == 0)
+                        alloc_sz += group_start_alloc;
+                    s_alloc_batch_sz[i * group_sz + j] = alloc_sz;
+                }
+                if (group_start_alloc > 2)
+                    group_start_alloc /= 2;
+                alloc_sz += group_start_alloc;
+            }
+        }
+
+        return n ? static_cast<uint32_t>(s_alloc_batch_sz[n-1]) : 0; // more aggressive alloc at the beginning
+
+#elif (SPP_ALLOC_SZ == 1)
+        // use as little memory as possible - slowest insert/delete in table
+        // -----------------------------------------------------------------
+        return n;
+#else
+        // decent compromise when SPP_ALLOC_SZ == 2
+        // ----------------------------------------
+        static size_type sz_minus_1 = SPP_ALLOC_SZ - 1;
+        return (n + sz_minus_1) & ~sz_minus_1;
+#endif
+    }
+
+    mutable_pointer _allocate_group(Alloc &alloc, uint32_t n /* , bool tight = false */) 
+    {
+        // ignore tight since we don't store num_alloc
+        // num_alloc = (uint8_t)(tight ? n : _sizing(n));
+
+        uint32_t num_alloc = (uint8_t)_sizing(n);
+        _set_num_alloc(num_alloc);
+        mutable_pointer retval = alloc.allocate(static_cast<size_type>(num_alloc));
+        if (retval == NULL) 
+        {
+            // the allocator is supposed to throw an exception if the allocation fails.
+            fprintf(stderr, "sparsehash FATAL ERROR: failed to allocate %d groups\n", num_alloc);
+            exit(1);
+        }
+        return retval;
+    }
+
+    void _free_group(Alloc &alloc, uint32_t num_alloc)
+    {
+        if (_group)  
+        {
+            uint32_t num_buckets = _num_items();
+            if (num_buckets)
+            {
+                mutable_pointer end_it = _group + num_buckets;
+                for (mutable_pointer p = _group; p != end_it; ++p)
+                    p->~mutable_value_type();
+            }
+            alloc.deallocate(_group, (typename allocator_type::size_type)num_alloc);
+            _group = NULL;
+        }
+    }
+
+    // private because should not be called - no allocator!
+    sparsegroup &operator=(const sparsegroup& x);
+
+    static size_type _pos_to_offset(group_bm_type bm, size_type pos)
+    {  
+        //return (size_type)((uint32_t)~((int32_t(-1) + pos) >> 31) & spp_popcount(bm << (SPP_GROUP_SIZE - pos)));
+        //return (size_type)(pos ? spp_popcount(bm << (SPP_GROUP_SIZE - pos)) : 0);
+        return static_cast<size_type>(spp_popcount(bm & ((static_cast<group_bm_type>(1) << pos) - 1)));
+    }
+
+public:
+    
+    // get_iter() in sparsetable needs it
+    size_type pos_to_offset(size_type pos) const
+    {  
+        return _pos_to_offset(_bitmap, pos); 
+    }
+
+#ifdef _MSC_VER 
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+
+    // Returns the (logical) position in the bm[] array, i, such that
+    // bm[i] is the offset-th set bit in the array.  It is the inverse
+    // of pos_to_offset.  get_pos() uses this function to find the index
+    // of an ne_iterator in the table.  Bit-twiddling from
+    // http://hackersdelight.org/basics.pdf
+    // -----------------------------------------------------------------
+    static size_type offset_to_pos(group_bm_type bm, size_type offset) 
+    {
+        for (; offset > 0; offset--) 
+            bm &= (bm-1);  // remove right-most set bit
+
+        // Clear all bits to the left of the rightmost bit (the &),
+        // and then clear the rightmost bit but set all bits to the
+        // right of it (the -1).
+        // --------------------------------------------------------
+        bm = (bm & -bm) - 1;
+        return  static_cast<size_type>(spp_popcount(bm));
+    }
+
+#ifdef _MSC_VER 
+#pragma warning(pop)
+#endif
+
+    size_type offset_to_pos(size_type offset) const 
+    {
+        return offset_to_pos(_bitmap, offset);
+    }
+
+public:
+    // Constructors -- default and copy -- and destructor
+    explicit sparsegroup() :
+        _group(0), _bitmap(0), _bm_erased(0)
+    {
+        _set_num_items(0);
+        _set_num_alloc(0);        
+    }
+
+    sparsegroup(const sparsegroup& x) : 
+        _group(0), _bitmap(x._bitmap), _bm_erased(x._bm_erased)
+    {
+        _set_num_items(0);
+        _set_num_alloc(0);  
+         assert(_group == 0); if (_group) exit(1);
+    }
+
+    sparsegroup(const sparsegroup& x, allocator_type& a) : 
+        _group(0), _bitmap(x._bitmap), _bm_erased(x._bm_erased)
+    {
+        _set_num_items(0);
+        _set_num_alloc(0);  
+
+        uint32_t num_items = x._num_items();
+        if (num_items)
+        {
+            _group = _allocate_group(a, num_items /* , true */);
+            _set_num_items(num_items);
+            std::uninitialized_copy(x._group, x._group + num_items, _group);
+        }
+    }
+
+    ~sparsegroup() { assert(_group == 0); if (_group) exit(1); }
+
+    void destruct(allocator_type& a) { _free_group(a, _num_alloc()); }
+
+    // Many STL algorithms use swap instead of copy constructors
+    void swap(sparsegroup& x) 
+    {
+        using std::swap;
+
+        swap(_group, x._group);
+        swap(_bitmap, x._bitmap);
+        swap(_bm_erased, x._bm_erased);
+#ifdef SPP_STORE_NUM_ITEMS
+        swap(_num_buckets,   x._num_buckets);
+        swap(_num_allocated, x._num_allocated);        
+#endif
+    }
+
+    // It's always nice to be able to clear a table without deallocating it
+    void clear(Alloc &alloc, bool erased) 
+    {
+        _free_group(alloc, _num_alloc());
+        _bitmap = 0;
+        if (erased)
+            _bm_erased = 0;
+        _set_num_items(0);
+        _set_num_alloc(0);
+    }
+
+    // Functions that tell you about size.  Alas, these aren't so useful
+    // because our table is always fixed size.
+    size_type size() const           { return static_cast<size_type>(SPP_GROUP_SIZE); }
+    size_type max_size() const       { return static_cast<size_type>(SPP_GROUP_SIZE); }
+
+    bool empty() const               { return false; }
+
+    // We also may want to know how many *used* buckets there are
+    size_type num_nonempty() const   { return (size_type)_num_items(); }
+
+    // get()/set() are explicitly const/non-const.  You can use [] if
+    // you want something that can be either (potentially more expensive).
+    const_reference get(size_type i) const 
+    {
+        if (_bmtest(i))           // bucket i is occupied
+            return (const_reference)_group[pos_to_offset(i)];
+        else
+            return default_value();  // return the default reference
+    }
+
+    // TODO(csilvers): make protected + friend
+    // This is used by sparse_hashtable to get an element from the table
+    // when we know it exists.
+    reference unsafe_get(size_type i) const
+    {
+        // assert(_bmtest(i));
+        return (reference)_group[pos_to_offset(i)];
+    }
+
+    typedef std::pair<mutable_pointer, bool> SetResult;
+
+    // returns a reference which can be assigned, so we have to create an entry if not 
+    // already there
+    // -------------------------------------------------------------------------------
+    reference mutating_get(Alloc &alloc, size_type i) 
+    {
+        // fills bucket i before getting
+        if (!_bmtest(i))
+        {
+            SetResult sr = set(alloc, i, false);
+            if (!sr.second)
+                ::new (sr.first) mutable_value_type();
+            return *((pointer)sr.first);
+        }
+
+        return _group[pos_to_offset(i)];
+    }
+
+    // Syntactic sugar.  It's easy to return a const reference.  To
+    // return a non-const reference, we need to use the assigner adaptor.
+    const_reference operator[](size_type i) const 
+    {
+        return get(i);
+    }
+
+    element_adaptor operator[](size_type i)
+    {
+        return element_adaptor(this, i);
+    }
+
+private:
+    typedef spp_::integral_constant<bool,
+                                    (spp_::is_relocatable<value_type>::value &&
+                                     spp_::is_same<allocator_type,
+                                                   spp_::libc_allocator_with_realloc<mutable_value_type> >::value)>
+            realloc_and_memmove_ok; 
+
+    // Our default allocator - try to merge memory buffers
+    // right now it uses Google's traits, but we should use something like folly::IsRelocatable
+    // return true if the slot was constructed (i.e. contains a valid mutable_value_type
+    // ---------------------------------------------------------------------------------
+    bool _set_aux(Alloc &alloc, size_type offset, spp_::true_type) 
+    {
+        //static int x=0;  if (++x < 10) printf("x\n"); // check we are getting here
+        
+        uint32_t  num_items = _num_items();
+        uint32_t  num_alloc = _sizing(num_items);
+
+        if (num_items == num_alloc)
+        {
+            num_alloc = _sizing(num_items + 1);
+            _group = alloc.reallocate(_group, num_alloc);
+            _set_num_alloc(num_alloc);
+        }
+
+        for (uint32_t i = num_items; i > offset; --i)
+            memcpy(_group + i, _group + i-1, sizeof(*_group));
+        return false;
+    }
+
+    // Create space at _group[offset], without special assumptions about value_type
+    // and allocator_type, with a default value
+    // return true if the slot was constructed (i.e. contains a valid mutable_value_type
+    // ---------------------------------------------------------------------------------
+    bool _set_aux(Alloc &alloc, size_type offset, spp_::false_type) 
+    {
+        uint32_t  num_items = _num_items();
+        uint32_t  num_alloc = _sizing(num_items);
+
+        //assert(num_alloc == (uint32_t)_num_allocated);
+        if (num_items < num_alloc)
+        {
+            // create new object at end and rotate it to position
+            ::new (&_group[num_items]) mutable_value_type();
+            std::rotate(_group + offset, _group + num_items, _group + num_items + 1);
+            return true;
+        }
+
+        // This is valid because 0 <= offset <= num_items
+        mutable_pointer p = _allocate_group(alloc, _sizing(num_items + 1));
+        if (offset)
+            std::uninitialized_copy(MK_MOVE_IT(_group), 
+                                    MK_MOVE_IT(_group + offset),
+                                    p);
+        if (num_items > offset)
+            std::uninitialized_copy(MK_MOVE_IT(_group + offset),
+                                    MK_MOVE_IT(_group + num_items),
+                                    p + offset + 1);
+        _free_group(alloc, num_alloc);
+        _group = p;
+        return false;
+    }
+
+public:
+
+    // TODO(austern): Make this exception safe: handle exceptions from
+    // value_type's copy constructor.
+    // return true if the slot was constructed (i.e. contains a valid mutable_value_type)
+    // ----------------------------------------------------------------------------------
+    bool _set(Alloc &alloc, size_type i, size_type offset, bool erased)
+    {
+        if (erased)
+        {
+            // assert(_bme_test(i));
+            _bme_clear(i);
+        }
+
+        if (!_bmtest(i)) 
+        {
+            bool res = _set_aux(alloc, offset, realloc_and_memmove_ok());
+            _incr_num_items();
+            _bmset(i);
+            return res;
+        }
+        return true;
+    }
+
+    // This returns a pair (first is a pointer to the item's location, second is whether
+    // that location is constructed (i.e. contains a valid mutable_value_type)
+    // ---------------------------------------------------------------------------------
+    SetResult set(Alloc &alloc, size_type i, bool erased)
+    {
+        size_type offset = pos_to_offset(i);  
+        bool constructed =  _set(alloc, i, offset, erased); // may change _group pointer
+        return std::make_pair(_group + offset, constructed);
+    }
+
+    // used in _move_from (where we can move the old value instead of copying it
+    // -------------------------------------------------------------------------
+    void move(Alloc &alloc, size_type i, reference val)
+    {
+        // assert(!_bmtest(i));
+
+        size_type offset = pos_to_offset(i); 
+        if (!_set(alloc, i, offset, false))
+            ::new (&_group[offset]) mutable_value_type();
+
+        using std::swap;
+        swap(_group[offset], spp_mutable_ref(val)); // called from _move_from, OK to swap
+    }
+    
+    // We let you see if a bucket is non-empty without retrieving it
+    // -------------------------------------------------------------
+    bool test(size_type i) const
+    {
+        return _bmtest(i);
+    }
+
+    // also tests for erased values
+    // ----------------------------
+    bool test_strict(size_type i) const 
+    {
+        return _bmtest_strict(i);
+    }
+
+private:
+    // Shrink the array, assuming value_type has trivial copy
+    // constructor and destructor, and the allocator_type is the default
+    // libc_allocator_with_alloc. 
+    // -----------------------------------------------------------------------
+    void _group_erase_aux(Alloc &alloc, size_type offset, spp_::true_type) 
+    {
+        // static int x=0;  if (++x < 10) printf("Y\n"); // check we are getting here
+        uint32_t  num_items = _num_items();
+        uint32_t  num_alloc = _sizing(num_items);
+
+        if (num_items == 1)
+        {
+            assert(offset == 0);
+            _free_group(alloc, num_alloc);
+            _set_num_alloc(0);
+            return;
+        }
+
+        _group[offset].~mutable_value_type();
+
+        for (size_type i = offset; i < num_items - 1; ++i)
+            memcpy(_group + i, _group + i + 1, sizeof(*_group));
+        
+        if (_sizing(num_items - 1) != num_alloc)
+        {
+            num_alloc = _sizing(num_items - 1);
+            assert(num_alloc);            // because we have at least 1 item left
+            _set_num_alloc(num_alloc);
+            _group = alloc.reallocate(_group, num_alloc);
+        }
+    }
+
+    // Shrink the array, without any special assumptions about value_type and
+    // allocator_type.
+    // --------------------------------------------------------------------------
+    void _group_erase_aux(Alloc &alloc, size_type offset, spp_::false_type) 
+    {
+        uint32_t  num_items = _num_items();
+        uint32_t  num_alloc   = _sizing(num_items);
+
+        if (_sizing(num_items - 1) != num_alloc)
+        {
+            mutable_pointer p = 0;
+            if (num_items > 1)
+            {
+                p = _allocate_group(alloc, num_items - 1);
+                if (offset)
+                    std::uninitialized_copy(MK_MOVE_IT(_group), 
+                                            MK_MOVE_IT(_group + offset), 
+                                            p);
+                if (static_cast<uint32_t>(offset + 1) < num_items)
+                    std::uninitialized_copy(MK_MOVE_IT(_group + offset + 1), 
+                                            MK_MOVE_IT(_group + num_items),
+                                            p + offset);
+            }
+            else
+            {
+                assert(offset == 0);
+                _set_num_alloc(0);
+            }
+            _free_group(alloc, num_alloc);
+            _group = p;
+        }
+        else
+        {
+            std::rotate(_group + offset, _group + offset + 1, _group + num_items);
+            _group[num_items - 1].~mutable_value_type();
+        }
+    }
+
+    void _group_erase(Alloc &alloc, size_type offset)
+    {
+        _group_erase_aux(alloc, offset, realloc_and_memmove_ok());
+    }
+
+public:
+    template <class twod_iter>
+    bool erase_ne(Alloc &alloc, twod_iter &it)
+    {
+        assert(_group && it.col_current != ne_end());
+        size_type offset = (size_type)(it.col_current - ne_begin());
+        size_type pos    = offset_to_pos(offset);
+
+        if (_num_items() <= 1)
+        {
+            clear(alloc, false);
+            it.col_current = 0;
+        }
+        else
+        {
+            _group_erase(alloc, offset);
+            _decr_num_items();
+            _bmclear(pos);
+
+            // in case _group_erase reallocated the buffer
+            it.col_current = reinterpret_cast<pointer>(_group) + offset; 
+        }
+        _bme_set(pos);  // remember that this position has been erased
+        it.advance_past_end();
+        return true;
+    }
+
+
+    // This takes the specified elements out of the group.  This is
+    // "undefining", rather than "clearing".
+    // TODO(austern): Make this exception safe: handle exceptions from
+    // value_type's copy constructor.
+    // ---------------------------------------------------------------
+    void erase(Alloc &alloc, size_type i)
+    {
+        if (_bmtest(i))
+        { 
+            // trivial to erase empty bucket
+            if (_num_items() == 1)
+                clear(alloc, false);
+            else 
+            {
+                _group_erase(alloc, pos_to_offset(i)); 
+                _decr_num_items();
+                _bmclear(i);
+            }
+            _bme_set(i); // remember that this position has been erased
+        }
+    }
+
+    // I/O
+    // We support reading and writing groups to disk.  We don't store
+    // the actual array contents (which we don't know how to store),
+    // just the bitmap and size.  Meant to be used with table I/O.
+    // --------------------------------------------------------------
+    template <typename OUTPUT> bool write_metadata(OUTPUT *fp) const 
+    {
+        // warning: we write 4 or 8 bytes for the bitmap, instead of 6 in the 
+        //          original google sparsehash
+        // ------------------------------------------------------------------
+        if (!sparsehash_internal::write_data(fp, &_bitmap, sizeof(_bitmap)))
+            return false;
+
+        return true;
+    }
+
+    // Reading destroys the old group contents!  Returns true if all was ok.
+    template <typename INPUT> bool read_metadata(Alloc &alloc, INPUT *fp) 
+    {
+        clear(alloc, true);
+
+        if (!sparsehash_internal::read_data(fp, &_bitmap, sizeof(_bitmap)))
+            return false;
+
+        // We'll allocate the space, but we won't fill it: it will be
+        // left as uninitialized raw memory.
+        uint32_t num_items = spp_popcount(_bitmap); // yes, _num_buckets not set
+        _set_num_items(num_items);
+        _group = num_items ? _allocate_group(alloc, num_items/* , true */) : 0;
+        return true;
+    }
+
+    // Again, only meaningful if value_type is a POD.
+    template <typename INPUT> bool read_nopointer_data(INPUT *fp)
+    {
+        for (ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!sparsehash_internal::read_data(fp, &(*it), sizeof(*it)))
+                return false;
+        return true;
+    }
+
+    // If your keys and values are simple enough, we can write them
+    // to disk for you.  "simple enough" means POD and no pointers.
+    // However, we don't try to normalize endianness.
+    // ------------------------------------------------------------
+    template <typename OUTPUT> bool write_nopointer_data(OUTPUT *fp) const
+    {
+        for (const_ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!sparsehash_internal::write_data(fp, &(*it), sizeof(*it)))
+                return false;
+        return true;
+    }
+
+
+    // Comparisons.  We only need to define == and < -- we get
+    // != > <= >= via relops.h (which we happily included above).
+    // Note the comparisons are pretty arbitrary: we compare
+    // values of the first index that isn't equal (using default
+    // value for empty buckets).
+    // ---------------------------------------------------------
+    bool operator==(const sparsegroup& x) const
+    {
+        return (_bitmap == x._bitmap &&
+                _bm_erased == x._bm_erased && 
+                std::equal(_group, _group + _num_items(), x._group));  
+    }
+
+    bool operator<(const sparsegroup& x) const 
+    {
+        // also from <algorithm>
+        return std::lexicographical_compare(_group, _group + _num_items(), 
+                                            x._group, x._group + x._num_items());
+    }
+
+    bool operator!=(const sparsegroup& x) const { return !(*this == x); }
+    bool operator<=(const sparsegroup& x) const { return !(x < *this); }
+    bool operator> (const sparsegroup& x) const { return x < *this; }
+    bool operator>=(const sparsegroup& x) const { return !(*this < x); }
+
+    void mark()            { _group = (mutable_value_type *)static_cast<uintptr_t>(-1); }
+    bool is_marked() const { return _group == (mutable_value_type *)static_cast<uintptr_t>(-1); }
+
+private:
+    // ---------------------------------------------------------------------------
+    template <class A>
+    class alloc_impl : public A 
+    {
+    public:
+        typedef typename A::pointer pointer;
+        typedef typename A::size_type size_type;
+
+        // Convert a normal allocator to one that has realloc_or_die()
+        explicit alloc_impl(const A& a) : A(a) { }
+
+        // realloc_or_die should only be used when using the default
+        // allocator (libc_allocator_with_realloc).
+        pointer realloc_or_die(pointer /*ptr*/, size_type /*n*/) 
+        {
+            fprintf(stderr, "realloc_or_die is only supported for "
+                    "libc_allocator_with_realloc\n");
+            exit(1);
+            return NULL;
+        }
+    };
+
+    // A template specialization of alloc_impl for
+    // libc_allocator_with_realloc that can handle realloc_or_die.
+    // -----------------------------------------------------------
+    template <class A>
+    class alloc_impl<libc_allocator_with_realloc<A> >
+        : public libc_allocator_with_realloc<A>    
+    {
+    public:
+        typedef typename libc_allocator_with_realloc<A>::pointer pointer;
+        typedef typename libc_allocator_with_realloc<A>::size_type size_type;
+
+        explicit alloc_impl(const libc_allocator_with_realloc<A>& a)
+            : libc_allocator_with_realloc<A>(a) 
+        { }
+
+        pointer realloc_or_die(pointer ptr, size_type n)
+        {
+            pointer retval = this->reallocate(ptr, n);
+            if (retval == NULL) {
+                fprintf(stderr, "sparsehash: FATAL ERROR: failed to reallocate "
+                        "%lu elements for ptr %p", static_cast<unsigned long>(n), ptr);
+                exit(1);
+            }
+            return retval;
+        }
+    };
+
+#ifdef SPP_STORE_NUM_ITEMS
+    uint32_t _num_items() const           { return (uint32_t)_num_buckets; }
+    void     _set_num_items(uint32_t val) { _num_buckets = static_cast<size_type>(val); }
+    void     _incr_num_items()            { ++_num_buckets; }
+    void     _decr_num_items()            { --_num_buckets; }
+    uint32_t _num_alloc() const           { return (uint32_t)_num_allocated; }
+    void     _set_num_alloc(uint32_t val) { _num_allocated = static_cast<size_type>(val); }
+#else
+    uint32_t _num_items() const           { return spp_popcount(_bitmap); }
+    void     _set_num_items(uint32_t )    { }
+    void     _incr_num_items()            { }
+    void     _decr_num_items()            { }
+    uint32_t _num_alloc() const           { return _sizing(_num_items()); }
+    void     _set_num_alloc(uint32_t val) { }
+#endif
+
+    // The actual data
+    // ---------------
+    mutable_value_type * _group;                             // (small) array of T's
+    group_bm_type        _bitmap;
+    group_bm_type        _bm_erased;                         // ones where items have been erased
+
+#ifdef SPP_STORE_NUM_ITEMS
+    size_type            _num_buckets;
+    size_type            _num_allocated;
+#endif
+};
+
+// ---------------------------------------------------------------------------
+// We need a global swap as well
+// ---------------------------------------------------------------------------
+template <class T, class Alloc>
+inline void swap(sparsegroup<T,Alloc> &x, sparsegroup<T,Alloc> &y) 
+{
+    x.swap(y);
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+template <class T, class Alloc = libc_allocator_with_realloc<T> >
+class sparsetable 
+{
+private:
+    typedef typename Alloc::template rebind<T>::other     value_alloc_type;
+
+    typedef typename Alloc::template rebind<
+        sparsegroup<T, value_alloc_type> >::other group_alloc_type;
+    typedef typename group_alloc_type::size_type          group_size_type;
+
+    typedef T                                             mutable_value_type;
+    typedef mutable_value_type*                           mutable_pointer;
+    typedef const mutable_value_type*                     const_mutable_pointer;
+
+public:
+    // Basic types
+    // -----------
+    typedef typename spp::cvt<T>::type                    value_type;
+    typedef Alloc                                         allocator_type;
+    typedef typename value_alloc_type::size_type          size_type;
+    typedef typename value_alloc_type::difference_type    difference_type;
+    typedef value_type&                                   reference;
+    typedef const value_type&                             const_reference;
+    typedef value_type*                                   pointer;
+    typedef const value_type*                             const_pointer;
+
+    typedef sparsegroup<T, value_alloc_type>              group_type;
+
+    typedef group_type&                                   GroupsReference;
+    typedef const group_type&                             GroupsConstReference;
+
+    typedef typename group_type::ne_iterator              ColIterator;
+    typedef typename group_type::const_ne_iterator        ColConstIterator;
+
+    typedef table_iterator<sparsetable<T, Alloc> >        iterator;       // defined with index
+    typedef const_table_iterator<sparsetable<T, Alloc> >  const_iterator; // defined with index
+    typedef table_element_adaptor<sparsetable<T, Alloc> > element_adaptor;
+    typedef std::reverse_iterator<const_iterator>         const_reverse_iterator;
+    typedef std::reverse_iterator<iterator>               reverse_iterator;
+
+    // These are our special iterators, that go over non-empty buckets in a
+    // table.  These aren't const only because you can change non-empty bcks.
+    // ----------------------------------------------------------------------
+    typedef Two_d_iterator<T, 
+                           group_type *, 
+                           ColIterator,
+                           std::bidirectional_iterator_tag> ne_iterator;
+
+    typedef Two_d_iterator<const T, 
+                           const group_type *, 
+                           ColConstIterator,
+                           std::bidirectional_iterator_tag> const_ne_iterator;
+
+    // Another special iterator: it frees memory as it iterates (used to resize).
+    // Obviously, you can only iterate over it once, which is why it's an input iterator
+    // ---------------------------------------------------------------------------------
+    typedef Two_d_destructive_iterator<T, 
+                                       group_type *, 
+                                       ColIterator,
+                                       std::input_iterator_tag, 
+                                       allocator_type>       destructive_iterator;
+
+    typedef std::reverse_iterator<ne_iterator>               reverse_ne_iterator;
+    typedef std::reverse_iterator<const_ne_iterator>         const_reverse_ne_iterator;
+
+
+    // Iterator functions
+    // ------------------
+    iterator               begin()         { return iterator(this, 0); }
+    const_iterator         begin() const   { return const_iterator(this, 0); }
+    const_iterator         cbegin() const  { return const_iterator(this, 0); }
+    iterator               end()           { return iterator(this, size()); }
+    const_iterator         end() const     { return const_iterator(this, size()); }
+    const_iterator         cend() const    { return const_iterator(this, size()); }
+    reverse_iterator       rbegin()        { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const  { return const_reverse_iterator(cend()); }
+    const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); }
+    reverse_iterator       rend()          { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const    { return const_reverse_iterator(cbegin()); }
+    const_reverse_iterator crend() const   { return const_reverse_iterator(cbegin()); }
+
+    // Versions for our special non-empty iterator
+    // ------------------------------------------
+    ne_iterator       ne_begin()           { return ne_iterator      (_first_group); }
+    const_ne_iterator ne_begin() const     { return const_ne_iterator(_first_group); }
+    const_ne_iterator ne_cbegin() const    { return const_ne_iterator(_first_group); }
+    ne_iterator       ne_end()             { return ne_iterator      (_last_group); }
+    const_ne_iterator ne_end() const       { return const_ne_iterator(_last_group); }
+    const_ne_iterator ne_cend() const      { return const_ne_iterator(_last_group); }
+
+    reverse_ne_iterator       ne_rbegin()        { return reverse_ne_iterator(ne_end()); }
+    const_reverse_ne_iterator ne_rbegin() const  { return const_reverse_ne_iterator(ne_end());  }
+    const_reverse_ne_iterator ne_crbegin() const { return const_reverse_ne_iterator(ne_end());  }
+    reverse_ne_iterator       ne_rend()          { return reverse_ne_iterator(ne_begin()); }
+    const_reverse_ne_iterator ne_rend() const    { return const_reverse_ne_iterator(ne_begin()); }
+    const_reverse_ne_iterator ne_crend() const   { return const_reverse_ne_iterator(ne_begin()); }
+
+    destructive_iterator destructive_begin()  
+    { 
+        return destructive_iterator(_alloc, _first_group);
+    }
+
+    destructive_iterator destructive_end() 
+    { 
+        return destructive_iterator(_alloc, _last_group); 
+    }
+
+    // How to deal with the proper group
+    static group_size_type num_groups(group_size_type num)
+    {   
+        // how many to hold num buckets
+        return num == 0 ? (group_size_type)0 : 
+            (group_size_type)(((num-1) / SPP_GROUP_SIZE) + 1);
+    }
+
+    typename group_type::size_type pos_in_group(size_type i) const 
+    {
+        return static_cast<typename group_type::size_type>(i & SPP_MASK_);
+    }
+    
+    size_type group_num(size_type i) const
+    {
+        return (size_type)(i >> SPP_SHIFT_);
+    }
+
+    GroupsReference which_group(size_type i) 
+    {
+        return _first_group[group_num(i)];
+    }
+
+    GroupsConstReference which_group(size_type i) const
+    {
+        return _first_group[group_num(i)];
+    }
+
+    void _alloc_group_array(group_size_type sz, group_type *&first, group_type *&last)
+    {
+        if (sz)
+        {
+            first = _group_alloc.allocate((size_type)(sz + 1)); // + 1 for end marker
+            first[sz].mark();                      // for the ne_iterator
+            last = first + sz;
+        }
+    }
+    
+    void _free_group_array(group_type *&first, group_type *&last)
+    {
+        if (first)
+        {
+            _group_alloc.deallocate(first, (group_size_type)(last - first + 1)); // + 1 for end marker
+            first = last = 0;
+        }
+    }
+
+    void _allocate_groups(size_type sz)
+    {
+        if (sz)
+        {
+            _alloc_group_array(sz, _first_group, _last_group);
+            std::uninitialized_fill(_first_group, _last_group, group_type());
+        }
+    }
+
+    void _free_groups()
+    {
+        if (_first_group)
+        {
+            for (group_type *g = _first_group; g != _last_group; ++g)
+                g->destruct(_alloc);
+            _free_group_array(_first_group, _last_group);
+        }
+    }
+
+    void _cleanup()
+    {
+        _free_groups();    // sets _first_group = _last_group = 0
+        _table_size  = 0;
+        _num_buckets = 0;
+    }
+
+    void _init()
+    {
+        _first_group = 0;
+        _last_group  = 0;
+        _table_size  = 0;
+        _num_buckets = 0;
+    }
+
+    void _copy(const sparsetable &o)
+    {
+        _table_size = o._table_size;
+        _num_buckets = o._num_buckets;
+        _alloc = o._alloc;                // todo - copy or move allocator according to...
+        _group_alloc = o._group_alloc;    // http://en.cppreference.com/w/cpp/container/unordered_map/unordered_map
+
+        group_size_type sz = (group_size_type)(o._last_group - o._first_group);
+        if (sz)
+        {
+            _alloc_group_array(sz, _first_group, _last_group);
+            for (group_size_type i=0; i<sz; ++i)
+                new (_first_group + i) group_type(o._first_group[i], _alloc);
+        }
+    }
+
+public:
+    // Constructors -- default, normal (when you specify size), and copy
+    explicit sparsetable(size_type sz = 0, const Alloc &alloc = Alloc()) : 
+        _first_group(0), 
+        _last_group(0),
+        _table_size(sz),
+        _num_buckets(0),
+        _alloc(alloc)  // todo - copy or move allocator according to 
+                       // http://en.cppreference.com/w/cpp/container/unordered_map/unordered_map
+    {
+        _allocate_groups(num_groups(sz));
+    }
+
+    ~sparsetable()
+    {
+        _free_groups();
+    }
+
+    sparsetable(const sparsetable &o) 
+    {
+        _init();
+        _copy(o);
+    }
+
+    sparsetable& operator=(const sparsetable &o)
+    {
+        _cleanup();
+        _copy(o);
+        return *this;
+    }
+
+
+#if !defined(SPP_NO_CXX11_RVALUE_REFERENCES)
+    sparsetable(sparsetable&& o)
+    {
+        _init();
+        this->swap(o);
+    }
+
+    sparsetable(sparsetable&& o, const Alloc &alloc)
+    {
+        _init();
+        this->swap(o);
+        _alloc = alloc; // [gp todo] is this correct?
+    }
+
+    sparsetable& operator=(sparsetable&& o)
+    {
+        _cleanup();
+        this->swap(o);
+        return *this;
+    }
+#endif    
+
+    // Many STL algorithms use swap instead of copy constructors
+    void swap(sparsetable& o) 
+    {
+        using std::swap;
+
+        swap(_first_group, o._first_group);
+        swap(_last_group,  o._last_group);
+        swap(_table_size,  o._table_size);
+        swap(_num_buckets, o._num_buckets);
+        if (_alloc != o._alloc)
+            swap(_alloc, o._alloc);
+        if (_group_alloc != o._group_alloc)
+            swap(_group_alloc, o._group_alloc);
+    }
+
+    // It's always nice to be able to clear a table without deallocating it
+    void clear() 
+    {
+        _free_groups();
+        _num_buckets = 0;
+        _table_size = 0;
+    }
+
+    inline allocator_type get_allocator() const 
+    {
+        return _alloc;
+    }
+
+
+    // Functions that tell you about size.
+    // NOTE: empty() is non-intuitive!  It does not tell you the number
+    // of not-empty buckets (use num_nonempty() for that).  Instead
+    // it says whether you've allocated any buckets or not.
+    // ----------------------------------------------------------------
+    size_type size() const           { return _table_size; }
+    size_type max_size() const       { return _alloc.max_size(); }
+    bool empty() const               { return _table_size == 0; }
+    size_type num_nonempty() const   { return _num_buckets; }
+
+    // OK, we'll let you resize one of these puppies
+    void resize(size_type new_size) 
+    {
+        group_size_type sz = num_groups(new_size);
+        group_size_type old_sz = (group_size_type)(_last_group - _first_group);
+
+        if (sz != old_sz)
+        {
+            // resize group array
+            // ------------------
+            group_type *first = 0, *last = 0;
+            if (sz)
+            {
+                _alloc_group_array(sz, first, last);
+                memcpy(first, _first_group, sizeof(*first) * (std::min)(sz, old_sz));
+            }
+
+            if (sz < old_sz)
+            {
+                for (group_type *g = _first_group + sz; g != _last_group; ++g)
+                    g->destruct(_alloc);
+            }
+            else
+                std::uninitialized_fill(first + old_sz, last, group_type());
+        
+            _free_group_array(_first_group, _last_group);
+            _first_group = first;
+            _last_group  = last;
+        }
+#if 0
+        // used only in test program
+        // todo: fix if sparsetable to be used directly
+        // --------------------------------------------
+        if (new_size < _table_size) 
+        {
+            // lower num_buckets, clear last group
+            if (pos_in_group(new_size) > 0)     // need to clear inside last group
+                groups.back().erase(_alloc, groups.back().begin() + pos_in_group(new_size),
+                                    groups.back().end());
+            _num_buckets = 0;                   // refigure # of used buckets
+            for (const group_type *group = _first_group; group != _last_group; ++group)
+                _num_buckets += group->num_nonempty();
+        }
+#endif
+        _table_size = new_size;
+    }
+
+    // We let you see if a bucket is non-empty without retrieving it
+    // -------------------------------------------------------------
+    bool test(size_type i) const 
+    {
+        // assert(i < _table_size);
+        return which_group(i).test(pos_in_group(i));
+    }
+
+    // also tests for erased values
+    // ----------------------------
+    bool test_strict(size_type i) const 
+    {
+        // assert(i < _table_size);
+        return which_group(i).test_strict(pos_in_group(i));
+    }
+
+    friend struct GrpPos;
+
+    struct GrpPos 
+    { 
+        typedef typename sparsetable::ne_iterator ne_iter;
+        GrpPos(const sparsetable &table, size_type i) :
+            grp(table.which_group(i)), pos(table.pos_in_group(i)) {}
+
+        bool test_strict() const { return grp.test_strict(pos); }
+        bool test() const        { return grp.test(pos); }
+        typename sparsetable::reference unsafe_get() const { return  grp.unsafe_get(pos); }
+        ne_iter get_iter(typename sparsetable::reference ref)
+        {
+            return ne_iter((group_type *)&grp, &ref);
+        }
+
+        void erase(sparsetable &table) // item *must* be present
+        {
+            assert(table._num_buckets);
+            ((group_type &)grp).erase(table._alloc, pos);
+            --table._num_buckets;
+        }
+        
+    private:
+        GrpPos* operator=(const GrpPos&);
+
+        const group_type &grp; 
+        typename group_type::size_type pos; 
+    };
+
+    bool test(iterator pos) const 
+    {
+        return which_group(pos.pos).test(pos_in_group(pos.pos));
+    }
+
+    bool test(const_iterator pos) const 
+    {
+        return which_group(pos.pos).test(pos_in_group(pos.pos));
+    }
+
+    // We only return const_references because it's really hard to
+    // return something settable for empty buckets.  Use set() instead.
+    const_reference get(size_type i) const 
+    {
+        assert(i < _table_size);
+        return which_group(i).get(pos_in_group(i));
+    }
+
+    // TODO(csilvers): make protected + friend
+    // This is used by sparse_hashtable to get an element from the table
+    // when we know it exists (because the caller has called test(i)).
+    // -----------------------------------------------------------------
+    reference unsafe_get(size_type i) const 
+    {
+        assert(i < _table_size);
+        // assert(test(i));
+        return which_group(i).unsafe_get(pos_in_group(i));
+    }
+
+    // TODO(csilvers): make protected + friend element_adaptor
+    reference mutating_get(size_type i) 
+    {   
+        // fills bucket i before getting
+        assert(i < _table_size);
+
+        GroupsReference grp(which_group(i));
+        typename group_type::size_type old_numbuckets = grp.num_nonempty();
+        reference retval = grp.mutating_get(_alloc, pos_in_group(i));
+        _num_buckets += grp.num_nonempty() - old_numbuckets;
+        return retval;
+    }
+
+    // Syntactic sugar.  As in sparsegroup, the non-const version is harder
+    const_reference operator[](size_type i) const 
+    {
+        return get(i);
+    }
+
+    element_adaptor operator[](size_type i) 
+    {
+        return element_adaptor(this, i);
+    }
+
+    // Needed for hashtables, gets as a ne_iterator.  Crashes for empty bcks
+    const_ne_iterator get_iter(size_type i) const 
+    {
+        //assert(test(i));    // how can a ne_iterator point to an empty bucket?
+
+        size_type grp_idx = group_num(i);
+
+        return const_ne_iterator(_first_group + grp_idx, 
+                                 (_first_group[grp_idx].ne_begin() +
+                                  _first_group[grp_idx].pos_to_offset(pos_in_group(i))));
+    }
+
+    const_ne_iterator get_iter(size_type i, ColIterator col_it) const
+    {
+        return const_ne_iterator(_first_group + group_num(i), col_it);
+    }
+
+    // For nonempty we can return a non-const version
+    ne_iterator get_iter(size_type i) 
+    {
+        //assert(test(i));    // how can a nonempty_iterator point to an empty bucket?
+        
+        size_type grp_idx = group_num(i);
+
+        return ne_iterator(_first_group + grp_idx,  
+                           (_first_group[grp_idx].ne_begin() +
+                            _first_group[grp_idx].pos_to_offset(pos_in_group(i))));
+    }
+
+    ne_iterator get_iter(size_type i, ColIterator col_it) 
+    {
+        return ne_iterator(_first_group + group_num(i), col_it);
+    }
+
+    // And the reverse transformation.
+    size_type get_pos(const const_ne_iterator& it) const
+    {
+        difference_type current_row = it.row_current - _first_group;
+        difference_type current_col = (it.col_current - _first_group[current_row].ne_begin());
+        return ((current_row * SPP_GROUP_SIZE) +
+                _first_group[current_row].offset_to_pos(current_col));
+    }
+
+    // This returns a reference to the inserted item (which is a copy of val)
+    // The trick is to figure out whether we're replacing or inserting anew
+    // ----------------------------------------------------------------------
+    reference set(size_type i, const_reference val, bool erased = false) 
+    {
+        assert(i < _table_size);
+        group_type &group = which_group(i);
+        typename group_type::size_type old_numbuckets = group.num_nonempty();
+        typename group_type::SetResult sr(group.set(_alloc, pos_in_group(i), erased));
+        if (!sr.second)
+            ::new (sr.first) mutable_value_type(val);
+        else
+            *sr.first = spp_const_mutable_ref(val);
+        _num_buckets += group.num_nonempty() - old_numbuckets;
+        return *((pointer)sr.first);
+    }
+
+    // used in _move_from (where we can move the old value instead of copying it
+    void move(size_type i, reference val) 
+    {
+        assert(i < _table_size);
+        which_group(i).move(_alloc, pos_in_group(i), val);
+        ++_num_buckets;
+    }
+
+    // This takes the specified elements out of the table. 
+    // --------------------------------------------------
+    void erase(size_type i) 
+    {
+        assert(i < _table_size);
+        
+        GroupsReference grp(which_group(i));
+        typename group_type::size_type old_numbuckets = grp.num_nonempty();
+        grp.erase(_alloc, pos_in_group(i));
+        _num_buckets += grp.num_nonempty() - old_numbuckets;
+    }
+
+    void erase(iterator pos) 
+    {
+        erase(pos.pos);
+    }
+
+    void erase(iterator start_it, iterator end_it)
+    {
+        // This could be more efficient, but then we'd need to figure
+        // out if we spanned groups or not.  Doesn't seem worth it.
+        for (; start_it != end_it; ++start_it)
+            erase(start_it);
+    }
+
+    const_ne_iterator erase(const_ne_iterator it)
+    {
+        ne_iterator res(it);
+        if (res.row_current->erase_ne(_alloc, res))
+            _num_buckets--;
+        return res;
+    }
+
+    const_ne_iterator erase(const_ne_iterator f, const_ne_iterator l)
+    {
+        size_t diff = l - f;
+        while (diff--)
+            f = erase(f);
+        return f;
+    }
+
+    // We support reading and writing tables to disk.  We don't store
+    // the actual array contents (which we don't know how to store),
+    // just the groups and sizes.  Returns true if all went ok.
+
+private:
+    // Every time the disk format changes, this should probably change too
+    typedef unsigned long MagicNumberType;
+    static const MagicNumberType MAGIC_NUMBER = 0x24687531;
+
+    // Old versions of this code write all data in 32 bits.  We need to
+    // support these files as well as having support for 64-bit systems.
+    // So we use the following encoding scheme: for values < 2^32-1, we
+    // store in 4 bytes in big-endian order.  For values > 2^32, we
+    // store 0xFFFFFFF followed by 8 bytes in big-endian order.  This
+    // causes us to mis-read old-version code that stores exactly
+    // 0xFFFFFFF, but I don't think that is likely to have happened for
+    // these particular values.
+    template <typename OUTPUT, typename IntType>
+    static bool write_32_or_64(OUTPUT* fp, IntType value)
+    {
+        if (value < 0xFFFFFFFFULL) {        // fits in 4 bytes
+            if (!sparsehash_internal::write_bigendian_number(fp, value, 4))
+                return false;
+        } 
+        else
+        {
+            if (!sparsehash_internal::write_bigendian_number(fp, 0xFFFFFFFFUL, 4))
+                return false;
+            if (!sparsehash_internal::write_bigendian_number(fp, value, 8))
+                return false;
+        }
+        return true;
+    }
+
+    template <typename INPUT, typename IntType>
+    static bool read_32_or_64(INPUT* fp, IntType *value) 
+    {   // reads into value
+        MagicNumberType first4 = 0;   // a convenient 32-bit unsigned type
+        if (!sparsehash_internal::read_bigendian_number(fp, &first4, 4))
+            return false;
+
+        if (first4 < 0xFFFFFFFFULL) 
+        {
+            *value = first4;
+        } 
+        else
+        {
+            if (!sparsehash_internal::read_bigendian_number(fp, value, 8))
+                return false;
+        }
+        return true;
+    }
+
+public:
+    // read/write_metadata() and read_write/nopointer_data() are DEPRECATED.
+    // Use serialize() and unserialize(), below, for new code.
+
+    template <typename OUTPUT> 
+    bool write_metadata(OUTPUT *fp) const 
+    {
+        if (!write_32_or_64(fp, MAGIC_NUMBER))  return false;
+        if (!write_32_or_64(fp, _table_size))  return false;
+        if (!write_32_or_64(fp, _num_buckets))  return false;
+
+        for (const group_type *group = _first_group; group != _last_group; ++group)
+            if (group->write_metadata(fp) == false)  
+                return false;
+        return true;
+    }
+
+    // Reading destroys the old table contents!  Returns true if read ok.
+    template <typename INPUT> 
+    bool read_metadata(INPUT *fp)
+    {
+        size_type magic_read = 0;
+        if (!read_32_or_64(fp, &magic_read))  return false;
+        if (magic_read != MAGIC_NUMBER) 
+        {
+            clear();                        // just to be consistent
+            return false;
+        }
+
+        if (!read_32_or_64(fp, &_table_size))  return false;
+        if (!read_32_or_64(fp, &_num_buckets))  return false;
+
+        resize(_table_size);                    // so the vector's sized ok
+        for (group_type *group = _first_group; group != _last_group; ++group)
+            if (group->read_metadata(_alloc, fp) == false)  
+                return false;
+        return true;
+    }
+
+    // This code is identical to that for SparseGroup
+    // If your keys and values are simple enough, we can write them
+    // to disk for you.  "simple enough" means no pointers.
+    // However, we don't try to normalize endianness
+    bool write_nopointer_data(FILE *fp) const 
+    {
+        for (const_ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!fwrite(&*it, sizeof(*it), 1, fp))  
+                return false;
+        return true;
+    }
+
+    // When reading, we have to override the potential const-ness of *it
+    bool read_nopointer_data(FILE *fp) 
+    {
+        for (ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!fread(reinterpret_cast<void*>(&(*it)), sizeof(*it), 1, fp))
+                return false;
+        return true;
+    }
+
+    // INPUT and OUTPUT must be either a FILE, *or* a C++ stream
+    //    (istream, ostream, etc) *or* a class providing
+    //    Read(void*, size_t) and Write(const void*, size_t)
+    //    (respectively), which writes a buffer into a stream
+    //    (which the INPUT/OUTPUT instance presumably owns).
+
+    typedef sparsehash_internal::pod_serializer<value_type> NopointerSerializer;
+
+    // ValueSerializer: a functor.  operator()(OUTPUT*, const value_type&)
+    template <typename ValueSerializer, typename OUTPUT>
+    bool serialize(ValueSerializer serializer, OUTPUT *fp) 
+    {
+        if (!write_metadata(fp))
+            return false;
+        for (const_ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!serializer(fp, *it))  
+                return false;
+        return true;
+    }
+
+    // ValueSerializer: a functor.  operator()(INPUT*, value_type*)
+    template <typename ValueSerializer, typename INPUT>
+    bool unserialize(ValueSerializer serializer, INPUT *fp) 
+    {
+        clear();
+        if (!read_metadata(fp))
+            return false;
+        for (ne_iterator it = ne_begin(); it != ne_end(); ++it) 
+            if (!serializer(fp, &*it))  
+                return false;
+        return true;
+    }
+
+    // Comparisons.  Note the comparisons are pretty arbitrary: we
+    // compare values of the first index that isn't equal (using default
+    // value for empty buckets).
+    bool operator==(const sparsetable& x) const
+    {
+        return (_table_size == x._table_size &&
+                _num_buckets == x._num_buckets &&
+                _first_group == x._first_group);
+    }
+
+    bool operator<(const sparsetable& x) const 
+    {
+        return std::lexicographical_compare(begin(), end(), x.begin(), x.end());
+    }
+    bool operator!=(const sparsetable& x) const { return !(*this == x); }
+    bool operator<=(const sparsetable& x) const { return !(x < *this); }
+    bool operator>(const sparsetable& x) const { return x < *this; }
+    bool operator>=(const sparsetable& x) const { return !(*this < x); }
+
+
+private:
+    // The actual data
+    // ---------------
+    group_type *     _first_group;        
+    group_type *     _last_group;
+    size_type        _table_size;          // how many buckets they want
+    size_type        _num_buckets;         // number of non-empty buckets
+    group_alloc_type _group_alloc;
+    value_alloc_type _alloc;
+};
+
+// We need a global swap as well
+// ---------------------------------------------------------------------------
+template <class T, class Alloc>
+inline void swap(sparsetable<T,Alloc> &x, sparsetable<T,Alloc> &y) 
+{
+    x.swap(y);
+}
+
+
+//  ----------------------------------------------------------------------
+//                  S P A R S E _ H A S H T A B L E
+//  ----------------------------------------------------------------------
+// Hashtable class, used to implement the hashed associative containers
+// hash_set and hash_map.
+//
+// Value: what is stored in the table (each bucket is a Value).
+// Key: something in a 1-to-1 correspondence to a Value, that can be used
+//      to search for a Value in the table (find() takes a Key).
+// HashFcn: Takes a Key and returns an integer, the more unique the better.
+// ExtractKey: given a Value, returns the unique Key associated with it.
+//             Must inherit from unary_function, or at least have a
+//             result_type enum indicating the return type of operator().
+// EqualKey: Given two Keys, says whether they are the same (that is,
+//           if they are both associated with the same Value).
+// Alloc: STL allocator to use to allocate memory.
+//
+//  ----------------------------------------------------------------------
+
+// The probing method
+// ------------------
+// Linear probing
+// #define JUMP_(key, num_probes)    ( 1 )
+// Quadratic probing
+#define JUMP_(key, num_probes)    ( num_probes )
+
+
+// -------------------------------------------------------------------
+// -------------------------------------------------------------------
+template <class Value, class Key, class HashFcn,
+          class ExtractKey, class SetKey, class EqualKey, class Alloc>
+class sparse_hashtable 
+{
+private:
+    typedef Value                                      mutable_value_type;
+    typedef typename Alloc::template rebind<Value>::other value_alloc_type;
+
+public:
+    typedef Key                                        key_type;
+    typedef typename spp::cvt<Value>::type             value_type;
+    typedef HashFcn                                    hasher;
+    typedef EqualKey                                   key_equal;
+    typedef Alloc                                      allocator_type;
+
+    typedef typename value_alloc_type::size_type       size_type;
+    typedef typename value_alloc_type::difference_type difference_type;
+    typedef value_type&                                reference;
+    typedef const value_type&                          const_reference;
+    typedef value_type*                                pointer;
+    typedef const value_type*                          const_pointer;
+    
+    // Table is the main storage class.
+    typedef sparsetable<mutable_value_type, value_alloc_type> Table;
+    typedef typename Table::ne_iterator               ne_it;
+    typedef typename Table::const_ne_iterator         cne_it;
+    typedef typename Table::destructive_iterator      dest_it;
+    typedef typename Table::ColIterator               ColIterator;
+
+    typedef ne_it                                     iterator;
+    typedef cne_it                                    const_iterator;
+    typedef dest_it                                   destructive_iterator;
+
+    // These come from tr1.  For us they're the same as regular iterators.
+    // -------------------------------------------------------------------
+    typedef iterator                                  local_iterator;
+    typedef const_iterator                            const_local_iterator;
+
+    // How full we let the table get before we resize
+    // ----------------------------------------------
+    static const int HT_OCCUPANCY_PCT; // = 80 (out of 100);
+
+    // How empty we let the table get before we resize lower, by default.
+    // (0.0 means never resize lower.)
+    // It should be less than OCCUPANCY_PCT / 2 or we thrash resizing
+    // ------------------------------------------------------------------
+    static const int HT_EMPTY_PCT; // = 0.4 * HT_OCCUPANCY_PCT;
+
+    // Minimum size we're willing to let hashtables be.
+    // Must be a power of two, and at least 4.
+    // Note, however, that for a given hashtable, the initial size is a
+    // function of the first constructor arg, and may be >HT_MIN_BUCKETS.
+    // ------------------------------------------------------------------
+    static const size_type HT_MIN_BUCKETS = 4;
+
+    // By default, if you don't specify a hashtable size at
+    // construction-time, we use this size.  Must be a power of two, and
+    // at least HT_MIN_BUCKETS.
+    // -----------------------------------------------------------------
+    static const size_type HT_DEFAULT_STARTING_BUCKETS = 32;
+
+    // iterators
+    // ---------
+    iterator       begin()        { return _mk_iterator(table.ne_begin());  }
+    iterator       end()          { return _mk_iterator(table.ne_end());    }
+    const_iterator begin() const  { return _mk_const_iterator(table.ne_cbegin()); }
+    const_iterator end() const    { return _mk_const_iterator(table.ne_cend());   }
+    const_iterator cbegin() const { return _mk_const_iterator(table.ne_cbegin()); }
+    const_iterator cend() const   { return _mk_const_iterator(table.ne_cend());   }
+
+    // These come from tr1 unordered_map.  They iterate over 'bucket' n.
+    // For sparsehashtable, we could consider each 'group' to be a bucket,
+    // I guess, but I don't really see the point.  We'll just consider
+    // bucket n to be the n-th element of the sparsetable, if it's occupied,
+    // or some empty element, otherwise.
+    // ---------------------------------------------------------------------
+    local_iterator begin(size_type i) 
+    {
+        return _mk_iterator(table.test(i) ? table.get_iter(i) : table.ne_end());
+    }
+
+    local_iterator end(size_type i) 
+    {
+        local_iterator it = begin(i);
+        if (table.test(i))
+            ++it;
+        return _mk_iterator(it);
+    }
+
+    const_local_iterator begin(size_type i) const 
+    {
+        return _mk_const_iterator(table.test(i) ? table.get_iter(i) : table.ne_cend());
+    }
+
+    const_local_iterator end(size_type i) const 
+    {
+        const_local_iterator it = begin(i);
+        if (table.test(i))
+            ++it;
+        return _mk_const_iterator(it);
+    }
+
+    const_local_iterator cbegin(size_type i) const { return begin(i); }
+    const_local_iterator cend(size_type i)   const { return end(i); }
+
+    // This is used when resizing
+    // --------------------------
+    destructive_iterator destructive_begin()       { return _mk_destructive_iterator(table.destructive_begin()); }
+    destructive_iterator destructive_end()         { return _mk_destructive_iterator(table.destructive_end());   }
+
+
+    // accessor functions for the things we templatize on, basically
+    // -------------------------------------------------------------
+    hasher hash_funct() const               { return settings; }
+    key_equal key_eq() const                { return key_info; }
+    allocator_type get_allocator() const    { return table.get_allocator(); }
+
+    // Accessor function for statistics gathering.
+    unsigned int num_table_copies() const { return settings.num_ht_copies(); }
+
+private:
+    // This is used as a tag for the copy constructor, saying to destroy its
+    // arg We have two ways of destructively copying: with potentially growing
+    // the hashtable as we copy, and without.  To make sure the outside world
+    // can't do a destructive copy, we make the typename private.
+    // -----------------------------------------------------------------------
+    enum MoveDontCopyT {MoveDontCopy, MoveDontGrow};
+
+    void _squash_deleted() 
+    {
+        // gets rid of any deleted entries we have
+        // ---------------------------------------
+        if (num_deleted) 
+        {
+            // get rid of deleted before writing
+            sparse_hashtable tmp(MoveDontGrow, *this);
+            swap(tmp);                    // now we are tmp
+        }
+        assert(num_deleted == 0);
+    }
+
+    // creating iterators from sparsetable::ne_iterators
+    // -------------------------------------------------
+    iterator             _mk_iterator(ne_it it) const               { return it; }
+    const_iterator       _mk_const_iterator(cne_it it) const        { return it; }
+    destructive_iterator _mk_destructive_iterator(dest_it it) const { return it; }
+
+public:
+    size_type size() const              { return table.num_nonempty(); }
+    size_type max_size() const          { return table.max_size(); }
+    bool empty() const                  { return size() == 0; }
+    size_type bucket_count() const      { return table.size(); }
+    size_type max_bucket_count() const  { return max_size(); }
+    // These are tr1 methods.  Their idea of 'bucket' doesn't map well to
+    // what we do.  We just say every bucket has 0 or 1 items in it.
+    size_type bucket_size(size_type i) const 
+    {
+        return (size_type)(begin(i) == end(i) ? 0 : 1);
+    }
+
+private:
+    // Because of the above, size_type(-1) is never legal; use it for errors
+    // ---------------------------------------------------------------------
+    static const size_type ILLEGAL_BUCKET = size_type(-1);
+
+    // Used after a string of deletes.  Returns true if we actually shrunk.
+    // TODO(csilvers): take a delta so we can take into account inserts
+    // done after shrinking.  Maybe make part of the Settings class?
+    // --------------------------------------------------------------------
+    bool _maybe_shrink() 
+    {
+        assert((bucket_count() & (bucket_count()-1)) == 0); // is a power of two
+        assert(bucket_count() >= HT_MIN_BUCKETS);
+        bool retval = false;
+
+        // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS,
+        // we'll never shrink until you get relatively big, and we'll never
+        // shrink below HT_DEFAULT_STARTING_BUCKETS.  Otherwise, something
+        // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will
+        // shrink us down to HT_MIN_BUCKETS buckets, which is too small.
+        // ---------------------------------------------------------------
+        const size_type num_remain = table.num_nonempty();
+        const size_type shrink_threshold = settings.shrink_threshold();
+        if (shrink_threshold > 0 && num_remain < shrink_threshold &&
+            bucket_count() > HT_DEFAULT_STARTING_BUCKETS) 
+        {
+            const float shrink_factor = settings.shrink_factor();
+            size_type sz = (size_type)(bucket_count() / 2);    // find how much we should shrink
+            while (sz > HT_DEFAULT_STARTING_BUCKETS &&
+                   num_remain < static_cast<size_type>(sz * shrink_factor)) 
+            {
+                sz /= 2;                            // stay a power of 2
+            }
+            sparse_hashtable tmp(MoveDontCopy, *this, sz);
+            swap(tmp);                            // now we are tmp
+            retval = true;
+        }
+        settings.set_consider_shrink(false);   // because we just considered it
+        return retval;
+    }
+
+    // We'll let you resize a hashtable -- though this makes us copy all!
+    // When you resize, you say, "make it big enough for this many more elements"
+    // Returns true if we actually resized, false if size was already ok.
+    // --------------------------------------------------------------------------
+    bool _resize_delta(size_type delta)
+    {
+        bool did_resize = false;
+        if (settings.consider_shrink()) 
+        {
+            // see if lots of deletes happened
+            if (_maybe_shrink())
+                did_resize = true;
+        }
+        if (table.num_nonempty() >=
+            (std::numeric_limits<size_type>::max)() - delta)
+        {
+            throw_exception(std::length_error("resize overflow"));
+        }
+
+        size_type num_occupied = (size_type)(table.num_nonempty() + num_deleted);
+
+        if (bucket_count() >= HT_MIN_BUCKETS &&
+             (num_occupied + delta) <= settings.enlarge_threshold())
+            return did_resize;                       // we're ok as we are
+
+        // Sometimes, we need to resize just to get rid of all the
+        // "deleted" buckets that are clogging up the hashtable.  So when
+        // deciding whether to resize, count the deleted buckets (which
+        // are currently taking up room).  
+        // -------------------------------------------------------------
+        const size_type needed_size = 
+                  settings.min_buckets((size_type)(num_occupied + delta), (size_type)0);
+
+        if (needed_size <= bucket_count())      // we have enough buckets
+            return did_resize;
+
+        size_type resize_to = settings.min_buckets((size_type)(num_occupied + delta), bucket_count());
+
+        if (resize_to < needed_size &&    // may double resize_to
+            resize_to < (std::numeric_limits<size_type>::max)() / 2) 
+        {
+            // This situation means that we have enough deleted elements,
+            // that once we purge them, we won't actually have needed to
+            // grow.  But we may want to grow anyway: if we just purge one
+            // element, say, we'll have to grow anyway next time we
+            // insert.  Might as well grow now, since we're already going
+            // through the trouble of copying (in order to purge the
+            // deleted elements).
+            const size_type target =
+                static_cast<size_type>(settings.shrink_size((size_type)(resize_to*2)));
+            if (table.num_nonempty() + delta >= target) 
+            {
+                // Good, we won't be below the shrink threshhold even if we double.
+                resize_to *= 2;
+            }
+        }
+
+        sparse_hashtable tmp(MoveDontCopy, *this, resize_to);
+        swap(tmp);                             // now we are tmp
+        return true;
+    }
+
+    // Used to actually do the rehashing when we grow/shrink a hashtable
+    // -----------------------------------------------------------------
+    void _copy_from(const sparse_hashtable &ht, size_type min_buckets_wanted)
+    {
+        clear();            // clear table, set num_deleted to 0
+
+        // If we need to change the size of our table, do it now
+        const size_type resize_to = settings.min_buckets(ht.size(), min_buckets_wanted);
+
+        if (resize_to > bucket_count()) 
+        {
+            // we don't have enough buckets
+            table.resize(resize_to);               // sets the number of buckets
+            settings.reset_thresholds(bucket_count());
+        }
+
+        // We use a normal iterator to get bcks from ht
+        // We could use insert() here, but since we know there are
+        // no duplicates, we can be more efficient
+        assert((bucket_count() & (bucket_count()-1)) == 0);      // a power of two
+        for (const_iterator it = ht.begin(); it != ht.end(); ++it) 
+        {
+            size_type num_probes = 0;              // how many times we've probed
+            size_type bucknum;
+            const size_type bucket_count_minus_one = bucket_count() - 1;
+            for (bucknum = hash(get_key(*it)) & bucket_count_minus_one;
+                 table.test(bucknum);                                   // table.test() OK since no erase()
+                 bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) 
+            {
+                ++num_probes;
+                assert(num_probes < bucket_count()
+                       && "Hashtable is full: an error in key_equal<> or hash<>");
+            }
+            table.set(bucknum, *it, false);               // copies the value to here
+        }
+        settings.inc_num_ht_copies();
+    }
+
+    // Implementation is like _copy_from, but it destroys the table of the
+    // "from" guy by freeing sparsetable memory as we iterate.  This is
+    // useful in resizing, since we're throwing away the "from" guy anyway.
+    // --------------------------------------------------------------------
+    void _move_from(MoveDontCopyT mover, sparse_hashtable &ht,
+                   size_type min_buckets_wanted)
+    {
+        clear(); 
+
+        // If we need to change the size of our table, do it now
+        size_type resize_to;
+        if (mover == MoveDontGrow)
+            resize_to = ht.bucket_count();       // keep same size as old ht
+        else                                     // MoveDontCopy
+            resize_to = settings.min_buckets(ht.size(), min_buckets_wanted);
+        if (resize_to > bucket_count()) 
+        {
+            // we don't have enough buckets
+            table.resize(resize_to);               // sets the number of buckets
+            settings.reset_thresholds(bucket_count());
+        }
+
+        // We use a normal iterator to get bcks from ht
+        // We could use insert() here, but since we know there are
+        // no duplicates, we can be more efficient
+        assert((bucket_count() & (bucket_count()-1)) == 0);      // a power of two
+        const size_type bucket_count_minus_one = (const size_type)(bucket_count() - 1);
+
+        // THIS IS THE MAJOR LINE THAT DIFFERS FROM COPY_FROM():
+        for (destructive_iterator it = ht.destructive_begin();
+              it != ht.destructive_end(); ++it)
+        {
+            size_type num_probes = 0;
+            size_type bucknum;
+            for (bucknum = hash(get_key(*it)) & bucket_count_minus_one; 
+                 table.test(bucknum);                          // table.test() OK since no erase()
+                 bucknum = (size_type)((bucknum + JUMP_(key, num_probes)) & (bucket_count()-1)))
+            {
+                ++num_probes;
+                assert(num_probes < bucket_count()
+                       && "Hashtable is full: an error in key_equal<> or hash<>");
+            }
+            table.move(bucknum, *it);    // moves the value to here
+        }
+        settings.inc_num_ht_copies();
+    }
+
+
+    // Required by the spec for hashed associative container
+public:
+    // Though the docs say this should be num_buckets, I think it's much
+    // more useful as num_elements.  As a special feature, calling with
+    // req_elements==0 will cause us to shrink if we can, saving space.
+    // -----------------------------------------------------------------
+    void resize(size_type req_elements) 
+    {
+        // resize to this or larger
+        if (settings.consider_shrink() || req_elements == 0)
+            _maybe_shrink();
+        if (req_elements > table.num_nonempty())    // we only grow
+            _resize_delta((size_type)(req_elements - table.num_nonempty()));
+    }
+
+    // Get and change the value of shrink_factor and enlarge_factor.  The
+    // description at the beginning of this file explains how to choose
+    // the values.  Setting the shrink parameter to 0.0 ensures that the
+    // table never shrinks.
+    // ------------------------------------------------------------------
+    void get_resizing_parameters(float* shrink, float* grow) const 
+    {
+        *shrink = settings.shrink_factor();
+        *grow = settings.enlarge_factor();
+    }
+
+    float get_shrink_factor() const  { return settings.shrink_factor(); }
+    float get_enlarge_factor() const { return settings.enlarge_factor(); }
+
+    void set_resizing_parameters(float shrink, float grow) {
+        settings.set_resizing_parameters(shrink, grow);
+        settings.reset_thresholds(bucket_count());
+    }
+
+    void set_shrink_factor(float shrink)
+    {                                           
+        set_resizing_parameters(shrink, get_enlarge_factor());
+    }
+
+    void set_enlarge_factor(float grow)
+    {
+        set_resizing_parameters(get_shrink_factor(), grow);
+    }
+
+    // CONSTRUCTORS -- as required by the specs, we take a size,
+    // but also let you specify a hashfunction, key comparator,
+    // and key extractor.  We also define a copy constructor and =.
+    // DESTRUCTOR -- the default is fine, surprisingly.
+    // ------------------------------------------------------------
+    explicit sparse_hashtable(size_type expected_max_items_in_table = 0,
+                              const HashFcn& hf = HashFcn(),
+                              const EqualKey& eql = EqualKey(),
+                              const ExtractKey& ext = ExtractKey(),
+                              const SetKey& set = SetKey(),
+                              const Alloc& alloc = Alloc())
+        : settings(hf),
+          key_info(ext, set, eql),
+          num_deleted(0),
+          table((expected_max_items_in_table == 0
+                 ? HT_DEFAULT_STARTING_BUCKETS
+                 : settings.min_buckets(expected_max_items_in_table, 0)),
+                value_alloc_type(alloc)) 
+    {
+        settings.reset_thresholds(bucket_count());
+    }
+
+    // As a convenience for resize(), we allow an optional second argument
+    // which lets you make this new hashtable a different size than ht.
+    // We also provide a mechanism of saying you want to "move" the ht argument
+    // into us instead of copying.
+    // ------------------------------------------------------------------------
+    sparse_hashtable(const sparse_hashtable& ht,
+                     size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS)
+        : settings(ht.settings),
+          key_info(ht.key_info),
+          num_deleted(0),
+          table(0)
+    {
+        settings.reset_thresholds(bucket_count());
+        _copy_from(ht, min_buckets_wanted); 
+    }
+
+#if !defined(SPP_NO_CXX11_RVALUE_REFERENCES)
+
+    sparse_hashtable(sparse_hashtable&& o) :
+        settings(std::move(o.settings)),
+        key_info(std::move(o.key_info)),
+        num_deleted(o.num_deleted),
+        table(std::move(o.table))
+    {
+    }
+
+    sparse_hashtable(sparse_hashtable&& o, const Alloc& alloc) :
+        settings(std::move(o.settings)),
+        key_info(std::move(o.key_info)),
+        num_deleted(o.num_deleted),
+        table(std::move(o.table), alloc)
+    {
+    }
+
+    sparse_hashtable& operator=(sparse_hashtable&& o)
+    {
+        using std::swap;
+
+        sparse_hashtable tmp(std::move(o));
+        swap(tmp, *this);
+        return *this;
+    }
+#endif    
+
+    sparse_hashtable(MoveDontCopyT mover, 
+                     sparse_hashtable& ht,
+                     size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS)
+        : settings(ht.settings),
+          key_info(ht.key_info),
+          num_deleted(0),
+          table(min_buckets_wanted, ht.table.get_allocator())
+    {
+        settings.reset_thresholds(bucket_count());
+        _move_from(mover, ht, min_buckets_wanted); 
+    }
+
+    sparse_hashtable& operator=(const sparse_hashtable& ht)
+    {
+        if (&ht == this) 
+            return *this;        // don't copy onto ourselves
+        settings = ht.settings;
+        key_info = ht.key_info;
+        num_deleted = ht.num_deleted;
+
+        // _copy_from() calls clear and sets num_deleted to 0 too
+        _copy_from(ht, HT_MIN_BUCKETS);
+
+        // we purposefully don't copy the allocator, which may not be copyable
+        return *this;
+    }
+
+    // Many STL algorithms use swap instead of copy constructors
+    void swap(sparse_hashtable& ht) 
+    {
+        using std::swap;
+
+        swap(settings, ht.settings);
+        swap(key_info, ht.key_info);
+        swap(num_deleted, ht.num_deleted);
+        table.swap(ht.table);
+        settings.reset_thresholds(bucket_count());  // also resets consider_shrink
+        ht.settings.reset_thresholds(ht.bucket_count());
+        // we purposefully don't swap the allocator, which may not be swap-able
+    }
+
+    // It's always nice to be able to clear a table without deallocating it
+    void clear() 
+    {
+        if (!empty() || num_deleted != 0) 
+        {
+            table.clear();
+            table = Table(HT_DEFAULT_STARTING_BUCKETS);
+        }
+        settings.reset_thresholds(bucket_count());
+        num_deleted = 0;
+    }
+
+    // LOOKUP ROUTINES
+private:
+    
+    enum pos_type { pt_empty = 0, pt_erased, pt_full };
+    // -------------------------------------------------------------------
+    class Position
+    {
+    public:
+
+        Position() : _t(pt_empty) {}
+        Position(pos_type t, size_type idx) : _t(t), _idx(idx) {}
+        
+        pos_type  _t;
+        size_type _idx;
+    };
+
+    // Returns a pair: 
+    //   - 'first' is a code, 2 if key already present, 0 or 1 otherwise.
+    //   - 'second' is a position, where the key should go
+    // Note: because of deletions where-to-insert is not trivial: it's the
+    // first deleted bucket we see, as long as we don't find the key later
+    // -------------------------------------------------------------------
+    Position _find_position(const key_type &key) const
+    {
+        size_type num_probes = 0;                    // how many times we've probed
+        const size_type bucket_count_minus_one = (const size_type)(bucket_count() - 1);
+        size_type bucknum = hash(key) & bucket_count_minus_one; 
+        Position pos;
+
+        while (1)
+        {    
+            // probe until something happens
+            // -----------------------------
+            typename Table::GrpPos grp_pos(table, bucknum);
+
+            if (!grp_pos.test_strict())
+            {
+                // bucket is empty => key not present
+                return pos._t ? pos : Position(pt_empty, bucknum);
+            } 
+            else if (grp_pos.test())
+            {
+                reference ref(grp_pos.unsafe_get());
+
+                if (equals(key, get_key(ref)))
+                    return Position(pt_full, bucknum);
+            }
+            else if (pos._t == pt_empty)
+            {
+                // first erased position
+                pos._t   = pt_erased;
+                pos._idx = bucknum;
+            }
+            
+            ++num_probes;                        // we're doing another probe
+            bucknum = (size_type)((bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one);
+            assert(num_probes < bucket_count()
+                   && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+
+public:
+    // I hate to duplicate find() like that, but it is 
+    // significantly faster to not have the intermediate pair
+    // ------------------------------------------------------------------
+    iterator find(const key_type& key)
+    {
+        size_type num_probes = 0;              // how many times we've probed
+        const size_type bucket_count_minus_one = bucket_count() - 1;
+        size_type bucknum = hash(key) & bucket_count_minus_one;
+        
+        while (1)                        // probe until something happens
+        {            
+            typename Table::GrpPos grp_pos(table, bucknum);
+
+            if (!grp_pos.test_strict())
+                return end();            // bucket is empty
+            if (grp_pos.test())
+            {
+                reference ref(grp_pos.unsafe_get());
+
+                if (equals(key, get_key(ref)))
+                    return grp_pos.get_iter(ref);
+            }
+            ++num_probes;                        // we're doing another probe
+            bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+            assert(num_probes < bucket_count()
+                   && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+
+    // Wish I could avoid the duplicate find() const and non-const.
+    // ------------------------------------------------------------
+    const_iterator find(const key_type& key) const
+    {
+        size_type num_probes = 0;              // how many times we've probed
+        const size_type bucket_count_minus_one = bucket_count() - 1;
+        size_type bucknum = hash(key) & bucket_count_minus_one;
+
+        while (1)                        // probe until something happens
+        {         
+            typename Table::GrpPos grp_pos(table, bucknum);
+
+            if (!grp_pos.test_strict())
+                return end();            // bucket is empty
+            else if (grp_pos.test())
+            {
+                reference ref(grp_pos.unsafe_get());
+
+                if (equals(key, get_key(ref)))
+                    return _mk_const_iterator(table.get_iter(bucknum, &ref));
+            }
+            ++num_probes;                        // we're doing another probe
+            bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+            assert(num_probes < bucket_count()
+                   && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+
+    // This is a tr1 method: the bucket a given key is in, or what bucket
+    // it would be put in, if it were to be inserted.  Shrug.
+    // ------------------------------------------------------------------
+    size_type bucket(const key_type& key) const 
+    {
+        Position pos = _find_position(key);
+        return pos._idx;
+    }
+
+    // Counts how many elements have key key.  For maps, it's either 0 or 1.
+    // ---------------------------------------------------------------------
+    size_type count(const key_type &key) const
+    {
+        Position pos = _find_position(key);
+        return (size_type)(pos._t == pt_full ? 1 : 0);
+    }
+
+    // Likewise, equal_range doesn't really make sense for us.  Oh well.
+    // -----------------------------------------------------------------
+    std::pair<iterator,iterator> equal_range(const key_type& key) 
+    {
+        iterator pos = find(key);      // either an iterator or end
+        if (pos == end()) 
+            return std::pair<iterator,iterator>(pos, pos);
+        else 
+        {
+            const iterator startpos = pos++;
+            return std::pair<iterator,iterator>(startpos, pos);
+        }
+    }
+
+    std::pair<const_iterator,const_iterator> equal_range(const key_type& key) const 
+    {
+        const_iterator pos = find(key);      // either an iterator or end
+        if (pos == end()) 
+            return std::pair<const_iterator,const_iterator>(pos, pos);
+        else
+        {
+            const const_iterator startpos = pos++;
+            return std::pair<const_iterator,const_iterator>(startpos, pos);
+        }
+    }
+
+
+    // INSERTION ROUTINES
+private:
+    // Private method used by insert_noresize and find_or_insert.
+    reference _insert_at(const_reference obj, size_type pos, bool erased) 
+    {
+        if (size() >= max_size()) 
+        {
+            throw_exception(std::length_error("insert overflow"));
+        }
+        if (erased)
+        {
+            assert(num_deleted);
+            --num_deleted;
+        }
+        return table.set(pos, obj, erased);
+    }
+
+    // If you know *this is big enough to hold obj, use this routine
+    std::pair<iterator, bool> _insert_noresize(const_reference obj) 
+    {
+        Position pos = _find_position(get_key(obj));
+        bool already_there = (pos._t == pt_full);
+
+        if (!already_there)
+        {
+            reference ref(_insert_at(obj, pos._idx, pos._t == pt_erased));
+            return std::pair<iterator, bool>(_mk_iterator(table.get_iter(pos._idx, &ref)), true);
+        }
+        return std::pair<iterator,bool>(_mk_iterator(table.get_iter(pos._idx)), false);
+    }
+
+    // Specializations of insert(it, it) depending on the power of the iterator:
+    // (1) Iterator supports operator-, resize before inserting
+    template <class ForwardIterator>
+    void _insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag /*unused*/)
+    {
+        int64_t dist = std::distance(f, l);
+        if (dist < 0 ||  static_cast<size_t>(dist) >= (std::numeric_limits<size_type>::max)()) 
+            throw_exception(std::length_error("insert-range overflow"));
+
+        _resize_delta(static_cast<size_type>(dist));
+
+        for (; dist > 0; --dist, ++f)
+            _insert_noresize(*f);
+    }
+
+    // (2) Arbitrary iterator, can't tell how much to resize
+    template <class InputIterator>
+    void _insert(InputIterator f, InputIterator l, std::input_iterator_tag /*unused*/) 
+    {
+        for (; f != l; ++f)
+            _insert(*f);
+    }
+
+public:
+
+#if 0 && !defined(SPP_NO_CXX11_VARIADIC_TEMPLATES)
+    template <class... Args>
+    pair<iterator, bool> emplace(Args&&... args) 
+    {
+        return rep.emplace_unique(std::forward<Args>(args)...);
+    }
+
+    template <class... Args>
+    iterator emplace_hint(const_iterator p, Args&&... args)
+    {
+        return rep.emplace_unique(std::forward<Args>(args)...).first;
+    }
+#endif
+
+    // This is the normal insert routine, used by the outside world
+    std::pair<iterator, bool> insert(const_reference obj)
+    {
+        _resize_delta(1);                      // adding an object, grow if need be
+        return _insert_noresize(obj);
+    }
+
+    // When inserting a lot at a time, we specialize on the type of iterator
+    template <class InputIterator>
+    void insert(InputIterator f, InputIterator l) 
+    {
+        // specializes on iterator type
+        _insert(f, l,
+               typename std::iterator_traits<InputIterator>::iterator_category());
+    }
+
+    // DefaultValue is a functor that takes a key and returns a value_type
+    // representing the default value to be inserted if none is found.
+    template <class DefaultValue>
+    value_type& find_or_insert(const key_type& key)
+    {
+        size_type num_probes = 0;              // how many times we've probed
+        const size_type bucket_count_minus_one = bucket_count() - 1;
+        size_type bucknum = hash(key) & bucket_count_minus_one;
+        DefaultValue default_value;
+        size_type erased_pos = 0;
+        bool erased = false;
+
+        while (1)                        // probe until something happens
+        {            
+            typename Table::GrpPos grp_pos(table, bucknum);
+
+            if (!grp_pos.test_strict())
+            {
+                // not found
+                if (_resize_delta(1))
+                {
+                    // needed to rehash to make room
+                    // Since we resized, we can't use pos, so recalculate where to insert.
+                    return *(_insert_noresize(default_value(key)).first);
+                } 
+                else 
+                {
+                    // no need to rehash, insert right here
+                    return _insert_at(default_value(key), erased ? erased_pos : bucknum, erased);
+                }
+            }
+            if (grp_pos.test())
+            {
+                reference ref(grp_pos.unsafe_get());
+
+                if (equals(key, get_key(ref)))
+                    return ref;
+            }
+            else if (!erased)
+            {
+                // first erased position
+                erased_pos = bucknum;
+                erased = true;
+            }
+
+            ++num_probes;                        // we're doing another probe
+            bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+            assert(num_probes < bucket_count()
+                   && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+
+    size_type erase(const key_type& key) 
+    {
+        size_type num_probes = 0;              // how many times we've probed
+        const size_type bucket_count_minus_one = bucket_count() - 1;
+        size_type bucknum = hash(key) & bucket_count_minus_one;
+        
+        while (1)                        // probe until something happens
+        {            
+            typename Table::GrpPos grp_pos(table, bucknum);
+
+            if (!grp_pos.test_strict())
+                return 0;            // bucket is empty, we deleted nothing
+            if (grp_pos.test())
+            {
+                reference ref(grp_pos.unsafe_get());
+
+                if (equals(key, get_key(ref)))
+                {
+                    grp_pos.erase(table);
+                    ++num_deleted;
+                    settings.set_consider_shrink(true); // will think about shrink after next insert
+                    return 1;                           // because we deleted one thing
+                }
+            }
+            ++num_probes;                        // we're doing another probe
+            bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+            assert(num_probes < bucket_count()
+                   && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+
+    const_iterator erase(const_iterator pos)
+    {
+        if (pos == cend()) 
+            return cend();                 // sanity check
+        
+        const_iterator nextpos = table.erase(pos);
+        ++num_deleted;
+        settings.set_consider_shrink(true);
+        return nextpos;
+    }
+
+    const_iterator erase(const_iterator f, const_iterator l) 
+    {
+        if (f == cend()) 
+            return cend();                // sanity check
+
+        size_type num_before = table.num_nonempty();
+        const_iterator nextpos = table.erase(f, l);
+        num_deleted += num_before - table.num_nonempty();
+        settings.set_consider_shrink(true);
+        return nextpos;
+    }
+
+    // Deleted key routines - just to keep google test framework happy
+    // we don't actually use the deleted key
+    // ---------------------------------------------------------------
+    void set_deleted_key(const key_type& key)   
+    {
+        _squash_deleted();
+        key_info.delkey = key;
+    }
+
+    void clear_deleted_key()
+    {
+        _squash_deleted();
+    }
+
+    key_type deleted_key() const 
+    {
+         return key_info.delkey;
+    }
+
+
+    bool operator==(const sparse_hashtable& ht) const 
+    {
+        if (this == &ht) 
+            return true;
+
+        if (size() != ht.size()) 
+            return false;
+
+        for (const_iterator it = begin(); it != end(); ++it) 
+        {
+            const_iterator it2 = ht.find(get_key(*it));
+            if ((it2 == ht.end()) || (*it != *it2)) 
+                return false;
+        }
+
+        return true;
+    }
+
+    bool operator!=(const sparse_hashtable& ht) const
+    {
+        return !(*this == ht);
+    }
+
+
+    // I/O
+    // We support reading and writing hashtables to disk.  NOTE that
+    // this only stores the hashtable metadata, not the stuff you've
+    // actually put in the hashtable!  Alas, since I don't know how to
+    // write a hasher or key_equal, you have to make sure everything
+    // but the table is the same.  We compact before writing.
+    //
+    // The OUTPUT type needs to support a Write() operation. File and
+    // OutputBuffer are appropriate types to pass in.
+    //
+    // The INPUT type needs to support a Read() operation. File and
+    // InputBuffer are appropriate types to pass in.
+    // -------------------------------------------------------------
+    template <typename OUTPUT>
+    bool write_metadata(OUTPUT *fp) 
+    {
+        _squash_deleted();           // so we don't have to worry about delkey
+        return table.write_metadata(fp);
+    }
+
+    template <typename INPUT>
+    bool read_metadata(INPUT *fp) 
+    {
+        num_deleted = 0;            // since we got rid before writing
+        const bool result = table.read_metadata(fp);
+        settings.reset_thresholds(bucket_count());
+        return result;
+    }
+
+    // Only meaningful if value_type is a POD.
+    template <typename OUTPUT>
+    bool write_nopointer_data(OUTPUT *fp)
+    {
+        return table.write_nopointer_data(fp);
+    }
+
+    // Only meaningful if value_type is a POD.
+    template <typename INPUT>
+    bool read_nopointer_data(INPUT *fp)
+    {
+        return table.read_nopointer_data(fp);
+    }
+
+    // INPUT and OUTPUT must be either a FILE, *or* a C++ stream
+    //    (istream, ostream, etc) *or* a class providing
+    //    Read(void*, size_t) and Write(const void*, size_t)
+    //    (respectively), which writes a buffer into a stream
+    //    (which the INPUT/OUTPUT instance presumably owns).
+
+    typedef sparsehash_internal::pod_serializer<value_type> NopointerSerializer;
+
+    // ValueSerializer: a functor.  operator()(OUTPUT*, const value_type&)
+    template <typename ValueSerializer, typename OUTPUT>
+    bool serialize(ValueSerializer serializer, OUTPUT *fp)
+    {
+        _squash_deleted();           // so we don't have to worry about delkey
+        return table.serialize(serializer, fp);
+    }
+
+    // ValueSerializer: a functor.  operator()(INPUT*, value_type*)
+    template <typename ValueSerializer, typename INPUT>
+    bool unserialize(ValueSerializer serializer, INPUT *fp)
+    {
+        num_deleted = 0;            // since we got rid before writing
+        const bool result = table.unserialize(serializer, fp);
+        settings.reset_thresholds(bucket_count());
+        return result;
+    }
+
+private:
+
+    // Package templated functors with the other types to eliminate memory
+    // needed for storing these zero-size operators.  Since ExtractKey and
+    // hasher's operator() might have the same function signature, they
+    // must be packaged in different classes.
+    // -------------------------------------------------------------------------
+    struct Settings :
+        sparsehash_internal::sh_hashtable_settings<key_type, hasher,
+                                                   size_type, HT_MIN_BUCKETS>
+    {
+        explicit Settings(const hasher& hf)
+            : sparsehash_internal::sh_hashtable_settings<key_type, hasher, size_type, 
+              HT_MIN_BUCKETS>
+              (hf, HT_OCCUPANCY_PCT / 100.0f, HT_EMPTY_PCT / 100.0f) {}
+    };
+
+    // KeyInfo stores delete key and packages zero-size functors:
+    // ExtractKey and SetKey.
+     // ---------------------------------------------------------
+    class KeyInfo : public ExtractKey, public SetKey, public EqualKey
+    {
+    public:
+        KeyInfo(const ExtractKey& ek, const SetKey& sk, const EqualKey& eq)
+            : ExtractKey(ek), SetKey(sk), EqualKey(eq) 
+        {
+        }
+
+        // We want to return the exact same type as ExtractKey: Key or const Key&
+        typename ExtractKey::result_type get_key(const_reference v) const
+        {
+            return ExtractKey::operator()(v);
+        }
+
+        bool equals(const key_type& a, const key_type& b) const 
+        {
+            return EqualKey::operator()(a, b);
+        }
+
+        typename spp_::remove_const<key_type>::type delkey;
+    };
+
+    // Utility functions to access the templated operators
+    size_t hash(const key_type& v) const
+    {
+        return settings.hash(v);
+    }
+
+    bool equals(const key_type& a, const key_type& b) const 
+    {
+        return key_info.equals(a, b);
+    }
+
+    typename ExtractKey::result_type get_key(const_reference v) const 
+    {
+        return key_info.get_key(v);
+    }
+    
+private:
+    // Actual data
+    // -----------
+    Settings  settings;
+    KeyInfo   key_info;
+    size_type num_deleted; 
+    Table     table;         // holds num_buckets and num_elements too
+};
+
+
+// We need a global swap as well
+// -----------------------------
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A>
+inline void swap(sparse_hashtable<V,K,HF,ExK,SetK,EqK,A> &x,
+                 sparse_hashtable<V,K,HF,ExK,SetK,EqK,A> &y) 
+{
+    x.swap(y);
+}
+
+#undef JUMP_
+
+// -----------------------------------------------------------------------------
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A>
+const typename sparse_hashtable<V,K,HF,ExK,SetK,EqK,A>::size_type
+sparse_hashtable<V,K,HF,ExK,SetK,EqK,A>::ILLEGAL_BUCKET;
+
+// How full we let the table get before we resize.  Knuth says .8 is
+// good -- higher causes us to probe too much, though saves memory
+// -----------------------------------------------------------------------------
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A>
+const int sparse_hashtable<V,K,HF,ExK,SetK,EqK,A>::HT_OCCUPANCY_PCT = 50;
+
+// How empty we let the table get before we resize lower.
+// It should be less than OCCUPANCY_PCT / 2 or we thrash resizing
+// -----------------------------------------------------------------------------
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A>
+const int sparse_hashtable<V,K,HF,ExK,SetK,EqK,A>::HT_EMPTY_PCT
+= static_cast<int>(0.4 *
+                   sparse_hashtable<V,K,HF,ExK,SetK,EqK,A>::HT_OCCUPANCY_PCT);
+
+
+
+
+//  ----------------------------------------------------------------------
+//                   S P A R S E _ H A S H _ M A P
+//  ----------------------------------------------------------------------
+template <class Key, class T,
+          class HashFcn = spp_hash<Key>,  
+          class EqualKey = std::equal_to<Key>,
+          class Alloc = libc_allocator_with_realloc<std::pair<const Key, T> > >
+class sparse_hash_map 
+{
+private:
+    // Apparently select1st is not stl-standard, so we define our own
+    struct SelectKey 
+    {
+        typedef const Key& result_type;
+
+        inline const Key& operator()(const std::pair<const Key, T>& p) const 
+        {
+            return p.first;
+        }
+    };
+
+    struct SetKey 
+    {
+        inline void operator()(std::pair<const Key, T>* value, const Key& new_key) const
+        {
+            *const_cast<Key*>(&value->first) = new_key;
+        }
+    };
+
+    // For operator[].
+    struct DefaultValue 
+    {
+        inline std::pair<const Key, T> operator()(const Key& key)  const
+        {
+            return std::make_pair(key, T());
+        }
+    };
+
+    // The actual data
+    typedef sparse_hashtable<std::pair<typename spp_::remove_const<Key>::type, T>, Key, HashFcn, SelectKey,
+                             SetKey, EqualKey, Alloc> ht;
+
+public:
+    typedef typename ht::key_type             key_type;
+    typedef T                                 data_type;
+    typedef T                                 mapped_type;
+    typedef typename std::pair<const Key, T>  value_type;
+    typedef typename ht::hasher               hasher;
+    typedef typename ht::key_equal            key_equal;
+    typedef Alloc                             allocator_type;
+
+    typedef typename ht::size_type            size_type;
+    typedef typename ht::difference_type      difference_type;
+    typedef typename ht::pointer              pointer;
+    typedef typename ht::const_pointer        const_pointer;
+    typedef typename ht::reference            reference;
+    typedef typename ht::const_reference      const_reference;
+
+    typedef typename ht::iterator             iterator;
+    typedef typename ht::const_iterator       const_iterator;
+    typedef typename ht::local_iterator       local_iterator;
+    typedef typename ht::const_local_iterator const_local_iterator;
+
+    // Iterator functions
+    iterator       begin()                         { return rep.begin(); }
+    iterator       end()                           { return rep.end(); }
+    const_iterator begin() const                   { return rep.cbegin(); }
+    const_iterator end() const                     { return rep.cend(); }
+    const_iterator cbegin() const                  { return rep.cbegin(); }
+    const_iterator cend() const                    { return rep.cend(); }
+
+    // These come from tr1's unordered_map. For us, a bucket has 0 or 1 elements.
+    local_iterator begin(size_type i)              { return rep.begin(i); }
+    local_iterator end(size_type i)                { return rep.end(i); }
+    const_local_iterator begin(size_type i) const  { return rep.begin(i); }
+    const_local_iterator end(size_type i) const    { return rep.end(i); }
+    const_local_iterator cbegin(size_type i) const { return rep.cbegin(i); }
+    const_local_iterator cend(size_type i) const   { return rep.cend(i); }
+
+    // Accessor functions
+    // ------------------
+    allocator_type get_allocator() const           { return rep.get_allocator(); }
+    hasher hash_funct() const                      { return rep.hash_funct(); }
+    hasher hash_function() const                   { return hash_funct(); }
+    key_equal key_eq() const                       { return rep.key_eq(); }
+
+
+    // Constructors
+    // ------------
+    explicit sparse_hash_map(size_type n = 0,
+                             const hasher& hf = hasher(),
+                             const key_equal& eql = key_equal(),
+                             const allocator_type& alloc = allocator_type())
+        : rep(n, hf, eql, SelectKey(), SetKey(), alloc) 
+    {
+    }
+
+    explicit sparse_hash_map(const allocator_type& alloc) :
+        rep(0, hasher(), key_equal(), SelectKey(), SetKey(), alloc)
+    {
+    }
+
+    sparse_hash_map(size_type n, const allocator_type& alloc) :
+        rep(n, hasher(), key_equal(), SelectKey(), SetKey(), alloc)
+    {
+    }
+
+    sparse_hash_map(size_type n, const hasher& hf, const allocator_type& alloc) :
+        rep(n, hf, key_equal(), SelectKey(), SetKey(), alloc)
+    {
+    }
+
+    template <class InputIterator>
+    sparse_hash_map(InputIterator f, InputIterator l,
+                    size_type n = 0,
+                    const hasher& hf = hasher(),
+                    const key_equal& eql = key_equal(),
+                    const allocator_type& alloc = allocator_type())
+        : rep(n, hf, eql, SelectKey(), SetKey(), alloc) 
+    {
+        rep.insert(f, l);
+    }
+
+    template <class InputIterator>
+    sparse_hash_map(InputIterator f, InputIterator l,
+                    size_type n, const allocator_type& alloc)
+        : rep(n, hasher(), key_equal(), SelectKey(), SetKey(), alloc) 
+    {
+        rep.insert(f, l);
+    }
+
+    template <class InputIterator>
+    sparse_hash_map(InputIterator f, InputIterator l,
+                    size_type n, const hasher& hf, const allocator_type& alloc)
+        : rep(n, hf, key_equal(), SelectKey(), SetKey(), alloc) 
+    {
+        rep.insert(f, l);
+    }
+
+    sparse_hash_map(const sparse_hash_map &o) : 
+        rep(o.rep) 
+    {}
+
+    sparse_hash_map(const sparse_hash_map &o,
+                    const allocator_type& alloc) : 
+        rep(o.rep, alloc) 
+    {}
+
+#if !defined(SPP_NO_CXX11_RVALUE_REFERENCES)
+    sparse_hash_map(const sparse_hash_map &&o) : 
+        rep(std::move(o.rep))
+    {}
+
+    sparse_hash_map(const sparse_hash_map &&o,
+                    const allocator_type& alloc) :
+        rep(std::move(o.rep), alloc) 
+    {}
+#endif
+
+#if !defined(SPP_NO_CXX11_HDR_INITIALIZER_LIST)
+    sparse_hash_map(std::initializer_list<value_type> init,
+                    size_type n = 0,
+                    const hasher& hf = hasher(),
+                    const key_equal& eql = key_equal(),
+                    const allocator_type& alloc = allocator_type())
+        : rep(n, hf, eql, SelectKey(), SetKey(), alloc) 
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_map(std::initializer_list<value_type> init,
+                    size_type n, const allocator_type& alloc) :
+        rep(n, hasher(), key_equal(), SelectKey(), SetKey(), alloc)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_map(std::initializer_list<value_type> init,
+                    size_type n, const hasher& hf, const allocator_type& alloc) :
+        rep(n, hf, key_equal(), SelectKey(), SetKey(), alloc)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_map& operator=(std::initializer_list<value_type> init)
+    {
+        rep.clear();
+        rep.insert(init.begin(), init.end());
+        return *this;
+    }
+
+    void insert(std::initializer_list<value_type> init)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+#endif
+
+    sparse_hash_map& operator=(const sparse_hash_map &o)
+    {
+        rep = o.rep;
+        return *this;
+    }
+
+    void clear()                        { rep.clear(); }
+    void swap(sparse_hash_map& hs)      { rep.swap(hs.rep); }
+
+    // Functions concerning size
+    // -------------------------
+    size_type size() const              { return rep.size(); }
+    size_type max_size() const          { return rep.max_size(); }
+    bool empty() const                  { return rep.empty(); }
+    size_type bucket_count() const      { return rep.bucket_count(); }
+    size_type max_bucket_count() const  { return rep.max_bucket_count(); }
+
+    size_type bucket_size(size_type i) const    { return rep.bucket_size(i); }
+    size_type bucket(const key_type& key) const { return rep.bucket(key); }
+    float     load_factor() const       { return size() * 1.0f / bucket_count(); }
+
+    float max_load_factor() const      { return rep.get_enlarge_factor(); }
+    void  max_load_factor(float grow)  { rep.set_enlarge_factor(grow); }
+
+    float min_load_factor() const      { return rep.get_shrink_factor(); }
+    void  min_load_factor(float shrink){ rep.set_shrink_factor(shrink); }
+
+    void set_resizing_parameters(float shrink, float grow) 
+    {
+        rep.set_resizing_parameters(shrink, grow);
+    }
+
+    void resize(size_type cnt)        { rep.resize(cnt); }
+    void rehash(size_type cnt)        { resize(cnt); } // c++11 name
+    void reserve(size_type cnt)       { resize(cnt); } // c++11 
+
+    // Lookup
+    // ------
+    iterator find(const key_type& key)                 { return rep.find(key); }
+    const_iterator find(const key_type& key) const     { return rep.find(key); }
+
+    mapped_type& operator[](const key_type& key) 
+    {
+        return rep.template find_or_insert<DefaultValue>(key).second;
+    }
+
+    size_type count(const key_type& key) const         { return rep.count(key); }
+
+    std::pair<iterator, iterator> 
+    equal_range(const key_type& key)             { return rep.equal_range(key); }
+
+    std::pair<const_iterator, const_iterator> 
+    equal_range(const key_type& key) const       { return rep.equal_range(key); }
+
+    mapped_type& at(const key_type& key) 
+    {
+        iterator it = rep.find(key);
+        if (it == rep.end())
+            throw_exception(std::out_of_range("at: key not present"));
+        return it->second;
+    }
+
+    const mapped_type& at(const key_type& key) const
+    {
+        const_iterator it = rep.find(key);
+        if (it == rep.cend())
+            throw_exception(std::out_of_range("at: key not present"));
+        return it->second;
+    }
+
+    // Insert
+    // ------
+    std::pair<iterator, bool> 
+    insert(const value_type& obj)                    { return rep.insert(obj); }
+
+    template <class InputIterator> 
+    void insert(InputIterator f, InputIterator l)    { rep.insert(f, l); }
+
+    void insert(const_iterator f, const_iterator l)  { rep.insert(f, l); }
+    
+    iterator insert(iterator /*unused*/, const value_type& obj) { return insert(obj).first; }
+    iterator insert(const_iterator /*unused*/, const value_type& obj) { return insert(obj).first; }
+
+    // Deleted key routines - just to keep google test framework happy
+    // we don't actually use the deleted key
+    // ---------------------------------------------------------------
+    void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); }
+    void clear_deleted_key()                    { rep.clear_deleted_key();  }
+    key_type deleted_key() const                { return rep.deleted_key(); }
+
+    // Erase
+    // -----
+    size_type erase(const key_type& key)               { return rep.erase(key); }
+    iterator  erase(iterator it)                       { return rep.erase(it); } 
+    iterator  erase(iterator f, iterator l)            { return rep.erase(f, l); }
+    iterator  erase(const_iterator it)                 { return rep.erase(it); }
+    iterator  erase(const_iterator f, const_iterator l){ return rep.erase(f, l); }
+
+    // Comparison
+    // ----------
+    bool operator==(const sparse_hash_map& hs) const   { return rep == hs.rep; }
+    bool operator!=(const sparse_hash_map& hs) const   { return rep != hs.rep; }
+
+
+    // I/O -- this is an add-on for writing metainformation to disk
+    //
+    // For maximum flexibility, this does not assume a particular
+    // file type (though it will probably be a FILE *).  We just pass
+    // the fp through to rep.
+
+    // If your keys and values are simple enough, you can pass this
+    // serializer to serialize()/unserialize().  "Simple enough" means
+    // value_type is a POD type that contains no pointers.  Note,
+    // however, we don't try to normalize endianness.
+    // ---------------------------------------------------------------
+    typedef typename ht::NopointerSerializer NopointerSerializer;
+
+    // serializer: a class providing operator()(OUTPUT*, const value_type&)
+    //    (writing value_type to OUTPUT).  You can specify a
+    //    NopointerSerializer object if appropriate (see above).
+    // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
+    //    pointer to a class providing size_t Write(const void*, size_t),
+    //    which writes a buffer into a stream (which fp presumably
+    //    owns) and returns the number of bytes successfully written.
+    //    Note basic_ostream<not_char> is not currently supported.
+    // ---------------------------------------------------------------
+    template <typename ValueSerializer, typename OUTPUT>
+    bool serialize(ValueSerializer serializer, OUTPUT* fp) 
+    {
+        return rep.serialize(serializer, fp);
+    }
+
+    // serializer: a functor providing operator()(INPUT*, value_type*)
+    //    (reading from INPUT and into value_type).  You can specify a
+    //    NopointerSerializer object if appropriate (see above).
+    // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
+    //    pointer to a class providing size_t Read(void*, size_t),
+    //    which reads into a buffer from a stream (which fp presumably
+    //    owns) and returns the number of bytes successfully read.
+    //    Note basic_istream<not_char> is not currently supported.
+    // NOTE: Since value_type is std::pair<const Key, T>, ValueSerializer
+    // may need to do a const cast in order to fill in the key.
+    // NOTE: if Key or T are not POD types, the serializer MUST use
+    // placement-new to initialize their values, rather than a normal
+    // equals-assignment or similar.  (The value_type* passed into the
+    // serializer points to garbage memory.)
+    // ---------------------------------------------------------------
+    template <typename ValueSerializer, typename INPUT>
+    bool unserialize(ValueSerializer serializer, INPUT* fp)
+    {
+        return rep.unserialize(serializer, fp);
+    }
+
+    // The four methods below are DEPRECATED.
+    // Use serialize() and unserialize() for new code.
+    // -----------------------------------------------
+    template <typename OUTPUT>
+    bool write_metadata(OUTPUT *fp)       { return rep.write_metadata(fp); }
+
+    template <typename INPUT>
+    bool read_metadata(INPUT *fp)         { return rep.read_metadata(fp); }
+
+    template <typename OUTPUT>
+    bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); }
+
+    template <typename INPUT>
+    bool read_nopointer_data(INPUT *fp)   { return rep.read_nopointer_data(fp); }
+
+
+private:
+    // The actual data
+    // ---------------
+    ht rep;
+};
+
+// We need a global swap as well
+template <class Key, class T, class HashFcn, class EqualKey, class Alloc>
+inline void swap(sparse_hash_map<Key, T, HashFcn, EqualKey, Alloc>& hm1,
+                 sparse_hash_map<Key, T, HashFcn, EqualKey, Alloc>& hm2) 
+{
+    hm1.swap(hm2);
+}
+
+//  ----------------------------------------------------------------------
+//                   S P A R S E _ H A S H _ S E T
+//  ----------------------------------------------------------------------
+
+template <class Value,
+          class HashFcn = spp_hash<Value>,
+          class EqualKey = std::equal_to<Value>,
+          class Alloc = libc_allocator_with_realloc<Value> >
+class sparse_hash_set 
+{
+private:
+    // Apparently identity is not stl-standard, so we define our own
+    struct Identity 
+    {
+        typedef const Value& result_type;
+        const Value& operator()(const Value& v) const { return v; }
+    };
+
+    struct SetKey 
+    {
+        void operator()(Value* value, const Value& new_key) const 
+        {
+            *value = new_key;
+        }
+    };
+
+    typedef sparse_hashtable<Value, Value, HashFcn, Identity, SetKey,
+                             EqualKey, Alloc> ht;
+
+public:
+    typedef typename ht::key_type              key_type;
+    typedef typename ht::value_type            value_type;
+    typedef typename ht::hasher                hasher;
+    typedef typename ht::key_equal             key_equal;
+    typedef Alloc                              allocator_type;
+
+    typedef typename ht::size_type             size_type;
+    typedef typename ht::difference_type       difference_type;
+    typedef typename ht::const_pointer         pointer;
+    typedef typename ht::const_pointer         const_pointer;
+    typedef typename ht::const_reference       reference;
+    typedef typename ht::const_reference       const_reference;
+
+    typedef typename ht::const_iterator        iterator;
+    typedef typename ht::const_iterator        const_iterator;
+    typedef typename ht::const_local_iterator  local_iterator;
+    typedef typename ht::const_local_iterator  const_local_iterator;
+
+
+    // Iterator functions -- recall all iterators are const
+    iterator       begin() const             { return rep.begin(); }
+    iterator       end() const               { return rep.end(); }
+    const_iterator cbegin() const            { return rep.cbegin(); }
+    const_iterator cend() const              { return rep.cend(); }
+
+    // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements.
+    local_iterator begin(size_type i) const  { return rep.begin(i); }
+    local_iterator end(size_type i) const    { return rep.end(i); }
+    local_iterator cbegin(size_type i) const { return rep.cbegin(i); }
+    local_iterator cend(size_type i) const   { return rep.cend(i); }
+
+
+    // Accessor functions
+    // ------------------
+    allocator_type get_allocator() const     { return rep.get_allocator(); }
+    hasher         hash_funct() const        { return rep.hash_funct(); }
+    hasher         hash_function() const     { return hash_funct(); }  // tr1 name
+    key_equal      key_eq() const            { return rep.key_eq(); }
+
+
+    // Constructors
+    // ------------
+    explicit sparse_hash_set(size_type n = 0,
+                             const hasher& hf = hasher(),
+                             const key_equal& eql = key_equal(),
+                             const allocator_type& alloc = allocator_type()) :
+        rep(n, hf, eql, Identity(), SetKey(), alloc)
+    {
+    }
+
+    explicit sparse_hash_set(const allocator_type& alloc) :
+        rep(0, hasher(), key_equal(), Identity(), SetKey(), alloc)
+    {
+    }
+
+    sparse_hash_set(size_type n, const allocator_type& alloc) :
+        rep(n, hasher(), key_equal(), Identity(), SetKey(), alloc)
+    {
+    }
+
+    sparse_hash_set(size_type n, const hasher& hf, 
+                    const allocator_type& alloc) :
+        rep(n, hf, key_equal(), Identity(), SetKey(), alloc)
+    {
+    }
+
+    template <class InputIterator>
+    sparse_hash_set(InputIterator f, InputIterator l,
+                    size_type n = 0,
+                    const hasher& hf = hasher(),
+                    const key_equal& eql = key_equal(),
+                    const allocator_type& alloc = allocator_type())
+        : rep(n, hf, eql, Identity(), SetKey(), alloc)
+    {
+        rep.insert(f, l);
+    } 
+
+    template <class InputIterator>
+    sparse_hash_set(InputIterator f, InputIterator l,
+                    size_type n, const allocator_type& alloc)
+        : rep(n, hasher(), key_equal(), Identity(), SetKey(), alloc)
+    {
+        rep.insert(f, l);
+    } 
+
+    template <class InputIterator>
+    sparse_hash_set(InputIterator f, InputIterator l,
+                    size_type n, const hasher& hf, const allocator_type& alloc)
+        : rep(n, hf, key_equal(), Identity(), SetKey(), alloc)
+    {
+        rep.insert(f, l);
+    } 
+
+    sparse_hash_set(const sparse_hash_set &o) : 
+        rep(o.rep)
+    {}
+
+    sparse_hash_set(const sparse_hash_set &o,
+                    const allocator_type& alloc) :
+        rep(o.rep, alloc) 
+    {}
+
+#if !defined(SPP_NO_CXX11_RVALUE_REFERENCES)
+    sparse_hash_set(const sparse_hash_set &&o) : 
+        rep(std::move(o.rep))
+    {}
+
+    sparse_hash_set(const sparse_hash_set &&o,
+                    const allocator_type& alloc) :
+        rep(std::move(o.rep), alloc) 
+    {}
+#endif
+
+#if !defined(SPP_NO_CXX11_HDR_INITIALIZER_LIST)
+    sparse_hash_set(std::initializer_list<value_type> init,
+                    size_type n = 0, 
+                    const hasher& hf = hasher(),
+                    const key_equal& eql = key_equal(),
+                    const allocator_type& alloc = allocator_type()) :
+        rep(n, hf, eql, Identity(), SetKey(), alloc)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_set(std::initializer_list<value_type> init,
+                    size_type n, const allocator_type& alloc) :
+        rep(n, hasher(), key_equal(), Identity(), SetKey(), alloc)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_set(std::initializer_list<value_type> init,
+                    size_type n, const hasher& hf, 
+                    const allocator_type& alloc) :
+        rep(n, hf, key_equal(), Identity(), SetKey(), alloc)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+    sparse_hash_set& operator=(std::initializer_list<value_type> init)
+    {
+        rep.clear();
+        rep.insert(init.begin(), init.end());
+        return *this;
+    }
+
+    void insert(std::initializer_list<value_type> init)
+    {
+        rep.insert(init.begin(), init.end());
+    }
+
+#endif
+    
+    sparse_hash_set& operator=(const sparse_hash_set &o)
+    {
+        rep = o.rep;
+        return *this;
+    }
+
+    void clear()                        { rep.clear(); }
+    void swap(sparse_hash_set& hs)      { rep.swap(hs.rep); }
+
+
+    // Functions concerning size
+    // -------------------------
+    size_type size() const              { return rep.size(); }
+    size_type max_size() const          { return rep.max_size(); }
+    bool empty() const                  { return rep.empty(); }
+    size_type bucket_count() const      { return rep.bucket_count(); }
+    size_type max_bucket_count() const  { return rep.max_bucket_count(); }
+
+    size_type bucket_size(size_type i) const    { return rep.bucket_size(i); }
+    size_type bucket(const key_type& key) const { return rep.bucket(key); }
+
+    float     load_factor() const       { return size() * 1.0f / bucket_count(); }
+
+    float max_load_factor() const      { return rep.get_enlarge_factor(); }
+    void  max_load_factor(float grow)  { rep.set_enlarge_factor(grow); }
+
+    float min_load_factor() const      { return rep.get_shrink_factor(); }
+    void  min_load_factor(float shrink){ rep.set_shrink_factor(shrink); }
+
+    void set_resizing_parameters(float shrink, float grow) 
+    {
+        rep.set_resizing_parameters(shrink, grow);
+    }
+
+    void resize(size_type cnt)        { rep.resize(cnt); }
+    void rehash(size_type cnt)        { resize(cnt); } // c++11 name
+    void reserve(size_type cnt)       { resize(cnt); } // c++11 
+
+    // Lookup
+    // ------
+    iterator find(const key_type& key) const     { return rep.find(key); }
+
+    size_type count(const key_type& key) const   { return rep.count(key); }
+
+    std::pair<iterator, iterator> 
+    equal_range(const key_type& key) const       { return rep.equal_range(key); }
+
+#if 0 && !defined(SPP_NO_CXX11_VARIADIC_TEMPLATES)
+    template <class... Args>
+    pair<iterator, bool> emplace(Args&&... args) 
+    {
+        return rep.emplace_unique(std::forward<Args>(args)...);
+    }
+
+    template <class... Args>
+    iterator emplace_hint(const_iterator p, Args&&... args)
+    {
+        return rep.emplace_unique(std::forward<Args>(args)...).first;
+    }
+#endif
+
+    // Insert
+    // ------
+    std::pair<iterator, bool> insert(const value_type& obj) 
+    {
+        std::pair<typename ht::iterator, bool> p = rep.insert(obj);
+        return std::pair<iterator, bool>(p.first, p.second);   // const to non-const
+    }
+
+    template <class InputIterator>
+    void insert(InputIterator f, InputIterator l)    { rep.insert(f, l); }
+
+    void insert(const_iterator f, const_iterator l)  { rep.insert(f, l); }
+
+    iterator insert(iterator /*unused*/, const value_type& obj) { return insert(obj).first; }
+
+    // Deleted key - do nothing - just to keep google test framework happy
+    // -------------------------------------------------------------------
+    void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); }
+    void clear_deleted_key()                  { rep.clear_deleted_key();  }
+    key_type deleted_key() const              { return rep.deleted_key(); }
+
+    // Erase
+    // -----
+    size_type erase(const key_type& key)      { return rep.erase(key); }
+    iterator  erase(iterator it)              { return rep.erase(it); }
+    iterator  erase(iterator f, iterator l)   { return rep.erase(f, l); }
+
+    // Comparison
+    // ----------
+    bool operator==(const sparse_hash_set& hs) const { return rep == hs.rep; }
+    bool operator!=(const sparse_hash_set& hs) const { return rep != hs.rep; }
+
+
+    // I/O -- this is an add-on for writing metainformation to disk
+    //
+    // For maximum flexibility, this does not assume a particular
+    // file type (though it will probably be a FILE *).  We just pass
+    // the fp through to rep.
+
+    // If your keys and values are simple enough, you can pass this
+    // serializer to serialize()/unserialize().  "Simple enough" means
+    // value_type is a POD type that contains no pointers.  Note,
+    // however, we don't try to normalize endianness.
+    // ---------------------------------------------------------------
+    typedef typename ht::NopointerSerializer NopointerSerializer;
+
+    // serializer: a class providing operator()(OUTPUT*, const value_type&)
+    //    (writing value_type to OUTPUT).  You can specify a
+    //    NopointerSerializer object if appropriate (see above).
+    // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
+    //    pointer to a class providing size_t Write(const void*, size_t),
+    //    which writes a buffer into a stream (which fp presumably
+    //    owns) and returns the number of bytes successfully written.
+    //    Note basic_ostream<not_char> is not currently supported.
+    // ---------------------------------------------------------------
+    template <typename ValueSerializer, typename OUTPUT>
+    bool serialize(ValueSerializer serializer, OUTPUT* fp)
+    {
+        return rep.serialize(serializer, fp);
+    }
+
+    // serializer: a functor providing operator()(INPUT*, value_type*)
+    //    (reading from INPUT and into value_type).  You can specify a
+    //    NopointerSerializer object if appropriate (see above).
+    // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
+    //    pointer to a class providing size_t Read(void*, size_t),
+    //    which reads into a buffer from a stream (which fp presumably
+    //    owns) and returns the number of bytes successfully read.
+    //    Note basic_istream<not_char> is not currently supported.
+    // NOTE: Since value_type is const Key, ValueSerializer
+    // may need to do a const cast in order to fill in the key.
+    // NOTE: if Key is not a POD type, the serializer MUST use
+    // placement-new to initialize its value, rather than a normal
+    // equals-assignment or similar.  (The value_type* passed into
+    // the serializer points to garbage memory.)
+    // ---------------------------------------------------------------
+    template <typename ValueSerializer, typename INPUT>
+    bool unserialize(ValueSerializer serializer, INPUT* fp)
+    {
+        return rep.unserialize(serializer, fp);
+    }
+
+    // The four methods below are DEPRECATED.
+    // Use serialize() and unserialize() for new code.
+    // -----------------------------------------------
+    template <typename OUTPUT>
+    bool write_metadata(OUTPUT *fp)       { return rep.write_metadata(fp); }
+
+    template <typename INPUT>
+    bool read_metadata(INPUT *fp)         { return rep.read_metadata(fp); }
+
+    template <typename OUTPUT>
+    bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); }
+
+    template <typename INPUT>
+    bool read_nopointer_data(INPUT *fp)   { return rep.read_nopointer_data(fp); }
+
+private:
+    // The actual data
+    // ---------------
+    ht rep;
+};
+
+template <class Val, class HashFcn, class EqualKey, class Alloc>
+inline void swap(sparse_hash_set<Val, HashFcn, EqualKey, Alloc>& hs1,
+                 sparse_hash_set<Val, HashFcn, EqualKey, Alloc>& hs2) 
+{
+    hs1.swap(hs2);
+}
+
+
+SPP_END_NAMESPACE
+
+#endif // sparsepp_h_guard_
diff --git a/scripts/RunRapMap.sh b/scripts/RunRapMap.sh
new file mode 100755
index 0000000..ded08c9
--- /dev/null
+++ b/scripts/RunRapMap.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+cmd="$@"
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+bam_out=`echo $cmd | sed -n 's/.*--bamOut\s\+\(\S\+\)\s*.*/\1/p'`
+bam_compress_threads=`echo $cmd | sed -n 's/.*--bamThreads\s\+\([[:digit:]]\+\)\s*.*/\1/p'`
+
+if [ -z "$bam_out" ]
+then
+    #Run normally in this branch
+    $DIR/rapmap ${@}
+else
+    # from: http://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script
+    if command -v samtools >/dev/null; then 
+        new_cmd=`echo $cmd | sed 's/--bamOut\s\+\(\S\+\)\s*//'`
+        execmd=""
+        if [ -z "$bam_compress_threads" ]
+        then
+            execmd="${new_cmd} -o | samtools view -Sb - > ${bam_out}"
+        else
+            execmd="${new_cmd} -o | samtools view -Sb -@ ${bam_compress_threads} > ${bam_out}"
+        fi
+        echo "Running command [$DIR/rapmap ${execmd}]"
+        $DIR/rapmap ${execmd}
+    else
+        echo >&2 "samtools is required to convert to BAM, but it's not installed.  Aborting."; 
+        exit 1; 
+    fi
+fi
diff --git a/scripts/add-header.sh b/scripts/add-header.sh
new file mode 100644
index 0000000..25bd36e
--- /dev/null
+++ b/scripts/add-header.sh
@@ -0,0 +1,33 @@
+HEADERS=(EnumCoder.hpp HitManager.hpp IndexHeader.hpp JFRaw.hpp PairAlignmentFormatter.hpp RapMapConfig.hpp RapMapFileSystem.hpp RapMapIndex.hpp RapMapSAIndex.hpp RapMapUtils.hpp SACollector.hpp SASearcher.hpp ScopedTimer.hpp SingleAlignmentFormatter.hpp SparseHashSerializer.hpp SpinLock.hpp)
+
+SRCS=(EnumCoder.cpp EnumCoderTest.cpp HitManager.cpp RapMap.cpp RapMapFileSystem.cpp RapMapIndex.cpp RapMapIndexer.cpp RapMapMapper.cpp RapMapSAIndex.cpp RapMapSAIndexer.cpp RapMapSAMapper.cpp RapMapUtils.cpp UtilTest.cpp) 
+
+for h in ${HEADERS[*]};
+do
+copyright-header --add-path ../include/$h \
+                 --license GPL3 \
+                 --copyright-holder 'Rob Patro' \
+                 --copyright-holder 'Avi Srivastava' \
+                 --copyright-holder 'Hirak Sarkar' \
+                 --copyright-software 'RapMap' \
+                 --copyright-software-description "Rapid and accurate mapping of short reads to transcriptomes using quasi-mapping." \
+                 --copyright-year 2015 \
+                 --copyright-year 2016 \
+		         --word-wrap 80 \
+                 --output-dir .
+done
+
+for h in ${SRCS[*]};
+do
+copyright-header --add-path ../src/$h \
+                 --license GPL3 \
+                 --copyright-holder 'Rob Patro' \
+                 --copyright-holder 'Avi Srivastava' \
+                 --copyright-holder 'Hirak Sarkar' \
+                 --copyright-software 'RapMap' \
+                 --copyright-software-description "Rapid and accurate mapping of short reads to transcriptomes using quasi-mapping." \
+                 --copyright-year 2015 \
+                 --copyright-year 2016 \
+		         --word-wrap 80 \
+                 --output-dir .
+done
diff --git a/scripts/compile.sh b/scripts/compile.sh
index 96ba356..782c4a7 100755
--- a/scripts/compile.sh
+++ b/scripts/compile.sh
@@ -73,6 +73,6 @@ make
 make install
 make test
 cd ../scripts
-bash make-release.sh -v ${version} -n CentOS5
+bash make-release.sh -v ${version} -n linux_x86-64 
 cd ../RELEASES
 cp *.tar.gz /io/
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2708677..a3bdfeb 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,6 +9,7 @@ set (RAPMAP_MAIN_SRCS
     RapMapSAIndex.cpp
     RapMapIndex.cpp
     HitManager.cpp
+    FastxParser.cpp
     rank9b.cpp
     stringpiece.cc
     xxhash.c
@@ -52,28 +53,29 @@ ${GAT_SOURCE_DIR}/external/install/lib
 #${BLAS_LIBRARY_DIR}
 )
 
-#message("Boost_LIBRARIES = ${Boost_LIBRARIES}")
-
 # Set the RPATH
-if (APPLE)
-    ## This DOES NOT do what I / any one sane, expects.  Setting the
-    ## linker path on OSX is messed up.  Just tell the user to use
-    ## DYLD_FALLBACK_LIBRARY_PATH for now
-    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-else()
+if (NOT APPLE)
     set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib:$ORIGIN/../../lib:$ORIGIN/:$ORIGIN/../../external/install/lib")
-endif()
+    set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+else()
+  # use, i.e. don't skip the full RPATH for the build tree
+  set(CMAKE_SKIP_BUILD_RPATH  FALSE)
 
-set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+  # when building, don't use the install RPATH already
+  # (but later on when installing)
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) 
 
-# Build the rsdic library
-# add_library(rsdic STATIC ${RSDICT_LIB_SRCS} )
+  # the RPATH to be used when installing
+  set(CMAKE_INSTALL_RPATH "")
+
+  # don't add the automatically determined parts of the RPATH
+  # which point to directories outside the build tree to the install RPATH
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+endif()
 
 # Build the rapmap executable
 add_executable(rapmap ${RAPMAP_MAIN_SRCS})
 
-#set_target_properties(salmon_core salmon PROPERTIES LINK_SEARCH_END_STATIC TRUE)
-
 # our suffix array construction libraries
 set (SUFFARRAY_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort.a)
 set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort64.a)
@@ -81,33 +83,18 @@ set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort64.a)
 # Link the executable
 target_link_libraries(rapmap
     ${PTHREAD_LIB}
-    #${Boost_LIBRARIES}
     ${ZLIB_LIBRARY}
     ${SUFFARRAY_LIB}
     ${SUFFARRAY64_LIB}
     ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
     m
     ${LIBLZMA_LIBRARIES}
-    #${LIBSALMON_LINKER_FLAGS}
     ${NON_APPLECLANG_LIBS}
     ${FAST_MALLOC_LIB}
 )
 
-#add_dependencies(salmon libbwa)
-
-##
-#  This ensures that the salmon executable should work with or without `make install`
-##
-if (APPLE)
-	add_custom_command(TARGET rapmap
-		POST_BUILD
-		COMMAND install_name_tool -add_rpath ${GAT_SOURCE_DIR}/external/install/lib rapmap
-		COMMAND install_name_tool -add_rpath @executable_path/../lib rapmap
-		)
-endif()
 
 ##### ======================================
-
 IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   SET(CMAKE_INSTALL_PREFIX
     "${GAT_SOURCE_DIR}" CACHE PATH "Default install prefix" FORCE
@@ -121,12 +108,15 @@ set(INSTALL_INCLUDE_DIR include )
 # install(FILES ${Boost_LIBRARIES}
 # 	           DESTINATION ${INSTALL_LIB_DIR})
 
-install(TARGETS rapmap
+install(TARGETS rapmap 
                 RUNTIME DESTINATION bin
                 LIBRARY DESTINATION lib
                 ARCHIVE DESTINATION lib
         )
 
+install(FILES ${GAT_SOURCE_DIR}/scripts/RunRapMap.sh 
+              PERMISSIONS WORLD_EXECUTE WORLD_READ OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE
+              DESTINATION bin)
     #set(POST_INSTALL_SCRIPT ${GAT_SOURCE_DIR}/cmake/PostInstall.cmake)
 
     #install(
diff --git a/src/FastxParser.cpp b/src/FastxParser.cpp
new file mode 100644
index 0000000..7c2faec
--- /dev/null
+++ b/src/FastxParser.cpp
@@ -0,0 +1,306 @@
+#include "FastxParser.hpp"
+
+#include "fcntl.h"
+#include "unistd.h"
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <poll.h>
+#include <thread>
+#include <vector>
+#include <zlib.h>
+
+// STEP 1: declare the type of file handler and the read() function
+KSEQ_INIT(gzFile, gzread)
+
+namespace fastx_parser {
+template <typename T>
+FastxParser<T>::FastxParser(std::vector<std::string> files,
+                            uint32_t numConsumers, uint32_t numParsers,
+                            uint32_t chunkSize)
+    : FastxParser(files, {}, numConsumers, numParsers, chunkSize) {}
+
+template <typename T>
+FastxParser<T>::FastxParser(std::vector<std::string> files,
+                            std::vector<std::string> files2,
+                            uint32_t numConsumers, uint32_t numParsers,
+                            uint32_t chunkSize)
+    : inputStreams_(files), inputStreams2_(files2), numParsing_(0),
+      blockSize_(chunkSize) {
+
+  if (numParsers > files.size()) {
+    std::cerr << "Can't make user of more parsing threads than file (pairs); "
+                 "setting # of parsing threads to "
+              << files.size();
+    numParsers = files.size();
+  }
+  numParsers_ = numParsers;
+
+  // nobody is parsing yet
+  numParsing_ = 0;
+
+  readQueue_ = moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>(
+      4 * numConsumers, numParsers, 0);
+
+  seqContainerQueue_ =
+      moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>(
+          4 * numConsumers, 1 + numConsumers, 0);
+
+  workQueue_ = moodycamel::ConcurrentQueue<uint32_t>(numParsers_);
+
+  // push all file ids on the queue
+  for (size_t i = 0; i < files.size(); ++i) {
+    workQueue_.enqueue(i);
+  }
+
+  // every parsing thread gets a consumer token for the seqContainerQueue
+  // and a producer token for the readQueue.
+  for (size_t i = 0; i < numParsers_; ++i) {
+    consumeContainers_.emplace_back(
+        new moodycamel::ConsumerToken(seqContainerQueue_));
+    produceReads_.emplace_back(new moodycamel::ProducerToken(readQueue_));
+  }
+
+  // enqueue the appropriate number of read chunks so that we can start
+  // filling them once the parser has been started.
+  moodycamel::ProducerToken produceContainer(seqContainerQueue_);
+  for (size_t i = 0; i < 4 * numConsumers; ++i) {
+    auto chunk = make_unique<ReadChunk<T>>(blockSize_);
+    seqContainerQueue_.enqueue(produceContainer, std::move(chunk));
+  }
+}
+
+template <typename T> ReadGroup<T> FastxParser<T>::getReadGroup() {
+  return ReadGroup<T>(getProducerToken_(), getConsumerToken_());
+}
+
+template <typename T>
+moodycamel::ProducerToken FastxParser<T>::getProducerToken_() {
+  return moodycamel::ProducerToken(seqContainerQueue_);
+}
+
+template <typename T>
+moodycamel::ConsumerToken FastxParser<T>::getConsumerToken_() {
+  return moodycamel::ConsumerToken(readQueue_);
+}
+
+template <typename T> FastxParser<T>::~FastxParser() {
+  for (auto& t : parsingThreads_) {
+    t->join();
+  }
+}
+
+inline void copyRecord(kseq_t* seq, ReadSeq* s) {
+  // Copy over the sequence and read name
+  s->seq.assign(seq->seq.s, seq->seq.l);
+  s->name.assign(seq->name.s, seq->name.l);
+}
+
+template <typename T>
+void parseReads(
+    std::vector<std::string>& inputStreams, std::atomic<uint32_t>& numParsing,
+    moodycamel::ConsumerToken* cCont, moodycamel::ProducerToken* pRead,
+    moodycamel::ConcurrentQueue<uint32_t>& workQueue,
+    moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>&
+        seqContainerQueue_,
+    moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>& readQueue_) {
+  kseq_t* seq;
+  T* s;
+  uint32_t fn{0};
+  while (workQueue.try_dequeue(fn)) {
+    auto file = inputStreams[fn];
+    std::unique_ptr<ReadChunk<T>> local;
+    while (!seqContainerQueue_.try_dequeue(*cCont, local)) {
+      std::cerr << "couldn't dequeue read chunk\n";
+    }
+    size_t numObtained{local->size()};
+    // open the file and init the parser
+    auto fp = gzopen(file.c_str(), "r");
+
+    // The number of reads we have in the local vector
+    size_t numWaiting{0};
+
+    seq = kseq_init(fp);
+    int ksv = kseq_read(seq);
+
+    while (ksv >= 0) {
+      s = &((*local)[numWaiting++]);
+
+      copyRecord(seq, s);
+
+      // If we've filled the local vector, then dump to the concurrent queue
+      if (numWaiting == numObtained) {
+        while (!readQueue_.try_enqueue(std::move(local))) {
+        }
+        numWaiting = 0;
+        numObtained = 0;
+        // And get more empty reads
+        while (!seqContainerQueue_.try_dequeue(*cCont, local)) {
+        }
+        numObtained = local->size();
+      }
+      ksv = kseq_read(seq);
+    }
+
+    // If we hit the end of the file and have any reads in our local buffer
+    // then dump them here.
+    if (numWaiting > 0) {
+      local->have(numWaiting);
+      while (!readQueue_.try_enqueue(*pRead, std::move(local))) {
+      }
+      numWaiting = 0;
+    }
+    // destroy the parser and close the file
+    kseq_destroy(seq);
+    gzclose(fp);
+  }
+
+  --numParsing;
+}
+
+template <typename T>
+void parseReadPair(
+    std::vector<std::string>& inputStreams,
+    std::vector<std::string>& inputStreams2, std::atomic<uint32_t>& numParsing,
+    moodycamel::ConsumerToken* cCont, moodycamel::ProducerToken* pRead,
+    moodycamel::ConcurrentQueue<uint32_t>& workQueue,
+    moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>&
+        seqContainerQueue_,
+    moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>>& readQueue_) {
+
+  kseq_t* seq;
+  kseq_t* seq2;
+  T* s;
+
+  uint32_t fn{0};
+  while (workQueue.try_dequeue(fn)) {
+    // for (size_t fn = 0; fn < inputStreams.size(); ++fn) {
+    auto& file = inputStreams[fn];
+    auto& file2 = inputStreams2[fn];
+
+    std::unique_ptr<ReadChunk<T>> local;
+    while (!seqContainerQueue_.try_dequeue(*cCont, local)) {
+      std::cerr << "couldn't dequeue read chunk\n";
+    }
+    size_t numObtained{local->size()};
+    // open the file and init the parser
+    auto fp = gzopen(file.c_str(), "r");
+    auto fp2 = gzopen(file2.c_str(), "r");
+
+    // The number of reads we have in the local vector
+    size_t numWaiting{0};
+
+    seq = kseq_init(fp);
+    seq2 = kseq_init(fp2);
+
+    int ksv = kseq_read(seq);
+    int ksv2 = kseq_read(seq2);
+    while (ksv >= 0 and ksv2 >= 0) {
+
+      s = &((*local)[numWaiting++]);
+      copyRecord(seq, &s->first);
+      copyRecord(seq2, &s->second);
+
+      // If we've filled the local vector, then dump to the concurrent queue
+      if (numWaiting == numObtained) {
+        while (!readQueue_.try_enqueue(std::move(local))) {
+        }
+        numWaiting = 0;
+        numObtained = 0;
+        // And get more empty reads
+        while (!seqContainerQueue_.try_dequeue(*cCont, local)) {
+        }
+        numObtained = local->size();
+      }
+      ksv = kseq_read(seq);
+      ksv2 = kseq_read(seq2);
+    }
+
+    // If we hit the end of the file and have any reads in our local buffer
+    // then dump them here.
+    if (numWaiting > 0) {
+      local->have(numWaiting);
+      while (!readQueue_.try_enqueue(*pRead, std::move(local))) {
+      }
+      numWaiting = 0;
+    }
+    // destroy the parser and close the file
+    kseq_destroy(seq);
+    gzclose(fp);
+    kseq_destroy(seq2);
+    gzclose(fp2);
+  }
+
+  --numParsing;
+}
+
+template <> bool FastxParser<ReadSeq>::start() {
+  if (numParsing_ == 0) {
+    for (size_t i = 0; i < numParsers_; ++i) {
+      ++numParsing_;
+      parsingThreads_.emplace_back(new std::thread([this, i]() {
+        parseReads(this->inputStreams_, this->numParsing_,
+                   this->consumeContainers_[i].get(),
+                   this->produceReads_[i].get(), this->workQueue_,
+                   this->seqContainerQueue_, this->readQueue_);
+      }));
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <> bool FastxParser<ReadPair>::start() {
+  if (numParsing_ == 0) {
+
+    // Some basic checking to ensure the read files look "sane".
+    if (inputStreams_.size() != inputStreams2_.size()) {
+      throw std::invalid_argument("There should be the same number "
+                                  "of files for the left and right reads");
+    }
+    for (size_t i = 0; i < inputStreams_.size(); ++i) {
+      auto& s1 = inputStreams_[i];
+      auto& s2 = inputStreams2_[i];
+      if (s1 == s2) {
+        throw std::invalid_argument("You provided the same file " + s1 +
+                                    " as both a left and right file");
+      }
+    }
+    for (size_t i = 0; i < numParsers_; ++i) {
+      ++numParsing_;
+      parsingThreads_.emplace_back(new std::thread([this, i]() {
+        parseReadPair(this->inputStreams_, this->inputStreams2_,
+                      this->numParsing_, this->consumeContainers_[i].get(),
+                      this->produceReads_[i].get(), this->workQueue_,
+                      this->seqContainerQueue_, this->readQueue_);
+      }));
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename T> bool FastxParser<T>::refill(ReadGroup<T>& seqs) {
+  finishedWithGroup(seqs);
+  while (numParsing_ > 0) {
+    if (readQueue_.try_dequeue(seqs.consumerToken(), seqs.chunkPtr())) {
+      return true;
+    }
+  }
+  return readQueue_.try_dequeue(seqs.consumerToken(), seqs.chunkPtr());
+}
+
+template <typename T> void FastxParser<T>::finishedWithGroup(ReadGroup<T>& s) {
+  // If this read group is holding a valid chunk, then give it back
+  if (!s.empty()) {
+    seqContainerQueue_.enqueue(s.producerToken(), std::move(s.takeChunkPtr()));
+    s.setChunkEmpty();
+  }
+}
+
+template class FastxParser<ReadSeq>;
+template class FastxParser<ReadPair>;
+}
diff --git a/src/HitManager.cpp b/src/HitManager.cpp
index b5fc9e7..d03c20b 100644
--- a/src/HitManager.cpp
+++ b/src/HitManager.cpp
@@ -1,5 +1,27 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include "HitManager.hpp"
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 #include <type_traits>
 
 namespace rapmap {
@@ -11,7 +33,7 @@ namespace rapmap {
                 uint32_t maxDist,
                 std::vector<QuasiAlignment>& hits,
                 MateStatus mateStatus){
-            bool foundHit{false};
+            //bool foundHit{false};
             // One processed hit per transcript
             for (auto& ph : processedHits) {
                 auto tid = ph.tid;
@@ -41,9 +63,9 @@ namespace rapmap {
                         uint32_t maxDist,
                         std::vector<QuasiAlignment>& hits,
                         MateStatus mateStatus){
-                bool foundHit{false};
+                //bool foundHit{false};
                 // One processed hit per transcript
-	            auto startOffset = hits.size();
+	            //auto startOffset = hits.size();
                 for (auto& ph : processedHits) {
                         // If this is an *active* position list
                         if (ph.second.active) {
@@ -78,7 +100,7 @@ namespace rapmap {
                         uint32_t maxDist,
                         std::vector<QuasiAlignment>& hits,
                         MateStatus mateStatus){
-                bool foundHit{false};
+                //bool foundHit{false};
 
                 // One processed hit per transcript
                 for (auto& ph : processedHits) {
@@ -128,7 +150,7 @@ namespace rapmap {
             // Iterator into, length of and end of the positon list for h2
             auto rightPosIt = posList.begin() + h2.kinfo->offset;
             auto rightPosLen = h2.kinfo->count;
-            auto rightPosEnd = rightPosIt + rightPosLen;
+            // auto rightPosEnd = rightPosIt + rightPosLen;
             // Iterator into, length of and end of the transcript list for h2
             auto rightTxpIt = eqClassLabels.begin() + eqClassRight.txpListStart;
             auto rightTxpListLen = eqClassRight.txpListLen;
@@ -353,7 +375,6 @@ namespace rapmap {
             // Convenient bindings for variables we'll use
             auto& SA = rmi.SA;
             //auto& txpIDs = rmi.positionIDs;
-            auto& rankDict = rmi.rankDict;
             auto& txpStarts = rmi.txpOffsets;
 
             // Walk through every hit in the new interval 'h'
@@ -418,7 +439,7 @@ namespace rapmap {
                 // Iterator into, length of and end of the positon list
                 auto posIt = posList.begin() + minHit->kinfo->offset;
                 auto posLen = minHit->kinfo->count;
-                auto posEnd = posIt + posLen;
+                // auto posEnd = posIt + posLen;
                 // Iterator into, length of and end of the transcript list
                 auto txpIt = eqClassLabels.begin() + eqClass.txpListStart;
                 auto txpListLen = eqClass.txpListLen;
@@ -578,6 +599,7 @@ namespace rapmap {
         SAHitMap intersectSAHits(
                 std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
                 RapMapIndexT& rmi,
+                size_t readLen,
                 bool strictFilter 
                 ) {
             using OffsetT = typename RapMapIndexT::IndexType;
@@ -601,7 +623,6 @@ namespace rapmap {
             auto& SA = rmi.SA;
             auto& txpStarts = rmi.txpOffsets;
             //auto& txpIDs = rmi.positionIDs;
-	    auto& rankDict = rmi.rankDict;
 
             // Start with the smallest interval
             // i.e. interval with the fewest hits.
@@ -640,7 +661,7 @@ namespace rapmap {
             for (auto it = outHits.begin(); it != outHits.end(); ++it) {
                 bool enoughHits = (it->second.numActive >= requiredNumHits);
                 it->second.active = (strictFilter) ? 
-                    (enoughHits and it->second.checkConsistent(requiredNumHits)) :
+                    (enoughHits and it->second.checkConsistent(readLen, requiredNumHits)) :
                     (enoughHits);
             }
             return outHits;
@@ -650,12 +671,12 @@ namespace rapmap {
         /**
         * Need to explicitly instantiate the versions we use
         */
-      using SAIndex32BitDense = RapMapSAIndex<int32_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+      using SAIndex32BitDense = RapMapSAIndex<int32_t, RegHashT<uint64_t, rapmap::utils::SAInterval<int32_t>,
 									     rapmap::utils::KmerKeyHasher>>;
-      using SAIndex64BitDense = RapMapSAIndex<int64_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+      using SAIndex64BitDense = RapMapSAIndex<int64_t, RegHashT<uint64_t, rapmap::utils::SAInterval<int64_t>,
 									     rapmap::utils::KmerKeyHasher>>;
-      using SAIndex32BitPerfect = RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
-      using SAIndex64BitPerfect = RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+      using SAIndex32BitPerfect = RapMapSAIndex<int32_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+      using SAIndex64BitPerfect = RapMapSAIndex<int64_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
 
         template
         void intersectSAIntervalWithOutput<SAIndex32BitDense>(SAIntervalHit<int32_t>& h,
@@ -671,11 +692,11 @@ namespace rapmap {
 
         template
         SAHitMap intersectSAHits<SAIndex32BitDense>(std::vector<SAIntervalHit<int32_t>>& inHits,
-                                                    SAIndex32BitDense& rmi, bool strictFilter);
+                                                    SAIndex32BitDense& rmi, size_t readLen, bool strictFilter);
 
         template
         SAHitMap intersectSAHits<SAIndex64BitDense>(std::vector<SAIntervalHit<int64_t>>& inHits,
-          SAIndex64BitDense& rmi, bool strictFilter);
+                                                    SAIndex64BitDense& rmi, size_t readLen, bool strictFilter);
 
         template
         void intersectSAIntervalWithOutput<SAIndex32BitPerfect>(SAIntervalHit<int32_t>& h,
@@ -691,10 +712,10 @@ namespace rapmap {
 
         template
         SAHitMap intersectSAHits<SAIndex32BitPerfect>(std::vector<SAIntervalHit<int32_t>>& inHits,
-                                                      SAIndex32BitPerfect& rmi, bool strictFilter);
+                                                      SAIndex32BitPerfect& rmi, size_t readLen, bool strictFilter);
 
         template
         SAHitMap intersectSAHits<SAIndex64BitPerfect>(std::vector<SAIntervalHit<int64_t>>& inHits,
-                                                      SAIndex64BitPerfect& rmi, bool strictFilter);
+                                                      SAIndex64BitPerfect& rmi, size_t readLen, bool strictFilter);
     }
 }
diff --git a/src/RapMap.cpp b/src/RapMap.cpp
index 942b970..16b0458 100644
--- a/src/RapMap.cpp
+++ b/src/RapMap.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <iostream>
 #include <fstream>
 #include <vector>
diff --git a/src/RapMapFileSystem.cpp b/src/RapMapFileSystem.cpp
index 66e246b..c73bce9 100644
--- a/src/RapMapFileSystem.cpp
+++ b/src/RapMapFileSystem.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include "RapMapFileSystem.hpp"
 #include <sys/stat.h>
 
diff --git a/src/RapMapIndex.cpp b/src/RapMapIndex.cpp
index 8efb45f..baf71ae 100644
--- a/src/RapMapIndex.cpp
+++ b/src/RapMapIndex.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include "RapMapIndex.hpp"
 
 RapMapIndex::RapMapIndex() {}
diff --git a/src/RapMapIndexer.cpp b/src/RapMapIndexer.cpp
index c8608ca..cbad057 100644
--- a/src/RapMapIndexer.cpp
+++ b/src/RapMapIndexer.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <iostream>
 #include <mutex>
 #include <vector>
@@ -17,10 +38,9 @@
 #include "xxhash.h"
 #include "btree/btree_map.h"
 
+#include "FastxParser.hpp"
 // Jellyfish 2 include
 #include "jellyfish/mer_dna.hpp"
-#include "jellyfish/stream_manager.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
 
 #include "RapMapUtils.hpp"
 #include "RapMapFileSystem.hpp"
@@ -37,8 +57,7 @@
 
 #include <chrono>
 
-using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
-using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
 using TranscriptID = uint32_t;
 using TranscriptIDVector = std::vector<TranscriptID>;
 using KmerIDMap = std::vector<TranscriptIDVector>;
@@ -200,12 +219,13 @@ void processTranscripts(ParserT* parser,
 
     {
         ScopedTimer timer;
-        while(true) {
-            typename ParserT::job j(*parser);
-            if(j.is_empty()) break;
+	// Get the read group by which this thread will
+	// communicate with the parser (*once per-thread*)
+	auto rg = parser->getReadGroup();
 
-            for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
-                std::string& readStr = j->data[i].seq;
+	while (parser->refill(rg)) {
+	  for (auto& read : rg) { // for each sequence
+	    std::string& readStr = read.seq; 
 
 		// Do Kallisto-esque clipping of polyA tails
 		if (readStr.size() > polyAClipLength and
@@ -224,7 +244,7 @@ void processTranscripts(ParserT* parser,
                 uint32_t readLen  = readStr.size();
                 uint32_t txpIndex = n++;
                 transcriptLengths.push_back(readLen);
-                auto& recHeader = j->data[i].header;
+                auto& recHeader = read.name;
                 transcriptNames.emplace_back(recHeader.substr(0, recHeader.find_first_of(" \t")));
 
                 rapmap::utils::my_mer mer;
@@ -267,7 +287,7 @@ void processTranscripts(ParserT* parser,
                         numKmers++;
                     }
                 }
-                transcriptSeqs.push_back(j->data[i].seq);
+                transcriptSeqs.push_back(read.seq);
                 if (n % 10000 == 0) {
                     std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
                 }
@@ -750,13 +770,17 @@ int rapMapIndex(int argc, char* argv[]) {
         rapmap::fs::MakeDir(indexDir.c_str());
     }
 
-    size_t maxReadGroup{1000}; // Number of reads in each "job"
-    size_t concurrentFile{2}; // Number of files to read simultaneously
-    size_t numThreads{2};
-    stream_manager streams(transcriptFiles.begin(), transcriptFiles.end(), concurrentFile);
+    size_t numThreads{1};
+
     std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
-    transcriptParserPtr.reset(new single_parser(4 * numThreads, maxReadGroup,
-                              concurrentFile, streams));
+    //transcriptParserPtr.reset(
+    //    new single_parser(4 * numThreads, maxReadGroup, concurrentFile, streams));
+
+    size_t numProd = 1;
+    transcriptParserPtr.reset(
+			      new single_parser(transcriptFiles, numThreads, numProd));
+
+    transcriptParserPtr->start();
     std::mutex iomutex;
     processTranscripts(transcriptParserPtr.get(), indexDir, iomutex);
     return 0;
diff --git a/src/RapMapMapper.cpp b/src/RapMapMapper.cpp
index 273261c..2e9e0c8 100644
--- a/src/RapMapMapper.cpp
+++ b/src/RapMapMapper.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <iostream>
 #include <mutex>
 #include <vector>
@@ -25,13 +46,13 @@
 #include "xxhash.h"
 
 #include "spdlog/spdlog.h"
-#include "spdlog/details/format.h"
+#include "spdlog/sinks/ostream_sink.h"
+#include "spdlog/fmt/ostr.h"
+#include "spdlog/fmt/fmt.h"
 
+#include "FastxParser.hpp"
 // Jellyfish 2 include
 #include "jellyfish/mer_dna.hpp"
-#include "jellyfish/stream_manager.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
-#include "jellyfish/hash_counter.hpp"
 
 #include "tclap/CmdLine.h"
 
@@ -57,10 +78,8 @@
 
 // STEP 1: declare the type of file handler and the read() function
 // KSEQ_INIT(int, read)
-
-using paired_parser = pair_sequence_parser<char**>;
-using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
-using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using paired_parser = fastx_parser::FastxParser<fastx_parser::ReadPair>;
+using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
 using TranscriptID = uint32_t;
 using TranscriptIDVector = std::vector<TranscriptID>;
 using KmerIDMap = std::vector<TranscriptIDVector>;
@@ -726,15 +745,20 @@ void processReadsSingle(single_parser* parser,
     SingleAlignmentFormatter<RapMapIndex*> formatter(&rmi);
 
     size_t readLen{0};
-
-    while(true) {
-        typename single_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
-        if(j.is_empty()) break;           // If got nothing, quit
-        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
-            readLen = j->data[i].seq.length();
+    // Get the read group by which this thread will
+    // communicate with the parser (*once per-thread*)
+    auto rg = parser->getReadGroup();
+
+    while (parser->refill(rg)) {
+      //while(true) {
+      //  typename single_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
+      //  if(j.is_empty()) break;                 // If we got nothing, then quit.
+      //  for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+      for (auto& read : rg) {
+            readLen = read.seq.length();
             ++hctr.numReads;
             hits.clear();
-            hitCollector(j->data[i].seq, hits, MateStatus::SINGLE_END);
+            hitCollector(read.seq, hits, MateStatus::SINGLE_END);
             /*
                std::set_intersection(leftHits.begin(), leftHits.end(),
                rightHits.begin(), rightHits.end(),
@@ -744,7 +768,7 @@ void processReadsSingle(single_parser* parser,
             hctr.totHits += numHits;
 
              if (hits.size() > 0 and !noOutput and hits.size() <= maxNumHits) {
-                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                rapmap::utils::writeAlignmentsToStream(read, formatter,
                         hctr, hits, sstream);
             }
 
@@ -776,7 +800,7 @@ void processReadsSingle(single_parser* parser,
         // Get rid of last newline
         if (!outStr.empty()) {
             outStr.pop_back();
-            outQueue->info() << std::move(outStr);
+            outQueue->info(std::move(outStr));
         }
 	    sstream.clear();
 	}
@@ -830,19 +854,21 @@ void processReadsPair(paired_parser* parser,
     // 0x3 means "orphaned" alignments for left and right
     // (currently not treated as orphan).
     uint32_t orphanStatus{0};
-    while(true) {
-        typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
-        if(j.is_empty()) break;           // If got nothing, quit
-        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+
+    // Get the read group by which this thread will
+    // communicate with the parser (*once per-thread*)
+    auto rg = parser->getReadGroup();
+    while (parser->refill(rg)) {
+      for (auto& rpair : rg) {
 	    tooManyHits = false;
-            readLen = j->data[i].first.seq.length();
+            readLen = rpair.first.seq.length();
             ++hctr.numReads;
             jointHits.clear();
             leftHits.clear();
             rightHits.clear();
-    	    hitCollector(j->data[i].first.seq,
+    	    hitCollector(rpair.first.seq,
                         leftHits, MateStatus::PAIRED_END_LEFT);
-            hitCollector(j->data[i].second.seq,
+            hitCollector(rpair.second.seq,
                         rightHits, MateStatus::PAIRED_END_RIGHT);
 
             rapmap::utils::mergeLeftRightHits(
@@ -851,7 +877,7 @@ void processReadsPair(paired_parser* parser,
 
 
             if (jointHits.size() > 0 and !noOutput and jointHits.size() <= maxNumHits) {
-                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                rapmap::utils::writeAlignmentsToStream(rpair, formatter,
                                                        hctr, jointHits, sstream);
             }
 
@@ -889,7 +915,7 @@ void processReadsPair(paired_parser* parser,
         // Get rid of last newline
         if (!outStr.empty()){
             outStr.pop_back();
-            outQueue->info() << std::move(outStr);
+            outQueue->info(std::move(outStr));
         }
 	    sstream.clear();
 	}
@@ -1044,17 +1070,9 @@ int rapMapMap(int argc, char* argv[]) {
 		    std::exit(1);
 		}
 
-		size_t numFiles = read1Vec.size() + read2Vec.size();
-		char** pairFileList = new char*[numFiles];
-		for (size_t i = 0; i < read1Vec.size(); ++i) {
-		    pairFileList[2*i] = const_cast<char*>(read1Vec[i].c_str());
-		    pairFileList[2*i+1] = const_cast<char*>(read2Vec[i].c_str());
-		}
-		size_t maxReadGroup{1000}; // Number of reads in each "job"
-		size_t concurrentFile{2}; // Number of files to read simultaneously
-		pairParserPtr.reset(new paired_parser(4 * nthread, maxReadGroup,
-			    concurrentFile,
-			    pairFileList, pairFileList+numFiles));
+		uint32_t nprod = (read1Vec.size() > 1) ? 2 : 1; 
+		pairParserPtr.reset(new paired_parser(read1Vec, read2Vec, nthread, nprod));
+		pairParserPtr->start();
 
 		/** Create the threads depending on the collector type **/
 		if (endCollectorSwitch.getValue()) {
@@ -1086,18 +1104,13 @@ int rapMapMap(int argc, char* argv[]) {
 		}
 
 		for (auto& t : threads) { t.join(); }
-		delete [] pairFileList;
 	    } else {
 		std::vector<std::thread> threads;
 		std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(unmatedReads.getValue(), ',');
-		size_t maxReadGroup{1000}; // Number of reads in each "job"
-		size_t concurrentFile{1};
-		stream_manager streams( unmatedReadVec.begin(), unmatedReadVec.end(),
-			concurrentFile);
-		singleParserPtr.reset(new single_parser(4 * nthread,
-			    maxReadGroup,
-			    concurrentFile,
-			    streams));
+
+		uint32_t nprod = (unmatedReadVec.size() > 1) ? 2 : 1; 
+		singleParserPtr.reset(new single_parser(unmatedReadVec, nthread, nprod));
+		singleParserPtr->start();
 
 		/** Create the threads depending on the collector type **/
 		if (endCollectorSwitch.getValue()) {
diff --git a/src/RapMapSAIndex.cpp b/src/RapMapSAIndex.cpp
index 2e97122..3036e8d 100644
--- a/src/RapMapSAIndex.cpp
+++ b/src/RapMapSAIndex.cpp
@@ -1,4 +1,26 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 #include "RapMapSAIndex.hpp"
 #include "IndexHeader.hpp"
 #include <cereal/types/unordered_map.hpp>
@@ -11,24 +33,51 @@
 #include <future>
 #include <thread>
 
+/*
+void set_empty_key(spp::sparse_hash_map<uint64_t,
+                       rapmap::utils::SAInterval<IndexT>,
+		   rapmap::utils::KmerKeyHasher>& khash,
+		   uint64_t k) {
+}
+
+void set_empty_key(spp::sparse_hash_map<uint64_t,
+                       rapmap::utils::SAInterval<IndexT>,
+		   rapmap::utils::KmerKeyHasher>& khash,
+		   uint64_t k) {
+}
+*/
+
+    // Set the SA and text pointer if this is a perfect hash
+template <typename IndexT>
+void setPerfectHashPointers(RegHashT<uint64_t,
+                            rapmap::utils::SAInterval<IndexT>,
+                            rapmap::utils::KmerKeyHasher>& khash, std::vector<IndexT>& SA, std::string& seq) {
+    // do nothing
+}
+
+template <typename IndexT>
+void setPerfectHashPointers(PerfectHashT<uint64_t,
+                            rapmap::utils::SAInterval<IndexT>>& khash, std::vector<IndexT>& SA, std::string& seq) {
+    khash.setSAPtr(&SA);
+    khash.setTextPtr(seq.c_str(), seq.length());
+}
+
 // These are **free** functions that are used for loading the
 // appropriate type of hash.
 template <typename IndexT>
 bool loadHashFromIndex(const std::string& indexDir,
-                       google::dense_hash_map<uint64_t,
+                       RegHashT<uint64_t,
                        rapmap::utils::SAInterval<IndexT>,
                        rapmap::utils::KmerKeyHasher>& khash) {
-      khash.set_empty_key(std::numeric_limits<uint64_t>::max());
-      std::ifstream hashStream(indexDir + "hash.bin");
-      khash.unserialize(typename google::dense_hash_map<uint64_t,
-                      rapmap::utils::SAInterval<IndexT>,
-                      rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
+      std::ifstream hashStream(indexDir + "hash.bin", std::ios::binary);
+      khash.unserialize(typename spp_utils::pod_hash_serializer<uint64_t, rapmap::utils::SAInterval<IndexT>>(),
+			&hashStream);
       return true;
 }
 
 template <typename IndexT>
 bool loadHashFromIndex(const std::string& indexDir,
-		       BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> & h) {
+		       PerfectHashT<uint64_t, rapmap::utils::SAInterval<IndexT>> & h) {
     std::string hashBase = indexDir + "hash_info";
     h.load(hashBase);
     return true;
@@ -58,46 +107,13 @@ bool RapMapSAIndex<IndexT, HashT>::load(const std::string& indDir) {
     }
     indexStream.close();
     uint32_t idxK = h.kmerLen();
+    rapmap::utils::my_mer::k(idxK);
 
     // This part takes the longest, so do it in it's own asynchronous task
     std::future<bool> loadingHash = std::async(std::launch::async, [this, logger, indDir]() -> bool {
-	   if (loadHashFromIndex(indDir, khash)) {
-                logger->info("Successfully loaded position hash");
-                return true;
-            } else {
-                logger->error("Failed to load position hash!");
-                return false;
-            }
-	// If using a google dense hash
-        //this->khash.set_empty_key(std::numeric_limits<uint64_t>::max());
-        //uint32_t k = 31;
-        //std::ifstream hashStream(indDir + "hash.bin");
-        //{
-
-	  //logger->info("Loading Position Hash");
-            //khash.load(hashStream);
-            //cereal::BinaryInputArchive hashArchive(hashStream);
-            //hashArchive(k);
-            //khash.unserialize(typename google::dense_hash_map<uint64_t,
-            //        rapmap::utils::SAInterval<IndexT>,
-            //        rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
-            //hashArchive(khash);
-	   //}
-        //hashStream.close();
-        //std::cerr << "had " << khash.size() << " entries\n";
-        //return true;
+	return loadHashFromIndex(indDir, khash);
     });
 
-    /*
-    std::ifstream intervalStream(indDir + "kintervals.bin");
-    {
-        logger->info("Loading k-mer intervals");
-        cereal::BinaryInputArchive intervalArchive(intervalStream);
-        intervalArchive(kintervals);
-    }
-    intervalStream.close();
-    */
-
     std::ifstream saStream(indDir + "sa.bin");
     {
         logger->info("Loading Suffix Array ");
@@ -161,17 +177,17 @@ bool RapMapSAIndex<IndexT, HashT>::load(const std::string& indDir) {
         logger->error("Failed to load hash!");
         std::exit(1);
     }
-    rapmap::utils::my_mer::k(idxK);
-
+    // Set the SA and text pointer if this is a perfect hash
+    setPerfectHashPointers(khash, SA, seq); 
     logger->info("Done loading index");
     return true;
 }
 
-template class RapMapSAIndex<int32_t,  google::dense_hash_map<uint64_t,
+template class RapMapSAIndex<int32_t,  RegHashT<uint64_t,
                       rapmap::utils::SAInterval<int32_t>,
                       rapmap::utils::KmerKeyHasher>>;
-template class RapMapSAIndex<int64_t,  google::dense_hash_map<uint64_t,
+template class RapMapSAIndex<int64_t,  RegHashT<uint64_t,
                       rapmap::utils::SAInterval<int64_t>,
                       rapmap::utils::KmerKeyHasher>>;
-template class RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
-template class RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+template class RapMapSAIndex<int32_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+template class RapMapSAIndex<int64_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
diff --git a/src/RapMapSAIndexer.cpp b/src/RapMapSAIndexer.cpp
index 83a1491..955d04b 100644
--- a/src/RapMapSAIndexer.cpp
+++ b/src/RapMapSAIndexer.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <algorithm>
 #include <cctype>
 #include <cstdio>
@@ -21,14 +42,14 @@
 #include <cereal/types/vector.hpp>
 
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 #include "xxhash.h"
 
 #include "spdlog/spdlog.h"
 
+#include "FastxParser.hpp"
 // Jellyfish 2 include
 #include "jellyfish/mer_dna.hpp"
-#include "jellyfish/stream_manager.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
 
 #include "divsufsort.h"
 #include "divsufsort64.h"
@@ -47,15 +68,11 @@
 #include "jellyfish/thread_exec.hpp"
 #include "rank9b.h"
 
-#include "sparsehash/dense_hash_map"
-
 #include "IndexHeader.hpp"
 
 #include <chrono>
 
-using stream_manager =
-    jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
-using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
 using TranscriptID = uint32_t;
 using TranscriptIDVector = std::vector<TranscriptID>;
 using KmerIDMap = std::vector<TranscriptIDVector>;
@@ -108,7 +125,10 @@ template <typename IndexT>
 bool buildPerfectHash(const std::string& outputDir, std::string& concatText,
                       size_t tlen, uint32_t k, std::vector<IndexT>& SA,
                       uint32_t numHashThreads) {
-  BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
+  //BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
+  PerfectHashT<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
+  intervals.setSAPtr(&SA);
+  intervals.setTextPtr(concatText.data(), concatText.length());
 
   // The start and stop of the current interval
   IndexT start = 0, stop = 0;
@@ -235,11 +255,13 @@ template <typename IndexT>
 bool buildHash(const std::string& outputDir, std::string& concatText,
                size_t tlen, uint32_t k, std::vector<IndexT>& SA) {
   // Now, build the k-mer lookup table
-  google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<IndexT>,
-                         rapmap::utils::KmerKeyHasher>
-      khash;
-  khash.set_empty_key(std::numeric_limits<uint64_t>::max());
-
+    // The base type should always be uint64_t
+    using WordT = rapmap::utils::my_mer::base_type;
+    RegHashT<WordT, rapmap::utils::SAInterval<IndexT>,
+                         rapmap::utils::KmerKeyHasher> khash;
+    //RegHashT<uint64_t, IndexT, rapmap::utils::KmerKeyHasher> overflowhash;
+    //std::cerr << "sizeof(SAInterval<IndexT>) = " << sizeof(rapmap::utils::SAInterval<IndexT>) << '\n';
+    //khash.set_empty_key(std::numeric_limits<uint64_t>::max());
   // The start and stop of the current interval
   IndexT start = 0, stop = 0;
   // An iterator to the beginning of the text
@@ -261,8 +283,8 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
       if (nextKmer != currentKmer) {
         if (currentKmer.length() == k and
             currentKmer.find_first_of('$') == std::string::npos) {
-          mer = rapmap::utils::my_mer(currentKmer);
-          auto bits = mer.get_bits(0, 2 * k);
+          mer = currentKmer;
+          auto bits = mer.word(0);
           auto hashIt = khash.find(bits);
           if (hashIt == khash.end()) {
             if (start > 1) {
@@ -286,14 +308,24 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
             }
 
             khash[bits] = {start, stop};
+            /*
+            IndexT len = stop - start;
+            bool overflow = (len >= std::numeric_limits<uint8_t>::max());
+            uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() : 
+                static_cast<uint8_t>(len);
+            khash[bits] = {start, blen};
+            if (overflow) {
+                overflowhash[bits] = len;
+            }
+            */
           } else {
             std::cerr << "\nERROR (1): trying to add same suffix "
                       << currentKmer << " (len = " << currentKmer.length()
                       << ") multiple times!\n";
             auto prevInt = hashIt->second;
-            std::cerr << "existing interval is [" << prevInt.begin << ", "
-                      << prevInt.end << ")\n";
-            for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+            std::cerr << "existing interval is [" << prevInt.begin() << ", "
+                      << prevInt.end() << ")\n";
+            for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
               auto suff = concatText.substr(SA[x], k);
               for (auto c : suff) {
                 std::cerr << "*" << c << "*";
@@ -320,8 +352,8 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
       // in the hash.
       if (currentKmer.length() == k and
           currentKmer.find_first_of('$') == std::string::npos) {
-        mer = rapmap::utils::my_mer(currentKmer);
-        auto bits = mer.get_bits(0, 2 * k);
+        mer = currentKmer.c_str();
+        auto bits = mer.word(0);
         auto hashIt = khash.find(bits);
         if (hashIt == khash.end()) {
           if (start > 2) {
@@ -336,13 +368,23 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
             }
           }
           khash[bits] = {start, stop};
+          /*
+          IndexT len = stop - start;
+          bool overflow = (len >= std::numeric_limits<uint8_t>::max());
+          uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() : 
+              static_cast<uint8_t>(len);
+          khash[bits] = {start, blen};
+          if (overflow) {
+              overflowhash[bits] = len;
+          }
+          */
         } else {
           std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer
                     << "multiple times!\n";
           auto prevInt = hashIt->second;
-          std::cerr << "existing interval is [" << prevInt.begin << ", "
-                    << prevInt.end << ")\n";
-          for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+          std::cerr << "existing interval is [" << prevInt.begin() << ", "
+                    << prevInt.end() << ")\n";
+          for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
             std::cerr << concatText.substr(SA[x], k) << "\n";
           }
           std::cerr << "new interval is [" << start << ", " << stop << ")\n";
@@ -364,8 +406,18 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
   if (start < tlen) {
     if (currentKmer.length() == k and
         currentKmer.find_first_of('$') != std::string::npos) {
-      mer = rapmap::utils::my_mer(currentKmer);
-      khash[mer.get_bits(0, 2 * k)] = {start, stop};
+      mer = currentKmer.c_str();
+      khash[mer.word(0)] = {start, stop};
+      /*
+      IndexT len = stop - start;
+      bool overflow = (len >= std::numeric_limits<uint8_t>::max());
+      uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() : 
+          static_cast<uint8_t>(len);
+      khash[mer.get_bits(0, 2 * k)] = {start, blen};
+      if (overflow) {
+          overflowhash[mer.get_bits(0, 2 * k)] = len;
+      }
+      */
     }
   }
   std::cerr << "\nkhash had " << khash.size() << " keys\n";
@@ -373,13 +425,8 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
   {
     ScopedTimer timer;
     std::cerr << "saving hash to disk . . . ";
-    cereal::BinaryOutputArchive hashArchive(hashStream);
-    // hashArchive(k);
-    khash.serialize(typename google::dense_hash_map<
-                        uint64_t, rapmap::utils::SAInterval<IndexT>,
-                        rapmap::utils::KmerKeyHasher>::NopointerSerializer(),
+    khash.serialize(typename spp_utils::pod_hash_serializer<WordT, rapmap::utils::SAInterval<IndexT>>(),
                     &hashStream);
-    // hashArchive(khash);
     std::cerr << "done\n";
   }
   hashStream.close();
@@ -392,7 +439,9 @@ bool buildHash(const std::string& outputDir, std::string& concatText,
 template <typename ParserT> //, typename CoverageCalculator>
 void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
                         bool noClipPolyA, bool usePerfectHash,
-                        uint32_t numHashThreads, std::mutex& iomutex,
+                        uint32_t numHashThreads, 
+                        std::string& sepStr,
+                        std::mutex& iomutex,
                         std::shared_ptr<spdlog::logger> log) {
   // Seed with a real random value, if available
   std::random_device rd;
@@ -435,12 +484,13 @@ void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
   fmt::MemoryWriter txpSeqStream;
   {
     ScopedTimer timer;
-    while (true) {
-      typename ParserT::job j(*parser);
-      if (j.is_empty())
-        break;
-      for (size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
-        std::string& readStr = j->data[i].seq;
+    // Get the read group by which this thread will
+    // communicate with the parser (*once per-thread*)
+    auto rg = parser->getReadGroup();
+
+    while (parser->refill(rg)) {
+      for (auto& read : rg) { // for each sequence
+        std::string& readStr = read.seq;
         readStr.erase(
             std::remove_if(readStr.begin(), readStr.end(),
                            [](const char a) -> bool { return !(isprint(a)); }),
@@ -470,7 +520,7 @@ void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
             if (newEndPos == std::string::npos) {
               log->warn("Entry with header [{}] appeared to be all A's; it "
                         "will be removed from the index!",
-                        j->data[i].header);
+                        read.name);
               readStr.resize(0);
             } else {
               readStr.resize(newEndPos + 1);
@@ -482,22 +532,26 @@ void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
         readLen = readStr.size();
         // If the transcript was completely removed during clipping, don't
         // include it in the index.
-        if (readStr.size() >= k) {
+        if (readStr.size() > 0) {
           // If we're suspicious the user has fed in a *genome* rather
           // than a transcriptome, say so here.
           if (readStr.size() >= tooLong) {
             log->warn("Entry with header [{}] was longer than {} nucleotides.  "
                       "Are you certain that "
                       "we are indexing a transcriptome and not a genome?",
-                      j->data[i].header, tooLong);
+                      read.name, tooLong);
+          } else if (readStr.size() < k) {
+            log->warn("Entry with header [{}], had length less than "
+                      "the k-mer length of {} (perhaps after poly-A clipping)",
+                      read.name, k);
           }
 
           uint32_t txpIndex = n++;
 
           // The name of the current transcript
-          auto& recHeader = j->data[i].header;
+          auto& recHeader = read.name;
           transcriptNames.emplace_back(
-              recHeader.substr(0, recHeader.find_first_of(" \t")));
+                                       recHeader.substr(0, recHeader.find_first_of(sepStr)));//" \t")));
 
           // The position at which this transcript starts
           transcriptStarts.push_back(currIndex);
@@ -507,9 +561,9 @@ void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
           currIndex += readLen + 1;
           onePos.push_back(currIndex - 1);
         } else {
-            log->warn("Discarding entry with header [{}], since it was shorter than "
-                      "the k-mer length of {} (perhaps after poly-A clipping)", 
-                      j->data[i].header, k);
+            log->warn("Discarding entry with header [{}], since it had length 0 "
+                      "(perhaps after poly-A clipping)",
+                      read.name);
         }
       }
       if (n % 10000 == 0) {
@@ -658,13 +712,24 @@ int rapMapSAIndex(int argc, char* argv[]) {
       "path");
   TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index",
                                  false, 31, "positive integer less than 32");
+
+  TCLAP::ValueArg<std::string> customSeps("s", "headerSep", "Instead of a space or tab, break the header at the first "
+                                          "occurrence of this string, and name the transcript as the token before "
+                                          "the first separator", false, " \t", "string");
   TCLAP::SwitchArg noClip(
       "n", "noClip",
       "Don't clip poly-A tails from the ends of target sequences", false);
   TCLAP::SwitchArg perfectHash(
-      "p", "perfectHash", "Use a perfect hash instead of dense hash --- "
+      "p", "perfectHash", "Use a perfect hash instead of sparse hash --- "
                           "somewhat slows construction, but uses less memory",
       false);
+  /*
+  TCLAP::SwitchArg perfectHash(
+      "f", "frugalPerfectHash", "Use a frugal variant of the perfect hash --- "
+                          "this will considerably slow construction, and somewhat slow lookup, but "
+                          "hash construction and the subsequent mapping will require the least memory."
+      false);
+  */
   TCLAP::ValueArg<uint32_t> numHashThreads(
       "x", "numThreads",
       "Use this many threads to build the perfect hash function", false, 4,
@@ -674,6 +739,7 @@ int rapMapSAIndex(int argc, char* argv[]) {
   cmd.add(kval);
   cmd.add(noClip);
   cmd.add(perfectHash);
+  cmd.add(customSeps);
   cmd.add(numHashThreads);
   cmd.parse(argc, argv);
 
@@ -681,6 +747,8 @@ int rapMapSAIndex(int argc, char* argv[]) {
   std::string transcriptFile(transcripts.getValue());
   std::vector<std::string> transcriptFiles({transcriptFile});
 
+  std::string sepStr = customSeps.getValue();
+
   uint32_t k = kval.getValue();
   if (k % 2 == 0) {
     std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
@@ -712,20 +780,19 @@ int rapMapSAIndex(int argc, char* argv[]) {
   auto fileLog = spdlog::create("fileLog", {fileSink});
   auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
 
-  size_t maxReadGroup{1000}; // Number of reads in each "job"
-  size_t concurrentFile{2};  // Number of files to read simultaneously
-  size_t numThreads{2};
-  stream_manager streams(transcriptFiles.begin(), transcriptFiles.end(),
-                         concurrentFile);
+  size_t numThreads{1};
+
   std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
-  transcriptParserPtr.reset(
-      new single_parser(4 * numThreads, maxReadGroup, concurrentFile, streams));
 
+  size_t numProd = 1;
+  transcriptParserPtr.reset(
+			    new single_parser(transcriptFiles, numThreads, numProd));
+  transcriptParserPtr->start();
   bool noClipPolyA = noClip.getValue();
   bool usePerfectHash = perfectHash.getValue();
   uint32_t numPerfectHashThreads = numHashThreads.getValue();
   std::mutex iomutex;
   indexTranscriptsSA(transcriptParserPtr.get(), indexDir, noClipPolyA,
-                     usePerfectHash, numPerfectHashThreads, iomutex, jointLog);
+                     usePerfectHash, numPerfectHashThreads, sepStr, iomutex, jointLog);
   return 0;
 }
diff --git a/src/RapMapSAMapper.cpp b/src/RapMapSAMapper.cpp
index 7b265d2..ac2d506 100644
--- a/src/RapMapSAMapper.cpp
+++ b/src/RapMapSAMapper.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <iostream>
 #include <mutex>
 #include <vector>
@@ -32,13 +53,12 @@
 
 #include "spdlog/spdlog.h"
 #include "spdlog/sinks/ostream_sink.h"
-#include "spdlog/details/format.h"
+#include "spdlog/fmt/ostr.h"
+#include "spdlog/fmt/fmt.h"
 
 // Jellyfish 2 include
+#include "FastxParser.hpp"
 #include "jellyfish/mer_dna.hpp"
-#include "jellyfish/stream_manager.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
-#include "jellyfish/hash_counter.hpp"
 
 #include "tclap/CmdLine.h"
 
@@ -46,8 +66,10 @@
 #include "kseq.h"
 }
 */
+
 #include "stringpiece.h"
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 #include "PairSequenceParser.hpp"
 #include "PairAlignmentFormatter.hpp"
 #include "SingleAlignmentFormatter.hpp"
@@ -63,9 +85,8 @@
 
 //#define __TRACK_CORRECT__
 
-using paired_parser = pair_sequence_parser<char**>;
-using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
-using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using paired_parser = fastx_parser::FastxParser<fastx_parser::ReadPair>;
+using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
 using TranscriptID = uint32_t;
 using TranscriptIDVector = std::vector<TranscriptID>;
 using KmerIDMap = std::vector<TranscriptIDVector>;
@@ -91,21 +112,42 @@ using ProcessedHit = rapmap::utils::ProcessedHit;
 using QuasiAlignment = rapmap::utils::QuasiAlignment;
 using FixedWriter = rapmap::utils::FixedWriter;
 
+struct MappingOpts {
+    std::string index;
+    std::string read1;
+    std::string read2;
+    std::string unmatedReads;
+    uint32_t numThreads{1};
+    uint32_t maxNumHits{200};
+    std::string outname;
+    double quasiCov{0.0};
+    bool pairedEnd{false};
+    bool noOutput{true};
+    bool sensitive{false};
+    bool strictCheck{false};
+    bool fuzzy{false};
+    bool consistentHits{false};
+    bool quiet{false};
+};
 
-
-template <typename RapMapIndexT, typename CollectorT, typename MutexT>
+template <typename RapMapIndexT, typename MutexT>
 void processReadsSingleSA(single_parser * parser,
                           RapMapIndexT& rmi,
-                          CollectorT& hitCollector,
                           MutexT* iomutex,
                           std::shared_ptr<spdlog::logger> outQueue,
                           HitCounters& hctr,
-                          uint32_t maxNumHits,
-                          bool noOutput,
-                          bool strictCheck,
-                          bool consistentHits) {
-
+                          MappingOpts* mopts) {
     using OffsetT = typename RapMapIndexT::IndexType;
+
+    SACollector<RapMapIndexT> hitCollector(&rmi);
+    if (mopts->sensitive) {
+        hitCollector.disableNIP();
+    }
+    hitCollector.setStrictCheck(mopts->strictCheck);
+    if (mopts->quasiCov > 0.0) {
+        hitCollector.setCoverageRequirement(mopts->quasiCov);
+    }
+
     auto& txpNames = rmi.txpNames;
     auto& txpLens = rmi.txpLens;
     uint32_t n{0};
@@ -125,31 +167,37 @@ void processReadsSingleSA(single_parser * parser,
     SASearcher<RapMapIndexT> saSearcher(&rmi);
 
     uint32_t orphanStatus{0};
-    while(true) {
-        typename single_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
-        if(j.is_empty()) break;                 // If we got nothing, then quit.
-        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
-            readLen = j->data[i].seq.length();
+    // Get the read group by which this thread will
+    // communicate with the parser (*once per-thread*)
+    auto rg = parser->getReadGroup();
+
+    while (parser->refill(rg)) {
+      //while(true) {
+      //  typename single_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
+      //  if(j.is_empty()) break;                 // If we got nothing, then quit.
+      //  for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+      for (auto& read : rg) {
+	    readLen = read.seq.length();//j->data[i].seq.length();
             ++hctr.numReads;
             hits.clear();
-            hitCollector(j->data[i].seq, hits, saSearcher, MateStatus::SINGLE_END, strictCheck, consistentHits);
+            hitCollector(read.seq, hits, saSearcher, MateStatus::SINGLE_END, mopts->consistentHits);
             auto numHits = hits.size();
             hctr.totHits += numHits;
 
-	    if (hits.size() > 0 and !noOutput and hits.size() <= maxNumHits) {
+	    if (hits.size() > 0 and !mopts->noOutput and hits.size() <= mopts->maxNumHits) {
                 /*
                 std::sort(hits.begin(), hits.end(),
                             [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                                 return a.tid < b.tid;
                             });
                 */
-                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                rapmap::utils::writeAlignmentsToStream(read, formatter,
                                                        hctr, hits, sstream);
             }
 
             if (hctr.numReads > hctr.lastPrint + 1000000) {
         		hctr.lastPrint.store(hctr.numReads.load());
-                if (iomutex->try_lock()){
+                if (!mopts->quiet and iomutex->try_lock()){
                     if (hctr.numReads > 0) {
 #if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
                         std::cerr << "\033[F\033[F\033[F";
@@ -171,12 +219,12 @@ void processReadsSingleSA(single_parser * parser,
         } // for all reads in this job
 
         // DUMP OUTPUT
-        if (!noOutput) {
+        if (!mopts->noOutput) {
             std::string outStr(sstream.str());
             // Get rid of last newline
             if (!outStr.empty()) {
                 outStr.pop_back();
-                outQueue->info() << std::move(outStr);
+                outQueue->info(std::move(outStr));
             }
             sstream.clear();
             /*
@@ -195,21 +243,24 @@ void processReadsSingleSA(single_parser * parser,
 /**
  *  Map reads from a collection of paired-end files.
  */
-template <typename RapMapIndexT, typename CollectorT, typename MutexT>
+template <typename RapMapIndexT, typename MutexT>
 void processReadsPairSA(paired_parser* parser,
                         RapMapIndexT& rmi,
-                        CollectorT& hitCollector,
                         MutexT* iomutex,
                         std::shared_ptr<spdlog::logger> outQueue,
                         HitCounters& hctr,
-                        uint32_t maxNumHits,
-                        bool noOutput,
-                        bool strictCheck,
-                        bool nonStrictMerge,
-                        bool consistentHits) {
-
+                        MappingOpts* mopts) {
     using OffsetT = typename RapMapIndexT::IndexType;
 
+    SACollector<RapMapIndexT> hitCollector(&rmi);
+    if (mopts->sensitive) {
+        hitCollector.disableNIP();
+    }
+    hitCollector.setStrictCheck(mopts->strictCheck);
+    if (mopts->quasiCov > 0.0) {
+        hitCollector.setCoverageRequirement(mopts->quasiCov);
+    }
+
     auto& txpNames = rmi.txpNames;
     auto& txpLens = rmi.txpLens;
     uint32_t n{0};
@@ -232,50 +283,57 @@ void processReadsPairSA(paired_parser* parser,
     SASearcher<RapMapIndexT> saSearcher(&rmi);
 
     uint32_t orphanStatus{0};
-    while(true) {
-        typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
-        if(j.is_empty()) break;                 // If we got nothing, quit
-        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
-		    tooManyHits = false;
-            readLen = j->data[i].first.seq.length();
+
+    // Get the read group by which this thread will
+    // communicate with the parser (*once per-thread*)
+    auto rg = parser->getReadGroup();
+
+    while (parser->refill(rg)) {
+      //while(true) {
+      //typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
+      //if(j.is_empty()) break;                 // If we got nothing, quit
+      //  for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+      for (auto& rpair : rg) {
+	tooManyHits = false;
+	    readLen = rpair.first.seq.length();
             ++hctr.numReads;
             jointHits.clear();
             leftHits.clear();
             rightHits.clear();
 
-            bool lh = hitCollector(j->data[i].first.seq,
+            bool lh = hitCollector(rpair.first.seq,
                                    leftHits, saSearcher,
                                    MateStatus::PAIRED_END_LEFT,
-                                   strictCheck,
-                                   consistentHits);
+                                   mopts->consistentHits);
 
-            bool rh = hitCollector(j->data[i].second.seq,
+            bool rh = hitCollector(rpair.second.seq,
                                    rightHits, saSearcher,
                                    MateStatus::PAIRED_END_RIGHT,
-                                   strictCheck,
-                                   consistentHits);
+                                   mopts->consistentHits);
 
-            if (nonStrictMerge) {
+            if (mopts->fuzzy) {
                 rapmap::utils::mergeLeftRightHitsFuzzy(
                         lh, rh,
                         leftHits, rightHits, jointHits,
-                        readLen, maxNumHits, tooManyHits, hctr);
+                        readLen, mopts->maxNumHits, tooManyHits, hctr);
 
             } else {
                 rapmap::utils::mergeLeftRightHits(
                         leftHits, rightHits, jointHits,
-                        readLen, maxNumHits, tooManyHits, hctr);
+                        readLen, mopts->maxNumHits, tooManyHits, hctr);
             }
 
+            hctr.totHits += jointHits.size();
+
             // If we have reads to output, and we're writing output.
-            if (jointHits.size() > 0 and !noOutput and jointHits.size() <= maxNumHits) {
-                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+            if (jointHits.size() > 0 and !mopts->noOutput and jointHits.size() <= mopts->maxNumHits) {
+                rapmap::utils::writeAlignmentsToStream(rpair, formatter,
                                                        hctr, jointHits, sstream);
             }
 
             if (hctr.numReads > hctr.lastPrint + 1000000) {
         		hctr.lastPrint.store(hctr.numReads.load());
-                if (iomutex->try_lock()) {
+                if (!mopts->quiet and iomutex->try_lock()) {
                     if (hctr.numReads > 0) {
                         std::cerr << "\r\r";
                     }
@@ -292,12 +350,12 @@ void processReadsPairSA(paired_parser* parser,
         } // for all reads in this job
 
         // DUMP OUTPUT
-        if (!noOutput) {
+        if (!mopts->noOutput) {
             std::string outStr(sstream.str());
             // Get rid of last newline
             if (!outStr.empty()) {
                 outStr.pop_back();
-                outQueue->info() << std::move(outStr);
+                outQueue->info(std::move(outStr));
             }
             sstream.clear();
 	        /*
@@ -320,27 +378,18 @@ bool spawnProcessReadsThreads(
                               MutexT& iomutex,
                               std::shared_ptr<spdlog::logger> outQueue,
                               HitCounters& hctr,
-                              uint32_t maxNumHits,
-                              bool noOutput,
-                              bool strictCheck,
-                              bool fuzzy,
-                              bool consistentHits) {
+                              MappingOpts* mopts) {
 
             std::vector<std::thread> threads;
-            SACollector<RapMapIndexT> saCollector(&rmi);
+
             for (size_t i = 0; i < nthread; ++i) {
-                threads.emplace_back(processReadsPairSA<RapMapIndexT, SACollector<RapMapIndexT>, MutexT>,
+                threads.emplace_back(processReadsPairSA<RapMapIndexT, MutexT>,
                                      parser,
                                      std::ref(rmi),
-                                     std::ref(saCollector),
                                      &iomutex,
                                      outQueue,
                                      std::ref(hctr),
-                                     maxNumHits,
-                                     noOutput,
-                                     strictCheck,
-                                     fuzzy,
-                                     consistentHits);
+                                     mopts);
             }
 
             for (auto& t : threads) { t.join(); }
@@ -355,25 +404,16 @@ bool spawnProcessReadsThreads(
                               MutexT& iomutex,
                               std::shared_ptr<spdlog::logger> outQueue,
                               HitCounters& hctr,
-                              uint32_t maxNumHits,
-                              bool noOutput,
-                              bool strictCheck,
-                              bool consistentHits) {
-
+                              MappingOpts* mopts) {
             std::vector<std::thread> threads;
-            SACollector<RapMapIndexT> saCollector(&rmi);
             for (size_t i = 0; i < nthread; ++i) {
-                threads.emplace_back(processReadsSingleSA<RapMapIndexT, SACollector<RapMapIndexT>, MutexT>,
+                threads.emplace_back(processReadsSingleSA<RapMapIndexT, MutexT>,
                                      parser,
                                      std::ref(rmi),
-                                     std::ref(saCollector),
                                      &iomutex,
                                      outQueue,
                                      std::ref(hctr),
-                                     maxNumHits,
-                                     noOutput,
-                                     strictCheck, 
-                                     consistentHits);
+                                     mopts);
             }
             for (auto& t : threads) { t.join(); }
             return true;
@@ -382,30 +422,19 @@ bool spawnProcessReadsThreads(
 template <typename RapMapIndexT>
 bool mapReads(RapMapIndexT& rmi,
 	      std::shared_ptr<spdlog::logger> consoleLog,
-	      TCLAP::ValueArg<std::string>& index,
-	      TCLAP::ValueArg<std::string>& read1,
-	      TCLAP::ValueArg<std::string>& read2,
-	      TCLAP::ValueArg<std::string>& unmatedReads,
-	      TCLAP::ValueArg<uint32_t>& numThreads,
-	      TCLAP::ValueArg<uint32_t>& maxNumHits,
-	      TCLAP::ValueArg<std::string>& outname,
-	      TCLAP::SwitchArg& noout,
-	      TCLAP::SwitchArg& strict,
-          TCLAP::SwitchArg& fuzzy, 
-          TCLAP::SwitchArg& consistent) {
-
-	std::cerr << "\n\n\n\n";
-
-	bool pairedEnd = (read1.isSet() or read2.isSet());
+          MappingOpts* mopts) {
+	if (!mopts->quiet) { std::cerr << "\n\n\n\n"; }
+
+	bool pairedEnd = mopts->pairedEnd;//(read1.isSet() or read2.isSet());
 	// from: http://stackoverflow.com/questions/366955/obtain-a-stdostream-either-from-stdcout-or-stdofstreamfile
 	// set either a file or cout as the output stream
 	std::streambuf* outBuf;
 	std::ofstream outFile;
 	bool haveOutputFile{false};
-	if (outname.getValue() == "") {
+	if (mopts->outname == "") {
 	    outBuf = std::cout.rdbuf();
 	} else {
-	    outFile.open(outname.getValue());
+	    outFile.open(mopts->outname);
 	    outBuf = outFile.rdbuf();
 	    haveOutputFile = true;
 	}
@@ -417,28 +446,27 @@ bool mapReads(RapMapIndexT& rmi,
 	size_t queueSize{268435456};
 	spdlog::set_async_mode(queueSize);
 	auto outputSink = std::make_shared<spdlog::sinks::ostream_sink_mt>(outStream);
-	std::shared_ptr<spdlog::logger> outLog = std::make_shared<spdlog::logger>("outLog", outputSink);
+	std::shared_ptr<spdlog::logger> outLog = std::make_shared<spdlog::logger>("rapmap::outLog", outputSink);
 	outLog->set_pattern("%v");
 
-	uint32_t nthread = numThreads.getValue();
+	uint32_t nthread = mopts->numThreads;
 	std::unique_ptr<paired_parser> pairParserPtr{nullptr};
 	std::unique_ptr<single_parser> singleParserPtr{nullptr};
 
-	if (!noout.getValue()) {
+	if (!mopts->noOutput) {
 	  rapmap::utils::writeSAMHeader(rmi, outLog);
 	}
 
-    bool strictCheck = strict.getValue();
-    bool fuzzyIntersection = fuzzy.getValue();
-    bool consistentHits = consistent.getValue();
+    //for the parser
+    size_t chunkSize{10000};
 	SpinLockT iomutex;
 	{
-	    ScopedTimer timer;
+	    ScopedTimer timer(!mopts->quiet);
 	    HitCounters hctrs;
 	    consoleLog->info("mapping reads . . . \n\n\n");
         if (pairedEnd) {
-            std::vector<std::string> read1Vec = rapmap::utils::tokenize(read1.getValue(), ',');
-            std::vector<std::string> read2Vec = rapmap::utils::tokenize(read2.getValue(), ',');
+            std::vector<std::string> read1Vec = rapmap::utils::tokenize(mopts->read1, ',');
+            std::vector<std::string> read2Vec = rapmap::utils::tokenize(mopts->read2, ',');
 
             if (read1Vec.size() != read2Vec.size()) {
                 consoleLog->error("The number of provided files for "
@@ -446,39 +474,23 @@ bool mapReads(RapMapIndexT& rmi,
                 std::exit(1);
             }
 
-            size_t numFiles = read1Vec.size() + read2Vec.size();
-            char** pairFileList = new char*[numFiles];
-            for (size_t i = 0; i < read1Vec.size(); ++i) {
-                pairFileList[2*i] = const_cast<char*>(read1Vec[i].c_str());
-                pairFileList[2*i+1] = const_cast<char*>(read2Vec[i].c_str());
-            }
-            size_t maxReadGroup{1000}; // Number of reads in each "job"
-            size_t concurrentFile{2}; // Number of files to read simultaneously
-            pairParserPtr.reset(new paired_parser(4 * nthread, maxReadGroup,
-                        concurrentFile,
-                        pairFileList, pairFileList+numFiles));
-
+	    uint32_t nprod = (read1Vec.size() > 1) ? 2 : 1; 
+	    pairParserPtr.reset(new paired_parser(read1Vec, read2Vec, nthread, nprod, chunkSize));
+	    pairParserPtr->start();
             spawnProcessReadsThreads(nthread, pairParserPtr.get(), rmi, iomutex,
-                                     outLog, hctrs, maxNumHits.getValue(), noout.getValue(), strictCheck, 
-                                     fuzzyIntersection, consistentHits);
-            delete [] pairFileList;
+                                     outLog, hctrs, mopts);
         } else {
-            std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(unmatedReads.getValue(), ',');
-            size_t maxReadGroup{1000}; // Number of reads in each "job"
-            size_t concurrentFile{1};
-            stream_manager streams( unmatedReadVec.begin(), unmatedReadVec.end(),
-                    concurrentFile);
-            singleParserPtr.reset(new single_parser(4 * nthread,
-                        maxReadGroup,
-                        concurrentFile,
-                        streams));
+            std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(mopts->unmatedReads, ',');
 
+
+	    uint32_t nprod = (unmatedReadVec.size() > 1) ? 2 : 1; 
+	    singleParserPtr.reset(new single_parser(unmatedReadVec, nthread, nprod, chunkSize));
+	    singleParserPtr->start();
             /** Create the threads depending on the collector type **/
             spawnProcessReadsThreads(nthread, singleParserPtr.get(), rmi, iomutex,
-                                      outLog, hctrs, maxNumHits.getValue(), noout.getValue(), 
-                                     strictCheck, consistentHits);
+                                      outLog, hctrs, mopts);
         }
-	std::cerr << "\n\n";
+	if (!mopts->quiet) { std::cerr << "\n\n"; }
 
 
     consoleLog->info("Done mapping reads.");
@@ -499,10 +511,32 @@ bool mapReads(RapMapIndexT& rmi,
 	return true;
 }
 
+void displayOpts(MappingOpts& mopts, spdlog::logger* log) {
+        fmt::MemoryWriter optWriter;
+        optWriter.write("\ncommand line options\n"
+                        "====================\n");
+        optWriter.write("index: {}\n", mopts.index);
+        if (mopts.pairedEnd) {
+            optWriter.write("read(s) 1: {}\n", mopts.read1);
+            optWriter.write("read(s) 2: {}\n", mopts.read2);
+        } else {
+            optWriter.write("unmated read(s): {}\n", mopts.unmatedReads);
+        }
+        optWriter.write("output: {}\n", mopts.outname); 
+        optWriter.write("num. threads: {}\n", mopts.numThreads); 
+        optWriter.write("max num. hits: {}\n", mopts.maxNumHits); 
+        optWriter.write("quasi-coverage: {}\n", mopts.quasiCov); 
+        optWriter.write("no output: {}\n", mopts.noOutput); 
+        optWriter.write("sensitive: {}\n", mopts.sensitive); 
+        optWriter.write("strict check: {}\n", mopts.strictCheck); 
+        optWriter.write("fuzzy intersection: {}\n", mopts.fuzzy); 
+        optWriter.write("consistent hits: {}\n", mopts.consistentHits); 
+        optWriter.write("====================");
+        log->info(optWriter.str());
+}
 
-int rapMapSAMap(int argc, char* argv[]) {
-  std::cerr << "RapMap Mapper (SA-based)\n";
 
+int rapMapSAMap(int argc, char* argv[]) {
   std::string versionString = rapmap::version;
   TCLAP::CmdLine cmd(
 		     "RapMap Mapper",
@@ -517,10 +551,13 @@ int rapMapSAMap(int argc, char* argv[]) {
   TCLAP::ValueArg<uint32_t> numThreads("t", "numThreads", "Number of threads to use", false, 1, "positive integer");
   TCLAP::ValueArg<uint32_t> maxNumHits("m", "maxNumHits", "Reads mapping to more than this many loci are discarded", false, 200, "positive integer");
   TCLAP::ValueArg<std::string> outname("o", "output", "The output file (default: stdout)", false, "", "path");
+  TCLAP::ValueArg<double> quasiCov("z", "quasiCoverage", "Require that this fraction of a read is covered by MMPs before it is considered mappable.", false, 0.0, "double in [0,1]");
   TCLAP::SwitchArg noout("n", "noOutput", "Don't write out any alignments (for speed testing purposes)", false);
+  TCLAP::SwitchArg sensitive("e", "sensitive", "Perform a more sensitive quasi-mapping by disabling NIP skipping", false);
   TCLAP::SwitchArg strict("s", "strictCheck", "Perform extra checks to try and assure that only equally \"best\" mappings for a read are reported", false);
   TCLAP::SwitchArg fuzzy("f", "fuzzyIntersection", "Find paired-end mapping locations using fuzzy intersection", false);
   TCLAP::SwitchArg consistent("c", "consistentHits", "Ensure that the hits collected are consistent (co-linear)", false);
+  TCLAP::SwitchArg quiet("q", "quiet", "Disable all console output apart from warnings and errors", false);
   cmd.add(index);
   cmd.add(noout);
 
@@ -530,16 +567,26 @@ int rapMapSAMap(int argc, char* argv[]) {
   cmd.add(outname);
   cmd.add(numThreads);
   cmd.add(maxNumHits);
+  cmd.add(quasiCov);
+  cmd.add(sensitive);
   cmd.add(strict);
   cmd.add(fuzzy);
   cmd.add(consistent);
-
-  auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+  cmd.add(quiet);
+  
+  auto rawConsoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+  auto consoleSink =
+      std::make_shared<spdlog::sinks::ansicolor_sink>(rawConsoleSink);
   auto consoleLog = spdlog::create("stderrLog", {consoleSink});
-
+  
   try {
 
     cmd.parse(argc, argv);
+    // If we're supposed to be quiet, only print out warnings and above
+    if (quiet.getValue()) {
+        consoleLog->set_level(spdlog::level::warn);
+    }
+
     bool pairedEnd = (read1.isSet() or read2.isSet());
     if (pairedEnd and (read1.isSet() != read2.isSet())) {
       consoleLog->error("You must set both the -1 and -2 arguments to align "
@@ -570,6 +617,32 @@ int rapMapSAMap(int argc, char* argv[]) {
 			"doesn't exist", indexPrefix);
       std::exit(1);
     }
+    
+    MappingOpts mopts;
+    if (pairedEnd) {
+        mopts.read1 = read1.getValue();
+        mopts.read2 = read2.getValue();
+        mopts.pairedEnd = true;
+    } else {
+        mopts.unmatedReads = unmatedReads.getValue();
+    }
+    mopts.numThreads = numThreads.getValue();
+    mopts.maxNumHits = maxNumHits.getValue();
+    mopts.outname = (outname.isSet()) ? outname.getValue() : "";
+    mopts.quasiCov = quasiCov.getValue();
+    mopts.noOutput = noout.getValue();
+    mopts.sensitive = sensitive.getValue();
+    mopts.strictCheck = strict.getValue();
+    mopts.consistentHits = consistent.getValue();
+    mopts.fuzzy = fuzzy.getValue();
+    mopts.quiet = quiet.getValue();
+
+    if (quasiCov.isSet() and !sensitive.isSet()) {
+        consoleLog->info("The --quasiCoverage option is set to {}, but the --sensitive flag was not set. The former implies the later. Enabling sensitive mode.", quasiCov.getValue());
+        mopts.sensitive = true;
+    }
+
+    displayOpts(mopts, consoleLog.get());
 
     IndexHeader h;
     std::ifstream indexStream(indexPrefix + "header.json");
@@ -594,38 +667,30 @@ int rapMapSAMap(int argc, char* argv[]) {
       //BigSAIdxPtr.reset(new RapMapSAIndex<int64_t>);
       //BigSAIdxPtr->load(indexPrefix, h.kmerLen());
       if (h.perfectHash()) {
-          RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>> rmi;
+          RapMapSAIndex<int64_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int64_t>>> rmi;
           rmi.load(indexPrefix);
-          success = mapReads(rmi, consoleLog, index, read1, read2,
-                             unmatedReads, numThreads, maxNumHits,
-                             outname, noout, strict, fuzzy, consistent);
+          success = mapReads(rmi, consoleLog, &mopts);
       } else {
           RapMapSAIndex<int64_t,
-                        google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+                        RegHashT<uint64_t, rapmap::utils::SAInterval<int64_t>,
                                                rapmap::utils::KmerKeyHasher>> rmi;
           rmi.load(indexPrefix);
-          success = mapReads(rmi, consoleLog, index, read1, read2,
-                             unmatedReads, numThreads, maxNumHits,
-                             outname, noout, strict, fuzzy, consistent);
+          success = mapReads(rmi, consoleLog, &mopts);
       }
     } else {
         //std::cerr << "Loading 32-bit suffix array index: \n";
       //SAIdxPtr.reset(new RapMapSAIndex<int32_t>);
       //SAIdxPtr->load(indexPrefix, h.kmerLen());
         if (h.perfectHash()) {
-            RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>> rmi;
+            RapMapSAIndex<int32_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int32_t>>> rmi;
             rmi.load(indexPrefix);
-            success = mapReads(rmi, consoleLog, index, read1, read2,
-                               unmatedReads, numThreads, maxNumHits,
-                               outname, noout, strict, fuzzy, consistent);
+            success = mapReads(rmi, consoleLog, &mopts);
         } else {
             RapMapSAIndex<int32_t,
-                          google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+                          RegHashT<uint64_t, rapmap::utils::SAInterval<int32_t>,
                                                  rapmap::utils::KmerKeyHasher>> rmi;
             rmi.load(indexPrefix);
-            success = mapReads(rmi, consoleLog, index, read1, read2,
-                               unmatedReads, numThreads, maxNumHits,
-                               outname, noout, strict, fuzzy, consistent);
+            success = mapReads(rmi, consoleLog, &mopts);
         }
     }
 
diff --git a/src/RapMapUtils.cpp b/src/RapMapUtils.cpp
index 7b0dee2..d0e384c 100644
--- a/src/RapMapUtils.cpp
+++ b/src/RapMapUtils.cpp
@@ -1,3 +1,24 @@
+//
+// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
+// quasi-mapping.
+// Copyright (C) 2015, 2016 Rob Patro, Avi Srivastava, Hirak Sarkar
+//
+// This file is part of RapMap.
+//
+// RapMap is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// RapMap is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <cereal/types/vector.hpp>
 #include <cereal/types/unordered_map.hpp>
 #include <cereal/archives/binary.hpp>
@@ -7,8 +28,10 @@
 #include "RapMapIndex.hpp"
 #include "PairAlignmentFormatter.hpp"
 #include "SingleAlignmentFormatter.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
+//#include "jellyfish/whole_sequence_parser.hpp"
+#include "FastxParser.hpp"
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 
 namespace rapmap {
     namespace utils {
@@ -77,6 +100,38 @@ namespace rapmap {
             //std::swap(qual, qualWork);
         }
 
+        // Adapted from
+        // https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/blob/8c9933a1685e0ab50c7d8b7926c9068bc0c9d7d2/src/main.c#L36
+        // Don't modify the qual
+        void reverseRead(std::string& seq,
+                std::string& readWork) {
+
+            readWork.resize(seq.length(), 'A');
+            int32_t end = seq.length()-1, start = 0;
+            //readWork[end] = '\0';
+            //qualWork[end] = '\0';
+            while (LIKELY(start < end)) {
+                readWork[start] = (char)rc_table[(int8_t)seq[end]];
+                readWork[end] = (char)rc_table[(int8_t)seq[start]];
+                ++ start;
+                -- end;
+            }
+            // If odd # of bases, we still have to complement the middle
+            if (start == end) {
+                readWork[start] = (char)rc_table[(int8_t)seq[start]];
+                // but don't need to mess with quality
+                // qualWork[start] = qual[start];
+            }
+            //std::swap(seq, readWork);
+            //std::swap(qual, qualWork);
+        }
+
+        std::string reverseComplement(std::string& seq) {
+            std::string work;
+            reverseRead(seq, work);
+            return work;
+        }
+
         template <typename ReadT, typename IndexT>
         uint32_t writeAlignmentsToStream(
                 ReadT& r,
@@ -90,12 +145,12 @@ namespace rapmap {
                 auto& txpLens = formatter.index->txpLens;
 
                 auto& readTemp = formatter.readTemp;
-                auto& qualTemp = formatter.qualTemp;
+                //auto& qualTemp = formatter.qualTemp;
                 auto& cigarStr = formatter.cigarStr;
 
                 uint16_t flags;
 
-                auto& readName = r.header;
+                auto& readName = r.name;
 #if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
                 auto before = readName.find_first_of(':');
                 before = readName.find_first_of(':', before+1);
@@ -122,16 +177,15 @@ namespace rapmap {
                     }
 
                     std::string* readSeq = &(r.seq);
-                    std::string* qstr = &(r.qual);
+                    //std::string* qstr = &(r.qual);
 
                     if (!qa.fwd) {
                         if (!haveRev) {
-                            rapmap::utils::reverseRead(*readSeq, *qstr,
-                                                       readTemp, qualTemp);
+                            rapmap::utils::reverseRead(*readSeq, readTemp);
                             haveRev = true;
                         }
                         readSeq = &(readTemp);
-                        qstr = &(qualTemp);
+                        //qstr = &(qualTemp);
                     }
 
                    rapmap::utils::adjustOverhang(qa.pos, qa.readLen, txpLens[qa.tid], cigarStr);
@@ -146,7 +200,7 @@ namespace rapmap {
                         << 0 << '\t' // MATE POS
                         << qa.fragLen << '\t' // TLEN
                         << *readSeq << '\t' // SEQ
-                        << *qstr << '\t' // QSTR
+                        << "*\t" // QSTR
                         << numHitFlag << '\n';
                     ++alnCtr;
                     // === SAM
@@ -172,14 +226,14 @@ namespace rapmap {
 
                 auto& read1Temp = formatter.read1Temp;
                 auto& read2Temp = formatter.read2Temp;
-                auto& qual1Temp = formatter.qual1Temp;
-                auto& qual2Temp = formatter.qual2Temp;
+                //auto& qual1Temp = formatter.qual1Temp;
+                //auto& qual2Temp = formatter.qual2Temp;
                 auto& cigarStr1 = formatter.cigarStr1;
                 auto& cigarStr2 = formatter.cigarStr2;
 
                 uint16_t flags1, flags2;
 
-                auto& readName = r.first.header;
+                auto& readName = r.first.name;
                 // If the read name contains multiple space-separated parts,
                 // print only the first
                 size_t splitPos = readName.find(' ');
@@ -194,7 +248,7 @@ namespace rapmap {
                     readName[splitPos - 2] = '\0';
                 }
 
-                auto& mateName = r.second.header;
+                auto& mateName = r.second.name;
                 // If the read name contains multiple space-separated parts,
                 // print only the first
                 splitPos = mateName.find(' ');
@@ -238,7 +292,7 @@ namespace rapmap {
                         rapmap::utils::getSamFlags(qa, true, flags1, flags2);
                         if (alnCtr != 0) {
                             flags1 |= 0x100; flags2 |= 0x100;
-                        }
+                        } 
 
                         auto txpLen = txpLens[qa.tid];
                         rapmap::utils::adjustOverhang(qa, txpLens[qa.tid], cigarStr1, cigarStr2);
@@ -246,27 +300,25 @@ namespace rapmap {
                         // Reverse complement the read and reverse
                         // the quality string if we need to
                         std::string* readSeq1 = &(r.first.seq);
-                        std::string* qstr1 = &(r.first.qual);
+                        //std::string* qstr1 = &(r.first.qual);
                         if (!qa.fwd) {
                             if (!haveRev1) {
-                                rapmap::utils::reverseRead(*readSeq1, *qstr1,
-                                        read1Temp, qual1Temp);
+                                rapmap::utils::reverseRead(*readSeq1, read1Temp);
                                 haveRev1 = true;
                             }
                             readSeq1 = &(read1Temp);
-                            qstr1 = &(qual1Temp);
+                            //qstr1 = &(qual1Temp);
                         }
 
                         std::string* readSeq2 = &(r.second.seq);
-                        std::string* qstr2 = &(r.second.qual);
+                        //std::string* qstr2 = &(r.second.qual);
                         if (!qa.mateIsFwd) {
                             if (!haveRev2) {
-                                rapmap::utils::reverseRead(*readSeq2, *qstr2,
-                                        read2Temp, qual2Temp);
+                                rapmap::utils::reverseRead(*readSeq2, read2Temp);
                                 haveRev2 = true;
                             }
                             readSeq2 = &(read2Temp);
-                            qstr2 = &(qual2Temp);
+                            //qstr2 = &(qual2Temp);
                         }
 
                         // If the fragment overhangs the right end of the transcript
@@ -291,7 +343,7 @@ namespace rapmap {
                                 << qa.matePos + 1 << '\t' // PNEXT
                                 << ((read1First) ? fragLen : -fragLen) << '\t' // TLEN
                                 << *readSeq1 << '\t' // SEQ
-                                << *qstr1 << '\t' // QUAL
+                                << "*\t" // QUAL
                                 << numHitFlag << '\n';
 
                         sstream << mateName.c_str() << '\t' // QNAME
@@ -304,13 +356,14 @@ namespace rapmap {
                                 << qa.pos + 1 << '\t' // PNEXT
                                 << ((read1First) ? -fragLen : fragLen) << '\t' // TLEN
                                 << *readSeq2 << '\t' // SEQ
-                                << *qstr2 << '\t' // QUAL
+                                << "*\t" // QUAL
                                 << numHitFlag << '\n';
                     } else {
                         rapmap::utils::getSamFlags(qa, true, flags1, flags2);
                         if (alnCtr != 0) {
                             flags1 |= 0x100; flags2 |= 0x100;
                         }
+
 			/*
 			else {
                             // If this is the first alignment for this read
@@ -328,12 +381,12 @@ namespace rapmap {
                         std::string* unalignedSeq{nullptr};
 
                         uint32_t flags, unalignedFlags;
-                        std::string* qstr{nullptr};
-                        std::string* unalignedQstr{nullptr};
+                        //std::string* qstr{nullptr};
+                        //std::string* unalignedQstr{nullptr};
                         std::string* alignedName{nullptr};
                         std::string* unalignedName{nullptr};
                         std::string* readTemp{nullptr};
-                        std::string* qualTemp{nullptr};
+                        //std::string* qualTemp{nullptr};
 
                         rapmap::utils::FixedWriter* cigarStr;
                         if (qa.mateStatus == MateStatus::PAIRED_END_LEFT) { // left read
@@ -343,8 +396,8 @@ namespace rapmap {
                             readSeq = &(r.first.seq);
                             unalignedSeq = &(r.second.seq);
 
-                            qstr = &(r.first.qual);
-                            unalignedQstr = &(r.second.qual);
+                            //qstr = &(r.first.qual);
+                            //unalignedQstr = &(r.second.qual);
 
                             flags = flags1;
                             unalignedFlags = flags2;
@@ -353,7 +406,7 @@ namespace rapmap {
 
                             haveRev = &haveRev1;
                             readTemp = &read1Temp;
-                            qualTemp = &qual1Temp;
+                            //qualTemp = &qual1Temp;
                         } else { // right read
                             alignedName = &mateName;
                             unalignedName = &readName;
@@ -361,8 +414,8 @@ namespace rapmap {
                             readSeq = &(r.second.seq);
                             unalignedSeq = &(r.first.seq);
 
-                            qstr = &(r.second.qual);
-                            unalignedQstr = &(r.first.qual);
+                            //qstr = &(r.second.qual);
+                            //unalignedQstr = &(r.first.qual);
 
                             flags = flags2;
                             unalignedFlags = flags1;
@@ -370,19 +423,17 @@ namespace rapmap {
                             cigarStr = &cigarStr2;
                             haveRev = &haveRev2;
                             readTemp = &read2Temp;
-                            qualTemp = &qual2Temp;
+                            //qualTemp = &qual2Temp;
                         }
 
                         // Reverse complement the read and reverse
                         // the quality string if we need to
                         if (!qa.fwd) {
                             if (!(*haveRev)) {
-                                rapmap::utils::reverseRead(*readSeq, *qstr,
-                                        *readTemp, *qualTemp);
+                                rapmap::utils::reverseRead(*readSeq, *readTemp);
                                 *haveRev = true;
                             }
                             readSeq = readTemp;
-                            qstr = qualTemp;
                         }
 
                         /*
@@ -403,7 +454,7 @@ namespace rapmap {
                                 << qa.pos+1 << '\t' // PNEXT (only 1 read in templte)
                                 << 0 << '\t' // TLEN (spec says 0, not read len)
                                 << *readSeq << '\t' // SEQ
-                                << *qstr << '\t' // QUAL
+                                << "*\t" // QUAL
                                 << numHitFlag << '\n';
 
 
@@ -413,12 +464,12 @@ namespace rapmap {
                             << transcriptName << '\t' // RNAME (same as mate)
                             << qa.pos + 1 << '\t' // POS (same as mate)
                             << 0 << '\t' // MAPQ
-                            << unalignedSeq->length() << 'S' << '\t' // CIGAR
+                            << "*\t" // CIGAR
                             << '=' << '\t' // RNEXT
                             << qa.pos + 1 << '\t' // PNEXT (only 1 read in template)
                             << 0 << '\t' // TLEN (spec says 0, not read len)
                             << *unalignedSeq << '\t' // SEQ
-                            << *unalignedQstr << '\t' // QUAL
+                            << "*\t" // QUAL
                             << numHitFlag << '\n';
                     }
                     ++alnCtr;
@@ -471,41 +522,42 @@ namespace rapmap {
     }
 }
 
-using SAIndex32BitDense = RapMapSAIndex<int32_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+
+using SAIndex32BitDense = RapMapSAIndex<int32_t, RegHashT<uint64_t, rapmap::utils::SAInterval<int32_t>,
 								       rapmap::utils::KmerKeyHasher>>;
-using SAIndex64BitDense = RapMapSAIndex<int64_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+using SAIndex64BitDense = RapMapSAIndex<int64_t, RegHashT<uint64_t, rapmap::utils::SAInterval<int64_t>,
 								       rapmap::utils::KmerKeyHasher>>;
-using SAIndex32BitPerfect = RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
-using SAIndex64BitPerfect = RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+using SAIndex32BitPerfect = RapMapSAIndex<int32_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+using SAIndex64BitPerfect = RapMapSAIndex<int64_t, PerfectHashT<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
 
 // Explicit instantiations
 // pair parser, 32-bit, dense hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex32BitDense*>(
-                std::pair<header_sequence_qual, header_sequence_qual>& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadPair, SAIndex32BitDense*>(
+                fastx_parser::ReadPair& r,
                 PairAlignmentFormatter<SAIndex32BitDense*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // pair parser, 64-bit, dense hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex64BitDense*>(
-                std::pair<header_sequence_qual, header_sequence_qual>& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadPair, SAIndex64BitDense*>(
+                fastx_parser::ReadPair& r,
                 PairAlignmentFormatter<SAIndex64BitDense*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // pair parser, 32-bit, perfect hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex32BitPerfect*>(
-                std::pair<header_sequence_qual, header_sequence_qual>& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadPair, SAIndex32BitPerfect*>(
+                fastx_parser::ReadPair& r,
                 PairAlignmentFormatter<SAIndex32BitPerfect*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // pair parser, 64-bit, perfect hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex64BitPerfect*>(
-                std::pair<header_sequence_qual, header_sequence_qual>& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadPair, SAIndex64BitPerfect*>(
+                fastx_parser::ReadPair& r,
                 PairAlignmentFormatter<SAIndex64BitPerfect*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
@@ -513,48 +565,48 @@ template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequen
 
 
 // single parser, 32-bit, dense hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex32BitDense*>(
-		jellyfish::header_sequence_qual& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadSeq, SAIndex32BitDense*>(
+		fastx_parser::ReadSeq& r,
                 SingleAlignmentFormatter<SAIndex32BitDense*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // single parser, 64-bit, dense hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex64BitDense*>(
-		jellyfish::header_sequence_qual& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadSeq, SAIndex64BitDense*>(
+		fastx_parser::ReadSeq& r,
                 SingleAlignmentFormatter<SAIndex64BitDense*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // single parser, 32-bit, perfect hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex32BitPerfect*>(
- 		jellyfish::header_sequence_qual& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadSeq, SAIndex32BitPerfect*>(
+ 		fastx_parser::ReadSeq& r,
                 SingleAlignmentFormatter<SAIndex32BitPerfect*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 // single parser, 64-bit, perfect hash
-template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex64BitPerfect*>(
-		jellyfish::header_sequence_qual& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadSeq, SAIndex64BitPerfect*>(
+		fastx_parser::ReadSeq& r,
                 SingleAlignmentFormatter<SAIndex64BitPerfect*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream);
 
 
-template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, RapMapIndex*>(
-                std::pair<header_sequence_qual, header_sequence_qual>& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadPair, RapMapIndex*>(
+                fastx_parser::ReadPair& r,
                 PairAlignmentFormatter<RapMapIndex*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,
                 fmt::MemoryWriter& sstream
                 );
 
-template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, RapMapIndex*>(
-                jellyfish::header_sequence_qual& r,
+template uint32_t rapmap::utils::writeAlignmentsToStream<fastx_parser::ReadSeq, RapMapIndex*>(
+                fastx_parser::ReadSeq& r,
                 SingleAlignmentFormatter<RapMapIndex*>& formatter,
                 rapmap::utils::HitCounters& hctr,
                 std::vector<rapmap::utils::QuasiAlignment>& jointHits,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rapmap.git