[med-svn] [asmlib] 01/02: Imported Upstream version 0.1

Jorge Soares jssoares-guest at moszumanska.debian.org
Tue Nov 4 10:54:09 UTC 2014


This is an automated email from the git hooks/post-receive script.

jssoares-guest pushed a commit to branch master
in repository asmlib.

commit b012ac3d8f1b55857bf74364b00d79e57261ea84
Author: Jorge Soares <j.s.soares at gmail.com>
Date:   Tue Nov 4 11:21:52 2014 +0100

    Imported Upstream version 0.1
---
 LICENSE                           |  675 +++++++++++++++++
 README.md                         |    5 +
 asmlibSrc/MakeAsmlib.bat          |   22 +
 asmlibSrc/asmlib.make             |  252 +++++++
 asmlibSrc/cachesize32.asm         |  335 +++++++++
 asmlibSrc/cachesize64.asm         |  333 +++++++++
 asmlibSrc/cpuid32.asm             |   38 +
 asmlibSrc/cpuid64.asm             |   53 ++
 asmlibSrc/cputype32.asm           |  139 ++++
 asmlibSrc/cputype64.asm           |  125 ++++
 asmlibSrc/debugbreak32.asm        |   31 +
 asmlibSrc/debugbreak64.asm        |   31 +
 asmlibSrc/dispatchpatch32.asm     |  311 ++++++++
 asmlibSrc/dispatchpatch64.asm     |  328 +++++++++
 asmlibSrc/divfixedi32.asm         |  152 ++++
 asmlibSrc/divfixedi64.asm         |  171 +++++
 asmlibSrc/divfixedv32.asm         |  490 +++++++++++++
 asmlibSrc/divfixedv64.asm         |  496 +++++++++++++
 asmlibSrc/instrset32.asm          |  244 +++++++
 asmlibSrc/instrset64.asm          |  173 +++++
 asmlibSrc/libad32.asm             |   14 +
 asmlibSrc/libad32.def             |   44 ++
 asmlibSrc/libad64.asm             |   13 +
 asmlibSrc/libad64.def             |   42 ++
 asmlibSrc/memcmp32.asm            |  366 ++++++++++
 asmlibSrc/memcmp64.asm            |  293 ++++++++
 asmlibSrc/memcpy32.asm            | 1460 +++++++++++++++++++++++++++++++++++++
 asmlibSrc/memcpy64.asm            | 1313 +++++++++++++++++++++++++++++++++
 asmlibSrc/memmove32.asm           | 1238 +++++++++++++++++++++++++++++++
 asmlibSrc/memmove64.asm           | 1073 +++++++++++++++++++++++++++
 asmlibSrc/memset32.asm            |  487 +++++++++++++
 asmlibSrc/memset64.asm            |  368 ++++++++++
 asmlibSrc/mersenne32.asm          |  821 +++++++++++++++++++++
 asmlibSrc/mersenne64.asm          |  614 ++++++++++++++++
 asmlibSrc/mother32.asm            |  370 ++++++++++
 asmlibSrc/mother64.asm            |  250 +++++++
 asmlibSrc/physseed32.asm          |  334 +++++++++
 asmlibSrc/physseed64.asm          |  394 ++++++++++
 asmlibSrc/popcount32.asm          |  137 ++++
 asmlibSrc/popcount64.asm          |  110 +++
 asmlibSrc/procname32.asm          |  186 +++++
 asmlibSrc/procname64.asm          |  143 ++++
 asmlibSrc/randomah.asi            |  290 ++++++++
 asmlibSrc/rdtsc32.asm             |   51 ++
 asmlibSrc/rdtsc64.asm             |   51 ++
 asmlibSrc/round32.asm             |   41 ++
 asmlibSrc/round64.asm             |   38 +
 asmlibSrc/sfmt32.asm              | 1265 ++++++++++++++++++++++++++++++++
 asmlibSrc/sfmt64.asm              |  908 +++++++++++++++++++++++
 asmlibSrc/strcat32.asm            |   60 ++
 asmlibSrc/strcat64.asm            |   68 ++
 asmlibSrc/strcmp32.asm            |  177 +++++
 asmlibSrc/strcmp64.asm            |  162 ++++
 asmlibSrc/strcountset32.asm       |  194 +++++
 asmlibSrc/strcountset64.asm       |  175 +++++
 asmlibSrc/strcountutf832.asm      |  162 ++++
 asmlibSrc/strcountutf864.asm      |  127 ++++
 asmlibSrc/strcpy32.asm            |   53 ++
 asmlibSrc/strcpy64.asm            |   64 ++
 asmlibSrc/stricmp32.asm           |   70 ++
 asmlibSrc/stricmp64.asm           |   84 +++
 asmlibSrc/strlen32.asm            |  182 +++++
 asmlibSrc/strlen64.asm            |   84 +++
 asmlibSrc/strspn32.asm            |  338 +++++++++
 asmlibSrc/strspn64.asm            |  304 ++++++++
 asmlibSrc/strstr32.asm            |  251 +++++++
 asmlibSrc/strstr64.asm            |  218 ++++++
 asmlibSrc/strtouplow32.asm        |  285 ++++++++
 asmlibSrc/strtouplow64.asm        |  213 ++++++
 asmlibSrc/substring32.asm         |   61 ++
 asmlibSrc/substring64.asm         |   73 ++
 asmlibSrc/testalib.cpp            |  151 ++++
 asmlibSrc/testmem.cpp             |  396 ++++++++++
 asmlibSrc/testrandom.cpp          |  130 ++++
 asmlibSrc/unalignedisfaster32.asm |  178 +++++
 asmlibSrc/unalignedisfaster64.asm |  186 +++++
 76 files changed, 21564 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6b156fe
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,675 @@
+GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    {one line to give the program's name and a brief idea of what it does.}
+    Copyright (C) {year}  {name of author}
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    {project}  Copyright (C) {year}  {fullname}
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a90beb8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+asmlib
+======
+
+This is a library of optimized subroutines coded in assembly language. The functions in this library can be called from C, C++ and other compiled high-level languages. Supports many different compilers under Windows, Linux, BSD and Mac OS X operating systems, 32 and 64 bits. This library contains faster versions of common C/C++ memory and string functions, fast functions for string search and string parsing, fast integer division and integer vector division, as well as several useful fun [...]
+
diff --git a/asmlibSrc/MakeAsmlib.bat b/asmlibSrc/MakeAsmlib.bat
new file mode 100755
index 0000000..95ca00f
--- /dev/null
+++ b/asmlibSrc/MakeAsmlib.bat
@@ -0,0 +1,22 @@
+rem           MakeAsmlib.bat                   2011-07-01 Agner Fog
+
+rem  Make function library from assembly source with multiple
+rem  versions for different operating systems using objconv.
+
+
+rem  Set path to assembler and objconv:
+rem  You need to modify this path to fit your installation
+rem set path=C:\Program Files\Microsoft Visual Studio 9.0\VC\bin;C:\Program Files\Microsoft Visual Studio 9.0\Common7\IDE;C:\Program Files\Microsoft Visual Studio 9.0\VC\bin\x86_amd64;E:\Program Files\Microsoft SDKs\Windows\v6.1\Bin\x64;%path%
+
+rem  Path to nmake:
+set mspath=C:\Program Files (x86)\Microsoft Visual Studio 11.0
+
+set path=%mspath%\VC\bin;%mspath%\Common7\IDE;%mspath%\VC\bin\amd64;%path%
+
+
+rem  Make everything according to makefile asmlib.make
+nmake /Fasmlib.make
+
+wzzip asmlibbak.zip asmlib.zip asmlib-instructions.doc *.cpp
+
+pause
\ No newline at end of file
diff --git a/asmlibSrc/asmlib.make b/asmlibSrc/asmlib.make
new file mode 100755
index 0000000..a083f8c
--- /dev/null
+++ b/asmlibSrc/asmlib.make
@@ -0,0 +1,252 @@
+#                       ASMLIB.MAKE                        2013-09-11 Agner Fog
+
+# Makefile for ASMLIB function library, YASM version 
+# See asmlib-instructions.doc for a description
+
+# The following tools are required for building this library package:
+# Microsoft nmake or other make utility
+# Microsoft link
+# YASM assembler yasm.exe
+# (Works with NASM assembler as well, except for position-independent versions)
+# Object file converter objconv.exe (www.agner.org/optimize)
+# Winzip command line version (www.winzip.com) or other zip utility
+
+libpath64="C:\Program Files\Microsoft Visual Studio 9.0\VC\lib\amd64"
+
+# Main target is zip file
+# Using wzzip, which is the command line version of Winzip (www.winzip.com)
+asmlib.zip: lib/libacof32.lib lib/libacof32o.lib lib/libacof64.lib lib/libacof64o.lib \
+lib/libaomf32.lib lib/libaomf32o.lib \
+lib/libaelf32.a lib/libaelf32o.a lib/libaelf32p.a lib/libaelf32op.a \
+lib/libaelf64.a lib/libaelf64o.a \
+lib/libamac32.a lib/libamac32o.a lib/libamac32p.a lib/libamac32op.a \
+lib/libamac64.a lib/libamac64o.a \
+lib/libad32.dll lib/libad32.lib lib/libad64.dll lib/libad64.lib \
+asmlib.h asmlibran.h asmlib-instructions.pdf license.txt \
+asmlibSrc.zip inteldispatchpatch.zip
+  wzzip $@ $?
+  
+# Make zip archive of source code  
+asmlibSrc.zip: makeasmlib.bat asmlib.make \
+asm/instrset32.asm asm/instrset64.asm asm/procname32.asm asm/procname64.asm \
+asm/rdtsc32.asm asm/rdtsc64.asm asm/round32.asm asm/round64.asm \
+asm/libad32.asm asm/libad32.def asm/libad64.asm asm/libad64.def \
+asm/memcpy32.asm asm/memmove32.asm asm/memcpy64.asm asm/memmove64.asm \
+asm/memset32.asm asm/memset64.asm asm/memcmp32.asm asm/memcmp64.asm \
+asm/strlen32.asm asm/strlen64.asm \
+asm/strcpy32.asm asm/strcpy64.asm asm/strcat32.asm asm/strcat64.asm \
+asm/strcmp32.asm asm/strcmp64.asm asm/stricmp32.asm asm/stricmp64.asm \
+asm/strtouplow32.asm asm/strtouplow64.asm asm/strstr32.asm asm/strstr64.asm \
+asm/substring32.asm asm/substring64.asm asm/strspn32.asm asm/strspn64.asm \
+asm/strcountutf832.asm asm/strcountutf864.asm \
+asm/strcountset32.asm asm/strcountset64.asm \
+asm/divfixedi32.asm asm/divfixedi64.asm \
+asm/divfixedv32.asm asm/divfixedv64.asm \
+asm/popcount32.asm asm/popcount64.asm \
+asm/cpuid32.asm asm/cpuid64.asm asm/cputype32.asm asm/cputype64.asm \
+asm/physseed32.asm asm/physseed64.asm \
+asm/mother32.asm asm/mother64.asm asm/mersenne32.asm asm/mersenne64.asm \
+asm/randomah.asi asm/sfmt32.asm asm/sfmt64.asm \
+asm/debugbreak32.asm asm/debugbreak64.asm \
+asm/unalignedisfaster32.asm asm/unalignedisfaster64.asm \
+asm/cachesize32.asm asm/cachesize64.asm \
+asm/dispatchpatch32.asm asm/dispatchpatch64.asm \
+testalib.cpp testrandom.cpp testmem.cpp
+  wzzip $@ $?
+  
+# Make zip archive of inteldispatchpatch
+inteldispatchpatch.zip: patch/dispatchpatch.txt \
+patch/dispatchpatch32.obj patch/dispatchpatch32.o patch/dispatchpatch32.mac.o \
+patch/dispatchpatch64.obj patch/dispatchpatch64.o patch/dispatchpatch64.mac.o \
+patch/intel_cpu_feature_patch.c patch/intel_mkl_feature_patch.c
+  wzzip $@ $?
+
+
+# Build each library version:
+
+# 32 bit Windows/COFF library
+lib/libacof32.lib: obj/instrset32.obj32 obj/procname32.obj32 \
+obj/cpuid32.obj32 obj/rdtsc32.obj32 obj/round32.obj32 \
+obj/memcpy32.obj32 obj/memmove32.obj32 obj/memset32.obj32 obj/memcmp32.obj32 \
+obj/strlen32.obj32 obj/strcpy32.obj32 obj/strcat32.obj32 \
+obj/strstr32.obj32 obj/strcmp32.obj32 obj/stricmp32.obj32 \
+obj/strtouplow32.obj32 obj/substring32.obj32 obj/strspn32.obj32 \
+obj/strcountutf832.obj32 obj/strcountset32.obj32 \
+obj/divfixedi32.obj32 obj/divfixedv32.obj32 obj/popcount32.obj32 \
+obj/physseed32.obj32 obj/mother32.obj32 obj/mersenne32.obj32 \
+obj/sfmt32.obj32 \
+obj/cputype32.obj32 obj/debugbreak32.obj32 obj/unalignedisfaster32.obj32 \
+obj/cachesize32.obj32
+  objconv -fcof32 -wex -lib $@ $?
+
+# 32 bit ELF library, position dependent
+lib/libaelf32.a: obj/instrset32.o32 obj/procname32.o32 \
+obj/cpuid32.o32 obj/rdtsc32.o32 obj/round32.o32 \
+obj/memcpy32.o32 obj/memmove32.o32 obj/memset32.o32 obj/memcmp32.o32 \
+obj/strlen32.o32 obj/strcpy32.o32 obj/strcat32.o32 \
+obj/strstr32.o32 obj/strcmp32.o32 obj/stricmp32.o32 \
+obj/strtouplow32.o32 obj/substring32.o32 obj/strspn32.o32 \
+obj/strcountutf832.o32 obj/strcountset32.o32 \
+obj/divfixedi32.o32 obj/divfixedv32.o32 obj/popcount32.o32 \
+obj/physseed32.o32 obj/mother32.o32 obj/mersenne32.o32 \
+obj/sfmt32.o32 \
+obj/cputype32.o32 obj/debugbreak32.o32 obj/unalignedisfaster32.o32 \
+obj/cachesize32.o32
+  objconv -felf32 -nu -wex -lib $@ $?
+
+# 32 bit ELF library, position independent
+lib/libaelf32p.a: obj/instrset32.o32pic obj/procname32.o32pic \
+obj/cpuid32.o32pic obj/rdtsc32.o32pic obj/round32.o32pic \
+obj/memcpy32.o32pic obj/memmove32.o32pic obj/memset32.o32pic obj/memcmp32.o32pic \
+obj/strlen32.o32pic obj/strcpy32.o32pic obj/strcat32.o32pic \
+obj/strstr32.o32pic obj/strcmp32.o32pic obj/stricmp32.o32pic \
+obj/strtouplow32.o32pic obj/substring32.o32pic obj/strspn32.o32pic \
+obj/strcountutf832.o32pic obj/strcountset32.o32pic \
+obj/divfixedi32.o32pic obj/divfixedv32.o32pic obj/popcount32.o32pic \
+obj/physseed32.o32pic obj/mother32.o32pic obj/mersenne32.o32pic \
+obj/sfmt32.o32pic \
+obj/cputype32.o32pic obj/debugbreak32.o32 obj/unalignedisfaster32.o32 \
+obj/cachesize32.o32pic
+  objconv -felf32 -nu -wex -lib $@ $?
+
+# 64 bit COFF library Windows
+lib/libacof64.lib: obj/instrset64.obj64 obj/procname64.obj64 \
+obj/cpuid64.obj64 obj/rdtsc64.obj64 obj/round64.obj64 \
+obj/memcpy64.obj64 obj/memmove64.obj64 obj/memset64.obj64 obj/memcmp64.obj64 \
+obj/strlen64.obj64 obj/strcpy64.obj64 obj/strcat64.obj64 \
+obj/strstr64.obj64 obj/strcmp64.obj64 obj/stricmp64.obj64 \
+obj/strtouplow64.obj64 obj/substring64.obj64 obj/strspn64.obj64 \
+obj/strcountutf864.obj64 obj/strcountset64.obj64 \
+obj/divfixedi64.obj64 obj/divfixedv64.obj64 obj/popcount64.obj64 \
+obj/physseed64.obj64 obj/mother64.obj64 obj/mersenne64.obj64 \
+obj/sfmt64.obj64 \
+obj/cputype64.obj64 obj/debugbreak64.obj64 obj/unalignedisfaster64.obj64 \
+obj/cachesize64.obj64
+  objconv -fcof64 -wex -lib $@ $?
+
+# 64 bit ELF library Unix
+lib/libaelf64.a: obj/instrset64.o64 obj/procname64.o64 \
+obj/cpuid64.o64 obj/rdtsc64.o64 obj/round64.o64 \
+obj/memcpy64.o64 obj/memmove64.o64 obj/memset64.o64 obj/memcmp64.o64 \
+obj/strlen64.o64 obj/strcpy64.o64 obj/strcat64.o64 \
+obj/strstr64.o64 obj/strcmp64.o64 obj/stricmp64.o64 \
+obj/strtouplow64.o64 obj/substring64.o64 obj/strspn64.o64 \
+obj/strcountutf864.o64 obj/strcountset64.o64 \
+obj/divfixedi64.o64 obj/divfixedv64.o64 obj/popcount64.o64 \
+obj/physseed64.o64 obj/mother64.o64 obj/mersenne64.o64 \
+obj/sfmt64.o64 \
+obj/cputype64.o64 obj/debugbreak64.o64 obj/unalignedisfaster64.o64 \
+obj/cachesize64.o64
+  objconv -felf64 -nu -wex -wd1029 -lib $@ $?
+
+# Convert these libraries to other versions:
+  
+# 32 bit COFF library, override version
+lib/libacof32o.lib: lib/libacof32.lib
+  objconv -fcof32 -np:?OVR_:_ -wex $** $@
+
+# 32 bit OMF library
+lib/libaomf32.lib: lib/libacof32.lib  
+  objconv -fomf32 -nu -wex $** $@
+  
+# 32 bit OMF library, override version
+lib/libaomf32o.lib: lib/libacof32o.lib  
+  objconv -fomf32 -nu -wex $** $@
+  
+# 32 bit ELF library, override, position dependent
+lib/libaelf32o.a: lib/libaelf32.a
+  objconv -felf32 -np:?OVR_: -wex $** $@
+
+# 32 bit ELF library, override, position independent
+lib/libaelf32op.a: lib/libaelf32p.a
+  objconv -felf32 -np:?OVR_: -wex $** $@
+
+# 32 bit Mach-O library, position dependent
+lib/libamac32.a: lib/libaelf32.a
+  objconv -fmac32 -nu -wex -wd1050 $** $@
+  
+# 32 bit Mach-O library, position independent
+lib/libamac32p.a: lib/libaelf32p.a
+  objconv -fmac32 -nu -wex -wd1050 $** $@
+  
+# 32 bit Mach-O library, override
+lib/libamac32o.a: lib/libaelf32o.a
+  objconv -fmac32 -nu -wex -wd1050 $** $@
+  
+# 32 bit Mach-O library, override, position independent
+lib/libamac32op.a: lib/libaelf32op.a
+  objconv -fmac32 -nu -wex -wd1050 $** $@  
+
+# Make 64 bit COFF library, override
+lib/libacof64o.lib: lib/libacof64.lib
+  objconv -fcof64 -np:?OVR_: -wex $** $@
+
+# 64 bit ELF library, override
+lib/libaelf64o.a: lib/libaelf64.a
+  objconv -felf64 -np:?OVR_: -wex -wd1029 $** $@
+
+# 64 bit Mach-O library
+lib/libamac64.a: lib/libaelf64.a
+  objconv -fmac64 -nu -wex $** $@
+  
+# 64 bit Mach-O library, override
+lib/libamac64o.a: lib/libaelf64o.a
+  objconv -fmac64 -nu -wex $** $@
+
+# Convert 32 bit COFF library to DLL
+lib/libad32.dll: lib/libacof32.lib obj/libad32.obj32 asm/libad32.def
+  link /DLL /DEF:asm\libad32.def /SUBSYSTEM:WINDOWS /NODEFAULTLIB /ENTRY:DllEntry obj\libad32.obj32 lib/libacof32.lib
+  move libad32.* lib\
+
+# Convert 64 bit COFF library to DLL
+lib/libad64.dll: lib/libacof64.lib obj/libad64.obj64 asm/libad64.def
+  link /DLL /DEF:asm\libad64.def /SUBSYSTEM:WINDOWS /LIBPATH:$(libpath64) /ENTRY:DllEntry obj\libad64.obj64 lib/libacof64.lib
+  move libad64.* lib\
+
+  
+# Object files for inteldispatchpatch.zip:
+
+patch/dispatchpatch32.obj: obj/dispatchpatch32.obj32
+  copy obj\dispatchpatch32.obj32 patch\dispatchpatch32.obj
+# Note: copy must have '\', not '/'
+  
+patch/dispatchpatch32.o: obj/dispatchpatch32.o32pic
+  copy obj\dispatchpatch32.o32pic patch\dispatchpatch32.o
+  
+patch/dispatchpatch32.mac.o: obj/dispatchpatch32.o32pic  
+  objconv -fmac32 -nu -wex -wd1050 $** $@  
+
+patch/dispatchpatch64.obj: obj/dispatchpatch64.obj64
+  copy obj\dispatchpatch64.obj64 patch\dispatchpatch64.obj
+  
+patch/dispatchpatch64.o: obj/dispatchpatch64.o64
+  copy obj\dispatchpatch64.o64 patch\dispatchpatch64.o
+  
+patch/dispatchpatch64.mac.o: obj/dispatchpatch64.o64
+  objconv -fmac64 -nu -wex $** $@  
+
+
+# Generic rules for assembling
+
+# Generic rule for assembling 32-bit code for Windows (position dependent)
+{asm\}.asm{obj\}.obj32:
+  yasm -fwin32 -DWINDOWS -Worphan-labels -Werror -o $*.obj32 $<
+# ML /c /Cx /W3 /coff /Fl /Fo$*.obj32
+
+# Generic rule for assembling 32-bit for Unix, position-dependent
+{asm\}.asm{obj\}.o32:
+  yasm -felf32 -DUNIX -Worphan-labels -Werror -o $*.o32 $<
+  objconv -felf32 -nu- -wd2005 $*.o32 $*.o32
+  
+# Generic rule for assembling 32-bit for Unix, position-independent
+{asm\}.asm{obj\}.o32pic:
+  yasm -felf32 -DUNIX -DPOSITIONINDEPENDENT -Worphan-labels -Werror -o $*.o32pic $<
+  objconv -felf32 -nu- -wd2005 $*.o32pic $*.o32pic
+  
+# Generic rule for assembling 64-bit code for Windows
+{asm\}.asm{obj\}.obj64:
+  yasm -fwin64 -DWINDOWS -Worphan-labels -Werror -o $*.obj64 $<
+
+# Generic rule for assembling 64-bit code for Linux, BSD, Mac
+{asm\}.asm{obj\}.o64:
+  yasm -felf64 -DUNIX -Worphan-labels -Werror -o $*.o64 $<
diff --git a/asmlibSrc/cachesize32.asm b/asmlibSrc/cachesize32.asm
new file mode 100755
index 0000000..c50b957
--- /dev/null
+++ b/asmlibSrc/cachesize32.asm
@@ -0,0 +1,335 @@
+;*************************  cachesize32.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2011-07-11
+; Last modified:    2013-08-14
+; Description:
+; Determines the size of the data caches 
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input: 
+; level: n = 1 - 4: level n data cache
+;        0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _DataCacheSize: function
+
+; Imported from cputype32.asm
+extern _CpuType                 ; near. Determine CPU vendor
+
+; data are referenced as [esi+structuremember] rather than [esi+label-dataref] because
+; of a bug in yasm v. 1.1.0.2352:
+
+struc   data_layout
+ok:     resd    1
+level1: resd    1
+level2: resd    1
+level3: resd    1
+level4: resd    1
+descriptortable: resd 60
+endstruc
+
+struc   descriptor_record              ; record for table of cache descriptors
+d_key:          resb 1                 ; key from cpuid instruction
+d_level:        resb 1                 ; cache level
+d_sizem:        resb 1                 ; size multiplier
+d_2pow:         resb 1                 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref:                                ; reference point
+ok_:     DD      0                      ; 1 when values are determined
+level1_: DD      0                      ; level 1 data cache size
+level2_: DD      0                      ; level 2 data cache size
+level3_: DD      0                      ; level 3 data cache size
+level4_: DD      0                      ; level 4 data cache size
+numlevels   equ  4                      ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_:                      ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13                       ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14                       ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14                       ; 16 kb L1 data cache
+db 21h, 2, 1, 18                       ; 256 kb L2 data cache
+db 22h, 3, 1, 19                       ; 512 kb L3 data cache
+db 23h, 3, 1, 20                       ; 1 Mb L3 data cache
+db 25h, 3, 1, 21                       ; 2 Mb L3 data cache
+db 29h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15                       ; 32 kb L1 data cache
+db 39h, 2, 1, 17                       ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16                       ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17                       ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18                       ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17                       ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19                       ; 512 kb L2 data cache
+db 41h, 2, 1, 17                       ; 128 kb L2 data cache
+db 42h, 2, 1, 18                       ; 256 kb L2 data cache
+db 43h, 2, 1, 19                       ; 512 kb L2 data cache
+db 44h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 45h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 46h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 47h, 3, 1, 23                       ; 8 Mb L3 data cache
+db 48h, 2, 3, 20                       ; 3 Mb L2 data cache
+db 49h, 2, 1, 22                       ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21                       ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23                       ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22                       ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24                       ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21                       ; 6 Mb L2 data cache
+db 60h, 1, 1, 14                       ; 16 kb L1 data cache
+db 66h, 1, 1, 13                       ; 8 kb L1 data cache
+db 67h, 1, 1, 14                       ; 16 kb L1 data cache
+db 68h, 1, 1, 15                       ; 32 kb L1 data cache
+db 78h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 79h, 2, 1, 17                       ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18                       ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19                       ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20                       ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21                       ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19                       ; 512 kb L2 data cache
+db 82h, 2, 1, 18                       ; 256 kb L2 data cache
+db 83h, 2, 1, 19                       ; 512 kb L2 data cache
+db 84h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 85h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 86h, 2, 1, 19                       ; 512 kb L2 data cache
+db 87h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19                      ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19                      ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20                      ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21                      ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23                      ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22                      ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21                      ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23                      ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t _DataCacheSize(int level);
+
+; Function entry:
+_DataCacheSize:
+        push    ebx
+        push    esi
+        push    edi
+        push    ebp
+        mov     edi, [esp+20]       ; level
+%IFDEF  POSITIONINDEPENDENT
+        call    get_thunk_esi
+        add     esi, dataref - $  ; point to dataref
+%ELSE
+        mov     esi, dataref      ; point to dataref
+%ENDIF
+        ; check if called before
+        cmp     dword [esi + ok], 1
+        je      D800
+        
+        ; find cpu vendor
+        push    0
+        mov     eax, esp
+        push    0
+        push    0
+        push    eax
+        call    _CpuType
+        add     esp, 12
+        pop     eax                    ; eax = vendor
+        dec     eax
+        jz      Intel
+        dec     eax
+        jz      AMD
+        dec     eax
+        jz      VIA
+        ; unknown vendor, try all methods
+        call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    AMDMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+        
+Intel:  call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+
+AMD:    ; AMD and VIA use same method
+VIA:    call    AMDMethod
+        
+D800:   ; cache data known, get desired return value
+        xor     eax, eax
+        cmp     edi, numlevels
+        ja      D900
+        cmp     edi, 0
+        je      D820
+        ; level = 1 .. numlevels
+        mov     eax, [esi + edi*4]     ; size of selected cache
+        jmp     D850
+D820:   ; level = 0. Get size of largest level cache
+        mov     eax, [esi + level3]
+        test    eax, eax
+        jnz     D850
+        mov     eax, [esi + level2]
+        test    eax, eax
+        jnz     D850
+        mov     eax, [esi + level1]
+D850:   mov     dword [esi + ok], 1  ; remember called, whether success or not
+D900:   pop     ebp
+        pop     edi
+        pop     esi
+        pop     ebx
+        ret
+
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_esi:
+        mov     esi, [esp]
+        ret
+%ENDIF
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 4
+        jb      I900                   ; fail
+        xor     ebp, ebp               ; loop counter
+I100:   mov     eax, 4
+        mov     ecx, ebp
+        cpuid                          ; get cache parameters
+        mov     edx, eax
+        and     edx, 11111b            ; cache type
+        jz      I500                   ; no more caches
+        cmp     edx, 2
+        je      I200                   ; code cache, ignore
+        inc     ecx                    ; sets
+        mov     edx, ebx
+        shr     edx, 22
+        inc     edx                    ; ways
+        imul    ecx, edx
+        mov     edx, ebx
+        shr     edx, 12
+        and     edx, 1111111111b
+        inc     edx                    ; partitions
+        imul    ecx, edx
+        and     ebx, 111111111111b        
+        inc     ebx                    ; line size
+        imul    ecx, ebx               ; calculated cache size
+        shr     eax, 5
+        and     eax, 111b              ; cache level
+        cmp     eax, numlevels
+        jna     I180
+        mov     eax, numlevels         ; limit higher levels
+I180:   mov     [esi+eax*4], ecx       ; store size of data cache level eax
+I200:   inc     ebp
+        cmp     ebp, 100h              ; avoid infinite loop
+        jb      I100                   ; next cache
+I500:   ; loop finished
+        ; check if OK
+        mov     eax, [esi+level1]
+        cmp     eax, 1024
+I900:   ret                            ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 2
+        jb      J900                   ; fail
+        mov     eax, 2
+        xor     ecx, ecx
+        cpuid                          ; get 16 descriptor bytes in eax, ebx, ecx, edx
+        mov     al, 0                  ; al does not contain a descriptor
+        push    eax                    ; save all descriptors
+        push    ebx
+        push    ecx
+        push    edx                    ; now esp points to descriptors
+        mov     edx, 15                ; loop counter
+        ; loop to read 16 descriptor bytes
+J100:   mov     al, byte [esp+edx]
+        ; find in table
+        mov     ebx, descriptortablelength-1  ; loop counter
+        ; loop to search in descriptortable
+J200:   cmp     al, [esi + descriptortable + ebx*descriptor_record_size + d_key]
+        jne     J300
+        ; descriptor found
+;       YASM v. 1.1.0 fails if there are too many of (label-dataref): !
+;       movzx   eax, byte [esi + ebx*4 + (descriptortable_-dataref) + d_sizem]
+        movzx   eax, byte [esi + ebx*4 + descriptortable + d_sizem]
+        mov     cl,  [esi + ebx*4 + descriptortable + d_2pow]
+        shl     eax, cl                ; compute size
+        movzx   ecx, byte [esi + descriptortable + ebx*4 + d_level]
+        ; check that level = 1-3
+        cmp     ecx, 3
+        ja      J300
+        mov     [esi+ecx*4], eax       ; store size eax of data cache level ecx
+J300:   dec     ebx
+        jns     J200                   ; inner loop
+        dec     edx
+        jns     J100                   ; outer loop
+        add     esp, 16                ; remove from stack
+        ; check if OK
+        mov     eax, [esi+level1]
+        cmp     eax, 1024
+J900:   ret                            ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+        mov     eax, 80000000H
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 6
+        jb      K900                   ; fail
+        mov     eax, 80000005H
+        cpuid                          ; get L1 cache size
+        shr     ecx, 24                ; L1 data cache size in kbytes
+        shl     ecx, 10                ; L1 data cache size in bytes
+        mov     [esi+level1], ecx      ; store L1 data cache size
+        mov     eax, 80000006H
+        cpuid                          ; get L2 and L3 cache sizes
+        shr     ecx, 16                ; L2 data cache size in kbytes
+        shl     ecx, 10                ; L2 data cache size in bytes
+        mov     [esi+level2], ecx      ; store L2 data cache size
+        mov     ecx, edx
+        shr     ecx, 18                ; L3 data cache size / 512 kbytes
+        shl     ecx, 19                ; L3 data cache size in bytes
+%if 0   ; AMD manual is unclear: 
+        ; do we have to increase the value if the number of ways is not a power or 2?
+        shr     edx, 12
+        and     edx, 1111b             ; L3 associativity
+        cmp     edx, 3
+        jb      K100
+        test    edx, 1
+        jz      K100
+        ; number of ways is not a power of 2, multiply by 1.5 ?
+        mov     eax, ecx
+        shr     eax, 1
+        add     ecx, eax
+%endif
+K100:   mov     [esi+level3], ecx      ; store L3 data cache size
+        ; check if OK
+        mov     eax, [esi+level1]
+        cmp     eax, 1024
+K900:   ret                            ; carry flag set if fail
diff --git a/asmlibSrc/cachesize64.asm b/asmlibSrc/cachesize64.asm
new file mode 100755
index 0000000..b8c07b8
--- /dev/null
+++ b/asmlibSrc/cachesize64.asm
@@ -0,0 +1,333 @@
+;*************************  cachesize64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2011-07-11
+; Last modified:    2013-08-14
+; Description:
+; Determines the size of the data caches 
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input: 
+; level: n = 1 - 4: level n data cache
+;        0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global DataCacheSize: function
+
+; Imported from cputype64.asm
+extern CpuType                         ; near. Determine CPU vendor
+
+struc   data_layout
+ok:     resd    2
+level1: resq    1
+level2: resq    1
+level3: resq    1
+level4: resq    1
+descriptortable: resd 60
+endstruc
+
+struc   descriptor_record              ; record for table of cache descriptors
+d_key:          resb 1                 ; key from cpuid instruction
+d_level:        resb 1                 ; cache level
+d_sizem:        resb 1                 ; size multiplier
+d_2pow:         resb 1                 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref:                               ; reference point
+ok_:       DD      0, 0                ; 1 when values are determined
+level1_:   DQ      0                   ; level 1 data cache size
+level2_:   DQ      0                   ; level 2 data cache size
+level3_:   DQ      0                   ; level 3 data cache size
+level4_:   DQ      0                   ; level 4 data cache size
+numlevels  equ     4                   ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_:                      ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13                       ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14                       ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14                       ; 16 kb L1 data cache
+db 21h, 2, 1, 18                       ; 256 kb L2 data cache
+db 22h, 3, 1, 19                       ; 512 kb L3 data cache
+db 23h, 3, 1, 20                       ; 1 Mb L3 data cache
+db 25h, 3, 1, 21                       ; 2 Mb L3 data cache
+db 29h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15                       ; 32 kb L1 data cache
+db 39h, 2, 1, 17                       ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16                       ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17                       ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18                       ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17                       ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19                       ; 512 kb L2 data cache
+db 41h, 2, 1, 17                       ; 128 kb L2 data cache
+db 42h, 2, 1, 18                       ; 256 kb L2 data cache
+db 43h, 2, 1, 19                       ; 512 kb L2 data cache
+db 44h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 45h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 46h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 47h, 3, 1, 23                       ; 8 Mb L3 data cache
+db 48h, 2, 3, 20                       ; 3 Mb L2 data cache
+db 49h, 2, 1, 22                       ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21                       ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23                       ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22                       ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24                       ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21                       ; 6 Mb L2 data cache
+db 60h, 1, 1, 14                       ; 16 kb L1 data cache
+db 66h, 1, 1, 13                       ; 8 kb L1 data cache
+db 67h, 1, 1, 14                       ; 16 kb L1 data cache
+db 68h, 1, 1, 15                       ; 32 kb L1 data cache
+db 78h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 79h, 2, 1, 17                       ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18                       ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19                       ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20                       ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21                       ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19                       ; 512 kb L2 data cache
+db 82h, 2, 1, 18                       ; 256 kb L2 data cache
+db 83h, 2, 1, 19                       ; 512 kb L2 data cache
+db 84h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 85h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 86h, 2, 1, 19                       ; 512 kb L2 data cache
+db 87h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19                      ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19                      ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20                      ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21                      ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23                      ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22                      ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21                      ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23                      ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t DataCacheSize(int level);
+
+; Function entry:
+DataCacheSize:
+        push    rbx
+        push    r14
+%ifdef  WINDOWS
+        push    rsi
+        push    rdi
+        mov     r14d, ecx              ; level
+%else   ; UNIX
+        mov     r14d, edi              ; level
+%endif
+        ; check if called before
+        lea     r9, [dataref]
+        cmp     dword [r9+ok], 1       ; ok
+        je      D800
+        
+        ; find cpu vendor
+        push    0
+%ifdef  WINDOWS
+        mov     rcx, rsp
+        xor     edx, edx
+        xor     r8d, r8d
+%else   ; UNIX
+        mov     rdi, rsp
+        xor     esi, esi
+        xor     edx, edx
+%endif        
+        call    CpuType
+        lea     r9, [dataref]
+        pop     rax                    ; eax = vendor
+        dec     eax
+        jz      Intel
+        dec     eax
+        jz      AMD
+        dec     eax
+        jz      VIA
+        ; unknown vendor, try all methods
+        call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    AMDMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+        
+Intel:  call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+
+AMD:    ; AMD and VIA use same method
+VIA:    call    AMDMethod
+        
+D800:   ; cache data known, get desired return value
+        xor     eax, eax
+        cmp     r14d, numlevels
+        ja      D900
+        cmp     r14d, 0
+        je      D820
+        ; level = 1 .. numlevels
+        mov     rax, [r9 + r14*8]      ; size of selected cache
+        jmp     D850
+D820:   ; level = 0. Get size of largest level cache
+        mov     rax, [r9 + level3]     ; level3
+        test    rax, rax
+        jnz     D850
+        mov     rax, [r9 + level2]     ; level2
+        test    rax, rax
+        jnz     D850
+        mov     eax, [r9 + level1]     ; level1
+D850:   mov     dword [r9 + ok], 1     ; remember called, whether success or not
+D900:   
+%ifdef  WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r14
+        pop     rbx
+        ret
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 4
+        jb      I900                   ; fail
+        xor     esi, esi               ; loop counter
+I100:   mov     eax, 4
+        mov     ecx, esi
+        cpuid                          ; get cache parameters
+        mov     edx, eax
+        and     edx, 11111b            ; cache type
+        jz      I500                   ; no more caches
+        cmp     edx, 2
+        je      I200                   ; code cache, ignore
+        inc     ecx                    ; sets
+        mov     edx, ebx
+        shr     edx, 22
+        inc     edx                    ; ways
+        imul    ecx, edx
+        mov     edx, ebx
+        shr     edx, 12
+        and     edx, 1111111111b
+        inc     edx                    ; partitions
+        imul    ecx, edx
+        and     ebx, 111111111111b        
+        inc     ebx                    ; line size
+        imul    rcx, rbx               ; calculated cache size (64 bit)
+        shr     eax, 5
+        and     eax, 111b              ; cache level
+        cmp     eax, numlevels
+        jna     I180
+        mov     eax, numlevels         ; limit higher levels
+I180:   mov     [r9+rax*8], rcx        ; store size of data cache level eax
+I200:   inc     esi
+        cmp     esi, 100h              ; avoid infinite loop
+        jb      I100                   ; next cache
+I500:   ; loop finished
+        ; check if OK
+        mov     eax, [r9+level1]       ; level1
+        cmp     eax, 1024
+I900:   ret                            ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 2
+        jb      J900                   ; fail
+        mov     eax, 2
+        xor     ecx, ecx
+        cpuid                          ; get 16 descriptor bytes in eax, ebx, ecx, edx
+        mov     al, 0                  ; al does not contain a descriptor
+        sub     rsp, 16
+        mov     [rsp],    eax          ; save all descriptors
+        mov     [rsp+4],  ebx
+        mov     [rsp+8],  ecx
+        mov     [rsp+12], edx
+        mov     edx, 15                ; loop counter
+        ; loop to read 16 descriptor bytes
+J100:   mov     al, byte [rsp+rdx]
+        ; find in table
+        mov     ebx, descriptortablelength-1  ; loop counter
+        ; loop to search in descriptortable
+J200:   cmp     al, [r9 + descriptortable + rbx*4 + d_key]
+        jne     J300
+        ; descriptor found
+        movzx   eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
+        mov     cl,  [r9 + descriptortable + rbx*4 + d_2pow]
+        shl     eax, cl                ; compute size
+        movzx   ecx, byte [r9 + descriptortable + rbx*4 + d_level]
+        ; check that level = 1-3
+        cmp     ecx, 3
+        ja      J300
+        mov     [r9+rcx*8], rax        ; store size eax of data cache level ecx
+J300:   dec     ebx
+        jns     J200                   ; inner loop
+        dec     edx
+        jns     J100                   ; outer loop
+        add     rsp, 16                ; remove from stack
+        ; check if OK
+        mov     eax, [r9 + level1]
+        cmp     eax, 1024
+J900:   ret                            ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+        mov     eax, 80000000H
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 6
+        jb      K900                   ; fail
+        mov     eax, 80000005H
+        cpuid                          ; get L1 cache size
+        shr     ecx, 24                ; L1 data cache size in kbytes
+        shl     ecx, 10                ; L1 data cache size in bytes
+        mov     [r9 + level1], ecx     ; store L1 data cache size
+        mov     eax, 80000006H
+        cpuid                          ; get L2 and L3 cache sizes
+        shr     ecx, 16                ; L2 data cache size in kbytes
+        shl     ecx, 10                ; L2 data cache size in bytes
+        mov     [r9 + level2], ecx     ; store L2 data cache size
+        mov     ecx, edx
+        shr     ecx, 18                ; L3 data cache size / 512 kbytes
+        shl     rcx, 19                ; L3 data cache size in bytes
+%if 0   ; AMD manual is unclear: 
+        ; do we have to increase the value if the number of ways is not a power or 2?
+        shr     edx, 12
+        and     edx, 1111b             ; L3 associativity
+        cmp     edx, 3
+        jb      K100
+        test    edx, 1
+        jz      K100
+        ; number of ways is not a power of 2, multiply by 1.5 ?
+        mov     rax, rcx
+        shr     rax, 1
+        add     rcx, rax
+%endif
+K100:   mov     [r9 + level3], rcx     ; store L3 data cache size
+        ; check if OK
+        mov     eax, [r9 + level1]
+        cmp     eax, 1024
+K900:   ret                            ; carry flag set if fail
diff --git a/asmlibSrc/cpuid32.asm b/asmlibSrc/cpuid32.asm
new file mode 100755
index 0000000..ec601a3
--- /dev/null
+++ b/asmlibSrc/cpuid32.asm
@@ -0,0 +1,38 @@
+;*************************  cpuid32.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2008-12-14
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _cpuid_ex: function
+
+SECTION .text  align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int eax, int ecx);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+_cpuid_ex:
+        push    ebx
+        push    edi
+        mov     edi, [esp+12]          ; abcd out
+        mov     eax, [esp+16]          ; eax in
+        mov     ecx, [esp+20]          ; ecx in
+        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
+        mov     [edi],    eax
+        mov     [edi+4],  ebx
+        mov     [edi+8],  ecx
+        mov     [edi+12], edx
+        pop     edi
+        pop     ebx
+        ret
+;_cpuid_ex END
diff --git a/asmlibSrc/cpuid64.asm b/asmlibSrc/cpuid64.asm
new file mode 100755
index 0000000..80cd249
--- /dev/null
+++ b/asmlibSrc/cpuid64.asm
@@ -0,0 +1,53 @@
+;*************************  cpuid64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2008-12-14
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global cpuid_ex: function
+
+SECTION .text  align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int a, int c);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+cpuid_ex:
+
+%IFDEF   WINDOWS
+; parameters: rcx = abcd, edx = a, r8d = c
+        push    rbx
+        xchg    rcx, r8
+        mov     eax, edx
+        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
+        mov     [r8],    eax
+        mov     [r8+4],  ebx
+        mov     [r8+8],  ecx
+        mov     [r8+12], edx
+        pop     rbx
+%ENDIF        
+%IFDEF   UNIX
+; parameters: rdi = abcd, esi = a, edx = c
+        push    rbx
+        mov     eax, esi
+        mov     ecx, edx
+        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
+        mov     [rdi],    eax
+        mov     [rdi+4],  ebx
+        mov     [rdi+8],  ecx
+        mov     [rdi+12], edx
+        pop     rbx
+%ENDIF        
+        ret
+;cpuid_ex END
diff --git a/asmlibSrc/cputype32.asm b/asmlibSrc/cputype32.asm
new file mode 100755
index 0000000..0ab02e2
--- /dev/null
+++ b/asmlibSrc/cputype32.asm
@@ -0,0 +1,139 @@
+;*************************  cputype32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor: 
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model:  This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global _CpuType: function
+
+
+SECTION .text
+
+_CpuType:
+        push    ebx
+        push    esi
+        push    edi
+        
+; parameters
+%define vendor  esp+16
+%define family  esp+20
+%define model   esp+24
+
+        xor     esi, esi               ; vendor
+        xor     edi, edi               ; family
+
+        ; detect if CPUID instruction supported by microprocessor:
+        pushfd
+        pop     eax
+        btc     eax, 21                ; check if CPUID bit can toggle
+        push    eax
+        popfd
+        pushfd
+        pop     ebx
+        xor     ebx, eax
+        bt      ebx, 21
+        jc      C900                   ; CPUID not supported
+        
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        
+        ; get vendor
+        ; ecx = last  4 characters of vendor string
+        ; ebx = first 4 characters of vendor string
+        cmp     ecx, 'ntel'            ; 'GenuineIntel'
+        je      C110
+        cmp     ecx, 'cAMD'            ; 'AuthenticAMD'
+        je      C120
+        cmp     ebx, 'Cent'            ; 'CentaurHauls'
+        je      C130
+        cmp     ebx, 'VIA '            ; 'VIA VIA VIA '
+        je      C130
+        cmp     ebx, 'Cyri'            ; 'CyrixInstead'
+        je      C140
+        cmp     ebx, 'NexG'            ; 'NexGenDriven'
+        je      C150
+        jmp     C200                   ; other
+C110:   or      esi, 1
+        jmp     C200
+C120:   or      esi, 2
+        jmp     C200
+C130:   or      esi, 3
+        jmp     C200
+C140:   or      esi, 4
+        jmp     C200
+C150:   or      esi, 5
+        ;jmp     C200
+C200:   
+        test    eax, eax
+        jz      C900                   ; function 1 not supported
+
+        ; Get family and model
+        mov     eax, 1
+        cpuid                          
+        mov     ebx, eax
+        mov     edi, eax
+        shr     ebx, 8
+        and     ebx, 0FH               ; Family
+        shr     edi, 20
+        and     edi, 0FFH              ; Extended family
+        add     edi, ebx               ; Family + extended family
+        
+        mov     ebx, eax
+        shr     ebx, 4
+        and     ebx, 0FH               ; Model
+        mov     ecx, eax
+        shr     ecx, 12
+        and     ecx, 0F0H              ; Extended model
+        or      ebx, ecx               ; extended model - Model
+        
+C300:   ; return esi = vendor, edi = family, ebx = model
+        mov     eax, [vendor]
+        test    eax, eax
+        jz      C310
+        mov     [eax], esi
+C310:   mov     eax, [family]
+        test    eax, eax
+        jz      C320
+        mov     [eax], edi
+C320:   mov     eax, [model]
+        test    eax, eax
+        jz      C330
+        mov     [eax], ebx
+C330:   xor     eax, eax
+        ; return
+        pop     edi
+        pop     esi
+        pop     ebx
+        ret
+        
+C900:   ; no cpuid
+        xor     ebx, ebx
+        jmp     C300
+;_CpuType ENDP
diff --git a/asmlibSrc/cputype64.asm b/asmlibSrc/cputype64.asm
new file mode 100755
index 0000000..c74c9d2
--- /dev/null
+++ b/asmlibSrc/cputype64.asm
@@ -0,0 +1,125 @@
+;*************************  cputype64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor: 
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model:  This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global CpuType: function
+
+
+SECTION .text
+
+CpuType:
+        push    rbx
+%ifdef  UNIX
+        mov     r8, rdx
+%endif
+%ifdef  WINDOWS        
+        push    rsi
+        push    rdi
+        mov     rdi, rcx
+        mov     rsi, rdx
+%endif
+        
+; parameters
+; vendor  rdi
+; family  rsi
+; model   r8
+
+        xor     r9d,  r9d              ; vendor
+        xor     r10d, r10d             ; family
+        xor     r11d, r11d             ; model
+
+        xor     eax, eax
+        cpuid                          ; get vendor
+        ; ecx = last  4 characters of vendor string
+        ; ebx = first 4 characters of vendor string
+        cmp     ecx, 'ntel'            ; 'GenuineIntel'
+        je      C110
+        cmp     ecx, 'cAMD'            ; 'AuthenticAMD'
+        je      C120
+        cmp     ebx, 'Cent'            ; 'CentaurHauls'
+        je      C130
+        cmp     ebx, 'VIA '            ; 'VIA VIA VIA '
+        je      C130
+        cmp     ebx, 'Cyri'            ; 'CyrixInstead'
+        je      C140
+        cmp     ebx, 'NexG'            ; 'NexGenDriven'
+        je      C150
+        jmp     C200                   ; other
+C110:   or      r9d, 1
+        jmp     C200
+C120:   or      r9d, 2
+        jmp     C200
+C130:   or      r9d, 3
+        jmp     C200
+C140:   or      r9d, 4
+        jmp     C200
+C150:   or      r9d, 5
+        ;jmp     C200
+C200:   
+
+        ; Get family and model
+        mov     eax, 1
+        cpuid                          
+        mov     ebx, eax
+        mov     r10d, eax
+        shr     ebx, 8
+        and     ebx, 0FH               ; Family
+        shr     r10d, 20
+        and     r10d, 0FFH             ; Extended family
+        add     r10d, ebx              ; Family + extended family
+        
+        mov     r11d, eax
+        shr     r11d, 4
+        and     r11d, 0FH              ; Model
+        shr     eax, 12
+        and     eax, 0F0H              ; Extended model
+        or      r11d, eax              ; extended model | Model
+        
+C300:   ; return r9d = vendor, r10d = family, r11d = model
+        test    rdi, rdi
+        jz      C310
+        mov     [rdi], r9d
+C310:   test    rsi, rsi
+        jz      C320
+        mov     [rsi], r10d
+C320:   test    r8, r8
+        jz      C330
+        mov     [r8], r11d
+C330:   xor     eax, eax
+        ; return
+%ifdef  WINDOWS 
+        pop     rdi
+        pop     rsi
+%endif
+        pop     rbx
+        ret
+;CpuType ENDP
diff --git a/asmlibSrc/debugbreak32.asm b/asmlibSrc/debugbreak32.asm
new file mode 100755
index 0000000..17a3ec9
--- /dev/null
+++ b/asmlibSrc/debugbreak32.asm
@@ -0,0 +1,31 @@
+;*************************  debugbreak32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global _A_DebugBreak: function
+
+
+SECTION .text
+
+_A_DebugBreak:
+        int3
+        nop
+        ret
+;_A_DebugBreak ENDP
diff --git a/asmlibSrc/debugbreak64.asm b/asmlibSrc/debugbreak64.asm
new file mode 100755
index 0000000..bbb32ef
--- /dev/null
+++ b/asmlibSrc/debugbreak64.asm
@@ -0,0 +1,31 @@
+;*************************  debugbreak64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global A_DebugBreak: function
+
+
+SECTION .text
+
+A_DebugBreak:
+        int3
+        nop
+        ret
+;A_DebugBreak ENDP
diff --git a/asmlibSrc/dispatchpatch32.asm b/asmlibSrc/dispatchpatch32.asm
new file mode 100755
index 0000000..ef03b69
--- /dev/null
+++ b/asmlibSrc/dispatchpatch32.asm
@@ -0,0 +1,311 @@
+;***********************  dispatchpatch32.asm  ********************************
+; Author:           Agner Fog
+; Date created:     2007-07-20
+; Last modified:    2014-07-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" int  __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve 
+; compatibility of Intel function libraries with non-Intel processors.
+; In Windows, use static link libraries (*.lib), not dynamic libraries
+; (*.dll). Linking in this as an object file will override the functions
+; with the same name in the library.; 
+; 
+; Copyright (c) 2007-2014 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern _InstructionSet: function
+%include "instrset32.asm"              ; include code for _InstructionSet function
+
+; InstructionSet function return value:
+;  0 =  80386 instruction set only
+;  1 or above = MMX instructions supported
+;  2 or above = conditional move and FCOMI supported
+;  3 or above = SSE (XMM) supported by processor and operating system
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVVX512F
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Dispatcher for Intel standard libraries and SVML library,
+;  old versions
+;
+;  __intel_cpu_indicator is for older versions of Intel compiler
+;  version 14.0 uses __intel_cpu_features_init_x() instead
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global ___intel_cpu_indicator
+global ___intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@:                  ; local name
+___intel_cpu_indicator: dd 0
+; table of indicator values
+itable  DD      1                      ; 0: generic version, 80386 instruction set
+        DD      8, 8                   ; 1,   2: MMX
+        DD      0x80                   ; 3:      SSE
+        DD      0x200                  ; 4:      SSE2
+        DD      0x800                  ; 5:      SSE3
+        DD      0x1000,  0x1000        ; 6,   7: SSSE3
+        DD      0x2000,  0x2000        ; 8,   9: SSE4.1
+        DD      0x8000,  0x8000        ; 10, 11: SSE4.2 and popcnt
+        DD      0x20000, 0x20000       ; 12, 13: AVX, pclmul, aes
+        DD      0x400000               ; 14:     AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+        DD      0x400000               ; 
+        
+itablelen equ ($ - itable) / 4         ; length of table
+
+SECTION .text
+
+; This is already in instrset.asm file
+;%IFDEF POSITIONINDEPENDENT
+; Local function for reading instruction pointer into edi
+;GetThunkEDX:
+;        mov     edx, [esp]
+;        ret
+;%ENDIF  ; POSITIONINDEPENDENT
+
+
+___intel_cpu_indicator_init:
+        pushad                         ; Must save registers
+        call    _InstructionSet
+        cmp     eax, itablelen
+        jb      L100
+        mov     eax, itablelen - 1     ; limit to table length
+L100:   
+%IFDEF POSITIONINDEPENDENT
+        ; Position-independent code for ELF and Mach-O shared objects:
+        call    GetThunkEDX
+        add     edx, intel_cpu_indicator@ - $
+%ELSE
+        lea     edx, [intel_cpu_indicator@]
+%ENDIF  
+        mov     eax, [edx + (itable - intel_cpu_indicator@) + 4*eax]
+        mov     [edx], eax             ; store in ___intel_cpu_indicator
+        popad
+        ret
+;___intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Math Kernel Library (MKL),
+;     version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _mkl_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
+        DD      2                      ; 4:      SSE2
+		DD      3                      ; 5:      SSE3
+        DD      4                      ; 6:      SSSE3
+        DD      4                      ; 7:      unused
+        DD      4                      ; 8:      SSE4.1
+        DD      4                      ; 9:      POPCNT
+        DD      5                      ; 10:     SSE4.2
+        DD      6                      ; 11:     AVX
+        DD      6                      ; 12:     PCLMUL, AES
+        DD      6                      ; 13:     AVX2
+        DD      7                      ; 14:     FMA3, BMI1/2, LZCNT
+;        DD      7                      ; 15:     AVX512F
+        
+mkltablen equ ($ - mkltab) / 4         ; length of table
+
+SECTION .text
+
+_mkl_serv_cpu_detect:
+        push    ecx                    ; Perhaps not needed
+        push    edx
+        call    _InstructionSet
+        cmp     eax, mkltablen
+        jb      M100
+        mov     eax, mkltablen - 1     ; limit to table length
+M100:   
+%IFDEF POSITIONINDEPENDENT
+        ; Position-independent code for ELF and Mach-O shared objects:
+        call    GetThunkEDX
+        add     edx, mkltab - $
+%ELSE
+        lea     edx, [mkltab]
+%ENDIF  
+        mov     eax, [edx + 4*eax]
+        pop     edx
+        pop     ecx
+        ret
+; end _mkl_serv_cpu_detect        
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Vector Math Library (VML)
+;     version 14.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _mkl_vml_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab  DD      0, 0, 0                ; 0-2: generic version, 80386 instruction set
+        DD      2                      ; 3:      SSE
+        DD      3                      ; 4:      SSE2
+		DD      4                      ; 5:      SSE3
+        DD      5                      ; 6:      SSSE3
+        DD      5                      ; 7:      unused
+        DD      6                      ; 8:      SSE4.1
+        DD      6                      ; 9:      POPCNT
+        DD      7                      ; 10:     SSE4.2
+        DD      8                      ; 11:     AVX
+        DD      8                      ; 12:     PCLMUL, AES
+        DD      8                      ; 13:     AVX2
+        DD      9                      ; 14:     FMA3, BMI1/2, LZCNT
+;        DD      9                      ; 15:     AVX512F
+
+vmltablen equ ($ - vmltab) / 4         ; length of table
+
+SECTION .text
+
+_mkl_vml_serv_cpu_detect:
+        push    ecx                    ; Perhaps not needed
+        push    edx
+        call    _InstructionSet
+        cmp     eax, vmltablen
+        jb      V100
+        mov     eax, vmltablen - 1     ; limit to table length
+V100:   
+%IFDEF POSITIONINDEPENDENT
+        ; Position-independent code for ELF and Mach-O shared objects:
+        call    GetThunkEDX
+        add     edx, vmltab - $
+%ELSE
+        lea     edx, [vmltab]
+%ENDIF  
+        mov     eax, [edx + 4*eax]
+        pop     edx
+        pop     ecx
+        ret
+; end _mkl_vml_serv_cpu_detect 
+
+       
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for __intel_cpu_feature_indicator 
+;     version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if 0   ; Don't include this!
+
+; __intel_cpu_features_init and __intel_cpu_features_init_x are 
+; identical, except that the former checks the CPU brand, the
+; latter does not. Don't override this function. Instead, set
+; the indicator variables to 0 to force a re-evaluation,
+; and call __intel_cpu_features_init_x.
+; If you do want to override these functions then you must
+; save all registers.
+
+
+global __intel_cpu_features_init
+global __intel_cpu_feature_indicator
+global __intel_cpu_fms_indicator
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x  DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x:     DD 0, 0
+
+
+feattab DD  1                ; 0 default
+        DD  0BH              ; 1 MMX
+        DD  0FH              ; 2 conditional move and FCOMI supported
+        DD  3FH              ; 3 SSE
+        DD  7FH              ; 4 SSE2
+        DD  0FFH             ; 5 SSE3
+        DD  1FFH, 1FFH       ; 6 Supplementary SSE3
+        DD  3FFH             ; 8 SSE4.1
+        DD  0BFFH            ; 9 POPCNT 
+        DD  0FFFH            ; 10 SSE4.2 
+        DD  10FFFH           ; 11 AVX 
+        DD  16FFFH           ; 12 PCLMUL and AES 
+        DD  816FFFH          ; 13 AVX2 
+        DD  9DEFFFH          ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+;        DD  0FDEFFFH         ; 15 HLE, RTM 
+
+feattablen equ ($ - feattab) / 4  ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+        push    ecx
+        push    edx
+        call    _InstructionSet
+        cmp     eax, feattablen
+        jb      F100
+        mov     eax, vmltablen - 1     ; limit to table length
+F100:   
+        lea     edx, [feattab]
+        mov     ebx, [edx + 4*eax]     ; look up in table        
+        push    ebx
+        mov     eax, 1
+        cpuid
+        pop     ebx
+        bt      ecx, 22                ; MOVBE
+        jnc     F200
+        or      ebx, 1000H
+F200:   mov     [intel_cpu_feature_indicator@], ebx
+
+        ; get family and model
+        mov     edx, eax
+        and     eax, 0FH               ; stepping bit 0-3
+        mov     ecx, edx
+        shr     ecx, 4
+        and     ecx, 0FH               ; model
+        mov     ebx, edx
+        shr     ebx, 12
+        and     ebx, 0F0H              ; x model
+        or      ecx, ebx               ; full model
+        mov     ah,  cl                ; model bit 8 - 15
+        mov     ecx, edx
+        shr     ecx, 8
+        and     ecx, 0FH               ; family
+        mov     ebx, edx
+        shr     ebx, 20
+        and     ebx, 0FFH              ; x family
+        add     ecx, ebx               ; full family
+        shl     ecx, 16
+        or      eax, ecx               ; full family bit 16 - 23
+        mov     [intel_cpu_fms_indicator@], eax
+        
+        pop     edx
+        pop     ecx
+        ret
+; end __intel_cpu_features_init        
+
+%endif
diff --git a/asmlibSrc/dispatchpatch64.asm b/asmlibSrc/dispatchpatch64.asm
new file mode 100755
index 0000000..8f9457a
--- /dev/null
+++ b/asmlibSrc/dispatchpatch64.asm
@@ -0,0 +1,328 @@
+;***********************  dispatchpatch64.asm  ********************************
+; Author:           Agner Fog
+; Date created:     2007-07-20
+; Last modified:    2014-07-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int  __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve 
+; compatibility of Intel function libraries with non-Intel processors.
+; Only works with static link libraries (*.lib, *.a), not dynamic libraries
+; (*.dll, *.so). Linking in this as an object file will override the functions
+; with the same name in the library.; 
+; 
+; Copyright (c) 2007-2014 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern InstructionSet: function
+%include "instrset64.asm"              ; include code for InstructionSet function
+
+; InstructionSet function return value:
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512F
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Dispatcher for Intel standard libraries and SVML library,
+;  old versions
+;
+;  __intel_cpu_indicator is for older versions of Intel compiler
+;  version 14.0 uses __intel_cpu_features_init_x() instead
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global __intel_cpu_indicator
+global __intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@:                  ; local name
+__intel_cpu_indicator: dd 0
+
+; table of indicator values
+itable  DD      1                      ; 0: generic version, 80386 instruction set
+        DD      8, 8                   ; 1,   2: MMX
+        DD      0x80                   ; 3:      SSE
+        DD      0x200                  ; 4:      SSE2
+        DD      0x800                  ; 5:      SSE3
+        DD      0x1000,  0x1000        ; 6,   7: SSSE3
+        DD      0x2000,  0x2000        ; 8,   9: SSE4.1
+        DD      0x8000,  0x8000        ; 10, 11: SSE4.2 and popcnt
+        DD      0x20000, 0x20000       ; 12, 13: AVX, pclmul, aes
+        DD      0x400000               ; 14:     AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+;        DD      0x800000               ; 15:     HLE, RTM
+itablelen equ ($ - itable) / 4         ; length of table
+
+SECTION .text
+
+__intel_cpu_indicator_init:
+        push    rax                    ; registers must be pushed
+        push    rcx
+        push    rdx
+        push    r8
+        push    r9
+        push    r10
+        push    r11
+        push    rsi
+        push    rdi
+        call    InstructionSet
+        cmp     eax, itablelen
+        jb      L100
+        mov     eax, itablelen - 1     ; limit to table length
+L100:   lea     rdx, [rel itable]
+        mov     eax, [rdx + 4*rax]
+        mov     [rel intel_cpu_indicator@], eax             ; store in __intel_cpu_indicator
+        pop     rdi
+        pop     rsi
+        pop     r11
+        pop     r10
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        pop     rax
+        ret
+
+;__intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Math Kernel Library (MKL),
+;     version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global mkl_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
+        DD      0                      ; 4:      SSE2
+		DD      1                      ; 5:      SSE3
+        DD      2                      ; 6:      SSSE3
+        DD      2                      ; 7:      unused
+        DD      2                      ; 8:      SSE4.1
+        DD      2                      ; 9:      POPCNT
+        DD      3                      ; 10:     SSE4.2
+        DD      4                      ; 11:     AVX
+        DD      4                      ; 12:     PCLMUL, AES
+        DD      4                      ; 13:     AVX2
+        DD      5                      ; 14:     FMA3, BMI1/2, LZCNT
+;        DD      5                      ; 15:     AVX512F
+mkltablen equ ($ - mkltab) / 4         ; length of table
+
+SECTION .text
+
+mkl_serv_cpu_detect:
+        push    rcx                    ; Perhaps not needed
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, mkltablen
+        jb      M100
+        mov     eax, mkltablen - 1     ; limit to table length
+M100:   
+        lea     rdx, [rel mkltab]
+        mov     eax, [rdx + 4*rax]
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        ret
+; end mkl_serv_cpu_detect 
+       
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Vector Math Library (VML)
+;     version 14.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global mkl_vml_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
+        DD      1                      ; 4:      SSE2
+		DD      2                      ; 5:      SSE3
+        DD      3                      ; 6:      SSSE3
+        DD      3                      ; 7:      unused
+        DD      4                      ; 8:      SSE4.1
+        DD      4                      ; 9:      POPCNT
+        DD      5                      ; 10:     SSE4.2
+        DD      6                      ; 11:     AVX
+        DD      6                      ; 12:     PCLMUL, AES
+        DD      6                      ; 13:     AVX2
+        DD      7                      ; 14:     FMA3, BMI1/2, LZCNT
+;        DD      7                      ; 15:     AVX512F
+vmltablen equ ($ - vmltab) / 4         ; length of table
+
+SECTION .text
+
+mkl_vml_serv_cpu_detect:
+        push    rcx                    ; Perhaps not needed
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, vmltablen
+        jb      V100
+        mov     eax, vmltablen - 1     ; limit to table length
+V100:   
+        lea     rdx, [rel vmltab]
+        mov     eax, [rdx + 4*rax]
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        ret
+; end mkl_vml_serv_cpu_detect        
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for __intel_cpu_feature_indicator 
+;     version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if 0   ; Don't include this!
+
+; __intel_cpu_features_init and __intel_cpu_features_init_x are 
+; identical, except that the former checks the CPU brand, the
+; latter does not. Don't override this function. Instead, set
+; the indicator variables to 0 to force a re-evaluation,
+; and call __intel_cpu_features_init_x.
+; If you do want to override these functions then you must
+; save all registers.
+
+
+global __intel_cpu_features_init
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x  DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x:     DD 0, 0
+
+
+feattab DD  1                ; 0 default
+        DD  0BH              ; 1 MMX
+        DD  0FH              ; 2 conditional move and FCOMI supported
+        DD  3FH              ; 3 SSE
+        DD  7FH              ; 4 SSE2
+        DD  0FFH             ; 5 SSE3
+        DD  1FFH, 1FFH       ; 6 Supplementary SSE3
+        DD  3FFH             ; 8 SSE4.1
+        DD  0BFFH            ; 9 POPCNT 
+        DD  0FFFH            ; 10 SSE4.2 
+        DD  10FFFH           ; 11 AVX 
+        DD  16FFFH           ; 12 PCLMUL and AES 
+        DD  816FFFH          ; 13 AVX2 
+        DD  9DEFFFH          ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+        DD  0FDEFFFH         ; 15 HLE, RTM 
+
+feattablen equ ($ - feattab) / 4  ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+        push    rcx 
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, feattablen
+        jb      F100
+        mov     eax, vmltablen - 1     ; limit to table length
+F100:   
+        lea     rdx, [rel feattab]
+        mov     ebx, [rdx + 4*rax]     ; look up in table        
+        push    rbx
+        mov     eax, 1
+        cpuid
+        pop     rbx
+        bt      ecx, 22                ; MOVBE
+        jnc     F200
+        or      ebx, 1000H
+F200:   mov     [intel_cpu_feature_indicator@], rbx
+
+        ; get family and model
+        mov     edx, eax
+        and     eax, 0FH               ; stepping bit 0-3
+        mov     ecx, edx
+        shr     ecx, 4
+        and     ecx, 0FH               ; model
+        mov     ebx, edx
+        shr     ebx, 12
+        and     ebx, 0F0H              ; x model
+        or      ecx, ebx               ; full model
+        mov     ah,  cl                ; model bit 8 - 15
+        mov     ecx, edx
+        shr     ecx, 8
+        and     ecx, 0FH               ; family
+        mov     ebx, edx
+        shr     ebx, 20
+        and     ebx, 0FFH              ; x family
+        add     ecx, ebx               ; full family
+        shl     ecx, 16
+        or      eax, ecx               ; full family bit 16 - 23
+        mov     [intel_cpu_fms_indicator@], eax
+        
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        ret
+; end __intel_cpu_features_init        
+
+%endif
diff --git a/asmlibSrc/divfixedi32.asm b/asmlibSrc/divfixedi32.asm
new file mode 100755
index 0000000..ebb85a7
--- /dev/null
+++ b/asmlibSrc/divfixedi32.asm
@@ -0,0 +1,152 @@
+;*************************  divfixedi32.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-22
+; Last modified:    2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed 
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed 
+
+global _setdivisori32: function
+_setdivisori32:
+        push    ebx
+        mov     ebx, [esp+12]          ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      H120                   ; d < 0. Generate error
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      H110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     ebx
+H110:   inc     eax
+        mov     ebx, [esp+8]           ; buffer
+        mov     [ebx], eax             ; multiplier
+        mov     [ebx+4], ecx           ; shift count
+        pop     ebx
+        ret
+        
+H120:   ; d <= 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+
+        
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global _dividefixedi32: function
+_dividefixedi32:
+        push    ebx
+        mov     eax, [esp+12]          ; x
+        mov     ecx, [esp+8]           ; buffer
+        mov     ebx, eax
+        imul    dword [ecx]            ; m
+        lea     eax, [edx+ebx]
+        mov     ecx, [ecx+4]           ; shift count
+        sar     eax, cl
+        sar     ebx, 31                ; sign(x)
+        sub     eax, ebx
+        pop     ebx
+        ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned 
+
+global _setdivisoru32: function
+_setdivisoru32:
+        push    ebx
+        mov     ebx, [esp+12]          ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     edx, cl                ; 2^L
+        cmp     cl, 20h
+        adc     edx, -1                ; fix cl overflow, must give edx = 0
+        sub     edx, ebx
+        xor     eax, eax
+        div     ebx
+        inc     eax
+        mov     ebx, [esp+8]           ; buffer
+        mov     [ebx], eax             ; multiplier
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        shl     eax, 8
+        or      eax, edx
+        mov     [ebx+4], eax           ; shift 1 and shift 2
+        pop     ebx
+        ret
+        
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global _dividefixedu32: function       ; unsigned
+_dividefixedu32:
+        mov     eax, [esp+8]           ; x
+        mov     ecx, [esp+4]           ; buffer
+        mul     dword [ecx]            ; m
+        mov     eax, [esp+8]           ; x
+        sub     eax, edx               ; x-t
+        mov     ecx, [ecx+4]           ; shift 1 and shift 2
+        shr     eax, cl
+        add     eax, edx
+        shr     ecx, 8
+        shr     eax, cl
+        ret
diff --git a/asmlibSrc/divfixedi64.asm b/asmlibSrc/divfixedi64.asm
new file mode 100755
index 0000000..4e52d31
--- /dev/null
+++ b/asmlibSrc/divfixedi64.asm
@@ -0,0 +1,171 @@
+;*************************  divfixedi64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-22
+; Last modified:    2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed 
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par2   edx                     ; function parameter 2
+%define buf    r9                      ; copy of function parameter 1: buffer
+%define rx     r8
+%define rxd    r8d                     ; d or x
+%ELSE   ; UNIX
+%define par1   rdi                     ; function parameter 1
+%define par2   esi                     ; function parameter 2
+%define buf    rdi                     ; function parameter 1: buffer
+%define rx     rsi
+%define rxd    esi                     ; d or x
+%ENDIF
+
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed 
+
+global setdivisori32: function
+setdivisori32:
+%IFDEF  WINDOWS
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ENDIF        
+        dec     rxd                    ; rxd = r8d or esi
+        mov     ecx, -1                ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+        bsr     ecx, rxd               ; floor(log2(d-1))
+        inc     rxd
+        js      H120                   ; d < 0. Generate error
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     rxd, edx
+        je      H110                   ; avoid overflow when d = 1
+        shl     edx, cl
+        div     rxd
+H110:   inc     eax
+        mov     [buf], eax             ; multiplier
+        mov     [buf+4], ecx           ; shift count
+        ret
+        
+H120:   ; d <= 0 not supported. Generate error
+        mov     edx, 1
+        div     edx                    ; will overflow
+        ud2
+
+        
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global dividefixedi32: function
+dividefixedi32:
+%IFDEF  WINDOWS
+        mov     eax, edx
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ELSE
+        mov     eax, esi
+%ENDIF        
+        imul    dword [buf]            ; m
+        lea     eax, [rdx+rx]          ; rx = r8 or rsi
+        mov     ecx, [buf+4]           ; shift count
+        sar     eax, cl
+        sar     rxd, 31                ; sign(x)
+        sub     eax, rxd
+        ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned 
+
+global setdivisoru32: function
+setdivisoru32:
+%IFDEF  WINDOWS
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ENDIF        
+        dec     rxd                    ; rxd = r8d or esi
+        mov     ecx, -1                ; value for bsr if r8d = 0
+        bsr     ecx, rxd               ; floor(log2(d-1))
+        inc     rxd
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     rdx, cl                ; 2^L (64 bit shift because cl may be 32)
+        sub     edx, rxd
+        xor     eax, eax
+        div     rxd
+        inc     eax
+        mov     [buf], eax             ; multiplier
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        shl     eax, 8
+        or      eax, edx
+        mov     [buf+4], eax           ; shift 1 and shift 2
+        ret
+        
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global dividefixedu32: function       ; unsigned
+dividefixedu32:
+%IFDEF  WINDOWS
+        mov     eax, edx
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ELSE
+        mov     eax, esi
+%ENDIF        
+        mul     dword [buf]            ; m
+        sub     rxd, edx               ; x-t
+        mov     ecx, [buf+4]           ; shift 1 and shift 2
+        shr     rxd, cl
+        lea     eax, [rx+rdx]
+        shr     ecx, 8
+        shr     eax, cl
+        ret
diff --git a/asmlibSrc/divfixedv32.asm b/asmlibSrc/divfixedv32.asm
new file mode 100755
index 0000000..c3c6294
--- /dev/null
+++ b/asmlibSrc/divfixedv32.asm
@@ -0,0 +1,490 @@
+;*************************  divfixedv32.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-25
+; Last modified:    2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed 
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the 
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers. 
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text  align = 16
+
+;******************************************************************************
+;                    16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global _setdivisor8s: function
+_setdivisor8s:
+        push    ebx
+        movsx   ebx, word [esp+8]      ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      H120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      H110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     bx                     ; 2^(16+L-1)/d
+H110:   inc     eax
+        movd    xmm0, eax
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     ebx
+        ret
+        
+H120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; _setdivisor8s end
+
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global _setdivisorV8i16: function
+_setdivisorV8i16:
+        mov     eax, dword [esp+8]     ; d
+        push    eax
+        call    _setdivisor8s
+        pop     ecx
+        mov     eax, dword [esp+4]     ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [eax], xmm0            ; multiplier
+        movdqa  [eax+16], xmm1         ; shift count is still in xmm1
+        ret
+; _setdivisorV8i16 end
+
+        
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global _dividefixedV8i16: function
+
+align 16
+_dividefixedV8i16:
+        mov     ecx, [esp+4]           ; buffer
+        movdqa  xmm1, xmm0             ; x
+        pmulhw  xmm0, [ecx]            ; multiply high signed words
+        paddw   xmm0, xmm1
+        movd    xmm2, [ecx+16]         ; shift count
+        psraw   xmm0, xmm2             ; shift right arithmetic
+        psraw   xmm1, 15               ; sign of x
+        psubw   xmm0, xmm1
+        ret
+;_dividefixedV8i16 end
+
+
+
+;******************************************************************************
+;                    16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+global _setdivisor8us: function
+_setdivisor8us:
+        push    ebx
+        movzx   ebx, word [esp+8]      ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     edx, cl                ; 2^L  [32-bit shift to allow overflow]
+        sub     edx, ebx
+        xor     eax, eax
+        div     bx
+        inc     eax
+        movd    xmm0, eax
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     ebx
+        ret
+; _setdivisor8us
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned 
+
+global _setdivisorV8u16: function
+_setdivisorV8u16:
+        mov     eax, dword [esp+8]    ; d
+        push    eax
+        call    _setdivisor8us
+        pop     ecx
+        mov     eax, dword [esp+4]     ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [eax], xmm0            ; multiplier
+        movdqa  [eax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; _setdivisorV8u16 end
+
+        
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global _dividefixedV8u16: function
+
+align 16
+_dividefixedV8u16:
+        mov     ecx, [esp+4]           ; buffer
+        movdqa  xmm1, xmm0             ; x
+        pmulhuw xmm0, [ecx]            ; multiply high unsigned words
+        psubw   xmm1, xmm0
+        movd    xmm2, [ecx+16]         ; shift1
+        psrlw   xmm1, xmm2
+        paddw   xmm0, xmm1
+        movd    xmm2, [ecx+20]         ; shift2
+        psrlw   xmm0, xmm2
+        ret
+; _dividefixedV8u16 end
+
+
+
+;******************************************************************************
+;                    32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global _setdivisor4i: function
+_setdivisor4i:
+        push    ebx
+        mov     ebx, [esp+8]           ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      K120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      K110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     ebx                    ; 2^(16+L-1)/d
+K110:   inc     eax
+        movd    xmm0, eax              ; multiplier
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     ebx
+        ret
+        
+K120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; _setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global _setdivisorV4i32: function
+_setdivisorV4i32:
+        mov     eax, dword [esp+8]     ; d
+        push    eax
+        call    _setdivisor4i
+        pop     ecx
+        mov     eax, dword [esp+4]     ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [eax], xmm0            ; multiplier
+        movdqa  [eax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; _setdivisorV4i32 end
+
+        
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global _dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global _dividefixedV4i32SSE2:  function
+global _dividefixedV4i32SSE41: function
+
+align 8
+_dividefixedV4i32: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+%ELSE   ; Position-independent code
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+; Make the following instruction with address relative to RP1:
+        jmp     near [edx+dividefixedV4i32Dispatch-RP1]
+%ENDIF
+
+align 16
+_dividefixedV4i32SSE41: 
+        mov     ecx, [esp+4]           ; buffer
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x        
+        movdqa  xmm3, [ecx]            ; multiplier
+        pmuldq  xmm0, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuldq  xmm1, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        paddd   xmm0, xmm2
+        movd    xmm3, [ecx+16]         ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        ret
+;_dividefixedV4i32SSE41 end
+
+
+_dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 16
+        and     esp, -16               ; make aligned stack space
+        movdqa  [esp], xmm0            ; store x
+        movdqa  xmm2, xmm0             ; x        
+        mov     ecx, [ebp+8]           ; buffer
+        mov     ecx, [ecx]             ; multiplier
+        ; do four signed high multiplications
+        mov     eax, [esp]
+        imul    ecx
+        mov     [esp], edx
+        mov     eax, [esp+4]
+        imul    ecx
+        mov     [esp+4], edx
+        mov     eax, [esp+8]
+        imul    ecx
+        mov     [esp+8], edx
+        mov     eax, [esp+12]
+        imul    ecx
+        mov     [esp+12], edx
+        movdqa  xmm0, [esp]            ; x*m vector
+        mov     ecx, [ebp+8]           ; buffer
+        paddd   xmm0, xmm2
+        movd    xmm3, [ecx+16]         ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        mov     esp, ebp
+        pop     ebp        
+        ret
+;_dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for _dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version
+        mov     ecx, _dividefixedV4i32SSE2
+        cmp     eax, 8                ; check if PMULDQ supported
+        jb      Q100
+        ; SSE4.1 supported
+        ; Point to SSE4.1 version of strstr
+        mov     ecx, _dividefixedV4i32SSE41
+Q100:   mov     [dividefixedV4i32Dispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP10:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_dividefixedV4i32SSE2-RP10]
+        cmp     eax, 8                ; check if PMULDQ supported
+        jb      Q100
+        ; SSE4.1 supported
+        ; Point to SSE4.1 version of strstr
+        lea     ecx, [edx+_dividefixedV4i32SSE41-RP10]
+Q100:   mov     [edx+dividefixedV4i32Dispatch-RP10], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+        
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret        
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch DD dividefixedV4i32CPUDispatch
+
+section .text
+
+
+
+;******************************************************************************
+;                    32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global _setdivisor4ui: function
+_setdivisor4ui:
+        push    ebx
+        mov     ebx, [esp+8]           ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     edx, cl                ; 2^L
+        cmp     cl, 20h
+        adc     edx, -1                ; fix cl overflow, must give edx = 0
+        sub     edx, ebx
+        xor     eax, eax
+        div     ebx
+        inc     eax
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     ebx
+        ret
+; _setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned 
+
+global _setdivisorV4u32: function
+_setdivisorV4u32:
+        mov     eax, dword [esp+8]     ; d
+        push    eax
+        call    _setdivisor4ui
+        pop     ecx
+        mov     eax, dword [esp+4]     ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [eax], xmm0            ; multiplier
+        movdqa  [eax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; _setdivisorV4u32 end
+        
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global _dividefixedV4u32: function
+
+align 16
+_dividefixedV4u32:
+        mov     ecx, [esp+4]           ; buffer
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x
+        movdqa  xmm3, [ecx]            ; multiplier
+        pmuludq xmm0, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuludq xmm1, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        psubd   xmm2, xmm0
+        movd    xmm3, [ecx+16]         ; shift1
+        psrld   xmm2, xmm3
+        paddd   xmm0, xmm2
+        movd    xmm3, [ecx+20]         ; shift2
+        psrld   xmm0, xmm3
+        ret
+;_dividefixedV4u32 end
diff --git a/asmlibSrc/divfixedv64.asm b/asmlibSrc/divfixedv64.asm
new file mode 100755
index 0000000..145b125
--- /dev/null
+++ b/asmlibSrc/divfixedv64.asm
@@ -0,0 +1,496 @@
+;*************************  divfixedv64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-25
+; Last modified:    2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed 
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the 
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers. 
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par1d  ecx
+%define par1w   cx
+%define par2   rdx                     ; function parameter 2
+%define par2d  edx
+%define par2w  dx 
+%define buf    r8                      ; pointer to buffer
+%ENDIF
+%IFDEF  UNIX
+%define par1   rdi                     ; function parameter 1
+%define par1d  edi
+%define par1w  di 
+%define par2   rsi                     ; function parameter 2
+%define par2d  esi
+%define par2w  si
+%define buf    rdi                     ; pointer to buffer
+%ENDIF
+
+
+; Imported from instrset64.asm:
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+section .text  align = 16
+
+;******************************************************************************
+;                    16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisor8s: function
+setdivisor8s:
+        push    rbx
+        movsx   ebx, par1w             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      H120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      H110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     bx                     ; 2^(16+L-1)/d
+H110:   inc     eax
+        movd    xmm0, eax              ; multiplier
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     rbx
+        ret
+H120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; setdivisor8s end        
+
+        
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisorV8i16: function
+setdivisorV8i16:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor8s
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
+        ret
+; setdivisorV8i16 end
+
+
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global dividefixedV8i16: function
+
+dividefixedV8i16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0, [par2]           ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        pmulhw  xmm0, [par1]           ; multiply high signed words
+        paddw   xmm0, xmm1
+        movd    xmm2, [par1+16]        ; shift count
+        psraw   xmm0, xmm2             ; shift right arithmetic
+        psraw   xmm1, 15               ; sign of x
+        psubw   xmm0, xmm1
+        ret
+;dividefixedV8i16 end
+
+
+
+;******************************************************************************
+;                    16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+align 16
+global setdivisor8us: function
+setdivisor8us:
+        push    rbx
+        movzx   ebx, par1w             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     edx, cl                ; 2^L  [32-bit shift to allow overflow]
+        sub     edx, ebx
+        xor     eax, eax
+        div     bx
+        inc     eax
+        movd    xmm0, eax
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift 1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     rbx
+        ret
+; setdivisor8us end
+
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned 
+
+global setdivisorV8u16: function
+setdivisorV8u16:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor8us
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; setdivisorV8u16 end
+
+        
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global dividefixedV8u16: function
+
+align 16
+dividefixedV8u16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0, [par2]           ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        pmulhuw xmm0, [par1]           ; multiply high unsigned words
+        psubw   xmm1, xmm0
+        movd    xmm2, [par1+16]        ; shift1
+        psrlw   xmm1, xmm2
+        paddw   xmm0, xmm1
+        movd    xmm2, [par1+20]        ; shift2
+        psrlw   xmm0, xmm2
+        ret
+;dividefixedV8u16 end
+
+
+
+;******************************************************************************
+;                    32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global setdivisor4i: function
+setdivisor4i:
+        push    rbx
+        mov     ebx, par1d             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      K120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      K110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     ebx                    ; 2^(16+L-1)/d
+K110:   inc     eax
+        movd    xmm0, eax              ; multiplier
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     rbx
+        ret
+        
+K120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global setdivisorV4i32: function
+setdivisorV4i32:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor4i
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
+        ret
+; setdivisorV4i32 end
+
+        
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global dividefixedV4i32SSE2:  function
+global dividefixedV4i32SSE41: function
+
+align 8
+dividefixedV4i32: ; function dispatching
+        jmp     near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+dividefixedV4i32SSE41: 
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x        
+        movdqa  xmm3, [par1]           ; multiplier
+        pmuldq  xmm0, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuldq  xmm1, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        paddd   xmm0, xmm2
+        movd    xmm3, [par1+16]        ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        ret
+;dividefixedV4i32SSE41 end
+
+dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+        push    rbp
+        mov     rbp, rsp
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+        mov     buf, par1
+%ENDIF
+        sub     rsp, 16                ; allocate stack space
+        and     rsp, -16               ; stack should be aligned already. align anyway to be safe
+        movdqa  [rsp], xmm0            ; store x
+        movdqa  xmm2, xmm0             ; x        
+        mov     ecx, [buf]             ; multiplier
+        ; do four signed high multiplications
+        mov     eax, [rsp]
+        imul    ecx
+        mov     [rsp], edx
+        mov     eax, [rsp+4]
+        imul    ecx
+        mov     [rsp+4], edx
+        mov     eax, [rsp+8]
+        imul    ecx
+        mov     [rsp+8], edx
+        mov     eax, [rsp+12]
+        imul    ecx
+        mov     [rsp+12], edx
+        movdqa  xmm0, [rsp]            ; x*m vector
+        paddd   xmm0, xmm2
+        movd    xmm3, [buf+16]         ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        mov     rsp, rbp
+        pop     rbp        
+        ret
+;dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version
+        lea     r8, [dividefixedV4i32SSE2]
+        cmp     eax, 8                ; check if PMULDQ supported
+        jb      Q100
+        ; SSE4.1 supported
+        ; Point to SSE4.1 version of strstr
+        lea     r8, [dividefixedV4i32SSE41]
+Q100:   mov     [dividefixedV4i32Dispatch], r8
+        ; Continue in appropriate version 
+        jmp     r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
+
+section .text
+
+
+;******************************************************************************
+;                    32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global setdivisor4ui: function
+setdivisor4ui:
+        push    rbx
+        mov     ebx, par1d             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     rdx, cl                ; 2^L     [64 bit shift to allow overflow]
+        sub     edx, ebx
+        xor     eax, eax
+        div     ebx
+        inc     eax
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al        
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     rbx
+        ret
+; setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned 
+
+global setdivisorV4u32: function
+setdivisorV4u32:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor4ui
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; setdivisorV4u32 end
+        
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global dividefixedV4u32: function
+
+align 16
+dividefixedV4u32:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x
+        movdqa  xmm3, [par1]           ; multiplier
+        pmuludq xmm0, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuludq xmm1, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        psubd   xmm2, xmm0
+        movd    xmm3, [par1+16]        ; shift1
+        psrld   xmm2, xmm3
+        paddd   xmm0, xmm2
+        movd    xmm3, [par1+20]        ; shift2
+        psrld   xmm0, xmm3
+        ret
+;dividefixedV4u32 end
diff --git a/asmlibSrc/instrset32.asm b/asmlibSrc/instrset32.asm
new file mode 100755
index 0000000..994f725
--- /dev/null
+++ b/asmlibSrc/instrset32.asm
@@ -0,0 +1,244 @@
+;*************************  instrset32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2003-12-12
+; Last modified:    2014-07-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the 
+; ability of the operating system to catch invalid opcode exceptions. The
+; method used here has been thoroughly tested on many different versions of
+; Intel and AMD microprocessors, and is believed to work reliably. For further
+; discussion of this method, see my manual "Optimizing subroutines in assembly
+; language" (www.agner.org/optimize/).
+; 
+; Copyright (c) 2003-2014 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+;  0 =  80386 instruction set only
+;  1 or above = MMX instructions supported
+;  2 or above = conditional move and FCOMI supported
+;  3 or above = SSE (XMM) supported by processor and operating system
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512f supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _InstructionSet: function
+global _IInstrSet
+
+
+SECTION .data
+align 16
+_IInstrSet:
+_IInstrSet@: dd    -1                  ; local name
+
+SECTION .text  align=16
+
+%IFDEF POSITIONINDEPENDENT
+
+; Local function for reading instruction pointer into edi
+GetThunkEDX:
+        mov     edx, [esp]
+        ret
+
+%ENDIF  ; POSITIONINDEPENDENT
+
+
+_InstructionSet:
+        
+%IFDEF POSITIONINDEPENDENT
+        ; Position-independent code for ELF and Mach-O shared objects:
+        call    GetThunkEDX
+        add     edx, _IInstrSet@ - $
+        mov     eax, [edx]
+%ELSE
+        mov     eax, [_IInstrSet@]
+%ENDIF        
+        ; Check if this function has been called before
+        test    eax, eax
+        js      FirstTime              ; Negative means first time
+        ret                            ; Early return. Has been called before
+
+FirstTime:                             ; Function has not been called before
+        push    ebx
+
+%IFNDEF POSITIONINDEPENDENT
+        mov     edx, _IInstrSet@       ; make edx point to _IInstrSet
+%ENDIF
+        push    edx                    ; save address of _IInstrSet
+        
+        ; detect if CPUID instruction supported by microprocessor:
+        pushfd
+        pop     eax
+        btc     eax, 21                ; check if CPUID bit can toggle
+        push    eax
+        popfd
+        pushfd
+        pop     ebx
+        xor     ebx, eax
+        xor     eax, eax               ; 0
+        bt      ebx, 21
+        jc      ISEND                  ; CPUID not supported
+        
+        cpuid                          ; get number of CPUID functions
+        test    eax, eax
+        jz      ISEND                  ; function 1 not supported
+        mov     eax, 1
+        cpuid                          ; get features
+        xor     eax, eax               ; 0
+        
+        test    edx, 1                 ; floating point support
+        jz      ISEND
+        bt      edx, 23                ; MMX support        
+        jnc     ISEND
+        inc     eax                    ; 1
+        
+        bt      edx, 15                ; conditional move support
+        jnc     ISEND
+        inc     eax                    ; 2
+
+        ; check OS support for XMM registers (SSE)
+        bt      edx, 24                ; FXSAVE support by microprocessor
+        jnc     ISEND
+        push    ecx
+        push    edx
+        mov     ebx, esp               ; save stack pointer
+        sub     esp, 200H              ; allocate space for FXSAVE
+        and     esp, -10H              ; align by 16
+TESTDATA EQU 0D95A34BEH                ; random test value
+TESTPS   EQU 10CH                      ; position to write TESTDATA = upper part of XMM6 image
+        fxsave  [esp]                  ; save FP/MMX and XMM registers
+        mov     ecx,[esp+TESTPS]       ; read part of XMM6 register
+        xor     DWORD [esp+TESTPS],TESTDATA  ; change value
+        fxrstor [esp]                  ; load changed value into XMM6
+        mov     [esp+TESTPS],ecx       ; restore old value in buffer
+        fxsave  [esp]                  ; save again
+        mov     edx,[esp+TESTPS]       ; read changed XMM6 register
+        mov     [esp+TESTPS],ecx       ; restore old value
+        fxrstor [esp]                  ; load old value into XMM6
+        xor     ecx, edx               ; get difference between old and new value
+        mov     esp, ebx               ; restore stack pointer
+        cmp     ecx, TESTDATA          ; test if XMM6 was changed correctly
+        pop     edx
+        pop     ecx
+        jne     ISEND
+        
+        bt      edx, 25                ; SSE support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 3
+        
+        bt      edx, 26                ; SSE2 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 4
+        
+        test    ecx, 1                 ; SSE3 support by microprocessor
+        jz      ISEND
+        inc     eax                    ; 5
+        
+        bt      ecx, 9                 ; Suppl-SSE3 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 6
+        
+        bt      ecx, 19                ; SSE4.1 support by microprocessor
+        jnc     ISEND
+        mov     al, 8                  ; 8
+        
+        bt      ecx, 23                ; POPCNT support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 9
+        
+        bt      ecx, 20                ; SSE4.2 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 10
+
+        ; check OS support for YMM registers (AVX)
+        bt      ecx, 27                ; OSXSAVE: XGETBV supported
+        jnc     ISEND
+        pushad
+        xor     ecx, ecx
+        db      0FH, 01H, 0D0H         ; XGETBV
+        and     eax, 6
+        cmp     eax, 6                 ; AVX support by OS
+        popad
+        jne     ISEND
+        
+        bt      ecx, 28                ; AVX support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 11
+        
+        bt      ecx, 1                 ; PCLMUL support
+        jnc     ISEND
+        bt      ecx, 25                ; AES support
+        jnc     ISEND
+        inc     eax                    ; 12
+        
+        push    eax
+        push    ecx
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid                          ; check for AVX2
+        bt      ebx, 5
+        pop     ecx
+        pop     eax
+        jnc     ISEND
+        inc     eax                    ; 13
+        
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+        bt      ecx, 12                ; FMA3
+        jnc     ISEND
+        bt      ecx, 29                ; F16C
+        jnc     ISEND
+        bt      ebx, 3                 ; BMI1
+        jnc     ISEND
+        bt      ebx, 8                 ; BMI2
+        jnc     ISEND
+                
+        push    eax
+        push    ebx
+        push    ecx
+        mov     eax, 80000001H
+        cpuid
+        bt      ecx, 5                 ; LZCNT
+        pop     ecx
+        pop     ebx
+        pop     eax
+        jnc     ISEND        
+        inc     eax                    ; 14
+
+        bt      ebx, 16                ; AVX512f
+        jnc     ISEND
+        inc     eax                    ; 15
+        
+ISEND:  pop     edx                    ; address of _IInstrSet
+        mov     [edx], eax             ; save value in public variable _IInstrSet
+        pop     ebx
+        ret                            ; return value is in eax
+
+;_InstructionSet ENDP
diff --git a/asmlibSrc/instrset64.asm b/asmlibSrc/instrset64.asm
new file mode 100755
index 0000000..d40938e
--- /dev/null
+++ b/asmlibSrc/instrset64.asm
@@ -0,0 +1,173 @@
+;*************************  instrset64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2003-12-12
+; Last modified:    2014-07-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the 
+; ability of the operating system to catch invalid opcode exceptions. For 
+; further discussion of this method, see my manual "Optimizing subroutines
+; in assembly language" (www.agner.org/optimize/).
+; 
+; Copyright (c) 2003-2014 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+;  0 =  80386 instruction set only
+;  1 or above = MMX instructions supported
+;  2 or above = conditional move and FCOMI supported
+;  3 or above = SSE (XMM) supported by processor and operating system
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512f supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global InstructionSet: function
+global IInstrSet
+
+
+SECTION .data
+align 16
+
+IInstrSet@:                            ; local name to avoid problems in shared objects
+IInstrSet:  dd      -1                 ; this global variable is valid after first call
+
+
+SECTION .text  align=16
+
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+
+
+InstructionSet:
+        ; Check if this function has been called before
+        mov     eax, [IInstrSet@]
+        test    eax, eax
+        js      FirstTime              ; Negative means first time
+        ; Early return. Has been called before
+        ret                            ; Return value is in eax
+
+FirstTime:
+        push    rbx
+
+        mov     eax, 1
+        cpuid                          ; get features into edx and ecx
+        
+        mov     eax, 4                 ; at least SSE2 supported in 64 bit mode
+        test    ecx, 1                 ; SSE3 support by microprocessor
+        jz      ISEND
+        inc     eax                    ; 5
+        
+        bt      ecx, 9                 ; Suppl-SSE3 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 6
+        
+        bt      ecx, 19                ; SSE4.1 support by microprocessor
+        jnc     ISEND
+        mov     al, 8                  ; 8        
+        
+        bt      ecx, 23                ; POPCNT support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 9
+        
+        bt      ecx, 20                ; SSE4.2 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 10
+
+        ; check OS support for YMM registers (AVX)
+        bt      ecx, 27                ; OSXSAVE: XGETBV supported
+        jnc     ISEND
+        push    rax
+        push    rcx
+        push    rdx
+        xor     ecx, ecx
+        db      0FH, 01H, 0D0H         ; XGETBV
+        and     eax, 6
+        cmp     eax, 6                 ; AVX support by OS
+        pop     rdx
+        pop     rcx
+        pop     rax
+        jne     ISEND
+
+        bt      ecx, 28                ; AVX support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 11
+        
+        bt      ecx, 1                 ; PCLMUL support
+        jnc     ISEND
+        bt      ecx, 25                ; AES support
+        jnc     ISEND
+        inc     eax                    ; 12
+        
+        push    rax
+        push    rcx
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid                          ; check for AVX2
+        bt      ebx, 5
+        pop     rcx
+        pop     rax
+        jnc     ISEND
+        inc     eax                    ; 13
+        
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+        bt      ecx, 12                ; FMA3
+        jnc     ISEND
+        bt      ecx, 29                ; F16C
+        jnc     ISEND
+        bt      ebx, 3                 ; BMI1
+        jnc     ISEND
+        bt      ebx, 8                 ; BMI2
+        jnc     ISEND
+        
+        push    rax
+        push    rbx
+        push    rcx
+        mov     eax, 80000001H
+        cpuid
+        bt      ecx, 5                 ; LZCNT
+        pop     rcx
+        pop     rbx
+        pop     rax
+        jnc     ISEND
+        inc     eax                    ; 14
+
+        bt      ebx, 16                ; AVX512f
+        jnc     ISEND
+        inc     eax                    ; 15
+       
+ISEND:  mov     [IInstrSet@], eax      ; save value in global variable
+
+        pop     rbx
+        ret                            ; return value is in eax
+
+;InstructionSet ENDP
diff --git a/asmlibSrc/libad32.asm b/asmlibSrc/libad32.asm
new file mode 100755
index 0000000..96bf994
--- /dev/null
+++ b/asmlibSrc/libad32.asm
@@ -0,0 +1,14 @@
+; ----------------------------- LIBAD.ASM ---------------------------
+; DLL entry function for LIBAD32.DLL
+
+
+SECTION .text  align=16
+
+GLOBAL _DllEntry at 12: function
+
+_DllEntry at 12:       ; proc hInstance:DWORD, reason:DWORD, reserved1:DWORD
+        mov     eax, 1
+        ret     12
+;_DllEntry at 12 endp
+
+; END  _DllEntry at 12
diff --git a/asmlibSrc/libad32.def b/asmlibSrc/libad32.def
new file mode 100755
index 0000000..bfba973
--- /dev/null
+++ b/asmlibSrc/libad32.def
@@ -0,0 +1,44 @@
+LIBRARY libad32
+
+VERSION 2013.0913
+
+EXPORTS 
+        InstructionSet
+        ProcessorName
+        ReadTSC
+        RoundF
+        RoundD
+        A_strcmp
+        A_stricmp
+        A_strstr
+        A_strtolower
+        A_strtoupper
+        A_strspn
+        A_strcspn
+        strCountInSet
+        strcount_UTF8
+        CpuType
+        A_DebugBreak
+        cpuid_ex
+        setdivisori32
+        setdivisoru32
+        dividefixedi32
+        dividefixedu32
+        PhysicalSeedD
+        MersenneRandomInitD
+        MersenneRandomInitByArrayD
+        MersenneRandomD
+        MersenneIRandomD
+        MersenneIRandomXD
+        MersenneBRandomD
+        MotherRandomInitD
+        MotherIRandomD
+        MotherRandomD
+        MotherBRandomD        
+        SFMTgenRandomInitD
+        SFMTgenRandomInitByArrayD
+        SFMTgenIRandomD
+        SFMTgenIRandomXD
+        SFMTgenRandomD
+        SFMTgenBRandomD
+        
diff --git a/asmlibSrc/libad64.asm b/asmlibSrc/libad64.asm
new file mode 100755
index 0000000..25c5208
--- /dev/null
+++ b/asmlibSrc/libad64.asm
@@ -0,0 +1,13 @@
+; ----------------------------- LIBAD64.ASM ---------------------------
+; DLL entry function for LIBAD64.DLL
+
+default rel
+
+global DllEntry: function
+
+SECTION .text  align=16
+
+DllEntry:
+        mov     eax, 1
+        ret
+;DllMain endp
diff --git a/asmlibSrc/libad64.def b/asmlibSrc/libad64.def
new file mode 100755
index 0000000..5948615
--- /dev/null
+++ b/asmlibSrc/libad64.def
@@ -0,0 +1,42 @@
+LIBRARY libad64
+
+VERSION 2013.0913
+
+EXPORTS InstructionSet
+        ProcessorName
+        ReadTSC
+        RoundF
+        RoundD
+        A_strcmp
+        A_stricmp
+        A_strstr
+        A_strtolower
+        A_strtoupper
+        A_strspn
+        A_strcspn
+        strCountInSet
+        strcount_UTF8
+        CpuType
+        A_DebugBreak
+        cpuid_ex
+        setdivisori32
+        setdivisoru32
+        dividefixedi32
+        dividefixedu32
+        PhysicalSeedD
+        MersenneRandomInitD
+        MersenneRandomInitByArrayD
+        MersenneRandomD
+        MersenneIRandomD
+        MersenneIRandomXD
+        MersenneBRandomD        
+        MotherRandomInitD
+        MotherIRandomD
+        MotherRandomD
+        MotherBRandomD        
+        SFMTgenRandomInitD
+        SFMTgenRandomInitByArrayD
+        SFMTgenIRandomD
+        SFMTgenIRandomXD
+        SFMTgenRandomD
+        SFMTgenBRandomD
diff --git a/asmlibSrc/memcmp32.asm b/asmlibSrc/memcmp32.asm
new file mode 100755
index 0000000..8e4cc00
--- /dev/null
+++ b/asmlibSrc/memcmp32.asm
@@ -0,0 +1,366 @@
+;*************************  memcmp32.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2013-10-03
+; Last modified:    2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger 
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller 
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memcmp: function             ; Function memcmp
+global ?OVR_memcmp: function           ; ?OVR removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global _memcmp386:  function           ; version for old CPUs without SSE
+global _memcmpSSE2: function           ; SSE2 version
+global _memcmpAVX2: function           ; AVX2 version
+
+; Imported from instrset32.asm
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+
+SECTION .text  align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+_A_memcmp:
+?OVR_memcmp:
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP:                                    ; reference point edx = offset RP
+; Make the following instruction with address relative to RP:
+        jmp     dword [edx+memcmpDispatch-RP]
+%ENDIF
+
+
+align 16
+_memcmpAVX2:   ; AVX2 version. Use ymm register
+memcmpAVX2@:   ; internal reference
+        push    esi
+        push    edi
+        mov     esi, [esp+12]                    ; ptr1
+        mov     edi, [esp+16]                    ; ptr2
+        mov     ecx, [esp+20]                    ; size
+        add     esi, ecx                         ; use negative index from end of memory block
+        add     edi, ecx
+        neg     ecx
+        jz      A900
+        mov     edx, 0FFFFH 
+        cmp     ecx, -32
+        ja      A100
+        
+A000:   ; loop comparing 32 bytes
+        vmovdqu   ymm1, [esi+ecx]
+        vpcmpeqb  ymm0, ymm1, [edi+ecx]          ; compare 32 bytes
+        vpmovmskb eax, ymm0                      ; get byte mask
+        xor     eax, -1                          ; not eax would not set flags
+        jnz     A700                             ; difference found
+        add     ecx, 32
+        jz      A900                             ; finished, equal
+        cmp     ecx, -32
+        jna     A000                             ; next 32 bytes
+        vzeroupper                               ; end ymm state
+        
+A100:   ; less than 32 bytes left
+        cmp     ecx, -16
+        ja      A200
+        movdqu  xmm1, [esi+ecx]
+        movdqu  xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     A701                             ; difference found
+        add     ecx, 16
+        jz      A901                             ; finished, equal
+        
+A200:   ; less than 16 bytes left
+        cmp     ecx, -8
+        ja      A300
+        ; compare 8 bytes
+        movq    xmm1, [esi+ecx]
+        movq    xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     A701                             ; difference found
+        add     ecx, 8
+        jz      A901 
+        
+A300:   ; less than 8 bytes left
+        cmp     ecx, -4
+        ja      A400
+        ; compare 4 bytes
+        movd    xmm1, [esi+ecx]
+        movd    xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     A701                             ; difference found
+        add     ecx, 4
+        jz      A901 
+
+A400:   ; less than 4 bytes left
+        cmp     ecx, -2
+        ja      A500
+        movzx   eax, word [esi+ecx]
+        movzx   edx, word [edi+ecx]
+        sub     eax, edx
+        jnz     A800                             ; difference in byte 0 or 1
+        add     ecx, 2
+        jz      A901 
+        
+A500:   ; less than 2 bytes left
+        test    ecx, ecx
+        jz      A901                             ; no bytes left
+        
+A600:   ; one byte left
+        movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+A700:   ; difference found. find position
+        vzeroupper
+A701:   
+        bsf     eax, eax
+        add     ecx, eax
+        movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+A800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     ecx, -1                          ; add 1 to ecx if al == 0
+        movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+A900:   ; equal
+        vzeroupper
+A901:   xor     eax, eax        
+        pop     edi
+        pop     esi
+        ret
+        
+
+_memcmpSSE2:   ; SSE2 version. Use xmm register
+memcmpSSE2@:   ; internal reference
+
+        push    esi
+        push    edi
+        mov     esi, [esp+12]                    ; ptr1
+        mov     edi, [esp+16]                    ; ptr2
+        mov     ecx, [esp+20]                    ; size
+        add     esi, ecx                         ; use negative index from end of memory block
+        add     edi, ecx
+        neg     ecx
+        jz      S900 
+        mov     edx, 0FFFFH
+        cmp     ecx, -16
+        ja      S200
+        
+S100:   ; loop comparing 16 bytes
+        movdqu  xmm1, [esi+ecx]
+        movdqu  xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     S700                             ; difference found
+        add     ecx, 16
+        jz      S900                             ; finished, equal
+        cmp     ecx, -16
+        jna     S100                             ; next 16 bytes
+        
+S200:   ; less than 16 bytes left
+        cmp     ecx, -8
+        ja      S300
+        ; compare 8 bytes
+        movq    xmm1, [esi+ecx]
+        movq    xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     S700                             ; difference found
+        add     ecx, 8
+        jz      S900 
+        
+S300:   ; less than 8 bytes left
+        cmp     ecx, -4
+        ja      S400
+        ; compare 4 bytes
+        movd    xmm1, [esi+ecx]
+        movd    xmm2, [edi+ecx]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, edx                         ; not ax
+        jnz     S700                             ; difference found
+        add     ecx, 4
+        jz      S900 
+
+S400:   ; less than 4 bytes left
+        cmp     ecx, -2
+        ja      S500
+        movzx   eax, word [esi+ecx]
+        movzx   edx, word [edi+ecx]
+        sub     eax, edx
+        jnz     S800                             ; difference in byte 0 or 1
+        add     ecx, 2
+        jz      S900 
+        
+S500:   ; less than 2 bytes left
+        test    ecx, ecx
+        jz      S900                             ; no bytes left
+        
+        ; one byte left
+        movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+S700:   ; difference found. find position
+        bsf     eax, eax
+        add     ecx, eax
+        movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+S800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     ecx, -1                          ; add 1 to ecx if al == 0
+S820:   movzx   eax, byte [esi+ecx]
+        movzx   edx, byte [edi+ecx]
+        sub     eax, edx                         ; return result
+        pop     edi
+        pop     esi
+        ret
+
+S900:   ; equal
+        xor     eax, eax        
+        pop     edi
+        pop     esi
+        ret
+
+
+_memcmp386:    ; 80386 version
+memcmp386@:    ; internal reference
+        ; This is not perfectly optimized because it is unlikely to ever be used
+        push    esi
+        push    edi
+        mov     esi, [esp+12]                    ; ptr1
+        mov     edi, [esp+16]                    ; ptr2
+        mov     ecx, [esp+20]                    ; size
+        mov     edx, ecx
+        shr     ecx, 2                           ; size/4 = number of dwords
+        repe    cmpsd                            ; compare dwords
+        jnz     M700
+        mov     ecx, edx
+        and     ecx, 3                           ; remainder
+M600:   repe    cmpsb                            ; compare bytes
+        je      M800                             ; equal
+        movzx   eax, byte [esi-1]                ; esi, edi point past the differing byte. find difference
+        movzx   edx, byte [edi-1]
+        sub     eax, edx                         ; calculate return value
+        pop     edi
+        pop     esi
+        ret
+        
+M700:   ; dwords differ. search in last 4 bytes
+        mov     ecx, 4
+        sub     esi, ecx
+        sub     edi, ecx
+        jmp     M600
+        
+M800:   ; equal. return zero
+        xor     eax, eax        
+        pop     edi
+        pop     esi
+        ret
+        
+        
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+
+%IFNDEF POSITIONINDEPENDENT
+        call    _InstructionSet                         ; get supported instruction set
+        ; Point to generic version of memcmp
+        mov     dword [memcmpDispatch],  memcmp386@
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        mov     dword [memcmpDispatch],  memcmpSSE2@
+        cmp     eax, 13                ; check AVX2
+        jb      Q100
+        ; AVX2 supported
+        mov     dword [memcmpDispatch],  memcmpAVX2@
+Q100:   ; Continue in appropriate version of memcmp
+        jmp     dword [memcmpDispatch]
+
+%ELSE   ; Position-independent version
+        push    edx
+        call    _InstructionSet 
+        pop     edx
+                
+        ; Point to generic version of memcmp
+        lea     ecx, [edx+memcmp386 at -RP]
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; Point to SSE2 version of memcmp
+        lea     ecx, [edx+memcmpSSE2 at -RP]
+        cmp     eax, 13                ; check AVX2
+        jb      Q100
+        ; Point to AVX2 version of memcmp
+        lea     ecx, [edx+memcmpAVX2 at -RP]
+Q100:   mov     [edx+memcmpDispatch-RP], ecx
+        ; Continue in appropriate version of memcmp
+        jmp     ecx
+        
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov     edx, [esp]
+        ret        
+%ENDIF
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DD memcmpCPUDispatch
+
diff --git a/asmlibSrc/memcmp64.asm b/asmlibSrc/memcmp64.asm
new file mode 100755
index 0000000..c7f14c9
--- /dev/null
+++ b/asmlibSrc/memcmp64.asm
@@ -0,0 +1,293 @@
+;*************************  memcmp64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2013-10-03
+; Last modified:    2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger 
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller 
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global A_memcmp: function              ; Function memcmp
+global ?OVR_memcmp: function           ; ?OVR_ removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global memcmpSSE2: function            ; SSE2 version
+global memcmpAVX2: function            ; AVX2 version
+
+; Imported from instrset64.asm
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+default rel
+
+; define registers used for parameters
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par2   rdx                     ; function parameter 2
+%define par3   r8                      ; function parameter 3
+%define par4   r9                      ; scratch register
+%define par4d  r9d                     ; scratch register
+%ENDIF
+%IFDEF  UNIX
+%define par1   rdi                     ; function parameter 1
+%define par2   rsi                     ; function parameter 2
+%define par3   rdx                     ; function parameter 3
+%define par4   rcx                     ; scratch register
+%define par4d  ecx                     ; scratch register
+%ENDIF
+
+
+
+SECTION .text  align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+A_memcmp:
+?OVR_memcmp:
+        jmp     qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+memcmpAVX2:    ; AVX2 version. Use ymm register
+memcmpAVX2@:   ; internal reference
+
+        add     par1, par3                       ; use negative index from end of memory block
+        add     par2, par3
+        neg     par3
+        jz      A900
+        mov     par4d, 0FFFFH 
+        cmp     par3, -32
+        ja      A100
+        
+A000:   ; loop comparing 32 bytes
+        vmovdqu   ymm1, [par1+par3]
+        vpcmpeqb  ymm0, ymm1, [par2+par3]        ; compare 32 bytes
+        vpmovmskb eax, ymm0                      ; get byte mask
+        xor     eax, -1                          ; not eax would not set flags
+        jnz     A700                             ; difference found
+        add     par3, 32
+        jz      A900                             ; finished, equal
+        cmp     par3, -32
+        jna     A000                             ; next 32 bytes
+        vzeroupper                               ; end ymm state
+        
+A100:   ; less than 32 bytes left
+        cmp     par3, -16
+        ja      A200
+        movdqu  xmm1, [par1+par3]
+        movdqu  xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                       ; invert lower 16 bits
+        jnz     A701                             ; difference found
+        add     par3, 16
+        jz      A901                             ; finished, equal
+        
+A200:   ; less than 16 bytes left
+        cmp     par3, -8
+        ja      A300
+        ; compare 8 bytes
+        movq    xmm1, [par1+par3]
+        movq    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d
+        jnz     A701                             ; difference found
+        add     par3, 8
+        jz      A901 
+        
+A300:   ; less than 8 bytes left
+        cmp     par3, -4
+        ja      A400
+        ; compare 4 bytes
+        movd    xmm1, [par1+par3]
+        movd    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     A701                             ; difference found
+        add     par3, 4
+        jz      A901 
+
+A400:   ; less than 4 bytes left
+        cmp     par3, -2
+        ja      A500
+        movzx   eax, word [par1+par3]
+        movzx   par4d, word [par2+par3]
+        sub     eax, par4d
+        jnz     A800                             ; difference in byte 0 or 1
+        add     par3, 2
+        jz      A901 
+        
+A500:   ; less than 2 bytes left
+        test    par3, par3
+        jz      A901                             ; no bytes left
+        
+A600:   ; one byte left
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A700:   ; difference found. find position
+        vzeroupper
+A701:   
+        bsf     eax, eax
+        add     par3, rax
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     par3, -1                           ; add 1 to par3 if al == 0
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A900:   ; equal
+        vzeroupper
+A901:   xor     eax, eax        
+        ret
+        
+
+memcmpSSE2:    ; SSE2 version. Use xmm register
+memcmpSSE2@:   ; internal reference
+
+        add     par1, par3                         ; use negative index from end of memory block
+        add     par2, par3
+        neg     par3
+        jz      S900 
+        mov     par4d, 0FFFFH
+        cmp     par3, -16
+        ja      S200
+        
+S100:   ; loop comparing 16 bytes
+        movdqu  xmm1, [par1+par3]
+        movdqu  xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 16
+        jz      S900                             ; finished, equal
+        cmp     par3, -16
+        jna     S100                             ; next 16 bytes
+        
+S200:   ; less than 16 bytes left
+        cmp     par3, -8
+        ja      S300
+        ; compare 8 bytes
+        movq    xmm1, [par1+par3]
+        movq    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 8
+        jz      S900 
+        
+S300:   ; less than 8 bytes left
+        cmp     par3, -4
+        ja      S400
+        ; compare 4 bytes
+        movd    xmm1, [par1+par3]
+        movd    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 4
+        jz      S900 
+
+S400:   ; less than 4 bytes left
+        cmp     par3, -2
+        ja      S500
+        movzx   eax, word [par1+par3]
+        movzx   par4d, word [par2+par3]
+        sub     eax, par4d
+        jnz     S800                             ; difference in byte 0 or 1
+        add     par3, 2
+        jz      S900 
+        
+S500:   ; less than 2 bytes left
+        test    par3, par3
+        jz      S900                             ; no bytes left
+        
+        ; one byte left
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S700:   ; difference found. find position
+        bsf     eax, eax
+        add     par3, rax
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     par3, -1                          ; add 1 to par3 if al == 0
+S820:   movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S900:   ; equal
+        xor     eax, eax        
+        ret
+
+        
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+        push    par1
+        push    par2
+        push    par3        
+        call    InstructionSet                         ; get supported instruction set
+        ; SSE2 always supported
+        lea     par4, [memcmpSSE2@]
+        cmp     eax, 13                ; check AVX2
+        jb      Q100
+        ; AVX2 supported
+        lea     par4, [memcmpAVX2@]        
+Q100:   ; save pointer
+        mov     qword [memcmpDispatch], par4
+; Continue in appropriate version of memcmp
+        pop     par3
+        pop     par2
+        pop     par1
+        jmp     par4
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DQ memcmpCPUDispatch
+
diff --git a/asmlibSrc/memcpy32.asm b/asmlibSrc/memcpy32.asm
new file mode 100755
index 0000000..257fe2c
--- /dev/null
+++ b/asmlibSrc/memcpy32.asm
@@ -0,0 +1,1460 @@
+;*************************  memcpy32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-18
+; Last modified:    2013-09-11
+
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is 
+; bigger than half the size of the largest_level cache. This limit can be
+; read with _GetMemcpyCacheLimit and changed with _SetMemcpyCacheLimit (in 
+; memmove32.asm). C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit();  // in memcpy32.asm
+; extern "C" void SetMemcpyCacheLimit();    // in memmove32.asm
+; extern "C" void SetMemcpyCacheLimit1();   // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memcpy: function             ; Function A_memcpy
+global ?OVR_memcpy: function           ; ?OVR removed if standard function memcpy overridden
+
+; Direct entries to CPU-specific versions
+global _memcpy386: function            ; Generic version for processors without SSE2
+global _memcpySSE2: function           ; Version for processors with SSE2
+global _memcpySSSE3: function          ; Version for processors with SSSE3
+global _memcpyU: function              ; Alternative version for processors with fast unaligned read
+global _memcpyU256: function            ; Version for processors with fast 256-bit read/write
+
+global _GetMemcpyCacheLimit: function  ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global _SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _UnalignedIsFaster              ; Tells if unaligned read is faster than PALIGNR
+extern _Store256BitIsFaster            ; Tells if a 256 bit store is faster than two 128 bit stores
+
+
+; Imported from cachesize32.asm:
+extern _DataCacheSize                  ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO  PROLOGM  0
+        push    esi
+        push    edi
+        mov     edi, [esp+12]          ; dest
+        mov     esi, [esp+16]          ; src
+        mov     ecx, [esp+20]          ; count
+%IFDEF  POSITIONINDEPENDENT
+        push    ebx
+        mov     ebx, edx               ; pointer to reference point RP
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO  RETURNM 0
+%IFDEF  POSITIONINDEPENDENT
+        pop     ebx
+%ENDIF
+        pop     edi
+        pop     esi
+        mov     eax, [esp+4]           ; Return value = dest
+        ret
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+_A_memcpy:
+?OVR_memcpy:
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+RP      equ     0                      ; RP = 0 if not position-independent
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP:                                    ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+        jmp     dword [edx+memcpyDispatch-RP]
+
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memcpyU256:   ; global label
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memcpyU256@:
+        PROLOGM
+        cmp     ecx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 1FH
+        jz      B3100                    ; Skip if dest aligned by 32
+        
+        ; edx = size of first partial block, 1 - 31 bytes
+        test    dl, 3
+        jz      B3030
+        test    dl, 1
+        jz      B3020
+        ; move 1 byte
+        movzx   eax, byte [esi]
+        mov     [edi], al
+        inc     esi
+        inc     edi
+B3020:  test    dl, 2
+        jz      B3030
+        ; move 2 bytes
+        movzx   eax, word [esi]
+        mov     [edi], ax
+        add     esi, 2
+        add     edi, 2
+B3030:  test    dl, 4
+        jz      B3040
+        ; move 4 bytes
+        mov     eax, [esi]
+        mov     [edi], eax
+        add     esi, 4
+        add     edi, 4
+B3040:  test    dl, 8
+        jz      B3050
+        ; move 8 bytes
+        movq    xmm0, qword [esi]
+        movq    qword [edi], xmm0
+        add     esi, 8
+        add     edi, 8
+B3050:  test    dl, 16
+        jz      B3060        
+        ; move 16 bytes
+        movups  xmm0, [esi]
+        movaps  [edi], xmm0
+        add     esi, 16
+        add     edi, 16
+B3060:  sub     ecx, edx
+
+B3100:  ; Now dest is aligned by 32. Any partial block has been moved        
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        add     esi, ecx               ; Point to the end
+        add     edi, ecx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+%ELSE
+        cmp     ecx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+        ja      I3100                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     ecx                    ; Negative index from the end      
+
+H3100:  ; copy -ecx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096 is close to 4096
+        test    esi, 1FH
+        jz      H3110                  ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J3100
+        
+H3110:  ; main copy loop, 32 bytes at a time
+        ; ecx has negative index from the end, counting up to zero
+        vmovups ymm0, [esi+ecx]
+        vmovaps [edi+ecx], ymm0
+        add     ecx, 20H
+        jnz     H3110
+        vzeroupper                      ; end of AVX mode        
+        
+        ; Move the remaining edx bytes (0 - 31):
+H3120:  add     esi, edx
+        add     edi, edx
+        neg     edx
+        jz      H3500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H3200
+        ; move 16 bytes
+        movups  xmm0, [esi+edx]
+        movaps  [edi+edx], xmm0
+        add     edx, 10H
+H3200:  cmp     edx, -8
+        jg      H3210        
+        ; move 8 bytes
+        movq    xmm0, qword [esi+edx]
+        movq    qword [edi+edx], xmm0
+        add     edx, 8 
+        jz      H3500                   ; Early skip if count divisible by 8       
+H3210:  cmp     edx, -4
+        jg      H3220        
+        ; move 4 bytes
+        mov     eax, [esi+edx]
+        mov     [edi+edx], eax
+        add     edx, 4        
+H3220:  cmp     edx, -2
+        jg      H3230        
+        ; move 2 bytes
+        movzx   eax, word [esi+edx]
+        mov     [edi+edx], ax
+        add     edx, 2
+H3230:  cmp     edx, -1
+        jg      H3500        
+        ; move 1 byte
+        movzx   eax, byte [esi+edx]
+        mov     [edi+edx], al
+H3500:  ; finished     
+        RETURNM
+        
+I3100:  ; non-temporal move
+        neg     ecx                    ; Negative index from the end
+align 16
+I3110:  ; main copy loop, 32 bytes at a time
+        ; ecx has negative index from the end, counting up to zero
+        vmovups ymm0, [esi+ecx]
+        vmovntps [edi+ecx], ymm0
+        add     ecx, 20H
+        jnz     I3110
+        vzeroupper                     ; end of AVX mode
+        jmp     H3120                  ; Move the remaining edx bytes (0 - 31):
+
+align 16
+J3100:  ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe 
+        ; to copy backwards to avoid false memory dependence
+%if 1
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying     
+        push    edx
+        mov     eax, esi
+        sub     eax, edi
+        cdq
+        xor     eax, edx
+        sub     eax, edx   ; abs(src-dest)
+        neg     ecx        ; size
+        pop     edx        ; restore rdx
+        cmp     eax, ecx
+        jnb     J3110
+        neg     ecx        ; restore rcx
+        jmp     H3110       ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway         
+        mov     eax, esi
+        sub     eax, edi
+        neg     ecx        ; size
+        cmp     eax, ecx
+        jnb     J3110       ; OK to copy backwards
+        ; must copy forwards
+        neg     ecx        ; restore ecx
+        jmp     H3110       ; copy forwards
+%endif
+        
+J3110:  ; copy backwards, ecx = size. esi, edi = end of src, dest        
+        push    esi
+        push    edi
+        sub     esi, ecx
+        sub     edi, ecx
+J3120:  ; loop backwards
+        vmovups ymm1, [esi+ecx-20H]
+        vmovaps [edi+ecx-20H], ymm1
+        sub     ecx, 20H
+        jnz     J3120
+        vzeroupper
+        pop     edi
+        pop     esi
+        jmp     H3120
+
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        ; multiple CPU versions (SSSE3 and later)
+A1000:  add     esi, ecx               ; end of src
+        add     edi, ecx               ; end of dest
+        neg     ecx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A1100        
+        ; move 32 bytes
+        ; movdqu is faster than movq on all processors with SSSE3
+        movups  xmm0, oword [esi+ecx]
+        movups  xmm1, oword [esi+ecx+10H]
+        movups  oword [edi+ecx], xmm0
+        movups  oword [edi+ecx+10H], xmm1
+        add     ecx, 20H
+A1100:  cmp     ecx, -10H        
+        jg      A1200
+        ; move 16 bytes
+        movups  xmm0, oword [esi+ecx]
+        movups  oword [edi+ecx], xmm0
+        add     ecx, 10H
+A1200:  cmp     ecx, -8        
+        jg      A1300
+        ; move 8 bytes
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+        add     ecx, 8
+A1300:  cmp     ecx, -4        
+        jg      A1400
+        ; move 4 bytes
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        add     ecx, 4
+        jz      A1900                 ; early out if count divisible by 4
+A1400:  cmp     ecx, -2        
+        jg      A1500
+        ; move 2 bytes
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+        add     ecx, 2
+A1500:  cmp     ecx, -1
+        jg      A1900        
+        ; move 1 byte
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al
+A1900:  ; finished
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+                
+align 16
+_memcpyU:   ; global label
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memcpyU@:   ; local label
+        PROLOGM
+        cmp     ecx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B2100                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B2030
+        test    dl, 1
+        jz      B2020
+        ; move 1 byte
+        movzx   eax, byte [esi]
+        mov     [edi], al
+        inc     esi
+        inc     edi
+B2020:  test    dl, 2
+        jz      B2030
+        ; move 2 bytes
+        movzx   eax, word [esi]
+        mov     [edi], ax
+        add     esi, 2
+        add     edi, 2
+B2030:  test    dl, 4
+        jz      B2040
+        ; move 4 bytes
+        mov     eax, [esi]
+        mov     [edi], eax
+        add     esi, 4
+        add     edi, 4
+B2040:  test    dl, 8
+        jz      B2050
+        ; move 8 bytes
+        movq    xmm0, qword [esi]
+        movq    qword [edi], xmm0
+        add     esi, 8
+        add     edi, 8
+B2050:  sub     ecx, edx
+B2100:  ; Now dest is aligned by 16. Any partial block has been moved        
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        add     esi, ecx               ; Point to the end
+        add     edi, ecx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+%ELSE
+        cmp     ecx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+        ja      I100                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     ecx                    ; Negative index from the end      
+
+H100:   ; copy -ecx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096 is close to 4096
+        test    esi, 0FH
+        jz      H110                   ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J100
+        
+H110:   ; main copy loop, 32 bytes at a time
+        ; ecx has negative index from the end, counting up to zero
+        movups  xmm0, [esi+ecx]
+        movups  xmm1, [esi+ecx+10H]
+        movaps  [edi+ecx], xmm0
+        movaps  [edi+ecx+10H], xmm1
+        add     ecx, 20H
+        jnz     H110
+        
+        ; Move the remaining edx bytes (0 - 31):
+H120:   add     esi, edx
+        add     edi, edx
+        neg     edx
+        jz      H500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H200
+        ; move 16 bytes
+        movups  xmm0, [esi+edx]
+        movaps  [edi+edx], xmm0
+        add     edx, 10H
+H200:   cmp     edx, -8
+        jg      H210        
+        ; move 8 bytes
+        movq    xmm0, qword [esi+edx]
+        movq    qword [edi+edx], xmm0
+        add     edx, 8 
+        jz      H500                   ; Early skip if count divisible by 8       
+H210:   cmp     edx, -4
+        jg      H220        
+        ; move 4 bytes
+        mov     eax, [esi+edx]
+        mov     [edi+edx], eax
+        add     edx, 4        
+H220:   cmp     edx, -2
+        jg      H230        
+        ; move 2 bytes
+        movzx   eax, word [esi+edx]
+        mov     [edi+edx], ax
+        add     edx, 2
+H230:   cmp     edx, -1
+        jg      H500        
+        ; move 1 byte
+        movzx   eax, byte [esi+edx]
+        mov     [edi+edx], al
+H500:   ; finished     
+        RETURNM
+        
+I100:   ; non-temporal move
+        neg     ecx                    ; Negative index from the end
+align 16
+I110:   ; main copy loop, 32 bytes at a time
+        ; ecx has negative index from the end, counting up to zero
+        movups  xmm0, [esi+ecx]
+        movups  xmm1, [esi+ecx+10H]
+        movntps [edi+ecx], xmm0
+        movntps [edi+ecx+10H], xmm1
+        add     ecx, 20H
+        jnz     I110
+        jmp     H120                  ; Move the remaining edx bytes (0 - 31):
+
+align 16
+J100:   ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe 
+        ; to copy backwards to avoid false memory dependence
+%if 1
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying     
+        push    edx
+        mov     eax, esi
+        sub     eax, edi
+        cdq
+        xor     eax, edx
+        sub     eax, edx   ; abs(src-dest)
+        neg     ecx        ; size
+        pop     edx        ; restore rdx
+        cmp     eax, ecx
+        jnb     J110
+        neg     ecx        ; restore rcx
+        jmp     H110       ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway         
+        mov     eax, esi
+        sub     eax, edi
+        neg     ecx        ; size
+        cmp     eax, ecx
+        jnb     J110       ; OK to copy backwards
+        ; must copy forwards
+        neg     ecx        ; restore ecx
+        jmp     H110       ; copy forwards
+%endif
+        
+J110:   ; copy backwards, ecx = size. esi, edi = end of src, dest        
+        push    esi
+        push    edi
+        sub     esi, ecx
+        sub     edi, ecx
+J120:   ; loop backwards
+        movups  xmm1, [esi+ecx-20H]
+        movups  xmm0, [esi+ecx-10H]
+        movaps  [edi+ecx-20H], xmm1
+        movaps  [edi+ecx-10H], xmm0
+        sub     ecx, 20H
+        jnz     J120
+        pop     edi
+        pop     esi
+        jmp     H120
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+_memcpySSSE3:   ; global label
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memcpySSSE3@:   ; local label
+        PROLOGM
+        cmp     ecx, 40H
+        jb      A1000                  ; Use simpler code if count < 64        
+        
+        ; count >= 64
+        ; This part will not always work if count < 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B1200                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1120
+        test    edx, 1
+        jz      B1110
+        ; move 1 byte
+        movzx   eax, byte [esi]
+        mov     [edi], al
+        inc     esi
+        inc     edi
+B1110:  test    dl, 2
+        jz      B1120
+        ; move 2 bytes
+        movzx   eax, word [esi]
+        mov     [edi], ax
+        add     esi, 2
+        add     edi, 2
+B1120:  test    dl, 4
+        jz      B1130
+        ; move 4 bytes
+        mov     eax, [esi]
+        mov     [edi], eax
+        add     esi, 4
+        add     edi, 4
+B1130:  test    dl, 8
+        jz      B1140
+        ; move 8 bytes
+        movq    xmm0, qword [esi]
+        movq    qword [edi], xmm0
+        add     esi, 8
+        add     edi, 8
+B1140:  sub     ecx, edx
+
+B1200:   ; Now dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        add     esi, ecx               ; Point to the end
+        add     edi, ecx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop
+        sub     esi, eax               ; Nearest preceding aligned block of src
+
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > _CacheBypassLimit
+        neg     ecx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment
+        jmp     dword [AlignmentDispatchSSSE3+eax*4]
+
+B1400:   neg     ecx
+        ; Dispatch to different codes depending on src alignment
+        jmp     dword [AlignmentDispatchNT+eax*4]
+
+%ELSE   ; Position-independent code
+
+        ; Check if count very big
+        ; Make the following instruction with address relative to RP:
+        cmp     ecx, [ebx-RP+_CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > _CacheBypassLimit
+        neg     ecx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment        
+
+        ; AlignmentDispatch table contains addresses relative to RP
+        ; Add table entry to ebx=RP to get jump address.
+
+        ; Make the following instruction with address relative to RP:
+        add     ebx, [ebx-RP+AlignmentDispatchSSSE3+eax*4]
+        jmp     ebx
+        
+B1400:   neg     ecx
+
+        ; Same with AlignmentDispatchNT:        
+        add     ebx, [ebx-RP+AlignmentDispatchNT+eax*4]
+        jmp     ebx        
+%ENDIF
+
+align   16
+C100:   ; Code for aligned src. SSE2 and later instruction set
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. ecx has negative index from the end, counting up to zero
+        movaps  xmm0, [esi+ecx]
+        movaps  xmm1, [esi+ecx+10H]
+        movaps  [edi+ecx], xmm0
+        movaps  [edi+ecx+10H], xmm1
+        add     ecx, 20H
+        jnz     C100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        jz      C500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes
+        movaps  xmm0, [esi+edx]
+        movaps  [edi+edx], xmm0
+        add     edx, 10H
+C200:   cmp     edx, -8
+        jg      C210        
+        ; move 8 bytes
+        movq    xmm0, qword [esi+edx]
+        movq    qword [edi+edx], xmm0
+        add     edx, 8 
+        jz      C500                   ; Early skip if count divisible by 8       
+C210:   cmp     edx, -4
+        jg      C220        
+        ; move 4 bytes
+        mov     eax, [esi+edx]
+        mov     [edi+edx], eax
+        add     edx, 4        
+C220:   cmp     edx, -2
+        jg      C230        
+        ; move 2 bytes
+        movzx   eax, word [esi+edx]
+        mov     [edi+edx], ax
+        add     edx, 2
+C230:   cmp     edx, -1
+        jg      C500        
+        ; move 1 byte
+        movzx   eax, byte [esi+edx]
+        mov     [edi+edx], al
+C500:   ; finished     
+        RETURNM
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+_memcpySSE2:    ; global label
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memcpySSE2@:    ; local label
+        PROLOGM
+        cmp     ecx, 40H
+        jae     B100                   ; Use simpler code if count < 64
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        add     esi, ecx               ; end of src
+        add     edi, ecx               ; end of dest
+        neg     ecx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A100        
+        ; move 32 bytes
+        ; movq is faster than movdqu on Intel Pentium M and Core 1
+        ; movdqu is fast on Nehalem and later
+        movq    xmm0, qword [esi+ecx]
+        movq    xmm1, qword [esi+ecx+8]
+        movq    xmm2, qword [esi+ecx+10H]
+        movq    xmm3, qword [esi+ecx+18H]
+        movq    qword [edi+ecx], xmm0
+        movq    qword [edi+ecx+8], xmm1
+        movq    qword [edi+ecx+10H], xmm2
+        movq    qword [edi+ecx+18H], xmm3
+        add     ecx, 20H
+A100:   cmp     ecx, -10H        
+        jg      A200
+        ; move 16 bytes
+        movq    xmm0, qword [esi+ecx]
+        movq    xmm1, qword [esi+ecx+8]
+        movq    qword [edi+ecx], xmm0
+        movq    qword [edi+ecx+8], xmm1
+        add     ecx, 10H
+A200:   cmp     ecx, -8        
+        jg      A300
+        ; move 8 bytes
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+        add     ecx, 8
+A300:   cmp     ecx, -4        
+        jg      A400
+        ; move 4 bytes
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        add     ecx, 4
+        jz      A900                     ; early out if count divisible by 4
+A400:   cmp     ecx, -2        
+        jg      A500
+        ; move 2 bytes
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+        add     ecx, 2
+A500:   cmp     ecx, -1
+        jg      A900        
+        ; move 1 byte
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al
+A900:   ; finished
+        RETURNM        
+        
+B100:   ; count >= 64
+        ; This part will not always work if count < 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B200                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B120
+        test    dl, 1
+        jz      B110
+        ; move 1 byte
+        movzx   eax, byte [esi]
+        mov     [edi], al
+        inc     esi
+        inc     edi
+B110:   test    dl, 2
+        jz      B120
+        ; move 2 bytes
+        movzx   eax, word [esi]
+        mov     [edi], ax
+        add     esi, 2
+        add     edi, 2
+B120:   test    dl, 4
+        jz      B130
+        ; move 4 bytes
+        mov     eax, [esi]
+        mov     [edi], eax
+        add     esi, 4
+        add     edi, 4
+B130:   test    dl, 8
+        jz      B140
+        ; move 8 bytes
+        movq    xmm0, qword [esi]
+        movq    qword [edi], xmm0
+        add     esi, 8
+        add     edi, 8
+B140:   sub     ecx, edx
+
+B200:   ; Now dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        add     esi, ecx               ; Point to the end
+        add     edi, ecx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop
+        sub     esi, eax               ; Nearest preceding aligned block of src
+
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+        ja      B400                   ; Use non-temporal store if count > _CacheBypassLimit
+        neg     ecx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment
+        jmp     dword [AlignmentDispatchSSE2+eax*4]
+
+B400:   neg     ecx
+        ; Dispatch to different codes depending on src alignment
+        jmp     dword [AlignmentDispatchNT+eax*4]
+
+%ELSE   ; Position-independent code
+
+        ; Check if count very big
+        ; Make the following instruction with address relative to RP:
+        cmp     ecx, [ebx-RP+_CacheBypassLimit]
+        ja      B400                   ; Use non-temporal store if count > _CacheBypassLimit
+        neg     ecx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment        
+
+        ; AlignmentDispatch tables contain addresses relative to RP
+        ; Add table entry to ebx=RP to get jump address.
+
+        ; Make the following instruction with address relative to RP:
+        add     ebx, [ebx-RP+AlignmentDispatchSSE2+eax*4]
+        jmp     ebx
+        
+B400:   neg     ecx
+
+        ; Same with AlignmentDispatchNT:        
+        add     ebx, [ebx-RP+AlignmentDispatchNT+eax*4]
+        jmp     ebx        
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSE2  2
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [esi+ecx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. ecx has negative index from the end, counting up to zero
+        movdqa  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [esi+ecx+20H]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [edi+ecx], xmm0        ; non-temporal save
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        psrldq  xmm3, %1               ; shift right
+        pslldq  xmm2, 16-%1            ; shift left
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+ecx+10H], xmm3    ; Save aligned
+        %ELSE
+        movntdq [edi+ecx+10H], xmm3    ; non-temporal save
+        %ENDIF
+        add     ecx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm1, [esi+edx+10H]
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+edx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [edi+edx], xmm0        ; non-temporal save
+        %ENDIF        
+        add     edx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO  MOVE_UNALIGNED_SSE2_4  1
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [esi+ecx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. ecx has negative index from the end, counting up to zero
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+       ;pshufd  xmm0, xmm0, 00111001B
+        shufps  xmm0, xmm0, 00111001B
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [esi+ecx+20H]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B
+        %IF %1 == 0
+        movaps  [edi+ecx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [edi+ecx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     ecx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [esi+edx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1
+        shufps  xmm0, xmm0, 00111001B
+        %IF %1 == 0
+        movaps  [edi+edx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+edx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     edx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO  MOVE_UNALIGNED_SSE2_8  1
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [esi+ecx]        ; Read from nearest preceding 16B boundary
+%%L1:  ; Loop. ecx has negative index from the end, counting up to zero
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1             ; Moves 8 bytes, leaves remaining bytes unchanged
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [esi+ecx+20H]
+        movsd   xmm1, xmm0
+        shufps  xmm1, xmm1, 01001110B
+        %IF %1 == 0
+        movaps  [edi+ecx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [edi+ecx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     ecx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [esi+edx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1
+        shufps  xmm0, xmm0, 01001110B
+        %IF %1 == 0
+        movaps  [edi+edx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+edx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     edx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO  MOVE_UNALIGNED_SSE2_12  1
+; %1 = 1 if non-temporal store desired
+; Special case for u = 12
+        movaps  xmm0, [esi+ecx]        ; Read from nearest preceding 16B boundary
+        shufps  xmm0, xmm0, 10010011B
+%%L1:   ; Loop. ecx has negative index from the end, counting up to zero
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [esi+ecx+20H]
+        shufps  xmm1, xmm1, 10010011B
+        shufps  xmm2, xmm2, 10010011B
+        movaps  xmm3, xmm2
+        movss   xmm2, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged       
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm1        ; Save aligned
+        movaps  [edi+ecx+10H], xmm2    ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm1        ; Non-temporal save
+        movntps [edi+ecx+10H], xmm2    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, xmm3             ; Save for next iteration        
+        add     ecx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [esi+edx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged       
+        %IF %1 == 0
+        movaps  [edi+edx], xmm1        ; Save aligned
+        %ELSE
+        movntps [edi+edx], xmm1        ; Non-temporal save
+        %ENDIF
+        add     edx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO  MOVE_UNALIGNED_SSSE3  1
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [esi+ecx]        ; Read from nearest preceding 16B boundary
+        
+%%L1:   ; Loop. ecx has negative index from the end, counting up to zero
+        movdqa  xmm2, [esi+ecx+10H]    ; Read next two blocks
+        movdqa  xmm3, [esi+ecx+20H]
+        movdqa  xmm1, xmm0             ; Save xmm0
+        movdqa  xmm0, xmm3             ; Save for next iteration
+        palignr xmm3, xmm2, %1         ; Combine parts into aligned block
+        palignr xmm2, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [edi+ecx], xmm2        ; Save aligned
+        movdqa  [edi+ecx+10H], xmm3    ; Save aligned
+        add     ecx, 20H
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm2, [esi+edx+10H]
+        palignr xmm2, xmm0, %1
+        movdqa  [edi+edx], xmm2
+        add     edx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+
+; (aligns are inserted manually to minimize the number of 16-bytes
+; boundaries inside loops in the most common cases)
+
+align   16
+D104:   MOVE_UNALIGNED_SSE2_4    0
+D108:   MOVE_UNALIGNED_SSE2_8    0
+align 8
+D10C:   MOVE_UNALIGNED_SSE2_12   0
+D101:   MOVE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_UNALIGNED_SSE2 9,   0
+D10A:   MOVE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_UNALIGNED_SSE2 0FH, 0
+        
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align   16
+times 11 nop
+E104:   MOVE_UNALIGNED_SSSE3 4
+times 5 nop
+E108:   MOVE_UNALIGNED_SSSE3 8
+times 5 nop
+E10C:   MOVE_UNALIGNED_SSSE3 0CH
+times 5 nop
+E101:   MOVE_UNALIGNED_SSSE3 1
+times 5 nop
+E102:   MOVE_UNALIGNED_SSSE3 2
+times 5 nop
+E103:   MOVE_UNALIGNED_SSSE3 3
+times 5 nop
+E105:   MOVE_UNALIGNED_SSSE3 5
+times 5 nop
+E106:   MOVE_UNALIGNED_SSSE3 6
+times 5 nop
+E107:   MOVE_UNALIGNED_SSSE3 7
+times 5 nop
+E109:   MOVE_UNALIGNED_SSSE3 9
+times 5 nop
+E10A:   MOVE_UNALIGNED_SSSE3 0AH
+times 5 nop
+E10B:   MOVE_UNALIGNED_SSSE3 0BH
+times 5 nop
+E10D:   MOVE_UNALIGNED_SSSE3 0DH
+times 5 nop
+E10E:   MOVE_UNALIGNED_SSSE3 0EH
+times 5 nop
+E10F:   MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align   8
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. ecx has negative index from the end, counting up to zero
+        movaps  xmm0, [esi+ecx]        ; Read
+        movaps  xmm1, [esi+ecx+10H]
+        movntps [edi+ecx], xmm0        ; Write non-temporal (bypass cache)
+        movntps [edi+ecx+10H], xmm1
+        add     ecx, 20H
+        jnz     F100                   ; Loop through negative ecx up to zero
+                
+        ; Move the remaining edx bytes (0 - 31):
+        add     esi, edx
+        add     edi, edx
+        neg     edx
+        jz      C500                   ; Skip if no more data
+        ; Check if we can more one more 16-bytes block
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes, aligned
+        movaps  xmm0, [esi+edx]
+        movntps [edi+edx], xmm0
+        add     edx, 10H
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of 
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104:   MOVE_UNALIGNED_SSE2_4    1
+F108:   MOVE_UNALIGNED_SSE2_8    1
+F10C:   MOVE_UNALIGNED_SSE2_12   1
+F101:   MOVE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_UNALIGNED_SSE2 3,   1
+F105:   MOVE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_UNALIGNED_SSE2 7,   1
+F109:   MOVE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_UNALIGNED_SSE2 0BH, 1
+F10D:   MOVE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_UNALIGNED_SSE2 0FH, 1
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for old processors without SSE2
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 8
+; 80386 version used when SSE2 not supported:
+_memcpy386:  ; global label
+memcpy386@:  ; local label
+        PROLOGM
+; edi = dest
+; esi = src
+; ecx = count
+        cld
+        cmp     ecx, 8
+        jb      G500
+G100:   test    edi, 1
+        jz      G200
+        movsb
+        dec     ecx
+G200:   test    edi, 2
+        jz      G300
+        movsw
+        sub     ecx, 2
+G300:   ; edi is aligned now
+        mov     edx, ecx
+        shr     ecx, 2
+        rep     movsd                  ; move 4 bytes at a time
+        mov     ecx, edx
+        and     ecx, 3
+        rep     movsb                  ; move remaining 0-3 bytes
+        RETURNM
+        
+G500:   ; count < 8. Move one byte at a time
+        rep     movsb                  ; move count bytes
+        RETURNM
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CPU dispatching for memcpy. This is executed only once
+memcpyCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        pushad
+        ; set _CacheBypassLimit to half the size of the largest level cache
+        call    GetMemcpyCacheLimit@
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of memcpy
+        mov     esi, memcpy386@
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memcpy
+        mov     esi, memcpySSE2@
+        cmp     eax, 6                 ; check Suppl-SSE3
+        jb      Q100
+        ; Suppl-SSE3 supported
+        ; Point to SSSE3 version of memcpy
+        mov     esi, memcpySSSE3@
+        call    _UnalignedIsFaster     ; Test if unaligned read is faster than aligned read and shift
+        test    eax, eax
+        jz      Q100
+        ; Point to unaligned version of memcpy
+        mov     esi, memcpyU@
+        call    _Store256BitIsFaster   ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        mov     esi, memcpyU256@
+Q100:   
+        mov     [memcpyDispatch], esi
+        popad
+        ; Continue in appropriate version of memcpy
+        jmp     [memcpyDispatch]
+
+%ELSE   ; Position-independent version
+        pushad
+        mov     ebx, edx               ; reference point
+        ; set _CacheBypassLimit to half the size of the largest level cache
+        call    GetMemcpyCacheLimit@
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of memcpy
+        lea     esi, [ebx+memcpy386 at -RP]
+        cmp     eax, 4                  ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memcpy
+        lea     esi, [ebx+memcpySSE2 at -RP]
+        cmp     eax, 6                  ; check Suppl-SSE3
+        jb      Q100
+        ; Suppl-SSE3 supported
+        ; Point to SSSE3 version of memcpy
+        lea     esi, [ebx+memcpySSSE3 at -RP]        
+        call    _UnalignedIsFaster      ; Test if unaligned read is faster than aligned read and shift
+        test    eax, eax
+        jz      Q100
+        ; Point to unaligned version of memcpy
+        lea     esi, [ebx+memcpyU at -RP]
+        call    _Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        lea     esi, [ebx+memcpyU256 at -RP]
+Q100:   ; insert appropriate pointer
+        mov     dword [ebx+memcpyDispatch-RP], esi
+        popad
+        ; Continue in appropriate version of memcpy
+        jmp     [edx+memcpyDispatch-RP]        
+%ENDIF
+
+; extern "C" size_t GetMemcpyCacheLimit();
+_GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@:  ; local label
+        push    ebx
+%ifdef  POSITIONINDEPENDENT
+        call    get_thunk_edx
+        lea     ebx, [edx + _CacheBypassLimit - $]
+%else
+        mov     ebx, _CacheBypassLimit
+%endif
+        mov     eax, [ebx]
+        test    eax, eax
+        jnz     U200
+        ; Get half the size of the largest level cache
+        push    0                      ; 0 means largest level cache
+        call    _DataCacheSize         ; get cache size
+        pop     ecx
+        shr     eax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [ebx], eax
+U200:   pop     ebx
+        ret
+        
+; Called internally from _SetMemcpyCacheLimit defined in memmove32.asm
+; Must return the value set
+_SetMemcpyCacheLimit1:
+        push    ebx
+%ifdef  POSITIONINDEPENDENT
+        call    get_thunk_edx
+        lea     ebx, [edx + _CacheBypassLimit - $]
+%else
+        mov     ebx, _CacheBypassLimit
+%endif
+        mov     eax, [esp+8]
+        test    eax, eax
+        jnz     U400
+        ; zero, means default
+        mov     [ebx], eax
+        call    GetMemcpyCacheLimit@
+U400:   
+        mov     [ebx], eax
+        pop     ebx
+        ret
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                   getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov eax,[memcpyDispatch]
+ret
+
+global getDispatch
+                
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatchSSE2 with 
+; AlignmentDispatchSupSSE3 if Suppl-SSE3 is supported
+; RP = reference point if position-independent code, otherwise RP = 0
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DD C100-RP, D101-RP, D102-RP, D103-RP, D104-RP, D105-RP, D106-RP, D107-RP
+DD D108-RP, D109-RP, D10A-RP, D10B-RP, D10C-RP, D10D-RP, D10E-RP, D10F-RP
+
+; Code pointer for each alignment for Suppl.SSE3 instruction set
+AlignmentDispatchSSSE3:
+DD C100-RP, E101-RP, E102-RP, E103-RP, E104-RP, E105-RP, E106-RP, E107-RP
+DD E108-RP, E109-RP, E10A-RP, E10B-RP, E10C-RP, E10D-RP, E10E-RP, E10F-RP
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DD F100-RP, F101-RP, F102-RP, F103-RP, F104-RP, F105-RP, F106-RP, F107-RP
+DD F108-RP, F109-RP, F10A-RP, F10B-RP, F10C-RP, F10D-RP, F10E-RP, F10F-RP
+
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch: DD memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+_CacheBypassLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/memcpy64.asm b/asmlibSrc/memcpy64.asm
new file mode 100755
index 0000000..e112153
--- /dev/null
+++ b/asmlibSrc/memcpy64.asm
@@ -0,0 +1,1313 @@
+;*************************  memcpy64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2013-09-11
+;
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is 
+; bigger than half the size of the largest_level cache. This limit can be
+; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
+; C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit();  // in memcpy64.asm
+; extern "C" void SetMemcpyCacheLimit();    // in memmove64.asm
+; extern "C" void SetMemcpyCacheLimit1();   // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memcpy: function              ; Function A_memcpy
+global ?OVR_memcpy: function           ; ?OVR removed if standard function memcpy overridden
+global memcpySSE2: function            ; Version for processors with only SSE2
+global memcpySSSE3: function           ; Version for processors with SSSE3
+global memcpyU: function               ; Version for processors with fast unaligned read
+global memcpyU256: function            ; Version for processors with fast 256-bit read/write
+
+global GetMemcpyCacheLimit: function   ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global SetMemcpyCacheLimit1: function  ; Set the size limit for bypassing cache when copying with memcpy
+
+
+; Imported from instrset64.asm
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from cachesize32.asm:
+extern DataCacheSize                   ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO  PROLOGM  0
+%IFDEF  WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; dest
+        mov     r9,  rcx               ; dest
+        mov     rsi, rdx               ; src
+        mov     rcx, r8                ; count
+%ELSE   ; Unix
+        mov     rcx, rdx               ; count
+        mov     r9,  rdi               ; dest
+%ENDIF
+%ENDM
+
+; Define return from this function
+%MACRO  RETURNM  0
+%IFDEF  WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF
+        mov     rax, r9                ; Return value = dest
+        ret
+%ENDM
+
+
+SECTION .text  align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+A_memcpy:
+?OVR_memcpy:
+        jmp     qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+memcpyU256:   ; global label
+memcpyU256@:  ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 1FH
+        jz      B3100                    ; Skip if dest aligned by 32
+        
+        ; edx = size of first partial block, 1 - 31 bytes
+        test    dl, 3
+        jz      B3030
+        test    dl, 1
+        jz      B3020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B3020:  test    dl, 2
+        jz      B3030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B3030:  test    dl, 4
+        jz      B3040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B3040:  test    dl, 8
+        jz      B3050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B3050:  test    dl, 16
+        jz      B3060        
+        ; move 16 bytes
+        movups  xmm0, [rsi]
+        movaps  [rdi], xmm0
+        add     rsi, 16
+        add     rdi, 16
+B3060:  sub     rcx, rdx
+
+B3100:  ; Now dest is aligned by 32. Any partial block has been moved        
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     rdx, rcx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     rdx, rcx               ; Remaining data after loop
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      I3100                  ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end        
+
+H3100:  ; copy -rcx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096  is close to 4096
+        test    sil, 1FH
+        jz      H3110                  ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J3100
+
+align 16
+H3110:  ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        vmovups ymm0, [rsi+rcx]
+        vmovaps [rdi+rcx], ymm0
+        add     rcx, 20H
+        jnz     H3110
+        vzeroupper                     ; end of AVX mode
+        
+H3120:  ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      H3500                  ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H3200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+H3200:  cmp     edx, -8
+        jg      H3210        
+        ; move 8 bytes
+        movq    xmm0, qword [rsi+rdx]
+        movq    qword [rdi+rdx], xmm0
+        add     rdx, 8 
+        jz      H500                   ; Early skip if count divisible by 8       
+H3210:  cmp     edx, -4
+        jg      H3220        
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4        
+H3220:  cmp     edx, -2
+        jg      H3230        
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+H3230:  cmp     edx, -1
+        jg      H3500        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+H3500:  ; finished     
+        RETURNM
+        
+I3100:   ; non-temporal move
+        neg     rcx                    ; Negative index from the end      
+
+align 16
+I3110:  ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        vmovups ymm0, [rsi+rcx]
+        vmovntps [rdi+rcx], ymm0
+        add     rcx, 20H
+        jnz     I3110
+        vzeroupper                      ; end of AVX mode
+        jmp     H3120                  ; Move the remaining edx bytes (0 - 31)
+                
+
+align 16
+J3100:  ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe 
+        ; to copy backwards to avoid false memory dependence
+%if 1   
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying     
+        push    rdx
+        mov     rax, rsi
+        sub     rax, rdi
+        cqo
+        xor     rax, rdx
+        sub     rax, rdx   ; abs(src-dest)
+        neg     rcx        ; size
+        pop     rdx        ; restore rdx
+        cmp     rax, rcx
+        jnb     J3110
+        neg     rcx        ; restore rcx
+        jmp     H3110      ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway         
+        mov     rax, rsi
+        sub     rax, rdi
+        neg     rcx        ; size
+        cmp     rax, rcx
+        jnb     J3110      ; OK to copy backwards
+        ; must copy forwards
+        neg     rcx        ; restore ecx
+        jmp     H3110      ; copy forwards
+
+%endif
+        
+J3110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest        
+        push    rsi
+        push    rdi
+        sub     rsi, rcx
+        sub     rdi, rcx
+J3120:  ; loop backwards
+        vmovups ymm0, [rsi+rcx-20H]
+        vmovaps [rdi+rcx-20H], ymm0
+        sub     rcx, 20H
+        jnz     J3120
+        vzeroupper
+        pop     rdi
+        pop     rsi
+        jmp     H3120
+        
+align 16
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        ; multiple CPU versions (SSSE3 and above)
+A1000:  add     rsi, rcx               ; end of src
+        add     rdi, rcx               ; end of dest
+        neg     rcx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A1100        
+        ; move 32 bytes
+        ; movdqu is faster than 64-bit moves on processors with SSSE3
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movups  [rdi+rcx], xmm0
+        movups  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+A1100:  cmp     ecx, -10H        
+        jg      A1200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rcx]
+        movups  [rdi+rcx], xmm0
+        add     rcx, 10H
+A1200:  cmp     ecx, -8        
+        jg      A1300
+        ; move 8 bytes
+        mov     rax, qword [rsi+rcx]
+        mov     qword [rdi+rcx], rax
+        add     rcx, 8
+A1300:  cmp     ecx, -4        
+        jg      A1400
+        ; move 4 bytes
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        add     rcx, 4
+        jz      A1900                     ; early out if count divisible by 4
+A1400:  cmp     ecx, -2        
+        jg      A1500
+        ; move 2 bytes
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+        add     rcx, 2
+A1500:  cmp     ecx, -1
+        jg      A1900        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al
+A1900:  ; finished
+        RETURNM        
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+memcpyU:   ; global label
+memcpyU@:  ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B2100                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B2030
+        test    dl, 1
+        jz      B2020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B2020:  test    dl, 2
+        jz      B2030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B2030:  test    dl, 4
+        jz      B2040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B2040:  test    dl, 8
+        jz      B2050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B2050:  sub     rcx, rdx
+B2100:  ; Now dest is aligned by 16. Any partial block has been moved        
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     rdx, rcx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     rdx, rcx               ; Remaining data after loop
+      
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      I100                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end      
+
+H100:   ; copy -rcx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096 is close to 4096
+        test    sil, 0FH
+        jz      H110                   ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J100
+
+H110:   ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movaps  [rdi+rcx], xmm0
+        movaps  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     H110
+        
+H120:   ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      H500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+H200:   cmp     edx, -8
+        jg      H210        
+        ; move 8 bytes
+        movq    xmm0, qword [rsi+rdx]
+        movq    qword [rdi+rdx], xmm0
+        add     rdx, 8 
+        jz      H500                   ; Early skip if count divisible by 8       
+H210:   cmp     edx, -4
+        jg      H220        
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4        
+H220:   cmp     edx, -2
+        jg      H230        
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+H230:   cmp     edx, -1
+        jg      H500        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+H500:   ; finished     
+        RETURNM
+        
+I100:   ; non-temporal move
+        neg     rcx                    ; Negative index from the end      
+
+align 16
+I110:   ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movntps [rdi+rcx], xmm0
+        movntps [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     I110
+        jmp     H120                  ; Move the remaining edx bytes (0 - 31):
+        
+
+align 16
+J100:   ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe 
+        ; to copy backwards to avoid false memory dependence
+%if 1   
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying     
+        push    rdx
+        mov     rax, rsi
+        sub     rax, rdi
+        cqo
+        xor     rax, rdx
+        sub     rax, rdx   ; abs(src-dest)
+        neg     rcx        ; size
+        pop     rdx        ; restore rdx
+        cmp     rax, rcx
+        jnb     J110
+        neg     rcx        ; restore rcx
+        jmp     H110       ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway         
+        mov     rax, rsi
+        sub     rax, rdi
+        neg     rcx        ; size
+        cmp     rax, rcx
+        jnb     J110       ; OK to copy backwards
+        ; must copy forwards
+        neg     rcx        ; restore ecx
+        jmp     H110       ; copy forwards
+
+%endif
+        
+J110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest        
+        push    rsi
+        push    rdi
+        sub     rsi, rcx
+        sub     rdi, rcx
+J120:   ; loop backwards
+        movups  xmm1, [rsi+rcx-20H]
+        movups  xmm0, [rsi+rcx-10H]
+        movaps  [rdi+rcx-20H], xmm1
+        movaps  [rdi+rcx-10H], xmm0
+        sub     rcx, 20H
+        jnz     J120
+        pop     rdi
+        pop     rsi
+        jmp     H120
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpySSSE3:     ; global label
+memcpySSSE3@:    ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B1200                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1030
+        test    dl, 1
+        jz      B1020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B1020:  test    dl, 2
+        jz      B1030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B1030:  test    dl, 4
+        jz      B1040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B1040:  test    dl, 8
+        jz      B1050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B1050:  sub     rcx, rdx
+B1200:  ; Now dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count (lower 32 bits)
+        and     rcx, -20H              ; Round down count to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop (0-31)
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchSSSE3]
+        jmp     near [r8+rax*8]
+
+B1400:  neg     rcx
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+
+
+align   16
+C100:   ; Code for aligned src. SSE2 and SSSE3 versions
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm0, [rsi+rcx]
+        movaps  xmm1, [rsi+rcx+10H]
+        movaps  [rdi+rcx], xmm0
+        movaps  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     C100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      C500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes
+        movaps  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+C200:   cmp     edx, -8
+        jg      C210        
+        ; move 8 bytes
+        mov     rax, [rsi+rdx]
+        mov     [rdi+rdx], rax
+        add     rdx, 8 
+        jz      C500                   ; Early skip if count divisible by 8       
+C210:   cmp     edx, -4
+        jg      C220        
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4        
+C220:   cmp     edx, -2
+        jg      C230        
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+C230:   cmp     edx, -1
+        jg      C500        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+C500:   ; finished     
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpySSE2:     ; global label
+memcpySSE2@:    ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jae     B0100                   ; Use simpler code if count < 64
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        add     rsi, rcx               ; end of src
+        add     rdi, rcx               ; end of dest
+        neg     rcx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A100        
+        ; move 32 bytes
+        ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
+        ; movdqu is fast on Nehalem and later
+        mov     rax, [rsi+rcx]
+        mov     rdx, [rsi+rcx+8]
+        mov     [rdi+rcx], rax
+        mov     [rdi+rcx+8], rdx
+        mov     rax, qword [rsi+rcx+10H]
+        mov     rdx, qword [rsi+rcx+18H]
+        mov     qword [rdi+rcx+10H], rax
+        mov     qword [rdi+rcx+18H], rdx
+        add     rcx, 20H
+A100:   cmp     ecx, -10H        
+        jg      A200
+        ; move 16 bytes
+        mov     rax, [rsi+rcx]
+        mov     rdx, [rsi+rcx+8]
+        mov     [rdi+rcx], rax
+        mov     [rdi+rcx+8], rdx
+        add     rcx, 10H
+A200:   cmp     ecx, -8        
+        jg      A300
+        ; move 8 bytes
+        mov     rax, qword [rsi+rcx]
+        mov     qword [rdi+rcx], rax
+        add     rcx, 8
+A300:   cmp     ecx, -4        
+        jg      A400
+        ; move 4 bytes
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        add     rcx, 4
+        jz      A900                     ; early out if count divisible by 4
+A400:   cmp     ecx, -2        
+        jg      A500
+        ; move 2 bytes
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+        add     rcx, 2
+A500:   cmp     ecx, -1
+        jg      A900        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al
+A900:   ; finished
+        RETURNM        
+        
+B0100:  ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B0200                    ; Skip if dest aligned by 16
+        
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B0030
+        test    dl, 1
+        jz      B0020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B0020:  test    dl, 2
+        jz      B0030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B0030:  test    dl, 4
+        jz      B0040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B0040:  test    dl, 8
+        jz      B0050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B0050:  sub     rcx, rdx
+B0200:  ; Now dest is aligned by 16. Any partial block has been moved        
+
+        ; This part will not always work if count < 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B300                    ; Skip if dest aligned by 16
+        
+        ; rdx = size of first partial block, 1 - 15 bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        sub     rcx, rdx
+        neg     rdx
+        cmp     edx, -8
+        jg      B200
+        ; move 8 bytes
+        mov     rax, [rsi+rdx]
+        mov     [rdi+rdx], rax
+        add     rdx, 8
+B200:   cmp     edx, -4        
+        jg      B210
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4
+        jz      B300              ; early out if aligned by 4
+B210:   cmp     edx, -2        
+        jg      B220
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+B220:   cmp     edx, -1
+        jg      B300
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+        
+B300:   ; Now dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count (lower 32 bits)
+        and     rcx, -20H              ; Round down count to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop (0-31)
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B400                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchSSE2]
+        jmp     near [r8+rax*8]
+
+B400:   neg     rcx
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+       
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO  MOVE_UNALIGNED_SSE2  2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:  ; Loop. rcx has negative index from the end, counting up to zero
+        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [rsi+rcx+20H]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm0        ; non-temporal save
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        psrldq  xmm3, %1               ; shift right
+        pslldq  xmm2, 16-%1            ; shift left
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx+10H], xmm3    ; non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm1, [rsi+rdx+10H]
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rdx], xmm0        ; non-temporal save
+        %ENDIF        
+        add     rdx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_4  1 ; nt
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+        shufps  xmm0, xmm0, 00111001B  ; Rotate
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx+20H]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1
+        shufps  xmm0, xmm0, 00111001B
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_8  1 ; nt
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1             ; Moves 8 bytes, leaves remaining bytes unchanged
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx+20H]
+        movsd   xmm1, xmm0
+        shufps  xmm1, xmm1, 01001110B
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1
+        shufps  xmm0, xmm0, 01001110B
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_12  1 ; nt
+; Special case for u = 12
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+        shufps  xmm0, xmm0, 10010011B
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [rsi+rcx+20H]
+        shufps  xmm1, xmm1, 10010011B
+        shufps  xmm2, xmm2, 10010011B
+        movaps  xmm3, xmm2
+        movss   xmm2, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged       
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        movaps  [rdi+rcx+10H], xmm2    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        movntps [rdi+rcx+10H], xmm2    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, xmm3             ; Save for next iteration        
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1        
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged       
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm1        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3  1 ; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+        
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movdqa  xmm2, [rsi+rcx+10H]    ; Read next two blocks
+        movdqa  xmm3, [rsi+rcx+20H]
+        movdqa  xmm1, xmm0             ; Save xmm0
+        movdqa  xmm0, xmm3             ; Save for next iteration
+        palignr xmm3, xmm2, %1         ; Combine parts into aligned block
+        palignr xmm2, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx], xmm2        ; Save aligned
+        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
+        add     rcx, 20H
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm2, [rsi+rdx+10H]
+        palignr xmm2, xmm0, %1
+        movdqa  [rdi+rdx], xmm2
+        add     rdx, 10H        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+; (alignments and fillers are inserted manually to minimize the number 
+; of 16-bytes boundaries inside loops)
+
+align 16
+D104:   MOVE_UNALIGNED_SSE2_4    0
+times 4 nop
+D108:   MOVE_UNALIGNED_SSE2_8    0
+times 4 nop
+D10C:   MOVE_UNALIGNED_SSE2_12   0
+times 1 nop
+D101:   MOVE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_UNALIGNED_SSE2 9,   0
+times 1 nop
+D10A:   MOVE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_UNALIGNED_SSE2 0FH, 0
+        
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align   16
+E104:   MOVE_UNALIGNED_SSSE3 4
+E108:   MOVE_UNALIGNED_SSSE3 8
+E10C:   MOVE_UNALIGNED_SSSE3 0CH
+E101:   MOVE_UNALIGNED_SSSE3 1
+E102:   MOVE_UNALIGNED_SSSE3 2
+E103:   MOVE_UNALIGNED_SSSE3 3
+E105:   MOVE_UNALIGNED_SSSE3 5
+E106:   MOVE_UNALIGNED_SSSE3 6
+E107:   MOVE_UNALIGNED_SSSE3 7
+E109:   MOVE_UNALIGNED_SSSE3 9
+times 1 nop
+E10A:   MOVE_UNALIGNED_SSSE3 0AH
+E10B:   MOVE_UNALIGNED_SSSE3 0BH
+E10D:   MOVE_UNALIGNED_SSSE3 0DH
+E10E:   MOVE_UNALIGNED_SSSE3 0EH
+E10F:   MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 16
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm0, [rsi+rcx]        ; Read
+        movaps  xmm1, [rsi+rcx+10H]
+        movntps [rdi+rcx], xmm0        ; Write non-temporal (bypass cache)
+        movntps [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     F100                   ; Loop through negative rcx up to zero
+                
+        ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      C500                   ; Skip if no more data
+        ; Check if we can more one more 16-bytes block
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes, aligned
+        movaps  xmm0, [rsi+rdx]
+        movntps [rdi+rdx], xmm0
+        add     rdx, 10H
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of 
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104:   MOVE_UNALIGNED_SSE2_4    1
+F108:   MOVE_UNALIGNED_SSE2_8    1
+F10C:   MOVE_UNALIGNED_SSE2_12   1
+F101:   MOVE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_UNALIGNED_SSE2 3,   1
+F105:   MOVE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_UNALIGNED_SSE2 7,   1
+F109:   MOVE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_UNALIGNED_SSE2 0BH, 1
+F10D:   MOVE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                   CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+memcpyCPUDispatch:   ; CPU dispatcher, check for instruction sets and which method is fastest
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8        
+        ; set CacheBypassLimit to half the size of the largest level cache
+        call    GetMemcpyCacheLimit@
+        mov     eax, 1
+        cpuid                          ; Get feature flags
+        lea     rbx, [memcpySSE2@]
+        bt      ecx, 9                 ; Test bit for SupplSSE3
+        jnc     Q100
+        lea     rbx, [memcpySSSE3@]
+        call    UnalignedIsFaster      ; Test if unaligned read is faster than aligned read and shift
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memcpyU@]
+        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memcpyU256@]
+Q100:   
+        ; Insert appropriate pointer
+        mov     [memcpyDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+        
+; extern "C" size_t GetMemcpyCacheLimit();
+GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@:  ; local limit
+        mov     rax, [CacheBypassLimit]
+        test    rax, rax
+        jnz     U200
+        ; Get half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means largest level cache
+%else
+        xor     edi, edi               ; 0 means largest level cache
+%endif
+        call    DataCacheSize          ; get cache size
+        shr     rax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [CacheBypassLimit], rax
+U200:   ret
+        
+; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
+SetMemcpyCacheLimit1:
+%ifdef  WINDOWS
+        mov     rax, rcx
+%else
+        mov     rax, rdi
+%endif
+        test    rax, rax
+        jnz     U400
+        ; zero, means default
+        mov     [CacheBypassLimit], rax
+        call    GetMemcpyCacheLimit@
+U400:   mov     [CacheBypassLimit], rax
+        ret
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                   getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov rax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatch with 
+; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3 
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+AlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch DQ memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DQ 0
diff --git a/asmlibSrc/memmove32.asm b/asmlibSrc/memmove32.asm
new file mode 100755
index 0000000..9113e3d
--- /dev/null
+++ b/asmlibSrc/memmove32.asm
@@ -0,0 +1,1238 @@
+;*************************  memmove32.asm  ***********************************
+; Author:           Agner Fog
+; Date created:     2008-07-18
+; Last modified:    2013-09-11
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memmove: function            ; Function A_memmove
+global ?OVR_memmove: function          ; ?OVR removed if standard function memmove overridden
+
+; Direct entries to CPU-specific versions
+global _memmove386: function           ; Version for processors without SSE2
+global _memmoveSSE2: function          ; Version for processors with SSE2
+global _memmoveSSSE3: function         ; Version for processors with SSSE3
+global _memmoveU: function             ; Version for processors with fast unaligned read
+global _memmoveU256: function          ; Version for processors with fast 256-bit read/write
+global _SetMemcpyCacheLimit            ; Change limit for bypassing cache
+
+; Imported from memcpy32.asm:
+extern _A_memcpy                       ; function entry
+extern _memcpy386                      ; CPU specific function entry
+extern _memcpySSE2                     ; CPU specific function entry
+extern _memcpySSSE3                    ; CPU specific function entry
+extern _memcpyU                        ; CPU specific function entry
+extern _memcpyU256                     ; CPU specific function entry
+
+; Imported from instrset32.asm
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _UnalignedIsFaster              ; Tells if unaligned read is faster than PALIGNR
+extern _Store256BitIsFaster            ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy32.asm
+extern _GetMemcpyCacheLimit            ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern _SetMemcpyCacheLimit1           ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;      Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO  PROLOGM  1
+        ; Check if dest overlaps src
+        mov     eax, [esp+4]           ; dest
+        sub     eax, [esp+8]           ; src
+        cmp     eax, [esp+12]          ; count
+        ; We can avoid testing for dest < src by using unsigned compare:
+        ; (Assume that the memory block cannot span across address 0)
+        ; Must move backwards if unsigned(dest-src) < count
+        jae     %1                     ; Jump to memcpy if we can move forwards
+        
+        push    esi
+        push    edi
+        mov     edi, [esp+12]          ; dest
+        mov     esi, [esp+16]          ; src
+        mov     ecx, [esp+20]          ; count
+%IFDEF  POSITIONINDEPENDENT
+        push    ebx
+        mov     ebx, edx               ; pointer to reference point RP
+%ENDIF
+        
+%ENDM
+
+
+; Define return from this function
+%MACRO  RETURNM 0
+%IFDEF  POSITIONINDEPENDENT
+        pop     ebx
+%ENDIF
+        pop     edi
+        pop     esi
+        mov     eax, [esp+4]           ; Return value = dest
+        ret
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+_A_memmove:
+?OVR_memmove:
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+RP      equ     0                      ; RP = 0 if not position-independent
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP:                                    ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+        jmp     dword [edx+memmoveDispatch-RP]
+
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memmoveU256:  ; Version for processors with fast 256-bit read/write
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memmoveU256@:
+        PROLOGM _memcpyU256
+      
+        cmp     ecx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [edi+ecx]         ; end of dext
+        and     edx, 1FH
+        jz      B4300                   ; Skip if end of dest aligned by 32
+        
+        ; edx = size of last partial block, 1 - 32 bytes
+        test    dl, 3
+        jz      B4210
+        test    dl, 1
+        jz      B4201      ; B4200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     ecx
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al        
+B4200:  test    dl, 2
+        jz      B4210
+B4201:  ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax        
+B4210:  test    dl, 4
+        jz      B4220
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+B4220:  test    dl, 8
+        jz      B4230
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0        
+B4230:  test    dl, 16
+        jz      B4300
+        ; move 16 bytes
+        sub     ecx, 16
+        movups  xmm0, [esi+ecx]
+        movaps  [edi+ecx], xmm0
+        
+B4300:  ; Now end of dest is aligned by 32. Any partial block has been moved        
+        mov     edx, ecx
+        and     ecx, 1FH               ; remaining size after 32 bytes blocks moved
+        and     edx, -20H              ; number of 32 bytes blocks
+        jz      H4100
+        add     esi, ecx
+        add     edi, ecx
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        cmp     edx, [_CacheBypassLimit]
+%ELSE
+        cmp     edx, [ebx-RP+_CacheBypassLimit]
+%ENDIF
+        ja      H4800                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+align 16
+H4000:  ; 32 bytes move loop
+        vmovups  ymm0, [esi+edx-20H]
+        vmovaps  [edi+edx-20H], ymm0
+        sub      edx, 20H
+        jnz      H4000
+        vzeroupper
+        
+H4090:  sub      esi, ecx
+        sub      edi, ecx
+
+H4100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H4600        
+        test    cl, 10H
+        jz      H4200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [esi+ecx]
+        movaps  [edi+ecx], xmm0
+        jz      H4600                     ; early out if count divisible by 16
+H4200:  test    cl, 8
+        jz      H4300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+H4300:  test    cl, 4
+        jz      H4400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        jz      H4600                     ; early out if count divisible by 4
+H4400:  test    cl, 2
+        jz      H4500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+H4500:  test    cl, 1
+        jz      H4600
+        ; move 1 byte
+        movzx   eax, byte [esi]   ; ecx-1 = 0
+        mov     [edi], al
+H4600:  ; finished
+        RETURNM
+
+align 16
+H4800:  ; 32 bytes move loop, bypass cache
+        vmovups  ymm0, [esi+edx-20H]
+        vmovntps [edi+edx-20H], ymm0
+        sub      edx, 20H
+        jnz      H4800
+        vzeroupper
+        jmp      H4090
+        
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        ; multiple CPU versions, SSSE3 and later
+A1000:  test    cl, 20H
+        jz      A1100
+        ; move 32 bytes
+        ; movups is faster than 64-bit moves on processors with SSSE3
+        sub     ecx, 20H
+        movups  xmm0, [esi+ecx+10H]
+        movups  xmm1, [esi+ecx]
+        movups  [edi+ecx+10H], xmm0
+        movups  [edi+ecx], xmm1
+A1100:  test    cl, 10H
+        jz      A1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [esi+ecx]
+        movups  [edi+ecx], xmm0
+A1200:  test    cl, 8
+        jz      A1300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+A1300:  test    cl, 4
+        jz      A1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        jz      A1900                     ; early out if count divisible by 4
+A1400:  test    cl, 2
+        jz      A1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+A1500:  test    cl, 1
+        jz      A1900
+        ; move 1 byte
+        movzx   eax, byte [esi]    ; ecx-1 = 0
+        mov     [edi], al
+A1900:  ; finished
+        RETURNM
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+                
+align 16
+_memmoveU:  ; Version for processors with fast unaligned read
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memmoveU@:
+        PROLOGM _memcpyU
+      
+        cmp     ecx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [edi+ecx]         ; end of dext
+        and     edx, 0FH
+        jz      B3300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B3210
+        test    dl, 1
+        jz      B3201      ; B3200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     ecx
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al        
+B3200:  test    dl, 2
+        jz      B3210
+B3201:  ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax        
+B3210:  test    dl, 4
+        jz      B3220
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+B3220:  test    dl, 8
+        jz      B3300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0        
+        
+B3300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        mov      edx, ecx
+        and      ecx, 1FH              ; remaining size after 32 bytes blocks moved
+        and      edx, -20H             ; number of 32 bytes blocks
+        jz       H1100
+        add      esi, ecx
+        add      edi, ecx
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        cmp     edx, [_CacheBypassLimit]
+%ELSE
+        cmp     edx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+        ja      H1800                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+align 16
+H1000:  ; 32 bytes move loop
+        movups   xmm1, [esi+edx-20H]
+        movups   xmm0, [esi+edx-10H]
+        movaps   [edi+edx-20H], xmm1
+        movaps   [edi+edx-10H], xmm0
+        sub      edx, 20H
+        jnz      H1000
+        
+H1090:  sub      esi, ecx
+        sub      edi, ecx
+
+H1100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H1600        
+        test    cl, 10H
+        jz      H1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [esi+ecx]
+        movaps  [edi+ecx], xmm0
+        jz      H1600                     ; early out if count divisible by 16
+H1200:  test    cl, 8
+        jz      H1300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+H1300:  test    cl, 4
+        jz      H1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        jz      H1600                     ; early out if count divisible by 4
+H1400:  test    cl, 2
+        jz      H1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+H1500:  test    cl, 1
+        jz      H1600
+        ; move 1 byte
+        movzx   eax, byte [esi]   ; ecx-1 = 0
+        mov     [edi], al
+H1600:  ; finished
+        RETURNM
+
+align 16
+H1800:  ; 32 bytes move loop, bypass cache
+        movups   xmm1, [esi+edx-20H]
+        movups   xmm0, [esi+edx-10H]
+        movntps  [edi+edx-20H], xmm1
+        movntps  [edi+edx-10H], xmm0
+        sub      edx, 20H
+        jnz      H1800        
+        jmp      H1090
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+_memmoveSSSE3:  ; SSSE3 version begins here
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memmoveSSSE3@:
+        PROLOGM    _memcpySSSE3
+      
+        cmp     ecx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [edi+ecx]         ; end of dext
+        and     edx, 0FH
+        jz      B1300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1210
+        test    dl, 1
+        jz      B1201      ; B1200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     ecx
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al        
+B1200:  test    dl, 2
+        jz      B1210
+B1201:  ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax        
+B1210:  test    dl, 4
+        jz      B1220
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+B1220:  test    dl, 8
+        jz      B1300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0        
+        
+B1300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [esi+ecx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     esi, eax               ; Nearest preceding aligned block of src
+        ; Add the same to esi and edi as we have subtracted from ecx
+        add     esi, edx
+        add     edi, edx
+        
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        jmp     [MAlignmentDispatchSSSE3+eax*4]
+
+B1400:  ; Dispatch to different codes depending on src alignment
+        jmp     [MAlignmentDispatchNT+eax*4]
+
+%ELSE   ; Position-independent code
+
+        ; Check if count very big
+        ; Make the following instruction with address relative to RP:
+        cmp     ecx, [ebx-RP+_CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        ; MAlignmentDispatch table contains addresses relative to RP
+        ; Add table entry to ebx=RP to get jump address.
+
+        ; Make the following instruction with address relative to RP:
+        add     ebx,[ebx-RP+MAlignmentDispatchSSSE3+eax*4]
+        jmp     ebx
+        
+B1400:  ; Same with MAlignmentDispatchNT:        
+        add     ebx,[ebx-RP+MAlignmentDispatchNT+eax*4]
+        jmp     ebx        
+%ENDIF
+
+
+align   16
+C100:   ; Code for aligned src. SSE2 or later instruction set
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. ecx has positive index from the beginning, counting down to zero
+        movaps  xmm0, [esi+ecx-10H]
+        movaps  xmm1, [esi+ecx-20H]
+        movaps  [edi+ecx-10H], xmm0
+        movaps  [edi+ecx-20H], xmm1
+        sub     ecx, 20H
+        jnz     C100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movaps  xmm0, [esi+ecx]
+        movaps  [edi+ecx], xmm0
+        
+C200:   ; Other branches come in here, ecx may contain arbitrary offset
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 8
+        jz      C210        
+        ; move 8 bytes
+        sub     ecx, 8 
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+C210:   test    dl, 4
+        jz      C220        
+        ; move 4 bytes
+        sub     ecx, 4        
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        jz      C500                   ; Early out if count divisible by 4
+C220:   test    dl, 2
+        jz      C230        
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+C230:   test    dl, 1
+        jz      C500        
+        ; move 1 byte
+        movzx   eax, byte [esi+ecx-1]    ; ecx-1 not always 0
+        mov     [edi+ecx-1], al
+C500:   ; finished     
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+_memmoveSSE2:   ; SSE2 version begins here
+%IFDEF POSITIONINDEPENDENT
+        call    get_thunk_edx
+        add     edx, RP-$
+%ENDIF
+memmoveSSE2@:
+        PROLOGM _memcpySSE2
+      
+        cmp     ecx, 40H
+        jae     B0100                    ; Use simpler code if count < 64
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        test    cl, 20H
+        jz      A100
+        ; move 32 bytes
+        ; movq is faster than movdqu on Intel Pentium M and Core 1
+        ; movdqu is faster on later processors
+        sub     ecx, 20H
+        movq    xmm0, qword [esi+ecx+18H]
+        movq    xmm1, qword [esi+ecx+10H]
+        movq    xmm2, qword [esi+ecx+8]
+        movq    xmm3, qword [esi+ecx]
+        movq    qword [edi+ecx+18H], xmm0
+        movq    qword [edi+ecx+10H], xmm1
+        movq    qword [edi+ecx+8], xmm2
+        movq    qword [edi+ecx], xmm3
+A100:   test    cl, 10H
+        jz      A200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movq    xmm0, qword [esi+ecx+8]
+        movq    xmm1, qword [esi+ecx]
+        movq    qword [edi+ecx+8], xmm0
+        movq    qword [edi+ecx], xmm1
+A200:   test    cl, 8
+        jz      A300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0
+A300:   test    cl, 4
+        jz      A400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+        jz      A900                     ; early out if count divisible by 4
+A400:   test    cl, 2
+        jz      A500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax
+A500:   test    cl, 1
+        jz      A900
+        ; move 1 byte
+        movzx   eax, byte [esi]    ; ecx-1 = 0
+        mov     [edi], al
+A900:   ; finished
+        RETURNM
+        
+B0100:  ; count >= 64
+        ; This part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [edi+ecx]         ; end of dest
+        and     edx, 0FH
+        jz      B0300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B0210
+        test    dl, 1
+        jz      B0201      ; B0200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     ecx
+        movzx   eax, byte [esi+ecx]
+        mov     [edi+ecx], al        
+B0200:  test    dl, 2
+        jz      B0210
+B0201:  ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [esi+ecx]
+        mov     [edi+ecx], ax        
+B0210:  test    dl, 4
+        jz      B0220
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [esi+ecx]
+        mov     [edi+ecx], eax
+B0220:  test    dl, 8
+        jz      B0300
+        ; move 8 bytes
+        sub     ecx, 8
+        movq    xmm0, qword [esi+ecx]
+        movq    qword [edi+ecx], xmm0        
+        
+B0300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [esi+ecx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     ecx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     esi, eax               ; Nearest preceding aligned block of src
+        ; Add the same to esi and edi as we have subtracted from ecx
+        add     esi, edx
+        add     edi, edx
+        
+%IFNDEF POSITIONINDEPENDENT
+        ; Check if count very big
+        cmp     ecx, [_CacheBypassLimit]
+        ja      B0400                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        jmp     [MAlignmentDispatchSSE2+eax*4]
+
+B0400:   ; Dispatch to different codes depending on src alignment
+        jmp     [MAlignmentDispatchNT+eax*4]
+
+%ELSE   ; Position-independent code
+
+        ; Check if count very big
+        ; Make the following instruction with address relative to RP:
+        cmp     ecx, [ebx-RP+_CacheBypassLimit]
+        ja      B0400                   ; Use non-temporal store if count > _CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        ; MAlignmentDispatch table contains addresses relative to RP
+        ; Add table entry to ebx=RP to get jump address.
+
+        ; Make the following instruction with address relative to RP:
+        add     ebx,[ebx-RP+MAlignmentDispatchSSE2+eax*4]
+        jmp     ebx
+        
+B0400:   ; Same with MAlignmentDispatchNT:        
+        add     ebx,[ebx-RP+MAlignmentDispatchNT+eax*4]
+        jmp     ebx        
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2 2
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [esi+ecx]        ; Read from nearest following 16B boundary        
+%%L1:   ; Loop. ecx has positive index from the beginning, counting down to zero
+        sub     ecx, 20H
+        movdqa  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [esi+ecx]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        pslldq  xmm0, 16-%1             ; shift left
+        psrldq  xmm1, %1                ; shift right
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+ecx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntdq [edi+ecx+10H], xmm0    ; Save aligned
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        pslldq  xmm3, 16-%1             ; shift left
+        psrldq  xmm2, %1                ; shift right
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+ecx], xmm3        ; Save aligned
+        %ELSE
+        movntdq [edi+ecx], xmm3        ; Save aligned
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     ecx, 10H
+        movdqa  xmm1, [esi+ecx]
+        pslldq  xmm0, 16-%1             ; shift left
+        psrldq  xmm1, %1                ; shift right
+        por     xmm0, xmm1              ; combine blocks
+        %IF %2 == 0
+        movdqa  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [edi+ecx], xmm0        ; Save aligned
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1
+; Special case: u = 4
+        movaps  xmm0, [esi+ecx]        ; Read from nearest following 16B boundary
+%%L1:   ; Loop. ecx has positive index from the beginning, counting down to zero
+        sub     ecx, 20H
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [esi+ecx]
+        movaps  xmm3, xmm0
+        movaps  xmm0, xmm2        
+        movss   xmm2, xmm1
+        shufps  xmm2, xmm2, 00111001B  ; Rotate right
+        movss   xmm1, xmm3
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [edi+ecx+10H], xmm1    ; Save aligned
+        movaps  [edi+ecx], xmm2        ; Save aligned
+        %ELSE
+        movntps [edi+ecx+10H], xmm1    ; Non-temporal save
+        movntps [edi+ecx], xmm2        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     ecx, 10H
+        movaps  xmm1, [esi+ecx]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm1        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm1        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_8  1
+; Special case: u = 8
+        movaps  xmm0, [esi+ecx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+%%L1:   ; Loop. ecx has positive index from the beginning, counting down to zero
+        sub     ecx, 20H
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 01001110B  ; Rotate
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [edi+ecx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [edi+ecx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [esi+ecx]
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        movsd   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm1        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     ecx, 10H
+        movaps  xmm1, [esi+ecx]
+        shufps  xmm1, xmm1, 01001110B  ; Rotate 
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_12  1
+; Special case: u = 12
+        movaps  xmm0, [esi+ecx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 10010011B  ; Rotate right
+%%L1:   ; Loop. ecx has positive index from the beginning, counting down to zero
+        sub     ecx, 20H
+        movaps  xmm1, [esi+ecx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [edi+ecx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [edi+ecx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [esi+ecx]
+        shufps  xmm0, xmm0, 10010011B  ; Rotate left
+        movss   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm1        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     ecx, 10H
+        movaps  xmm1, [esi+ecx]
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [edi+ecx], xmm0        ; Save aligned
+        %ELSE
+        movntps [edi+ecx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSSE3  1
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [esi+ecx]        ; Read from nearest following 16B boundary
+        
+%%L1:   ; Loop. ecx has positive index from the beginning, counting down to zero
+        movdqa  xmm1, [esi+ecx-10H]    ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [edi+ecx-10H], xmm0    ; Save aligned
+        movdqa  xmm0, [esi+ecx-20H]
+        palignr xmm1, xmm0, %1         ; Combine parts into aligned block
+        movdqa  [edi+ecx-20H], xmm1    ; Save aligned
+        sub     ecx, 20H
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     ecx, 10H
+        movdqa  xmm1, [esi+ecx]        ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [edi+ecx], xmm0        ; Save aligned
+        
+%%L2:   ; Get src pointer back to misaligned state
+        add     esi, eax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the 
+;  number of 16-bytes boundaries inside loops)
+
+align 16
+D104:   MOVE_REVERSE_UNALIGNED_SSE2_4    0
+D108:   MOVE_REVERSE_UNALIGNED_SSE2_8    0
+D10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   0
+D101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   0
+D10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Sup.SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align 16
+E104:   MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108:   MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C:   MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101:   MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102:   MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103:   MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105:   MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106:   MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107:   MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109:   MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A:   MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B:   MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D:   MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E:   MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F:   MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+
+align 16        
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. ecx has positive index from the beginning, counting down to zero
+        sub     ecx, 20H
+        movaps  xmm0, [esi+ecx+10H]
+        movaps  xmm1, [esi+ecx]
+        movntps [edi+ecx+10H], xmm0
+        movntps [edi+ecx], xmm1
+        jnz     F100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movaps  xmm0, [esi+ecx]
+        movntps  [edi+ecx], xmm0
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F104:   MOVE_REVERSE_UNALIGNED_SSE2_4    1
+F108:   MOVE_REVERSE_UNALIGNED_SSE2_8    1
+F10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   1
+F101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   1
+F105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   1
+F109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for old processors without SSE2
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 8
+; 80386 version used when SSE2 not supported:
+_memmove386:
+memmove386@:
+        PROLOGM _memcpy386
+; edi = dest
+; esi = src
+; ecx = count
+        std                            ; Move backwards
+        lea     edi, [edi+ecx-1]       ; Point to last byte of dest
+        lea     esi, [esi+ecx-1]       ; Point to last byte of src
+        cmp     ecx, 8
+        jb      G500
+G100:   test    edi, 3                 ; Test if unaligned
+        jz      G200
+        movsb
+        dec     ecx
+        jmp     G100                   ; Repeat while edi unaligned
+        
+G200:   ; edi is aligned now. Move 4 bytes at a time
+        sub     edi, 3                 ; Point to last dword of dest
+        sub     esi, 3                 ; Point to last dword of src
+        mov     edx, ecx
+        shr     ecx, 2
+        rep     movsd                  ; move 4 bytes at a time
+        mov     ecx, edx
+        and     ecx, 3
+        add     edi, 3                 ; Point to last byte of dest
+        add     esi, 3                 ; Point to last byte of src        
+        rep     movsb                  ; move remaining 0-3 bytes
+        cld
+        RETURNM
+        
+G500:   ; count < 8. Move one byte at a time
+        rep     movsb                  ; move count bytes
+        cld
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CPU dispatching for memmove. This is executed only once
+memmoveCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        pushad
+        ; set _CacheBypassLimit to half the size of the largest level cache
+        push    0
+        call    SetMemcpyCacheLimit@
+        pop     ecx
+        call    _InstructionSet
+        ; Point to generic version of memmove
+        mov     esi, memmove386@
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memmove
+        mov     esi, memmoveSSE2@
+        cmp     eax, 6                 ; check Suppl-SSE3
+        jb      Q100
+        ; Suppl-SSE3 supported
+        ; Point to SSSE3 version of memmove
+        mov     esi, memmoveSSSE3@
+        call    _UnalignedIsFaster
+        test    eax, eax
+        jz      Q100
+        ; Point to unaligned version of memmove
+        mov     esi, memmoveU@
+        call    _Store256BitIsFaster
+        test    eax, eax
+        jz      Q100        
+        ; Point to 256 bit move version of memmove
+        mov     esi, memmoveU256@       
+        
+Q100:   mov     [memmoveDispatch], esi
+        popad
+        ; Continue in appropriate version of memmove
+        jmp     [memmoveDispatch]
+
+%ELSE   ; Position-independent version
+        pushad
+        mov     ebx, edx               ; reference point
+        ; set _CacheBypassLimit to half the size of the largest level cache
+        push    0
+        call    SetMemcpyCacheLimit@
+        pop     ecx
+        call    _InstructionSet
+        ; Point to generic version of memmove
+        lea     esi, [ebx+memmove386 at -RP]
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memmove
+        lea     esi, [ebx+memmoveSSE2 at -RP]
+        cmp     eax, 6                 ; check Suppl-SSE3
+        jb      Q100
+        ; Suppl-SSE3 supported
+        ; Point to SSSE3 version of memmove
+        lea     esi, [ebx+memmoveSSSE3 at -RP]        
+        call    _UnalignedIsFaster
+        test    eax, eax
+        jz      Q100
+        ; Point to unaligned version of memmove
+        lea     esi, [ebx+memmoveU at -RP]        
+        call    _Store256BitIsFaster
+        test    eax, eax
+        jz      Q100
+        ; Point to 256 bit move version of memmove
+        lea     esi, [ebx+memmoveU256 at -RP]        
+        
+Q100:   ; insert appropriate pointer
+        mov     dword [ebx+memmoveDispatch-RP], esi
+        popad
+        ; Continue in appropriate version of memmove
+        jmp     [edx+memmoveDispatch-RP]        
+%ENDIF
+
+
+; Note: Must call _SetMemcpyCacheLimit1 defined in memcpy32.asm
+_SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@:  ; local label
+        mov     eax, [esp+4]
+        push    eax
+        call    _SetMemcpyCacheLimit1
+        pop     ecx
+%ifdef  POSITIONINDEPENDENT
+        call    get_thunk_edx
+        mov     [edx + _CacheBypassLimit - $], eax
+%else
+        mov     [_CacheBypassLimit], eax
+%endif
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatchSSE2 with 
+; MAlignmentDispatchSupSSE3 if Suppl-SSE3 is supported
+; RP = reference point if position-independent code, otherwise RP = 0
+
+MAlignmentDispatchSSE2:
+DD C100-RP, D101-RP, D102-RP, D103-RP, D104-RP, D105-RP, D106-RP, D107-RP
+DD D108-RP, D109-RP, D10A-RP, D10B-RP, D10C-RP, D10D-RP, D10E-RP, D10F-RP
+
+MAlignmentDispatchSSSE3:
+DD C100-RP, E101-RP, E102-RP, E103-RP, E104-RP, E105-RP, E106-RP, E107-RP
+DD E108-RP, E109-RP, E10A-RP, E10B-RP, E10C-RP, E10D-RP, E10E-RP, E10F-RP
+
+MAlignmentDispatchNT:
+DD F100-RP, F101-RP, F102-RP, F103-RP, F104-RP, F105-RP, F106-RP, F107-RP
+DD F108-RP, F109-RP, F10A-RP, F10B-RP, F10C-RP, F10D-RP, F10E-RP, F10F-RP
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memmoveDispatch: DD memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+_CacheBypassLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/memmove64.asm b/asmlibSrc/memmove64.asm
new file mode 100755
index 0000000..a09c95a
--- /dev/null
+++ b/asmlibSrc/memmove64.asm
@@ -0,0 +1,1073 @@
+;*************************  memmove64.asm  ***********************************
+; Author:           Agner Fog
+; Date created:     2008-07-18
+; Last modified:    2013-09-11
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memmove: function             ; Function A_memmove
+global ?OVR_memmove: function          ; ?OVR removed if standard function memmove overridden
+global memmoveSSE2: function           ; Version for processors with only SSE2
+global memmoveSSSE3: function          ; Version for processors with SSSE3
+global memmoveU: function              ; Version for processors with fast unaligned read
+global memmoveU256: function           ; Version for processors with fast 256-bit read/write
+global SetMemcpyCacheLimit             ; Change limit for bypassing cache
+
+; Imported from memcpy64.asm:
+extern A_memcpy                        ; function entry
+extern memcpySSE2                      ; CPU specific function entry
+extern memcpySSSE3                     ; CPU specific function entry
+extern memcpyU                         ; CPU specific function entry
+extern memcpyU256                      ; CPU specific function entry
+
+; Imported from instrset64.asm
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy64.asm
+extern GetMemcpyCacheLimit             ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern SetMemcpyCacheLimit1            ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;      Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO  PROLOGM  1
+%IFDEF  WINDOWS
+        ; Check if dest overlaps src
+        mov     rax, rcx
+        sub     rax, rdx
+        cmp     rax, r8
+        ; We can avoid testing for dest < src by using unsigned compare:
+        ; (Assume that the memory block cannot span across address 0)
+        ; Must move backwards if unsigned(dest-src) < count
+        jae     %1                     ; Jump to memcpy if we can move forwards
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; dest
+        mov     r9,  rcx               ; dest
+        mov     rsi, rdx               ; src
+        mov     rcx, r8                ; count
+%ELSE   ; Unix
+        ; Check if dest overlaps src
+        mov     rax, rdi
+        sub     rax, rsi
+        cmp     rax, rdx
+        ; Must move backwards if unsigned(dest-src) < count
+        jae     %1                     ; Jump to memcpy if we can move forwards
+        mov     rcx, rdx               ; count
+        mov     r9,  rdi               ; dest
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO  RETURNM  0
+%IFDEF  WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF
+        mov     rax, r9                ; Return value = dest
+        ret
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+A_memmove:
+?OVR_memmove:
+        jmp     qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU256:   ; Version for processors with fast 256-bit read/write
+memmoveU256@:  ; local label
+        PROLOGM memcpyU256
+      
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]          ; end of dext
+        and     edx, 1FH
+        jz      B4300                   ; Skip if end of dest aligned by 32
+        
+        ; edx = size of last partial block, 1 - 31 bytes
+        test    dl, 3
+        jz      B4210
+        test    dl, 1
+        jz      B4201      ; B4200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B4200:  test    dl, 2
+        jz      B4210
+B4201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B4210:  test    dl, 4
+        jz      B4220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B4220:  test    dl, 8
+        jz      B4230
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+B4230:  test    dl, 16
+        jz      B4300
+        ; move 16 bytes
+        sub     rcx, 16
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        
+B4300:  ; Now end of dest is aligned by 32. Any partial block has been moved        
+        mov     rdx, rcx
+        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
+        and     rdx, -20H             ; number of 32 bytes blocks
+        jz      H4100
+        add     rsi, rcx
+        add     rdi, rcx
+        
+        ; Check if count very big
+        cmp     rdx, [CacheBypassLimit]
+        ja      H4800                   ; Use non-temporal store if count > _CacheBypassLimit
+
+align   16 
+H4000:  ; 32 bytes move loop
+        vmovups  ymm0, [rsi+rdx-20H]
+        vmovaps  [rdi+rdx-20H], ymm0
+        sub      rdx, 20H
+        jnz      H4000
+        vzeroupper
+        
+H4090:  sub      rsi, rcx
+        sub      rdi, rcx
+
+H4100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H4600        
+        test    cl, 10H
+        jz      H4200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        jz      H4600                     ; early out if count divisible by 16
+H4200:  test    cl, 8
+        jz      H4300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+H4300:  test    cl, 4
+        jz      H4400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      H4600                     ; early out if count divisible by 4
+H4400:  test    cl, 2
+        jz      H4500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+H4500:  test    cl, 1
+        jz      H4600
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+H4600:  ; finished
+        RETURNM
+
+align 16
+H4800:  ; 32 bytes move loop, bypass cache
+        vmovups  ymm0, [rsi+rdx-20H]
+        vmovntps [rdi+rdx-20H], ymm0
+        sub      rdx, 20H
+        jnz      H4800        
+        vzeroupper
+        jmp      H4090
+        
+A1000:  ; count < 64. Move 32-16-8-4-2-1 bytes
+        test    cl, 20H
+        jz      A1100
+        ; move 32 bytes
+        ; movups is faster on processors with SSSE3
+        sub     ecx, 20H
+        movups     xmm0, [rsi+rcx+10H]
+        movups     xmm1, [rsi+rcx]
+        movups     [rdi+rcx+10H], xmm0
+        movups     [rdi+rcx], xmm1
+A1100:  test    cl, 10H
+        jz      A1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups     xmm0, [rsi+rcx]
+        movups     [rdi+rcx], xmm0
+A1200:  test    cl, 8
+        jz      A1300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+A1300:  test    cl, 4
+        jz      A1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      A1900                     ; early out if count divisible by 4
+A1400:  test    cl, 2
+        jz      A1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+A1500:  test    cl, 1
+        jz      A1900
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+A1900:  ; finished
+        RETURNM
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+                
+align 16
+memmoveU:  ; Version for processors with fast unaligned read
+memmoveU@: ; local label
+        PROLOGM memcpyU
+      
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]          ; end of dext
+        and     edx, 0FH
+        jz      B3300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B3210
+        test    dl, 1
+        jz      B3201      ; B3200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B3200:  test    dl, 2
+        jz      B3210
+B3201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B3210:  test    dl, 4
+        jz      B3220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B3220:  test    dl, 8
+        jz      B3300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax        
+        
+B3300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        mov     rdx, rcx
+        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
+        and     rdx, -20H             ; number of 32 bytes blocks
+        jz      H1100
+        add     rsi, rcx
+        add     rdi, rcx
+        
+        ; Check if count very big
+        cmp     rdx, [CacheBypassLimit]
+        ja      H1800                   ; Use non-temporal store if count > _CacheBypassLimit
+
+align   16    ; minimize 16-bytes boundaries in H1000 loop
+H1000:  ; 32 bytes move loop
+        movups   xmm1, [rsi+rdx-20H]
+        movups   xmm0, [rsi+rdx-10H]
+        movaps   [rdi+rdx-20H], xmm1
+        movaps   [rdi+rdx-10H], xmm0
+        sub      rdx, 20H
+        jnz      H1000
+        
+H1090:  sub      rsi, rcx
+        sub      rdi, rcx
+
+H1100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H1600        
+        test    cl, 10H
+        jz      H1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        jz      H1600                     ; early out if count divisible by 16
+H1200:  test    cl, 8
+        jz      H1300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+H1300:  test    cl, 4
+        jz      H1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      H1600                     ; early out if count divisible by 4
+H1400:  test    cl, 2
+        jz      H1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+H1500:  test    cl, 1
+        jz      H1600
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+H1600:  ; finished
+        RETURNM
+
+align 16
+H1800:  ; 32 bytes move loop, bypass cache
+        movups   xmm1, [rsi+rdx-20H]
+        movups   xmm0, [rsi+rdx-10H]
+        movntps  [rdi+rdx-20H], xmm1
+        movntps  [rdi+rdx-10H], xmm0
+        sub      rdx, 20H
+        jnz      H1800        
+        jmp      H1090
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+memmoveSSSE3:   ; SSSE3 version begins here
+memmoveSSSE3@:  ; local label
+        PROLOGM memcpySSSE3
+
+        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]         ; end of dext
+        and     edx, 0FH
+        jz      B1300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1210
+        test    dl, 1
+        jz      B1201      ; B1200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B1200:  test    dl, 2
+        jz      B1210
+B1201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B1210:  test    dl, 4
+        jz      B1220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B1220:  test    dl, 8
+        jz      B1300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+              
+B1300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [rsi+rcx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+        ; Add the same to rsi and rdi as we have subtracted from rcx
+        add     rsi, rdx
+        add     rdi, rdx
+        
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchSSSE3]
+        jmp     near [r8+rax*8]
+
+B1400:  ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+        
+
+align   16
+C100:   ; Code for aligned src. SSE2 and later CPUs
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. rcx has positive index from the beginning, counting down to zero
+        movaps  xmm0, [rsi+rcx-10H]
+        movaps  xmm1, [rsi+rcx-20H]
+        movaps  [rdi+rcx-10H], xmm0
+        movaps  [rdi+rcx-20H], xmm1
+        sub     rcx, 20H
+        jnz     C100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     rcx, 10H
+        movaps  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        
+C200:   ; Other branches come in here, rcx may contain arbitrary offset
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 8
+        jz      C210        
+        ; move 8 bytes
+        sub     rcx, 8 
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+C210:   test    dl, 4
+        jz      C220        
+        ; move 4 bytes
+        sub     rcx, 4        
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      C500                   ; Early out if count divisible by 4
+C220:   test    dl, 2
+        jz      C230        
+        ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+C230:   test    dl, 1
+        jz      C500        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx-1]   ; rcx-1 is not always 0 here
+        mov     [rdi+rcx-1], al
+C500:   ; finished     
+        RETURNM
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+memmoveSSE2:   ; SSE2 version begins here
+memmoveSSE2@:  ; local label
+        PROLOGM  memcpySSE2
+
+        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+        cmp     rcx, 40H
+        jae     B0100                    ; Use simpler code if count < 64
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        test    cl, 20H
+        jz      A100
+        ; move 32 bytes
+        ; mov is faster than movdqu on SSE2 processors,
+        ; movdqu is faster on later processors
+        sub     ecx, 20H
+        mov     rax, [rsi+rcx+18H]
+        mov     rdx, [rsi+rcx+10H]
+        mov     [rdi+rcx+18H], rax
+        mov     [rdi+rcx+10H], rdx
+        mov     rax, [rsi+rcx+8]
+        mov     rdx, [rsi+rcx]
+        mov     [rdi+rcx+8], rax
+        mov     [rdi+rcx], rdx
+A100:   test    cl, 10H
+        jz      A200
+        ; move 16 bytes
+        sub     ecx, 10H
+        mov     rax, [rsi+rcx+8]
+        mov     rdx, [rsi+rcx]
+        mov     [rdi+rcx+8], rax
+        mov     [rdi+rcx], rdx
+A200:   test    cl, 8
+        jz      A300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+A300:   test    cl, 4
+        jz      A400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      A900                     ; early out if count divisible by 4
+A400:   test    cl, 2
+        jz      A500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+A500:   test    cl, 1
+        jz      A900
+        ; move 1 byte
+        movzx   eax, byte [rsi]       ; rcx-1 = 0
+        mov     [rdi], al
+A900:   ; finished
+        RETURNM
+        
+B0100:  ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]         ; end of dext
+        and     edx, 0FH
+        jz      B0300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B0210
+        test    dl, 1
+        jz      B0201      ; B0200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B0200:  test    dl, 2
+        jz      B0210
+B0201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B0210:  test    dl, 4
+        jz      B0220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B0220:  test    dl, 8
+        jz      B0300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+              
+B0300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [rsi+rcx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+        ; Add the same to rsi and rdi as we have subtracted from rcx
+        add     rsi, rdx
+        add     rdi, rdx
+        
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B0400                   ; Use non-temporal store if count > CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchSSE2]
+        jmp     near [r8+rax*8]
+
+B0400:   ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+        
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2  2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary        
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [rsi+rcx]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        pslldq  xmm0, 16-%1            ; shift left
+        psrldq  xmm1, %1               ; shift right
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx+10H], xmm0    ; Save aligned
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        pslldq  xmm3, 16-%1            ; shift left
+        psrldq  xmm2, %1                ; shift right
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm3        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm3        ; Save aligned
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movdqa  xmm1, [rsi+rcx]
+        pslldq  xmm0, 16-%1            ; shift left
+        psrldq  xmm1, %1               ; shift right
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm0        ; Save aligned
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_4  1 ; nt
+; Special case: u = 4
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [rsi+rcx]
+        movaps  xmm3, xmm0
+        movaps  xmm0, xmm2        
+        movss   xmm2, xmm1
+        shufps  xmm2, xmm2, 00111001B  ; Rotate right
+        movss   xmm1, xmm3
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        movaps  [rdi+rcx], xmm2        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        movntps [rdi+rcx], xmm2        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF        
+%%L2:     ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_8  1 ; nt
+; Special case: u = 8
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 01001110B  ; Rotate
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx]
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        movsd   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        shufps  xmm1, xmm1, 01001110B  ; Rotate 
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_12  1 ; nt
+; Special case: u = 12
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 10010011B  ; Rotate right
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx]
+        shufps  xmm0, xmm0, 10010011B  ; Rotate left
+        movss   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSSE3  1; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        movdqa  xmm1, [rsi+rcx-10H]    ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx-10H], xmm0    ; Save aligned
+        movdqa  xmm0, [rsi+rcx-20H]
+        palignr xmm1, xmm0, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx-20H], xmm1    ; Save aligned
+        sub     rcx, 20H
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movdqa  xmm1, [rsi+rcx]        ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the 
+;  number of 16-bytes boundaries inside loops)
+
+align   16
+D104:   MOVE_REVERSE_UNALIGNED_SSE2_4    0
+D108:   MOVE_REVERSE_UNALIGNED_SSE2_8    0
+D10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   0
+D101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   0
+D10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align   16
+E104:   MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108:   MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C:   MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101:   MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102:   MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103:   MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105:   MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106:   MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107:   MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109:   MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A:   MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B:   MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D:   MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E:   MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F:   MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+        
+align   16
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm0, [rsi+rcx+10H]
+        movaps  xmm1, [rsi+rcx]
+        movntps [rdi+rcx+10H], xmm0
+        movntps [rdi+rcx], xmm1
+        jnz     F100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     rcx, 10H
+        movaps  xmm0, [rsi+rcx]
+        movntps  [rdi+rcx], xmm0
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   1
+F104:   MOVE_REVERSE_UNALIGNED_SSE2_4    1
+F105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   1
+F108:   MOVE_REVERSE_UNALIGNED_SSE2_8    1
+F109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   1
+F10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveCPUDispatch:   ; CPU dispatcher, check for Suppl-SSE3 instruction set
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8        
+
+        ; set CacheBypassLimit to half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means default
+%else
+        xor     edi, edi
+%endif
+        call    SetMemcpyCacheLimit@
+        mov     eax, 1
+        cpuid                          ; Get feature flags
+        lea     rbx, [memmoveSSE2@]
+        bt      ecx, 9                 ; Test bit for SupplSSE3
+        jnc     Q100
+        lea     rbx, [memmoveSSSE3@]
+        call    UnalignedIsFaster
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memmoveU@]
+        call    Store256BitIsFaster
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memmoveU256@]
+        
+Q100:   ; Insert appropriate pointer
+        mov     [memmoveDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+        
+; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
+SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@:
+        call    SetMemcpyCacheLimit1
+        mov     [CacheBypassLimit], rax
+        ret 
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatch with 
+; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3 
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+MAlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+MAlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+MAlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+memmoveDispatch: DQ memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DD 0
diff --git a/asmlibSrc/memset32.asm b/asmlibSrc/memset32.asm
new file mode 100755
index 0000000..f4d025f
--- /dev/null
+++ b/asmlibSrc/memset32.asm
@@ -0,0 +1,487 @@
+;*************************  memset32.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2013-09-11
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memset: function             ; Function memset
+global ?OVR_memset: function           ; ?OVR removed if standard function memset overridden
+global _GetMemsetCacheLimit: function  ; Data blocks bigger than this will be stored uncached by memset
+global _SetMemsetCacheLimit: function  ; Change limit in GetMemsetCacheLimit
+; Direct entries to CPU-specific versions
+global _memset386:  function           ; version for old CPUs without SSE
+global _memsetSSE2: function           ; SSE2 version
+global _memsetAVX:  function           ; version for CPUs with fast 256-bit store
+
+
+; Imported from cachesize32.asm:
+extern _DataCacheSize                  ; Get size of data cache
+
+; Imported from instrset32.asm
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _Store256BitIsFaster            ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define return from this function
+%MACRO  RETURNM  0
+%IFDEF  POSITIONINDEPENDENT
+        pop     ebx
+%ENDIF
+        mov     eax, [esp+4]           ; return dest
+        ret
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+_A_memset:
+?OVR_memset:
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [memsetDispatch] ; Go to appropriate version, depending on instruction set
+RP      equ     0                      ; RP = 0 if not position-independent
+
+%ELSE   ; Position-independent code
+        push    ebx
+        call    get_thunk_ebx          ; get reference point for position-independent code
+RP:                                    ; reference point ebx = offset RP
+
+; Make the following instruction with address relative to RP:
+        jmp     dword [ebx+memsetDispatch-RP]
+
+%ENDIF
+
+_memsetAVX:  ; AVX version. Use ymm register
+%IFDEF POSITIONINDEPENDENT
+        push    ebx
+        call    get_thunk_ebx          ; get reference point for position-independent code
+        add     ebx, RP - $
+memsetAVX@: ; local label
+        mov     edx, [esp+4+4]           ; dest
+        movzx   eax, byte [esp+4+8]      ; c
+        mov     ecx, [esp+4+12]          ; count
+%ELSE
+memsetAVX@: ; local label
+        mov     edx, [esp+4]           ; dest
+        movzx   eax, byte [esp+8]      ; c
+        mov     ecx, [esp+12]          ; count
+%ENDIF        
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        cmp     ecx, 16
+        ja      B100
+        
+B050:   ; count <= 16, both SSE2 and AVX version
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [MemsetJTab+ecx*4]
+%ELSE
+        jmp     dword [MemsetJTab-RP+ebx+ecx*4]
+%ENDIF        
+        
+; Separate code for each count from 0 to 16:
+M16:    mov     [edx+12], eax
+M12:    mov     [edx+8],  eax
+M08:    mov     [edx+4],  eax
+M04:    mov     [edx],    eax
+M00:    RETURNM
+
+M15:    mov     [edx+11], eax
+M11:    mov     [edx+7],  eax
+M07:    mov     [edx+3],  eax
+M03:    mov     [edx+1],  ax
+M01:    mov     [edx],    al
+        RETURNM
+       
+M14:    mov     [edx+10], eax
+M10:    mov     [edx+6],  eax
+M06:    mov     [edx+2],  eax
+M02:    mov     [edx],    ax
+        RETURNM
+
+M13:    mov     [edx+9],  eax
+M09:    mov     [edx+5],  eax
+M05:    mov     [edx+1],  eax
+        mov     [edx],    al
+        RETURNM
+
+align   16
+B100:   ; count > 16.
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0        
+        lea     eax, [edx+ecx]         ; point to end
+        
+        cmp     ecx, 20H
+        jbe     K600                   ; faster to use xmm registers if small
+
+        ; Store the first possibly unaligned 16 bytes
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movups  oword [edx], xmm0
+        
+        ; store another 16 bytes, aligned        
+        add     edx, 10H
+        and     edx, -10H
+        movaps  oword [edx], xmm0
+        
+        ; go to next 32 bytes boundary
+        add     edx, 10H
+        and     edx, -20H
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        cmp     ecx, [_MemsetCacheLimit]        
+%ELSE   ; position-independent code
+        cmp     ecx, [ebx+_MemsetCacheLimit-RP]
+%ENDIF
+        ja      K300                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; find last 32 bytes boundary
+        mov     ecx, eax
+        and     ecx, -20H
+        
+        ; - size of 32-bytes blocks
+        sub     edx, ecx
+        jnb     K200                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+        
+K100:   ; Loop through 32-bytes blocks
+        ; ecx = end of 32-bytes blocks part
+        ; edx = negative index from the end, counting up to zero
+        vmovaps [ecx+edx], ymm0
+        add     edx, 20H
+        jnz     K100
+        vzeroupper
+        
+K200:   ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
+        movups  [eax-20H], xmm0
+        movups  [eax-10H], xmm0
+        RETURNM
+        
+K300:   ; Use non-temporal moves, same code as above:
+
+        ; find last 32 bytes boundary
+        mov     ecx, eax
+        and     ecx, -20H
+
+        ; - size of 32-bytes blocks
+        sub     edx, ecx
+        jnb     K500                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+
+align   16        
+K400:   ; Loop through 32-bytes blocks
+        ; ecx = end of 32-bytes blocks part
+        ; edx = negative index from the end, counting up to zero
+        vmovntps [ecx+edx], ymm0
+        add     edx, 20H
+        jnz     K400
+        vzeroupper
+        
+K500:   ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
+        movups  [eax-20H], xmm0
+        movups  [eax-10H], xmm0
+        RETURNM
+        
+K600:   ; 16 < count <= 32
+        movups  [edx], xmm0
+        movups  [eax-10H], xmm0
+        RETURNM        
+
+
+align 16
+_memsetSSE2:  ; SSE2 version. Use xmm register
+%IFDEF POSITIONINDEPENDENT
+        push    ebx
+        call    get_thunk_ebx          ; get reference point for position-independent code
+        add     ebx, RP - $
+memsetSSE2@: ; local label
+        mov     edx, [esp+4+4]           ; dest
+        movzx   eax, byte [esp+4+8]      ; c
+        mov     ecx, [esp+4+12]          ; count
+%ELSE
+memsetSSE2@: ; local label
+        mov     edx, [esp+4]           ; dest
+        movzx   eax, byte [esp+8]      ; c
+        mov     ecx, [esp+12]          ; count
+%ENDIF        
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        cmp     ecx, 16
+        jna     B050                   ; small counts: same as AVX version
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
+        
+        ; Store the first unaligned part.
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movq    qword [edx],   xmm0
+        movq    qword [edx+8], xmm0
+        
+        ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+        cmp     ecx, [_MemsetCacheLimit]        
+%ELSE   ; position-independent code
+        cmp     ecx, [ebx+_MemsetCacheLimit-RP]
+%ENDIF
+        ja      M500                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; Point to end of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     ecx, [edx+ecx-1]
+        and     ecx, -10H
+        
+        ; Point to start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     edx, 10H
+        and     edx, -10H
+        
+        ; -(size of regular part)
+        sub     edx, ecx
+        jnb     M300                   ; Jump if not negative
+
+align 16        
+M200:   ; Loop through regular part
+        ; ecx = end of regular part
+        ; edx = negative index from the end, counting up to zero
+        movdqa  [ecx+edx], xmm0
+        add     edx, 10H
+        jnz     M200
+        
+M300:   ; Do the last irregular part
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the preceding regular part, than to make possibly mispredicted
+        ; branches depending on the size of the last part.
+%IFDEF  POSITIONINDEPENDENT            ; (ebx is pushed)
+        mov     eax, [esp+4+4]         ; dest
+        mov     ecx, [esp+4+12]        ; count
+%ELSE
+        mov     eax, [esp+4]           ; dest
+        mov     ecx, [esp+12]          ; count
+%ENDIF
+        movq    qword [eax+ecx-10H], xmm0
+        movq    qword [eax+ecx-8], xmm0
+        RETURNM
+   
+M500:   ; Use non-temporal moves, same code as above:
+        ; End of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     ecx, [edx+ecx-1]
+        and     ecx, -10H
+        
+        ; Start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     edx, 10H
+        and     edx, -10H
+        
+        ; -(size of regular part)
+        sub     edx, ecx
+        jnb     M700                   ; Jump if not negative
+
+align 16        
+M600:   ; Loop through regular part
+        ; ecx = end of regular part
+        ; edx = negative index from the end, counting up to zero
+        movntdq [ecx+edx], xmm0
+        add     edx, 10H
+        jnz     M600
+        
+M700:   ; Do the last irregular part (same as M300)
+%IFDEF  POSITIONINDEPENDENT            ; (ebx is pushed)
+        mov     eax, [esp+4+4]         ; dest
+        mov     ecx, [esp+4+12]        ; count
+%ELSE
+        mov     eax, [esp+4]           ; dest
+        mov     ecx, [esp+12]          ; count
+%ENDIF
+        movq    qword [eax+ecx-10H], xmm0
+        movq    qword [eax+ecx-8], xmm0
+        RETURNM
+     
+        
+
+_memset386:  ; 80386 version
+%IFDEF POSITIONINDEPENDENT
+        push    ebx
+        call    get_thunk_ebx          ; get reference point for position-independent code
+        add     ebx, RP - $
+memset386@: ; local label
+        mov     edx, [esp+4+4]           ; dest
+        xor     eax, eax
+        mov     al,  byte [esp+4+8]      ; c
+        mov     ecx, [esp+4+12]          ; count
+%ELSE
+memset386@: ; local label
+        mov     edx, [esp+4]           ; dest
+        xor     eax, eax
+        mov     al,  byte [esp+8]      ; c
+        mov     ecx, [esp+12]          ; count
+%ENDIF        
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        push    edi
+        mov     edi, edx
+        cmp     ecx, 4
+        jb      N400
+N200:   test    edi, 3
+        jz      N300
+        ; unaligned
+N210:   mov     [edi], al              ; store 1 byte until edi aligned
+        inc     edi
+        dec     ecx
+        test    edi, 3
+        jnz     N210
+N300:   ; aligned
+        mov     edx, ecx
+        shr     ecx, 2
+        cld
+        rep     stosd                  ; store 4 bytes at a time
+        mov     ecx, edx
+        and     ecx, 3
+N400:   rep     stosb                  ; store any remaining bytes
+        pop     edi
+        RETURNM
+        
+        
+; CPU dispatching for memset. This is executed only once
+memsetCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        pushad
+        call    GetMemsetCacheLimit@                    ; calculate cache limit
+        call    _InstructionSet                         ; get supported instruction set
+        ; Point to generic version of memset
+        mov     dword [memsetDispatch],  memset386@
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memset
+        mov     dword [memsetDispatch],  memsetSSE2@
+        call    _Store256BitIsFaster                    ; check if 256-bit stores are available and faster
+        test    eax, eax
+        jz      Q100
+        mov     dword [memsetDispatch],  memsetAVX@
+        
+Q100:   popad
+        ; Continue in appropriate version of memset
+        jmp     dword [memsetDispatch]
+
+%ELSE   ; Position-independent version
+        pushad
+        call    GetMemsetCacheLimit@
+        call    _InstructionSet 
+                
+        ; Point to generic version of memset
+        lea     esi, [ebx+memset386 at -RP]
+        cmp     eax, 4                 ; check SSE2
+        jb      Q100
+        ; SSE2 supported
+        ; Point to SSE2 version of memset
+        lea     esi, [ebx+memsetSSE2 at -RP]
+        call    _Store256BitIsFaster                    ; check if 256-bit stores are available and faster
+        test    eax, eax
+        jz      Q100
+        lea     esi, [ebx+memsetAVX at -RP]
+Q100:   mov     [ebx+memsetDispatch-RP], esi
+        popad
+        ; Continue in appropriate version of memset
+        jmp     [ebx+memsetDispatch-RP]        
+        
+get_thunk_ebx: ; load caller address into ebx for position-independent code
+        mov     ebx, [esp]
+        ret        
+%ENDIF
+
+
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+_GetMemsetCacheLimit:
+GetMemsetCacheLimit@:  ; local label
+        push    ebx
+%ifdef  POSITIONINDEPENDENT
+        call    get_thunk_ebx
+        add     ebx, _MemsetCacheLimit - $
+%else
+        mov     ebx, _MemsetCacheLimit
+%endif
+        mov     eax, [ebx]
+        test    eax, eax
+        jnz     U200
+        ; Get half the size of the largest level cache
+        push    0                      ; 0 means largest level cache
+        call    _DataCacheSize         ; get cache size
+        pop     ecx
+        shr     eax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [ebx], eax
+U200:   pop     ebx
+        ret
+
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+_SetMemsetCacheLimit:
+        push    ebx
+%ifdef  POSITIONINDEPENDENT
+        call    get_thunk_ebx
+        add     ebx, _MemsetCacheLimit - $
+%else
+        mov     ebx, _MemsetCacheLimit
+%endif
+        mov     eax, [esp+8]
+        test    eax, eax
+        jnz     U400
+        ; zero, means default
+        mov     [ebx], eax
+        call    GetMemsetCacheLimit@
+U400:   
+        mov     [ebx], eax
+        pop     ebx
+        ret
+
+
+SECTION .data
+align 16
+
+; Jump table for count from 0 to 16:
+MemsetJTab DD M00, M01, M02, M03, M04, M05, M06, M07
+           DD M08, M09, M10, M11, M12, M13, M14, M15, M16
+
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch DD memsetCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+_MemsetCacheLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/memset64.asm b/asmlibSrc/memset64.asm
new file mode 100755
index 0000000..6fb0490
--- /dev/null
+++ b/asmlibSrc/memset64.asm
@@ -0,0 +1,368 @@
+;*************************  memset64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2013-08-04
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memset: function              ; Function memset
+global ?OVR_memset: function           ; ?OVR removed if standard function memset overridden
+global memsetSSE2: function            ; SSE2 version
+global memsetAVX: function             ; version for CPUs with fast 256-bit store
+global GetMemsetCacheLimit: function   ; Data blocks bigger than this will be stored uncached by memset
+global SetMemsetCacheLimit: function   ; Change limit in GetMemsetCacheLimit
+
+; Imported from cachesize64.asm:
+extern DataCacheSize                   ; Get size of data cache
+
+; Imported from unalignedisfaster64.asm:
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define prolog for this function
+%MACRO  PROLOGM  0
+%IFDEF  WINDOWS
+%define Rdest   rcx                    ; dest
+        movzx   eax, dl                ; c
+        mov     rdx, r8                ; count
+%define Rcount  rdx                    ; count
+%define Rdest2  r9                     ; copy of dest
+%define Rcount2 r8                     ; copy of count
+
+%ELSE   ; Unix
+%define Rdest   rdi                    ; dest
+        movzx   eax, sil               ; c
+%define Rcount  rdx                    ; count
+%define Rdest2  rcx                    ; copy of dest
+%define Rcount2 rsi                    ; copy of count
+        mov     Rcount2, Rcount        ; copy count
+%ENDIF
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+A_memset:
+?OVR_memset:
+        jmp     [memsetDispatch]       ; CPU dispatch table
+        
+memsetAVX:  ; AVX version. Use ymm register
+memsetAVX@: ; local label
+        PROLOGM
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        mov     Rdest2, Rdest          ; save dest
+        cmp     Rcount, 16
+        ja      B100
+B050:   lea     r10, [MemsetJTab]      ; SSE2 version comes in here
+        jmp     qword [r10+Rcount*8]   ; jump table for small counts
+        
+; Separate code for each count from 0 to 16:
+M16:    mov     [Rdest+12], eax
+M12:    mov     [Rdest+8],  eax
+M08:    mov     [Rdest+4],  eax
+M04:    mov     [Rdest],    eax
+M00:    mov     rax, Rdest2            ; return dest
+        ret
+
+M15:    mov     [Rdest+11], eax
+M11:    mov     [Rdest+7],  eax
+M07:    mov     [Rdest+3],  eax
+M03:    mov     [Rdest+1],  ax
+M01:    mov     [Rdest],    al
+        mov     rax, Rdest2            ; return dest
+        ret
+       
+M14:    mov     [Rdest+10], eax
+M10:    mov     [Rdest+6],  eax
+M06:    mov     [Rdest+2],  eax
+M02:    mov     [Rdest],    ax
+        mov     rax, Rdest2            ; return dest
+        ret
+
+M13:    mov     [Rdest+9],  eax
+M09:    mov     [Rdest+5],  eax
+M05:    mov     [Rdest+1],  eax
+        mov     [Rdest],    al
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+B100:   ; AVX version, Rcount > 16
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
+        
+        lea     rax, [Rdest+Rcount]    ; point to end
+        
+        cmp     Rcount, 20H
+        jbe     K600                   ; faster to use xmm registers if small
+        
+        ; Store the first possibly unaligned 16 bytes
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movups  oword [Rdest], xmm0
+        
+        ; store another 16 bytes, aligned        
+        add     Rdest, 10H
+        and     Rdest, -10H
+        movaps  oword [Rdest], xmm0
+        
+        ; go to next 32 bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -20H
+        
+        ; Check if count very big
+        cmp     Rcount, [MemsetCacheLimit]        
+        ja      K300                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; find last 32 bytes boundary
+        mov     Rcount, rax
+        and     Rcount, -20H
+        
+        ; - size of 32-bytes blocks
+        sub     Rdest, Rcount
+        jnb     K200                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+        
+align   16        
+K100:   ; Loop through 32-bytes blocks. Register use is swapped
+        ; Rcount = end of 32-bytes blocks part
+        ; Rdest = negative index from the end, counting up to zero
+        vmovaps [Rcount+Rdest], ymm0
+        add     Rdest, 20H
+        jnz     K100
+        vzeroupper
+        
+K200:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+        movups  [rax-20H], xmm0
+        movups  [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+K300:   ; Use non-temporal moves, same code as above:
+
+        ; find last 32 bytes boundary
+        mov     Rcount, rax
+        and     Rcount, -20H
+        
+        ; - size of 32-bytes blocks
+        sub     Rdest, Rcount
+        jnb     K500                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+        
+align   16        
+K400:   ; Loop through 32-bytes blocks. Register use is swapped
+        ; Rcount = end of 32-bytes blocks part
+        ; Rdest = negative index from the end, counting up to zero
+        vmovntps [Rcount+Rdest], ymm0
+        add     Rdest, 20H
+        jnz     K400
+        vzeroupper
+        
+K500:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+        movups  [rax-20H], xmm0
+        movups  [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+K600:   ; 16 < count <= 32
+        movups [Rdest], xmm0
+        movups [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+
+memsetSSE2:  ; count > 16. Use SSE2 instruction set
+memsetSSE2@: ; local label
+        PROLOGM
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        mov     Rdest2, Rdest          ; save dest
+        cmp     Rcount, 16
+        jna     B050
+
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
+        
+        ; Store the first unaligned part.
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movq    qword [Rdest],   xmm0
+        movq    qword [Rdest+8], xmm0
+        
+        ; Check if count very big
+M150:   mov     rax, [MemsetCacheLimit]        
+        cmp     Rcount, rax
+        ja      M500                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; Point to end of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     Rcount, [Rdest+Rcount-1]
+        and     Rcount, -10H
+        
+        ; Point to start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -10H
+        
+        ; -(size of regular part)
+        sub     Rdest, Rcount
+        jnb     M300                   ; Jump if not negative
+        
+align 16
+M200:   ; Loop through regular part
+        ; Rcount = end of regular part
+        ; Rdest = negative index from the end, counting up to zero
+        movdqa  [Rcount+Rdest], xmm0
+        add     Rdest, 10H
+        jnz     M200
+        
+M300:   ; Do the last irregular part
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the preceding regular part, than to make possibly mispredicted
+        ; branches depending on the size of the last part.
+        mov     rax, Rdest2                          ; dest
+        movq    qword [rax+Rcount2-10H], xmm0
+        movq    qword [rax+Rcount2-8], xmm0
+        ret
+
+        
+M500:   ; Use non-temporal moves, same code as above:
+        ; End of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     Rcount, [Rdest+Rcount-1]
+        and     Rcount, -10H
+        
+        ; Start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -10H
+        
+        ; -(size of regular part)
+        sub     Rdest, Rcount
+        jnb     M700                   ; Jump if not negative
+
+align 16        
+M600:   ; Loop through regular part
+        ; Rcount = end of regular part
+        ; Rdest = negative index from the end, counting up to zero
+        movntdq [Rcount+Rdest], xmm0
+        add     Rdest, 10H
+        jnz     M600
+        
+M700:   ; Do the last irregular part
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the preceding regular part, than to make possibly mispredicted
+        ; branches depending on the size of the last part.
+        mov     rax, Rdest2            ; dest
+        movq    qword [rax+Rcount2-10H], xmm0
+        movq    qword [rax+Rcount2-8], xmm0
+        ret
+        
+        
+memsetCPUDispatch:    ; CPU dispatcher, check for instruction sets and which method is fastest        
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8
+        ; set CacheBypassLimit to half the size of the largest level cache
+        call    GetMemsetCacheLimit@
+        lea     rbx, [memsetSSE2@]
+        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memsetAVX@]
+Q100:
+        ; Insert appropriate pointer
+        mov     [memsetDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+
+        
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+GetMemsetCacheLimit:
+GetMemsetCacheLimit@:
+        mov     rax, [MemsetCacheLimit]
+        test    rax, rax
+        jnz     U200
+        ; Get half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means largest level cache
+%else
+        xor     edi, edi               ; 0 means largest level cache
+%endif
+        call    DataCacheSize          ; get cache size
+        shr     eax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [MemsetCacheLimit], eax
+U200:   ret
+
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+SetMemsetCacheLimit:
+%ifdef  WINDOWS
+        mov     rax, rcx
+%else
+        mov     rax, rdi
+%endif
+        test    rax, rax
+        jnz     U400
+        ; zero, means default
+        mov     [MemsetCacheLimit], rax
+        call    GetMemsetCacheLimit@
+U400:   mov     [MemsetCacheLimit], rax
+        ret
+        
+   
+SECTION .data
+align 16
+; Jump table for count from 0 to 16:
+MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
+           DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
+           
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch: DQ memsetCPUDispatch           
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+MemsetCacheLimit: DQ 0
diff --git a/asmlibSrc/mersenne32.asm b/asmlibSrc/mersenne32.asm
new file mode 100755
index 0000000..ed1a100
--- /dev/null
+++ b/asmlibSrc/mersenne32.asm
@@ -0,0 +1,821 @@
+; ----------------------------- MERSENNE32.ASM ---------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+;  This random number generator is described in the article by
+;  M. Matsumoto & T. Nishimura, in:
+;  ACM Transactions on Modeling and Computer Simulation,
+;  vol. 8, no. 1, 1998, pp. 3-30. See also:
+;  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+;  Initialization:
+;  MersRandomInit must be called before the first call to any of the other
+;  random number functions. The seed is any 32-bit integer.
+;  You may use MersRandomInitByArray instead if you want more
+;  than 32 bits for seed. length is the number of integers in seeds[].
+;  length must be > 0, there is no upper limit for length.
+;
+;  Generating random numbers:
+;  MersRandom returns a floating point number in the interval 0 <= x < 1 with
+;  a resolution of 32 bits.
+;  MersIRandom returns an integer in the interval defined by min and max with
+;  a resolution of 32 bits.
+;  MersIRandomX returns an integer in the interval defined by min and max with
+;  exactly equal probabilities of all values in the interval.
+;  MersBRandom returns 32 random bits.
+;
+;  Error conditions:
+;  If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+;  and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+;  MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+;  C++ prototypes in randoma.h, 32-bit Windows:
+;
+;  Thread-safe static link versions for Mersenne Twister
+;  extern "C" void   MersRandomInit(void * Pthis, int seed);         // Re-seed
+;  extern "C" void   MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersIRandom (void * Pthis, int min, int max);   // Output random integer
+;  extern "C" int    MersIRandomX(void * Pthis, int min, int max);   // Output random integer, exact
+;  extern "C" double MersRandom(void * Pthis);                       // Output random float
+;  extern "C" unsigned int MersBRandom(void * Pthis);                // Output random bits
+;
+;  Single-threaded static link versions for Mersenne Twister, Windows only
+;  extern "C" void   MersenneRandomInit(int seed);                   // Re-seed
+;  extern "C" void   MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersenneIRandom (int min, int max);             // Output random integer
+;  extern "C" int    MersenneIRandomX(int min, int max);             // Output random integer, exact
+;  extern "C" double MersenneRandom();                               // Output random float
+;  extern "C" unsigned int MersenneBRandom();                        // Output random bits
+;
+;  Single threaded dynamic link versions for Mersenne Twister, Windows only
+;  extern "C" void   __stdcall MersenneRandomInitD(int seed);              // Re-seed
+;  extern "C" void   __stdcall MersenneRandomInitByArrayD(unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    __stdcall MersenneIRandomD (int min, int max);  // Output random integer
+;  extern "C" int    __stdcall MersenneIRandomXD(int min, int max);  // Output random integer, exact
+;  extern "C" double __stdcall MersenneRandomD();                    // Output random float
+;  extern "C" unsigned int __stdcall MersenneBRandomD();             // Output random bits
+;
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global _MersRandomInit, _MersRandomInitByArray
+global _MersBRandom, _MersRandom, _MersIRandom, _MersIRandomX
+global _MersenneRandomInitByArray,_MersenneRandomInit
+global _MersenneRandom, _MersenneIRandom, _MersenneIRandomX, _MersenneBRandom
+%IFDEF WINDOWS
+global _MersenneRandomInitByArrayD at 8, _MersenneRandomInitD at 4
+global _MersenneRandomD at 0, _MersenneIRandomD at 8, _MersenneIRandomXD at 8, _MersenneBRandomD at 0
+%ENDIF
+
+
+SECTION .data
+align 16
+; Data for single instance of random number generator
+MersenneInstance: 
+ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE ALIGN=16
+
+extern _InstructionSet
+
+
+; ---------------------------------------------------------------
+;  Thread-safe static link versions for Mersenne Twister
+; ---------------------------------------------------------------
+
+;  extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+
+_MersRandomInit: ; PROC NEAR
+        mov     ecx, [esp+4]                               ; Pthis
+        mov     eax, [esp+8]                               ; seed
+        and     ecx, -16                                   ; align buffer
+        
+MersRandomInit_reg:                                        ; Entry for register parameters, used internally
+        call    Mers_init0                                 ; initialize mt buffer with seeds
+        
+        ; Number of premade numbers that are lost in the initialization when the  
+        ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3        
+   PREMADELOST equ (MERS_N & 3)
+%ELSE
+   PREMADELOST equ 4
+%ENDIF
+        ; We want the C++ and the assembly implementation to give exactly the same
+        ; sequence. The C++ version discards 37 random numbers after initialization.
+        ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+        ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+        ; SSE2 is supported, otherwise 37 + 1.
+        
+        push    edi
+        mov     edi, 37+PREMADELOST+1
+        cmp     dword [ecx+CRandomMersenneA.Instset], 4    ; can we use XMM registers and SSE2 ?
+        jae     M110
+        sub     edi, PREMADELOST       ; SSE2 not supported
+        mov     dword [ecx+CRandomMersenneA.PreInx], 0     ; reset index to premade list
+M110:   ; loop
+M120:   call    MersBRandom_reg
+        dec     edi
+        jnz     M120
+        pop     edi
+        ret
+;_MersRandomInit ENDP
+        
+
+Mers_init0:                                                ; make random seeds from eax and put them into MT buffer
+; Input parameters: 
+; eax: seed
+; ecx points to CRandomMersenneA
+
+        push    ebx
+        push    edi
+        mov     ebx, eax                                   ; seed
+        
+        ; clear my buffer
+        push    ecx
+        mov     edi, ecx                                   ; Pthis
+        add     edi, 16                                    ; skip alignment filler
+        mov     ecx, (MersenneSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        pop     ecx                                        ; Pthis
+        
+        ; initialize CRandomMersenneA structure
+        mov     dword [ecx+CRandomMersenneA.PreInx], 4*4
+        push    ecx
+        call    _InstructionSet                            ; detect instruction set
+        pop     ecx
+        mov     [ecx+CRandomMersenneA.Instset], eax
+        mov     eax, MERS_B
+        mov     [ecx+CRandomMersenneA.TMB], eax
+        mov     [ecx+CRandomMersenneA.TMB+4], eax
+        mov     [ecx+CRandomMersenneA.TMB+8], eax
+        mov     [ecx+CRandomMersenneA.TMB+12], eax
+        mov     eax, MERS_C
+        mov     [ecx+CRandomMersenneA.TMC], eax
+        mov     [ecx+CRandomMersenneA.TMC+4], eax
+        mov     [ecx+CRandomMersenneA.TMC+8], eax
+        mov     [ecx+CRandomMersenneA.TMC+12], eax
+        mov     eax, 3FF00000H                             ; upper dword of 1.0, double precision
+        mov     dword [ecx+CRandomMersenneA.one+4], eax
+        mov     dword [ecx+CRandomMersenneA.one+12], eax        
+        mov     dword [ecx+CRandomMersenneA.LMASK], LOWER_MASK
+        mov     dword [ecx+CRandomMersenneA.UMASK], UPPER_MASK
+        mov     dword [ecx+CRandomMersenneA.MATA],  MERS_A
+
+        ; put random numbers into MT buffer
+        xor     edi, edi        
+M210:   mov     [ecx+edi*4+CRandomMersenneA.MT], ebx
+        mov     edx, ebx
+        shr     ebx, 30
+        xor     ebx, edx
+        imul    ebx, 1812433253
+        inc     edi
+        add     ebx, edi        
+        cmp     edi, MERS_N
+        jb      M210
+        
+        ; Set index MTI to end of list, (scaled by 4)
+        ; Round up to multiple of 4 to avoid alignment error
+        mov     dword [ecx+CRandomMersenneA.MTI], ((MERS_N+3) & -4) * 4
+        
+        pop     edi
+        pop     ebx       
+        ret      
+;Mers_init0   ENDP
+
+
+;  extern "C" void   MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersRandomInitByArray: ; PROC NEAR
+
+        push    ebx
+        push    esi
+        push    edi
+        push    ebp
+        mov     ecx, [esp+20]                              ; Pthis
+        mov     ebx, [esp+24]                              ; seeds
+        mov     ebp, [esp+28]                              ; length
+        and     ecx, -16                                   ; align buffer
+        
+MersRandomInitByArray_reg:                                 ; Entry for register parameters, used internally
+
+        push    ebp                                        ; save length
+        mov     eax, 19650218
+        call    Mers_init0                                 ; init0(19650218);
+        
+        test    ebp, ebp
+        jle     M380                                       ; error: length <= 0
+        xor     edi, edi                                   ; j = 0
+        lea     esi, [edi+1]                               ; i = 1
+        cmp     ebp, MERS_N
+        ja      M310
+        mov     ebp, MERS_N                                ; k = max (MERS_N,length)
+M310:                                                    
+
+        ; for (; k; k--) {
+M320:   mov     eax, [ecx+esi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1664525                               ; * 1664525
+        xor     eax, [ecx+esi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        add     eax, [ebx+edi*4]                           ; + seeds[j]
+        add     eax, edi                                   ; + j
+        mov     [ecx+esi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        inc     edi                                        ; j++
+        cmp     esi, MERS_N
+        jb      M330                                       ; if (i>=MERS_N)
+        mov     eax, [ecx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [ecx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M330:
+        cmp     edi, [esp]                                 ; length
+        jb      M340          ; if (j>=length)
+        xor     edi, edi                                   ; j = 0;
+M340:
+        dec     ebp                                        ; k--
+        jnz     M320                                       ; first k loop
+M350:
+        mov     ebp, MERS_N-1                              ; k
+M360:   mov     eax, [ecx+esi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1566083941                            ; * 1566083941
+        xor     eax, [ecx+esi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        sub     eax, esi                                   ; - i
+        mov     [ecx+esi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        cmp     esi, MERS_N
+        jb      M370                                       ; if (i>=MERS_N)
+        mov     eax, [ecx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [ecx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M370:
+        dec     ebp                                        ; k--
+        jnz     M360                                       ; second k loop
+        mov     dword [ecx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+        mov     dword [ecx+CRandomMersenneA.MTI], 0
+        mov     dword [ecx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+        mov     edi, MERS_N + PREMADELOST+1
+        CMP     dword [ecx+CRandomMersenneA.Instset], 4    ; can we use XMM registers and SSE2 ?
+        jae     M390
+        sub     edi, PREMADELOST                           ; SSE2 not supported
+        mov     dword [ecx+CRandomMersenneA.PreInx], 0     ; reset index to premade list
+M390:   ; loop
+M391:   call    MersBRandom_reg
+        dec     edi
+        jnz     M391
+
+        pop     ecx                                        ; remove local copy of length
+        pop     ebp                                        ; restore registers
+        pop     edi
+        pop     esi
+        pop     ebx
+        ret
+;_MersRandomInitByArray ENDP
+
+;  extern "C" unsigned int MersBRandom(void * Pthis);      // Output random bits
+
+_MersBRandom: ; PROC NEAR                                  ; generate random bits
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align buffer
+
+MersBRandom_reg:                                           ; Entry for register parameters, used internally
+
+        cmp     dword [ecx+CRandomMersenneA.Instset], 4    ; can we use XMM registers and SSE2 ?
+        jb      M500
+
+        ; this version uses XMM registers and SSE2 instructions:
+        mov     edx, [ecx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        mov     eax, [ecx+edx*1+CRandomMersenneA.PreInt]   ; fetch premade random number
+        add     edx, 4
+        mov     [ecx+CRandomMersenneA.PreInx], edx
+        cmp     edx, 4*4
+        jnb     M410
+        ret                                                ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+        mov     edx, [ecx+CRandomMersenneA.MTI]            ; fetch 4 numbers from MT buffer
+        movdqa  xmm0, oword [ecx+edx*1+CRandomMersenneA.MT]
+        
+%IF TEMPERING                                              ; optional tempering algorithm
+        movdqa  xmm1, xmm0
+        psrld   xmm0, MERS_U
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_S
+        pand    xmm0, oword [ecx+CRandomMersenneA.TMB]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_T
+        pand    xmm0, oword [ecx+CRandomMersenneA.TMC]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        psrld   xmm0, MERS_L
+        pxor    xmm0, xmm1
+%ENDIF   ; tempering
+
+        ; save four premade integers
+        movdqa  oword [ecx+CRandomMersenneA.PreInt], xmm0
+        ; premake four floating point numbers
+        pxor    xmm1, xmm1
+        pxor    xmm2, xmm2
+        punpckldq xmm1, xmm0                               ; get first two numbers into bits 32-63 and 96-127
+        punpckhdq xmm2, xmm0                               ; get next  two numbers into bits 32-63 and 96-127
+        psrlq   xmm1, 12                                   ; get bits into mantissa position
+        psrlq   xmm2, 12                                   ; get bits into mantissa position
+        por     xmm1, oword [ecx+CRandomMersenneA.one]     ; set exponent for interval [1,2)
+        por     xmm2, oword [ecx+CRandomMersenneA.one]     ; set exponent for interval [1,2)
+        movdqa  oword [ecx+CRandomMersenneA.PreFlt], xmm1  ; store two premade numbers
+        movdqa  oword [ecx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers        
+        mov     dword [ecx+CRandomMersenneA.PreInx], 0     ; index to premade numbers 
+        add     edx, 4*4                                   ; increment MTI index into MT buffer by 4
+        mov     [ecx+CRandomMersenneA.MTI], edx
+        cmp     edx, MERS_N*4
+        jae     M420
+        ret                                                ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420:                                                      ; eax is the random number to return
+%IF     MERS_N & 3                                         ; if MERS_N is not divisible by 4
+        NVALID = MERS_N & 3                                ; only NVALID of the 4 premade numbers are valid
+        ; Move premade numbers (4-NVALID) positions forward
+        movdqa  xmm0, [ecx+CRandomMersenneA.PreInt]
+        movdqa  xmm1, [ecx+CRandomMersenneA.PreFlt]
+        movdqa  xmm2, [ecx+CRandomMersenneA.PreFlt+16]
+        movdqu  [ecx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+        movdqu  [ecx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3        
+        movq    [ecx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF        
+        ; save index to first valid premade number
+        mov     [ecx+CRandomMersenneA.PreInx], (4-NVALID)*4  
+%ENDIF
+        
+        ; MT buffer is empty. Fill it up
+        push    ebx
+        movd    xmm3, [ecx+CRandomMersenneA.UMASK]         ; load constants
+        movd    xmm4, [ecx+CRandomMersenneA.LMASK]
+        movd    xmm5, [ecx+CRandomMersenneA.MATA]
+        pshufd  xmm3, xmm3, 0                              ; broadcast constants
+        pshufd  xmm4, xmm4, 0
+        pshufd  xmm5, xmm5, 0
+        xor     ebx,  ebx                                  ; kk = 0
+        mov     edx,  MERS_M*4                             ; km
+        
+; change ecx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+        add     ecx, CRandomMersenneA.MT
+
+M430:   ; kk loop
+        movdqa  xmm2, oword [ecx+ebx]                      ; mt[kk]
+        movd    xmm6, [ecx+ebx+16]
+        movdqa  xmm1, oword [ecx+ebx]                      ; mt[kk]        
+        movss   xmm2, xmm6                                 ; faster than movdqu xmm2, [ebx+4] ?
+        pshufd  xmm2, xmm2, 00111001B                      ; mt[kk+1]        
+        movdqu  xmm0, oword [ecx+edx]                      ; mt[km]
+        ;movq   xmm0, qword [ecx+edx]                      ; mt[km]
+        ;movhps xmm0, qword [ecx+edx+8]                    ; this is faster than movdqu on older processors
+        pand    xmm1, xmm3                                 ; mt[kk] & UPPER_MASK
+        pand    xmm2, xmm4                                 ; mt[kk+1] & LOWER_MASK
+        por     xmm1, xmm2                                 ; y        
+        movdqa  xmm2, xmm1                                 ; y
+        pslld   xmm1, 31                                   ; copy bit 0 into all bits
+        psrad   xmm1, 31                                   ; -(y & 1)
+        pand    xmm1, xmm5                                 ; & MERS_A
+        psrld   xmm2, 1                                    ; y >> 1
+        pxor    xmm0, xmm1
+        pxor    xmm0, xmm2
+        movdqa  oword [ecx+ebx], xmm0                      ; result into mt[kk]
+        cmp     ebx, (MERS_N-4)*4
+        jae     M440                                       ; exit loop when kk past end of buffer
+        add     ebx, 16                                    ; kk += 4
+        add     edx, 16                                    ; km += 4
+        cmp     edx, (MERS_N-4)*4
+        jbe     M430                                       ; skip unless km wraparound
+        sub     edx, MERS_N*4                              ; km wraparound
+        movdqu  xmm0, oword [ecx+(MERS_N-4)*4]             ; copy end to before begin for km wraparound
+        movdqa  oword [ecx-4*4], xmm0        
+        movdqa  xmm0, oword [ecx]                          ; copy begin to after end for kk wraparound
+        movdqu  oword [ecx+MERS_N*4], xmm0
+        jmp     M430
+
+M440:   ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+        sub     ecx, CRandomMersenneA.MT        
+        mov     dword [ecx+CRandomMersenneA.MTI], 0
+        pop     ebx
+        ret                                                ; random number is still in eax
+        
+; Generic version        
+; this version is for old processors without XMM support:
+M500:
+        mov     edx, [ecx+CRandomMersenneA.MTI]
+        cmp     edx, MERS_N*4
+        jnb     short M520                                 ; buffer is empty, fill it   
+M510:   mov     eax, [ecx+edx*1+CRandomMersenneA.MT]
+        add     edx, 4
+        mov     [ecx+CRandomMersenneA.MTI], edx
+        
+%IF TEMPERING   
+        mov     edx, eax
+        shr     eax, MERS_U
+        xor     eax, edx
+        mov     edx, eax
+        shl     eax, MERS_S
+        and     eax, MERS_B
+        xor     eax, edx
+        mov     edx, eax
+        shl     eax, MERS_T
+        and     eax, MERS_C
+        xor     eax, edx
+        mov     edx, eax
+        shr     eax, MERS_L
+        xor     eax, edx
+%ENDIF   ; tempering
+
+        mov     edx, [ecx+CRandomMersenneA.PreInt]         ; previously premade number
+        mov     [ecx+CRandomMersenneA.PreInt], eax         ; store number for next call
+        shl     eax, 20                                    ; convert to float
+        mov     dword [ecx+CRandomMersenneA.PreFlt], eax
+        mov     eax, [ecx+CRandomMersenneA.PreInt]
+        shr     eax, 12
+        or      eax, 3FF00000H
+        mov     dword [ecx+CRandomMersenneA.PreFlt+4], eax
+        mov     eax, edx                                   ; return value is premade integer
+        ret
+
+        ; fill buffer with random numbers
+M520:   push    ebx
+        push    esi
+        xor     esi, esi                                   ; kk
+        mov     ebx, MERS_M*4                              ; km
+; change ecx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+        add     ecx, CRandomMersenneA.MT
+        
+        ; kk loop
+M530:   mov     eax, [ecx+esi]
+        mov     edx, [ecx+esi+4]
+        and     eax, UPPER_MASK
+        and     edx, LOWER_MASK
+        or      eax, edx
+        shr     eax, 1
+        sbb     edx, edx
+        and     edx, MERS_A
+        xor     eax, edx
+        xor     eax, [ecx+ebx]
+        mov     [ecx+esi], eax
+        add     ebx, 4
+        cmp     ebx, MERS_N*4
+        jb      short M540
+        ; copy begin of table to after end to simplify kk+1 wraparound
+        mov     eax, [ecx]
+        mov     [ecx+ebx], eax 
+        xor     ebx, ebx
+M540:   add     esi, 4
+        cmp     esi, MERS_N*4
+        jb      M530                                       ; loop end        
+        
+; change ecx back to pointing to CRandomMersenneA
+        sub     ecx, CRandomMersenneA.MT        
+        xor     edx, edx
+        mov     [ecx+CRandomMersenneA.MTI], edx
+        pop     esi
+        pop     ebx
+        jmp     M510        
+        
+;_MersBRandom ENDP
+
+;  extern "C" double MersRandom(void * Pthis); // Output random float
+
+_MersRandom:; PROC NEAR                                    ; generate random float with 32 bits resolution
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align buffer
+        mov     edx, [ecx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        fld     qword [ecx+edx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
+        fsub    qword [ecx+CRandomMersenneA.one]           ; subtract 1.0
+        jmp     MersBRandom_reg                            ; random bits
+;_MersRandom ENDP
+
+
+;  extern "C" int MersIRandom (void * Pthis, int min, int max);  // Output random integer
+
+_MersIRandom: ; PROC   NEAR
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align buffer
+        call    MersBRandom_reg                            ; random bits
+        mov     edx, [esp+12]                              ; max
+        mov     ecx, [esp+8]                               ; min
+        sub     edx, ecx
+        js      short M720                                 ; max < min
+        add     edx, 1                                     ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret
+M720:   mov     eax, 80000000H                             ; error exit
+        ret
+;_MersIRandom ENDP
+
+
+;  extern "C" int MersIRandomX (void * Pthis, int min, int max);        // Output random integer
+
+_MersIRandomX: ; PROC   NEAR
+        push    edi
+        mov     ecx, [esp+8]                               ; Pthis
+        mov     edx, [esp+12]                              ; min
+        mov     edi, [esp+16]                              ; max
+        and     ecx, -16                                   ; align buffer
+        sub     edi, edx                                   ; max - min
+        jle     short M830                                 ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        
+        ; if (interval != LastInterval) {
+        cmp     edi, [ecx+CRandomMersenneA.LastInterval]
+        je      M810
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [ecx+CRandomMersenneA.RLimit], eax
+        mov     [ecx+CRandomMersenneA.LastInterval], edi
+M810:
+M820:   ; do { // Rejection loop
+        call    MersBRandom_reg                            ; random bits (ecx is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     edi
+        ; } while (remainder > RLimit);
+        cmp     eax, [ecx+CRandomMersenneA.RLimit]
+        ja      M820
+        
+        ; return (int32)iran + min
+        mov     eax, [esp+12]                              ; min
+        add     eax, edx
+        pop     edi
+        ret
+        
+M830:   jl      M840
+        ; max = min. Return min
+        mov     eax, edx
+        pop     edi
+        ret                                                ; max = min exit
+        
+M840:   ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     edi
+        ret
+;_MersIRandomX ENDP
+
+
+; -------------------------------------------------------------------------
+;  Single-threaded static link versions of Mersenne Twister
+; -------------------------------------------------------------------------
+
+%IFDEF POSITIONINDEPENDENT
+; Get ecx = eip for self-relative addressing
+GetThunkECX:
+        mov     ecx, [esp]
+        ret
+        
+; Get address of MersenneInstance into ecx, position independent
+; This works only in YASM, not in NASM:
+%macro GetMersenneInstanceAddress  0
+        call    GetThunkECX
+        add     ecx, MersenneInstance - $
+%endmacro
+
+%ELSE
+
+; Get address of MersenneInstance into ecx, position dependent
+; This works only in YASM, not in NASM:
+%macro GetMersenneInstanceAddress  0
+        mov     ecx, MersenneInstance
+%endmacro
+
+%ENDIF
+
+;  extern "C" void   MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersenneRandomInitByArray: ; PROC NEAR
+        push    ebx
+        push    esi
+        push    edi
+        push    ebp
+        mov     ebx, [esp+20]                              ; seeds
+        mov     ebp, [esp+24]                              ; length
+        GetMersenneInstanceAddress                         ; Macro different for position-dependent and -independent version
+        jmp     MersRandomInitByArray_reg                  ; jump to function in mersenne32.asm
+;_MersenneRandomInitByArray ENDP        
+
+
+;  extern "C" void   MersenneRandomInit(int seed);  // Re-seed
+_MersenneRandomInit: ; PROC NEAR
+        mov     eax, [esp+4]                               ; seed
+        GetMersenneInstanceAddress
+        jmp     MersRandomInit_reg                         ; jump to function in mersenne32.asm
+;_MersenneRandomInit ENDP
+
+
+;  extern "C" double MersenneRandom(); // Output random float
+_MersenneRandom: ; PROC NEAR                               ; generate random float with 32 bits resolution
+        GetMersenneInstanceAddress
+        mov     edx, [ecx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        fld     qword [ecx+edx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
+        fsub    qword [ecx+CRandomMersenneA.one]           ; subtract 1.0
+        jmp     MersBRandom_reg                            ; random bits
+;_MersenneRandom ENDP
+
+
+;  extern "C" int MersenneIRandom (int min, int max); // Output random integer
+_MersenneIRandom: ; PROC   NEAR
+        GetMersenneInstanceAddress
+        call    MersBRandom_reg                            ; random bits
+        mov     edx, [esp+8]                               ; max
+        mov     ecx, [esp+4]                               ; min
+        sub     edx, ecx
+        js      short S410                                 ; max < min
+        add     edx, 1                                     ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret
+S410:   mov     eax, 80000000H                             ; error exit
+        ret
+;_MersenneIRandom ENDP
+
+
+;  extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
+
+_MersenneIRandomX: ; PROC   NEAR
+        push    edi
+        GetMersenneInstanceAddress
+        mov     edx, [esp+8]                               ; min
+        mov     edi, [esp+12]                              ; max
+        sub     edi, edx                                   ; max - min
+        jle     short S530                                 ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        cmp     edi, [ecx+CRandomMersenneA.LastInterval]
+        je      S510
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [ecx+CRandomMersenneA.RLimit], eax
+        mov     [ecx+CRandomMersenneA.LastInterval], edi
+S510:
+S520:   call    MersBRandom_reg                            ; random bits (ecx is preserved)
+        mul     edi
+        cmp     eax, [ecx+CRandomMersenneA.RLimit]
+        ja      S520        
+        mov     eax, [esp+8]                               ; min
+        add     eax, edx
+        pop     edi
+        ret     
+        
+S530:   jl      S540
+        ; max = min. Return min
+        mov     eax, edx
+        pop     edi
+        ret                                                ; max = min exit
+        
+S540:   ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     edi
+        ret     
+;_MersenneIRandomX ENDP
+
+
+;  extern "C" unsigned int MersenneBRandom();              // Output random bits
+_MersenneBRandom: ; PROC NEAR                              ; generate random float with 32 bits resolution
+        GetMersenneInstanceAddress
+        jmp     MersBRandom_reg                            ; random bits
+;_MersenneBRandom ENDP
+
+
+; -----------------------------------------------------------------
+;  Single-threaded DLL versions for Mersenne Twister, Windows only
+; -----------------------------------------------------------------
+%IFDEF WINDOWS
+
+;  extern "C" void __stdcall MersenneRandomInitByArrayD(unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersenneRandomInitByArrayD at 8: ; PROC NEAR
+        ; translate __cdecl to __stdcall calling
+        mov     eax, [esp+4]                               ; seeds
+        mov     edx, [esp+8]                               ; length
+        push    edx                                       
+        push    eax
+        call    _MersenneRandomInitByArray
+        pop     ecx
+        pop     ecx
+        ret     8
+;_MersenneRandomInitByArrayD at 8 ENDP        
+
+
+;  extern "C" void __stdcall MersenneRandomInitD(int seed); // Re-seed
+_MersenneRandomInitD at 4: ; PROC NEAR
+        ; remove parameter from stack
+        pop     edx                                        ; return address
+        pop     eax                                        ; seed
+        push    edx                                        ; put return address back in        
+        mov     ecx, MersenneInstance
+        ; eax = seed, ecx = Pthis
+        jmp     MersRandomInit_reg                         ; jump to function in mersenne32.asm
+;_MersenneRandomInitD at 4 ENDP
+
+
+;  extern "C" double __stdcall MersenneRandomD(); // Output random float
+_MersenneRandomD at 0: ; PROC NEAR                            ; generate random float with 32 bits resolution
+        mov     ecx, MersenneInstance
+        mov     edx, [ecx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        fld     qword [ecx+edx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
+        fsub    qword [ecx+CRandomMersenneA.one]           ; subtract 1.0
+        jmp     MersBRandom_reg                            ; random bits
+;_MersenneRandomD at 0 ENDP
+
+
+;  extern "C" int __stdcall MersenneIRandomD (int min, int max); // Output random integer
+_MersenneIRandomD at 8: ; PROC   NEAR
+        mov     ecx, MersenneInstance
+        call    MersBRandom_reg                            ; random bits
+        mov     edx, [esp+8]                               ; max
+        mov     ecx, [esp+4]                               ; min
+        sub     edx, ecx
+        js      short S710                                 ; max < min
+        add     edx, 1                                     ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret     8
+S710:   mov     eax, 80000000H                             ; error exit
+        ret     8
+;_MersenneIRandomD at 8 ENDP
+
+
+;  extern "C" int __stdcall MersenneIRandomXD(int min, int max); // Output random integer, exact
+
+_MersenneIRandomXD at 8: ; PROC   NEAR
+        push    edi
+        mov     ecx, MersenneInstance
+        mov     edx, [esp+8]                               ; min
+        mov     edi, [esp+12]                              ; max
+        sub     edi, edx                                   ; max - min
+        jle     short S830                                 ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        cmp     edi, [ecx+CRandomMersenneA.LastInterval]
+        je      S810
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [ecx+CRandomMersenneA.RLimit], eax
+        mov     [ecx+CRandomMersenneA.LastInterval], edi
+S810:
+S820:   call    MersBRandom_reg                            ; random bits (ecx is preserved)
+        mul     edi
+        cmp     eax, [ecx+CRandomMersenneA.RLimit]
+        ja      S820        
+        mov     eax, [esp+8]                               ; min
+        add     eax, edx
+        pop     edi
+        ret     8
+        
+S830:   jl      S840
+        ; max = min. Return min
+        mov     eax, edx
+        pop     edi
+        ret     8                                          ; max = min exit
+        
+S840:   ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     edi
+        ret     8
+;_MersenneIRandomXD at 8 ENDP
+
+
+;  extern "C" unsigned int __stdcall MersenneBRandomD();   // Output random bits
+_MersenneBRandomD at 0: ; PROC NEAR                           ; generate random float with 32 bits resolution
+        mov     ecx, MersenneInstance
+        jmp     MersBRandom_reg                            ; random bits
+;_MersenneBRandomD at 0 ENDP
+
+%ENDIF ; WINDOWS
diff --git a/asmlibSrc/mersenne64.asm b/asmlibSrc/mersenne64.asm
new file mode 100755
index 0000000..f510e7c
--- /dev/null
+++ b/asmlibSrc/mersenne64.asm
@@ -0,0 +1,614 @@
+; ----------------------------- MERSENNE64.ASM ---------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+;
+;  This random number generator is described in the article by
+;  M. Matsumoto & T. Nishimura, in:
+;  ACM Transactions on Modeling and Computer Simulation,
+;  vol. 8, no. 1, 1998, pp. 3-30. See also:
+;  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+;  Initialization:
+;  MersRandomInit must be called before the first call to any of the other
+;  random number functions. The seed is any 32-bit integer.
+;  You may use MersRandomInitByArray instead if you want more
+;  than 32 bits for seed. length is the number of integers in seeds[].
+;  length must be > 0, there is no upper limit for length.
+;
+;  Generating random numbers:
+;  MersRandom returns a floating point number in the interval 0 <= x < 1 with
+;  a resolution of 32 bits.
+;  MersIRandom returns an integer in the interval defined by min and max with
+;  a resolution of 32 bits.
+;  MersIRandomX returns an integer in the interval defined by min and max with
+;  exactly equal probabilities of all values in the interval.
+;  MersBRandom returns 32 random bits.
+;
+;  Error conditions:
+;  If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+;  and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+;  MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+;  C++ prototypes in randoma.h:
+;  Thread-safe versions:
+;  extern "C" void   MersRandomInit(void * Pthis, int seed);         // Re-seed
+;  extern "C" void   MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersIRandom (void * Pthis, int min, int max);   // Output random integer
+;  extern "C" int    MersIRandomX(void * Pthis, int min, int max);   // Output random integer, exact
+;  extern "C" double MersRandom(void * Pthis);                       // Output random float
+;  extern "C" unsigned int MersBRandom(void * Pthis);                // Output random bits
+;
+;  Single-threaded versions:
+;  extern "C" void   MersenneRandomInit(int seed);                   // Re-seed
+;  extern "C" void   MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersenneIRandom (int min, int max);             // Output random integer
+;  extern "C" int    MersenneIRandomX(int min, int max);             // Output random integer, exact
+;  extern "C" double MersenneRandom();                               // Output random float
+;  extern "C" unsigned int MersenneBRandom();                        // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
+global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
+global MersenneBRandom, MersenneBRandomD, MersBRandom
+global MersenneRandom, MersenneRandomD, MersRandom
+global MersenneIRandom, MersenneIRandomD, MersIRandom
+global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MersenneInstance: ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE  ALIGN=16
+
+MersenneRandomInit: ; PROC
+%IFDEF UNIX
+        mov     edx, edi                                   ; seed
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitD:                                       ; alias
+        mov     edx, ecx                                   ; seed
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        ;jmp     ?Windows_MersRandomInit
+%ENDIF
+;MersenneRandomInit ENDP
+
+        
+; Thread-safe version:
+;  extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+MersRandomInit: ;   PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     edx, esi                                   ; seed
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        ; parameters: rcx = Pthis, edx = seed
+        and     rcx, -16                                   ; align buffer
+        ?Windows_MersRandomInit:
+        call    Mers_init0                                 ; initialize mt buffer with seeds
+        
+        ; Number of premade numbers that are lost in the initialization when the  
+        ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3        
+   PREMADELOST equ (MERS_N & 3)
+%ELSE
+   PREMADELOST equ 4
+%ENDIF
+        ; We want the C++ and the assembly implementation to give exactly the same
+        ; sequence. The C++ version discards 37 random numbers after initialization.
+        ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+        ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+        ; SSE2 is supported, otherwise 37 + 1.
+        
+        push    rbx
+        mov     ebx, 37+PREMADELOST+1
+        ; CMP     dword [rcx+CRandomMersenneA.Instset], 4  ; can we use XMM registers and SSE2 ?
+        ; jae     M110
+        ; sub     ebx, PREMADELOST                         ; SSE2 not supported
+        ; mov     dword [rcx+CRandomMersenneA.PreInx], 0   ; reset index to premade list
+M110:   ; loop
+M120:   call    ?Windows_MersBRandom
+        dec     ebx
+        jnz     M120
+        pop     rbx
+        ret
+;MersRandomInit ENDP
+        
+
+Mers_init0:                                                ; make random seeds from eax and put them into MT buffer
+; Input parameters: 
+; rcx points to CRandomMersenneA
+; edx: seed
+; rcx unchanged by procedure
+
+        push    rdi
+        ; clear my buffer
+        push    rcx
+        mov     rdi, rcx                                   ; Pthis
+        add     rdi, 16
+        mov     ecx, (MersenneSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        pop     rcx                                        ; Pthis
+        mov     edi, edx                                   ; seed
+        
+        ; initialize CRandomMersenneA structure
+        mov     dword [rcx+CRandomMersenneA.PreInx], 4*4
+        mov     dword [rcx+CRandomMersenneA.Instset], 4
+        mov     eax, MERS_B
+        mov     [rcx+CRandomMersenneA.TMB], eax
+        mov     [rcx+CRandomMersenneA.TMB+4], eax
+        mov     [rcx+CRandomMersenneA.TMB+8], eax
+        mov     [rcx+CRandomMersenneA.TMB+12], eax
+        mov     eax, MERS_C
+        mov     [rcx+CRandomMersenneA.TMC], eax
+        mov     [rcx+CRandomMersenneA.TMC+4], eax
+        mov     [rcx+CRandomMersenneA.TMC+8], eax
+        mov     [rcx+CRandomMersenneA.TMC+12], eax
+        mov     eax, 3FF00000H                             ; upper dword of 1.0, double precision
+        mov     [rcx+CRandomMersenneA.one+4], eax
+        mov     [rcx+CRandomMersenneA.one+12], eax        
+        mov     dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
+        mov     dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
+        mov     dword [rcx+CRandomMersenneA.MATA],  MERS_A
+
+        ; put random numbers into MT buffer
+        xor     eax, eax
+M210:   mov     [rcx+rax*4+CRandomMersenneA.MT], edi
+        mov     edx, edi
+        shr     edi, 30
+        xor     edi, edx
+        imul    edi, 1812433253
+        inc     eax
+        add     edi, eax
+        cmp     eax, MERS_N
+        jb      M210
+        
+        ; Set index MTI to end of list, (scaled by 4)
+        ; Round up to multiple of 4 to avoid alignment error
+        mov     dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
+        
+        pop     rdi
+        ret      
+
+
+; Single threaded version:
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
+
+MersenneRandomInitByArray: ; PROC                          ; entry for Linux call
+%IFDEF UNIX
+        mov     r8d, esi                                   ; length
+        mov     rdx, rdi                                   ; seeds
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInitByArray
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitByArrayD: ; LABEL NEAR                   ; alias
+        mov     r8d, edx                                   ; length
+        mov     rdx, rcx                                   ; seeds
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInitByArray
+%ENDIF        
+;MersenneRandomInitByArray ENDP       
+
+; Thread-safe version:
+; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
+MersRandomInitByArray: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; length
+        mov     rdx, rsi                                   ; seeds
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        
+?Windows_MersRandomInitByArray:
+; parameters: rcx = Pthis, rdx = seeds, r8d = length
+
+        and     rcx, -16                                   ; align buffer
+        push    rbx
+        push    rsi
+        push    rdi
+        push    rbp
+        mov     rbx, rdx                                   ; seeds
+        mov     ebp, r8d                                   ; length
+        
+        mov     edx, 19650218
+        call    Mers_init0                                 ; init0(19650218); (rcx unchanged)
+        
+        mov     r8d, ebp                                   ; r8d = length, ebp = k
+        test    ebp, ebp
+        jle     M380                                       ; error: length <= 0
+        xor     edi, edi                                   ; j = 0
+        lea     esi, [rdi+1]                               ; i = 1
+        cmp     ebp, MERS_N
+        ja      M310
+        mov     ebp, MERS_N                                ; k = max (MERS_N,length)
+M310:
+
+        ; for (; k; k--) {
+M320:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1664525                               ; * 1664525
+        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        add     eax, [rbx+rdi*4]                           ; + seeds[j]
+        add     eax, edi                                   ; + j
+        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        inc     edi                                        ; j++
+        cmp     esi, MERS_N
+        jb      M330                                       ; if (i>=MERS_N)
+        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [rcx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M330:
+        cmp     edi, r8d                                   ; length
+        jb      M340                                       ; if (j>=length)
+        xor     edi, edi                                   ; j = 0;
+M340:
+        dec     ebp                                        ; k--
+        jnz     M320                                       ; first k loop
+M350:
+        mov     ebp, MERS_N-1                              ; k
+M360:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1566083941                            ; * 1566083941
+        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        sub     eax, esi                                   ; - i
+        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        cmp     esi, MERS_N
+        jb      M370                                       ; if (i>=MERS_N)
+        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [rcx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M370:
+        dec     ebp                                        ; k--
+        jnz     M360                                       ; second k loop
+        mov     dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+        mov     dword [rcx+CRandomMersenneA.MTI], 0
+        mov     dword [rcx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+        mov     edi, MERS_N + PREMADELOST+1
+M391:   call    ?Windows_MersBRandom
+        dec     edi
+        jnz     M391
+
+        pop     rbp                                        ; restore registers
+        pop     rdi
+        pop     rsi
+        pop     rbx
+        ret
+;MersRandomInitByArray ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+
+MersenneBRandom: ; PROC                                    ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneBRandomD: ; LABEL NEAR                             ; alias
+%ENDIF
+        lea     rcx, [MersenneInstance]                    ; Point to instance
+        jmp     ?Windows_MersBRandom
+;MersenneBRandom ENDP       
+
+; Thread-safe version:
+; extern "C" unsigned int MersBRandom(void * Pthis);       // Output random bits
+
+MersBRandom: ; PROC
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+
+?Windows_MersBRandom: ; LABEL NEAR                         ; Label used internally
+        and     rcx, -16                                   ; align buffer
+        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        mov     eax, [rcx+rdx*1+CRandomMersenneA.PreInt]   ; fetch premade random number
+        add     edx, 4
+        mov     [rcx+CRandomMersenneA.PreInx], edx
+        cmp     edx, 4*4
+        jnb     M410
+        ret                                                ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+        mov     edx, [rcx+CRandomMersenneA.MTI]            ; fetch 4 numbers from MT buffer
+        movdqa  xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
+        
+%IF TEMPERING                                              ; optional tempering algorithm
+        movdqa  xmm1, xmm0
+        psrld   xmm0, MERS_U
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_S
+        pand    xmm0, oword [rcx+CRandomMersenneA.TMB]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_T
+        pand    xmm0, oword [rcx+CRandomMersenneA.TMC]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        psrld   xmm0, MERS_L
+        pxor    xmm0, xmm1
+%ENDIF   ; tempering
+
+        ; save four premade integers
+        movdqa  oword [rcx+CRandomMersenneA.PreInt], xmm0
+        ; premake four floating point numbers
+        pxor    xmm1, xmm1
+        pxor    xmm2, xmm2
+        punpckldq xmm1, xmm0                               ; get first two numbers into bits 32-63 and 96-127
+        punpckhdq xmm2, xmm0                               ; get next  two numbers into bits 32-63 and 96-127
+        psrlq   xmm1, 12                                   ; get bits into mantissa position
+        psrlq   xmm2, 12                                   ; get bits into mantissa position
+        por     xmm1,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
+        por     xmm2,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
+        movdqa  oword [rcx+CRandomMersenneA.PreFlt], xmm1  ; store two premade numbers
+        movdqa  oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers        
+        mov     dword [rcx+CRandomMersenneA.PreInx], 0     ; index to premade numbers 
+        add     edx, 4*4                                   ; increment MTI index into MT buffer by 4
+        mov     [rcx+CRandomMersenneA.MTI], edx
+        cmp     edx, MERS_N*4
+        jae     M420
+        ret                                                ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420:                                                      ; eax is the random number to return
+%IF     MERS_N & 3                                         ; if MERS_N is not divisible by 4
+        NVALID equ MERS_N & 3                              ; only NVALID of the 4 premade numbers are valid
+        ; Move premade numbers (4-NVALID) positions forward
+        movdqa  xmm0, [rcx+CRandomMersenneA.PreInt]
+        movdqa  xmm1, [rcx+CRandomMersenneA.PreFlt]
+        movdqa  xmm2, [rcx+CRandomMersenneA.PreFlt+16]
+        movdqu  [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+        movdqu  [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3        
+        movq    [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF        
+        ; save index to first valid premade number
+        mov     [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4  
+%ENDIF
+        
+; MT buffer is empty. Fill it up
+        push    rbx
+        movd    xmm3, [rcx+CRandomMersenneA.UMASK]         ; load constants
+        movd    xmm4, [rcx+CRandomMersenneA.LMASK]
+        movd    xmm5, [rcx+CRandomMersenneA.MATA]
+        pshufd  xmm3, xmm3, 0                              ; broadcast constants
+        pshufd  xmm4, xmm4, 0
+        pshufd  xmm5, xmm5, 0
+        xor     rbx,  rbx                                  ; kk = 0
+        mov     edx,  MERS_M*4                             ; km
+        
+; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+        add     rcx, CRandomMersenneA.MT
+
+M430:   ; kk loop
+        movdqa  xmm2, [rcx+rbx]                            ; mt[kk]
+        movd    xmm0, dword [rcx+rbx+16]
+        movdqa  xmm1, [rcx+rbx]                            ; mt[kk]        
+        movss   xmm2, xmm0                                 ; faster than movdqu xmm2,[]
+        pshufd  xmm2, xmm2, 00111001B                      ; mt[kk+1]
+        movdqu  xmm0, oword [rcx+rdx]                      ; mt[km]        
+        ;movq   xmm0, qword [rcx+rdx]                      ; mt[km]
+        ;movhps xmm0, qword [rcx+rdx+8]                    ; faster than movdqu on older processors        
+        pand    xmm1, xmm3                                 ; mt[kk] & UPPER_MASK
+        pand    xmm2, xmm4                                 ; mt[kk+1] & LOWER_MASK
+        por     xmm1, xmm2                                 ; y        
+        movdqa  xmm2, xmm1                                 ; y
+        pslld   xmm1, 31                                   ; copy bit 0 into all bits
+        psrad   xmm1, 31                                   ; -(y & 1)
+        pand    xmm1, xmm5                                 ; & MERS_A
+        psrld   xmm2, 1                                    ; y >> 1
+        pxor    xmm0, xmm1
+        pxor    xmm0, xmm2
+        movdqa  [rcx+rbx], xmm0                            ; result into mt[kk]
+        cmp     ebx, (MERS_N-4)*4
+        jae     M440                                       ; exit loop when kk past end of buffer
+        add     ebx, 16                                    ; kk += 4
+        add     rdx, 16                                    ; km += 4 (signed)
+        cmp     edx, (MERS_N-4)*4
+        jbe     M430                                       ; skip unless km wraparound
+        sub     rdx, MERS_N*4                              ; km wraparound (signed)
+        movdqu  xmm0, [rcx+(MERS_N-4)*4]                   ; copy end to before begin for km wraparound
+        movdqa  [rcx-4*4], xmm0        
+        movdqa  xmm0, [rcx]                                ; copy begin to after end for kk wraparound
+        movdqu  [rcx+MERS_N*4], xmm0
+        jmp     M430
+
+M440:   ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+        sub     rcx, CRandomMersenneA.MT        
+
+        mov     dword [rcx+CRandomMersenneA.MTI], 0
+        pop     rbx
+        ret                                                ; random number is still in eax
+        
+;MersBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneRandom();  // Get floating point random number
+
+MersenneRandom: ; PROC                                     ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneRandomD:                                           ; alias
+        lea     rcx, [MersenneInstance]                    ; Point to instance
+        ; continue in next function
+%ENDIF
+%IFDEF UNIX
+        lea     rdi, [MersenneInstance]                    ; Point to instance
+        ; continue in next function
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MersRandom(void * Pthis);  // Get floating point random number
+MersRandom: 
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        movsd   xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
+        subsd   xmm0, [rcx+CRandomMersenneA.one]           ; subtract 1.0
+        movsd   [rcx+CRandomMersenneA.TmpFlt], xmm0        ; store random number
+        call    ?Windows_MersBRandom                       ; prepare next random number
+        movsd   xmm0, [rcx+CRandomMersenneA.TmpFlt]        ; recall random number
+        ret        
+;MersenneRandom ENDP       
+
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandom: ; PROC 
+%IFDEF UNIX
+        push    rsi                                        ; max
+        push    rdi                                        ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     MersIRandom_max_min_on_stack
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomD:                                          ; Alias
+        push    rdx                                        ; max
+        push    rcx                                        ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     MersIRandom_max_min_on_stack
+%ENDIF
+;MersenneIRandom ENDP       
+
+; Thread-safe version:
+; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandom: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; max
+        mov     edx, esi                                   ; min
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        push    r8                                         ; max
+        push    rdx                                        ; min
+MersIRandom_max_min_on_stack:
+        
+        call    ?Windows_MersBRandom                       ; random bits
+        pop     rcx                                        ; min
+        pop     rdx                                        ; max
+        sub     edx, ecx
+        js      short M720                                 ; max < min
+        add     edx, 1                                     ; interval = max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [rdx+rcx]                             ; add min
+        ret
+M720:   mov     eax, 80000000H                             ; error exit
+        ret
+;MersIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandomX: ; PROC
+%IFDEF UNIX
+        mov     r8d, esi                                   ; max
+        mov     edx, edi                                   ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersIRandomX
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomXD:                                         ; alias
+        mov     r8d, edx                                   ; max
+        mov     edx, ecx                                   ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersIRandomX
+%ENDIF
+;MersenneIRandomX ENDP       
+
+; Thread-safe version:
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandomX: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; max
+        mov     edx, esi                                   ; min
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        
+?Windows_MersIRandomX:
+; parameters: rcx = Pthis, edx = min, r8d = max
+
+        and     rcx, -16                                   ; align buffer
+        push    rdi
+        mov     edi, r8d                                   ; max
+
+        sub     edi, edx                                   ; max - min
+        jle     short M830                                 ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        push    rdx                                        ; save min
+        
+        ; if (interval != LastInterval) {
+        cmp     edi, [rcx+CRandomMersenneA.LastInterval]
+        je      M810
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [rax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [rcx+CRandomMersenneA.RLimit], eax
+        mov     [rcx+CRandomMersenneA.LastInterval], edi
+M810:
+M820:   ; do { // Rejection loop
+        call    ?Windows_MersBRandom                       ; random bits (rcx is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     edi
+        ; } while (remainder > RLimit);
+        cmp     eax, [rcx+CRandomMersenneA.RLimit]
+        ja      M820
+        
+        ; return (int32)iran + min
+        pop     rax                                        ; min
+        add     eax, edx
+        pop     rdi
+        ret
+        
+M830:   jl      M840
+        ; max = min. Return min
+        mov     eax, edx
+        pop     rdi
+        ret                                                ; max = min exit
+        
+M840:   ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     rdi
+        ret
+;MersIRandomX ENDP
diff --git a/asmlibSrc/mother32.asm b/asmlibSrc/mother32.asm
new file mode 100755
index 0000000..af5cf6f
--- /dev/null
+++ b/asmlibSrc/mother32.asm
@@ -0,0 +1,370 @@
+; ----------------------------- MOTHER32.ASM -----------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-09-11
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+; Description:
+;
+; Mother-of-All random number generator by Agner Fog 1998 - 2008
+; 32-bit mode version for 80x86 and compatible microprocessors
+;
+;  This is a multiply-with-carry type of random number generator
+;  invented by George Marsaglia.  The algorithm is:             
+;  S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+;  X[n] = S modulo 2^32
+;  C = floor(S / 2^32) 
+;
+; C++ prototypes:
+;
+; Thread-safe versions:
+; extern "C" void         MotRandomInit(void * Pthis, int seed);      // Initialization
+; extern "C" int          MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double       MotRandom(void * Pthis);                    // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis);                   // Output random bits
+;
+; Single-threaded static link versions
+; extern "C" void         MotherRandomInit(int seed);      // Initialization
+; extern "C" int          MotherIRandom(int min, int max); // Get integer random number in desired interval
+; extern "C" double       MotherRandom();                  // Get floating point random number
+; extern "C" unsigned int MotherBRandom();                 // Output random bits
+;
+; Single-threaded dynamic link versions
+; extern "C" void         __stdcall MotherRandomInitD(int seed);      // Initialization
+; extern "C" int          __stdcall MotherIRandomD(int min, int max); // Get integer random number in desired interval
+; extern "C" double       __stdcall MotherRandomD();                  // Get floating point random number
+; extern "C" unsigned int __stdcall MotherBRandomD();                 // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _MotBRandom, _MotRandom, _MotIRandom, _MotRandomInit, 
+global _MotherRandomInit, _MotherRandom, _MotherIRandom, _MotherBRandom
+%IFDEF WINDOWS
+global _MotherRandomInitD at 4, _MotherRandomD at 0, _MotherIRandomD at 8, _MotherBRandomD at 0
+%ENDIF
+
+extern _InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; dummy offset operator
+%define offset
+
+section .data
+align 16
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+; Size of structure
+IEND
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE align=16   ; code segment
+
+; extern "C" unsigned int MotherBRandom(void * Pthis);     // Output random bits
+
+_MotBRandom:  ; PROC NEAR
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align
+MotBRandom_reg:                                            ; Alternative entry for Pthis in ecx
+        
+        ; CPU dispatch:
+        cmp     dword [ecx+CRandomMotherA.Instset], 4
+        jb      MotBRandomGeneric
+
+; SSE2 version        
+        ; ecx = Pthis
+        movdqa  xmm1, oword [ecx+CRandomMotherA.M3]        ; load M3,M2,M1,M0
+        mov     eax,  [ecx+CRandomMotherA.M0]              ; Retrieve previous random number
+        movdqa  xmm2, xmm1                                 ; copy
+        movdqa  xmm3, oword [ecx+CRandomMotherA.MF3]       ; factors
+        psrlq   xmm2, 32                                   ; move M2,M0 down
+        movq    qword [ecx+CRandomMotherA.M4], xmm1        ; M4=M3, M3=M2
+        movhps  qword [ecx+CRandomMotherA.M2], xmm1        ; M2=M1, M1=M0
+        pmuludq xmm1, xmm3                                 ; M3*MF3, M1*MF1
+        psrlq   xmm3, 32                                   ; move MF2,MF0 down
+        pmuludq xmm2, xmm3                                 ; M2*MF2, M0*MF0
+        paddq   xmm1, xmm2                                 ; P2+P3, P0+P1
+        movhlps xmm2, xmm1                                 ; Get high qword
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        paddq   xmm1, [ecx+CRandomMotherA.MC]              ; +carry
+        movq    qword [ecx+CRandomMotherA.M0], xmm1        ; Store new M0 and carry
+        ; convert to double precision float
+        psllq   xmm1, 32                                   ; Discard carry bits
+        psrlq   xmm1, 12                                   ; Get bits into mantissa position
+        por     xmm1, oword [ecx+CRandomMotherA.one]       ; Add exponent bits to get number in interval [1,2)
+        movq    [ecx+CRandomMotherA.RanP1], xmm1           ; Store floating point number
+        ret
+        
+        
+; Generic version for old processors
+MotBRandomGeneric:                                         ; Generic version for old processors
+        ; ecx = Pthis
+        push    esi
+        push    edi
+        ; recall previous random number
+        push    dword [ecx+CRandomMotherA.M0]
+        ; prepare new random number
+        mov     eax, [ecx+CRandomMotherA.MF3]
+        mul     dword [ecx+CRandomMotherA.M3]              ; x[n-4]
+        mov     esi,eax
+        mov     eax, [ecx+CRandomMotherA.M2]               ; x[n-3]
+        mov     edi,edx
+        mov     [ecx+CRandomMotherA.M3],eax
+        mul     dword [ecx+CRandomMotherA.MF2]
+        add     esi,eax
+        mov     eax, [ecx+CRandomMotherA.M1]               ; x[n-2]
+        adc     edi,edx
+        mov     [ecx+CRandomMotherA.M2],eax
+        mul     dword [ecx+CRandomMotherA.MF1]
+        add     esi,eax
+        mov     eax,[ecx+CRandomMotherA.M0]                ; x[n-1]
+        adc     edi,edx
+        mov     [ecx+CRandomMotherA.M1],eax
+        mul     dword [ecx+CRandomMotherA.MF0]
+        add     eax,esi
+        adc     edx,edi
+        add     eax,[ecx+CRandomMotherA.MC]
+        adc     edx,0
+        ; store next random number and carry
+        mov     [ecx+CRandomMotherA.M0],eax
+        mov     [ecx+CRandomMotherA.MC],edx
+        ; convert to float in case next call needs a float
+        mov     edx, eax
+        shr     eax, 12
+        or      eax, 3ff00000h
+        shl     edx, 20
+        mov     dword [ecx+CRandomMotherA.RanP1+4], eax
+        mov     dword [ecx+CRandomMotherA.RanP1], edx
+        ; retrieve previous random number
+        pop     eax
+        pop     edi
+        pop     esi
+        ret
+;CRandomMotherA ENDP
+
+        
+; extern "C" double MotRandom(void * Pthis);  // Get floating point random number
+_MotRandom: ; PROC NEAR
+
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align
+        ; get previously prepared random number
+        fld     qword [ecx+CRandomMotherA.RanP1]
+        fsub    qword [ecx+CRandomMotherA.one]
+
+        ; make new random number ready for next time
+        call    MotBRandom_reg                             ; random bits
+        ret
+;_MotRandom ENDP
+
+
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+_MotIRandom: ; PROC NEAR                                   ; make random integer in desired interval
+
+        mov     ecx, [esp+4]                               ; Pthis
+        and     ecx, -16                                   ; align
+        call    MotBRandom_reg                             ; make random number
+        mov     edx, [esp+12]                              ; max
+        mov     ecx, [esp+8]                               ; min
+        sub     edx, ecx
+        js      short rerror                               ; max < min
+        inc     edx                                        ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret                                                ; ret 8 if not _cdecl calling
+
+rerror: mov     eax, 80000000h                             ; error exit   
+        ret                                                ; ret 8 if not _cdecl calling
+;_MotIRandom ENDP
+
+
+; extern "C" void MotRandomInit(void * Pthis, int seed);  // Initialization
+_MotRandomInit: ; PROC NEAR
+MotRandomInit@:  ; local alias
+
+        ; clear my buffer
+        push    edi
+        mov     edi, [esp+8]                               ; Pthis
+        and     edi, -16                                   ; align
+        add     edi, 16
+        mov     ecx, (MotherSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        
+        ; insert constants
+        mov     ecx, [esp+8]                               ; Pthis
+        and     ecx, -16                                   ; align
+        mov     dword [ecx+CRandomMotherA.one+4],3FF00000H ; high dword of 1.0
+        mov     dword [ecx+CRandomMotherA.MF0], 5115       ; factors
+        mov     dword [ecx+CRandomMotherA.MF1], 1776
+        mov     dword [ecx+CRandomMotherA.MF2], 1492
+        mov     dword [ecx+CRandomMotherA.MF3], 2111111111
+        
+        ; get instruction set
+        push    ecx
+        call    _InstructionSet
+        pop     ecx
+        mov     [ecx+CRandomMotherA.Instset], eax
+        
+        ; initialize from seed
+        mov     eax, [esp+12]                              ; seed        
+        ; make random numbers and put them into buffer
+        mov     edx, 29943829
+        imul    eax, edx
+        dec     eax
+        mov     [ecx+CRandomMotherA.M0], eax
+        imul    eax, edx
+        dec     eax
+        mov     [ecx+CRandomMotherA.M1], eax
+        imul    eax, edx
+        dec     eax
+        mov     [ecx+CRandomMotherA.M2], eax
+        imul    eax, edx
+        dec     eax
+        mov     [ecx+CRandomMotherA.M3], eax
+        imul    eax, edx
+        dec     eax
+        mov     [ecx+CRandomMotherA.MC], eax
+
+        ; randomize some more
+        mov     edi, 20                                    ; loop counter
+r90:    call    MotBRandom_reg
+        dec     edi
+        jnz     r90
+        pop     edi
+        ret     0                                          ; ret 4 if not _cdecl calling
+;_MotRandomInit ENDP
+
+
+; ------------------------------------------------------------------
+; Single-threaded static link versions of Mother-of-all generator
+; ------------------------------------------------------------------
+
+%IFDEF POSITIONINDEPENDENT
+; Get ecx = eip for self-relative addressing
+GetThunkECX:
+        mov     ecx, [esp]
+        ret
+        
+; Get address of MotherInstance into ecx, position independent
+; This works only in YASM, not in NASM:
+%macro GetMotherInstanceAddress  0
+        call    GetThunkECX
+        add     ecx, MotherInstance - $
+%endmacro
+
+%ELSE
+
+; Get address of MotherInstance into ecx, position dependent
+; This works only in YASM, not in NASM:
+%macro GetMotherInstanceAddress  0
+        mov     ecx, MotherInstance
+%endmacro
+
+%ENDIF
+
+
+; extern "C" void MotherRandomInit(int seed); // Initialization
+_MotherRandomInit: ; PROC NEAR
+        push    dword [esp+4]                              ; seed
+        GetMotherInstanceAddress
+        push    ecx
+        call    MotRandomInit@
+        pop     ecx
+        pop     ecx
+        ret
+;_MotherRandomInit ENDP
+
+
+; extern "C" double MotherRandom(); // Get floating point random number
+_MotherRandom: ; PROC NEAR
+        GetMotherInstanceAddress
+        fld     qword [ecx+CRandomMotherA.RanP1]
+        fsub    qword [ecx+CRandomMotherA.one]
+        call    MotBRandom_reg                             ; random bits
+        ret
+;_MotherRandom ENDP
+
+
+; extern "C" int MotherIRandom(int min, int max); // Get integer random number in desired interval
+_MotherIRandom: ; PROC  NEAR                               ; make random integer in desired interval
+        GetMotherInstanceAddress
+        call    MotBRandom_reg                             ; make random number
+        mov     edx, [esp+8]                               ; max
+        mov     ecx, [esp+4]                               ; min
+        sub     edx, ecx
+        jl      RR100                                      ; max < min
+        inc     edx                                        ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret                                                ; ret 8 if not _cdecl calling
+        
+RR100:  mov     eax, 80000000H                             ; error exit   
+        ret                                                ; ret 8 if not _cdecl calling
+;_MotherIRandom ENDP
+
+
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+_MotherBRandom: ; PROC NEAR
+        GetMotherInstanceAddress
+        jmp     MotBRandom_reg
+;_MotherBRandom ENDP
+       
+
+; ------------------------------------------------------------------
+; Single-threaded dynamic link versions
+; ------------------------------------------------------------------
+
+%IFDEF WINDOWS
+
+; extern "C" void __stdcall MotherRandomInitD(int seed); // Initialization
+_MotherRandomInitD at 4: ; PROC NEAR
+        push    dword [esp+4]                              ; seed
+        push    offset MotherInstance
+        call    MotRandomInit@
+        pop     ecx
+        pop     ecx
+        ret     4
+;_MotherRandomInitD at 4 ENDP
+
+
+; extern "C" double __stdcall MotherRandomD(); // Get floating point random number
+_MotherRandomD at 0: ; PROC NEAR
+        mov     ecx, offset MotherInstance
+        fld     qword [ecx+CRandomMotherA.RanP1]
+        fsub    qword [ecx+CRandomMotherA.one]
+        call    MotBRandom_reg                             ; random bits
+        ret
+;_MotherRandomD at 0 ENDP
+
+
+; extern "C" int __stdcall MotherIRandomD(int min, int max); // Get integer random number in desired interval
+_MotherIRandomD at 8: ; PROC NEAR                             ; make random integer in desired interval
+        mov     ecx, offset MotherInstance
+        call    MotBRandom_reg                             ; make random number
+        mov     edx, [esp+8]                               ; max
+        mov     ecx, [esp+4]                               ; min
+        sub     edx, ecx
+        js      RR200                                      ; max < min
+        inc     edx                                        ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret     8
+
+RR200:  mov     eax, 80000000h                             ; error exit   
+        ret     8
+;_MotherIRandomD at 8 ENDP
+
+
+; extern "C" unsigned int __stdcall MotherBRandomD(); // Output random bits
+_MotherBRandomD at 0: ; PROC NEAR
+        mov     ecx, offset MotherInstance
+        jmp     MotBRandom_reg
+;_MotherBRandomD at 0 ENDP 
+
+%ENDIF ; WINDOWS      
diff --git a/asmlibSrc/mother64.asm b/asmlibSrc/mother64.asm
new file mode 100755
index 0000000..83b6c50
--- /dev/null
+++ b/asmlibSrc/mother64.asm
@@ -0,0 +1,250 @@
+; ----------------------------- MOTHER64.ASM -----------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-12-15
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Mother-of-All random number generator by Agner Fog
+; 64-bit mode version for x86-64 compatible microprocessors.
+;
+;  This is a multiply-with-carry type of random number generator
+;  invented by George Marsaglia.  The algorithm is:             
+;  S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+;  X[n] = S modulo 2^32
+;  C = floor(S / 2^32) 
+;
+; C++ prototypes:
+; extern "C" void         MotRandomInit(void * Pthis, int seed);      // Initialization
+; extern "C" int          MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double       MotRandom(void * Pthis);                    // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis);                   // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; publics:
+global MotherBRandom, MotBRandom, ?Windows_MotBRandom
+global MotherRandom, MotRandom, MotherIRandom, MotIRandom
+global MotherRandomInit, MotRandomInit
+%IFDEF WINDOWS
+global MotherRandomInitD, MotherRandomD, MotherIRandomD, MotherBRandomD
+%ENDIF
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+IEND
+; Size of structure
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE ALIGN=16   ; code segment
+
+; Single threaded version:
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+
+MotherBRandom: ; PROC                         ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MotherBRandomD:
+%ENDIF
+        lea     rcx, [MotherInstance]         ; Point to instance
+        jmp     ?Windows_MotBRandom
+;MotherBRandom ENDP       
+
+; Thread-safe version:
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+
+MotBRandom: ; PROC 
+%IFDEF UNIX
+        mov     rcx, rdi                    ; translate calling convention
+%ENDIF
+?Windows_MotBRandom:
+        and     rcx, -16                    ; align
+        movdqa  xmm1, oword [rcx+CRandomMotherA.M3]  ; load M3,M2,M1,M0
+        mov     eax,  [rcx+CRandomMotherA.M0]              ; Retrieve previous random number
+        movdqa  xmm2, xmm1                                 ; copy
+        movdqa  xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
+        psrlq   xmm2, 32                                   ; move M2,M0 down
+        movq    qword [rcx+CRandomMotherA.M4], xmm1    ; M4=M3, M3=M2
+        movhps  qword [rcx+CRandomMotherA.M2], xmm1    ; M2=M1, M1=M0
+        pmuludq xmm1, xmm3                                 ; M3*MF3, M1*MF1
+        psrlq   xmm3, 32                                   ; move MF2,MF0 down
+        pmuludq xmm2, xmm3                                 ; M2*MF2, M0*MF0
+        paddq   xmm1, xmm2                                 ; P2+P3, P0+P1
+        movhlps xmm2, xmm1                                 ; Get high qword
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        paddq   xmm1, oword [rcx+CRandomMotherA.MC]    ; +carry
+        movq    qword [rcx+CRandomMotherA.M0], xmm1    ; Store new M0 and carry
+        ; convert to double precision float
+        psllq   xmm1, 32                                   ; Discard carry bits
+        psrlq   xmm1, 12                                   ; Get bits into mantissa position
+        por     xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
+        movq    [rcx+CRandomMotherA.RanP1], xmm1           ; Store floating point number
+        ret
+        
+;MotBRandom ENDP
+
+        
+; Single threaded version:
+; extern "C" unsigned int MotherRandom();  // Get floating point random number
+
+MotherRandom:
+%IFDEF UNIX
+        lea     rdi, [MotherInstance]         ; Point to instance
+%ENDIF
+%IFDEF WINDOWS
+MotherRandomD:
+        lea     rcx, [MotherInstance]         ; Point to instance
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MotRandom(void * Pthis);  // Get floating point random number
+MotRandom:
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+        and     rcx, -16                    ; align
+        ; get previously prepared random number
+        movsd   xmm0, [rcx+CRandomMotherA.RanP1]
+        subsd   xmm0, [rcx+CRandomMotherA.one]
+
+        ; make new random number ready for next time
+        call    ?Windows_MotBRandom
+        ret
+;MotherRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
+
+MotherIRandom: ; PROC
+%IFDEF UNIX
+        mov     r8d, esi                    ; max
+        mov     edx, edi                    ; min
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotIRandom
+%ENDIF
+%IFDEF WINDOWS
+MotherIRandomD:
+        mov     r8d, edx                    ; max
+        mov     edx, ecx                    ; min
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotIRandom
+%ENDIF
+; MotherIRandom ENDP       
+
+; Thread-safe version:
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MotIRandom:
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                    ; max
+        mov     edx, esi                    ; min
+        mov     rcx, rdi                    ; Pthis
+%ENDIF
+        
+?Windows_MotIRandom: ;   LABEL NEAR         ; entry for Windows call
+        and     rcx, -16                    ; align
+        push    r8
+        push    rdx
+        call    ?Windows_MotBRandom         ; make random number
+        pop     rcx                         ; min
+        pop     r8                          ; max
+        sub     r8d, ecx
+        js      short rerror                ; max < min
+        inc     r8d                         ; interval = max - min + 1
+        mul     r8d                         ; multiply random number eax by interval and truncate
+        lea     eax, [rdx+rcx]              ; add min to interval*BRandom >> 32
+        ret                                 ; ret 8 if not _cdecl calling
+
+rerror: mov     eax, 80000000h              ; error exit   
+        ret                                 ; ret 8 if not _cdecl calling
+;MotIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandomInit(int seed);  // Initialization
+
+MotherRandomInit: ; PROC
+%IFDEF UNIX
+        mov     edx, edi                    ; seed
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MotherRandomInitD:
+        mov     edx, ecx                    ; seed
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotRandomInit
+%ENDIF
+;MotherRandomInit ENDP       
+
+; Thread-safe version:
+; extern "C" void MotRandomInit(void * Pthis, int seed);  // Initialization
+MotRandomInit: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     edx, esi                    ; seed
+        mov     rcx, rdi                    ; Pthis
+%ENDIF
+        
+?Windows_MotRandomInit: ;   LABEL NEAR         ; entry for Windows call
+        and     rcx, -16                    ; align
+        ; clear my buffer
+        push    rdi
+        push    rcx
+        mov     rdi, rcx                    ; Pthis
+        add     rdi, 16
+        mov     ecx, (MotherSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        pop     rcx
+        
+        ; insert constants
+        mov     dword [rcx+CRandomMotherA.one+4], 3FF00000H  ; high dword of 1.0       
+        mov     dword [rcx+CRandomMotherA.MF0], 5115             ; factors
+        mov     dword [rcx+CRandomMotherA.MF1], 1776
+        mov     dword [rcx+CRandomMotherA.MF2], 1492
+        mov     dword [rcx+CRandomMotherA.MF3], 2111111111
+        
+        ; initialize from seed
+        mov     eax, edx                                   ; seed        
+        ; make random numbers and put them into buffer
+        mov     edx, 29943829
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M0], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M1], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M2], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M3], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.MC], eax
+
+        ; randomize some more
+        mov     edi, 20                                    ; loop counter
+r90:    call    ?Windows_MotBRandom                        ; (rcx and rdi unchanged)
+        dec     edi
+        jnz     r90
+        pop     rdi
+        ret
+;MotRandomInit ENDP
+
+ ;       END
diff --git a/asmlibSrc/physseed32.asm b/asmlibSrc/physseed32.asm
new file mode 100755
index 0000000..304b137
--- /dev/null
+++ b/asmlibSrc/physseed32.asm
@@ -0,0 +1,334 @@
+;*************************  physseed32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2010-08-03
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator 
+; which is available on some processors. 
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[]       An array which will be filled with random numbers
+; int NumSeeds      Indicates the desired number of 32-bit random numbers
+;
+; Return value:     0   Failure. No suitable instruction available (processor older than Pentium)
+;                   1   No physical random number generator. Used time stamp counter instead
+;                   2   Success. VIA physical random number generator used
+;                   3   Success. Intel physical random number generator used
+;                   4   Success. Intel physical seed generator used
+; 
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define NUM_TRIES   20                 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING     0                  ; 1 for test only
+
+global _PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global _PhysicalSeedNone: function
+global _PhysicalSeedRDTSC: function
+global _PhysicalSeedVIA: function
+global _PhysicalSeedRDRand: function
+global _PhysicalSeedRDSeed function
+
+
+SECTION .text  align=16
+
+_PhysicalSeed:
+
+%IFNDEF POSITIONINDEPENDENT
+
+        jmp     near [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:    ; Make the following instruction with address relative to RP1:
+        jmp     near [edx+PhysicalSeedDispatch-RP1]
+
+%ENDIF
+
+_PhysicalSeedRDSeed:
+        push    ebx
+        mov     edx, [esp+8]           ; seeds
+        mov     ecx, [esp+12]          ; NumSeeds
+        jecxz   S300
+        ; do 32 bits at a time
+S100:   mov     ebx, NUM_TRIES
+S110:   ; rdseed eax
+%if     TESTING
+        mov     eax, ecx
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F8h             ; rdseed rax
+%ENDIF        
+        jc      S120
+        ; failed. try again
+        dec     ebx
+        jz      S900
+        jmp     S110
+S120:   mov     [edx], eax
+        add     edx, 4
+        dec     ecx
+        jnz     S100                   ; loop 32 bits
+S300:   mov     eax, 4                 ; return value
+        pop     ebx
+        ret        
+S900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     ebx
+        ret
+
+_PhysicalSeedRDRand:
+        push    ebx
+        mov     edx, [esp+8]           ; seeds
+        mov     ecx, [esp+12]          ; NumSeeds
+        jecxz   R300
+        ; do 32 bits at a time
+R100:   mov     ebx, NUM_TRIES
+R110:   ; rdrand eax
+%if     TESTING
+        mov     eax, ecx
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F0h             ; rdrand eax
+%ENDIF
+        jc      R120
+        ; failed. try again
+        dec     ebx
+        jz      R900
+        jmp     R110
+R120:   mov     [edx], eax
+        add     edx, 4
+        dec     ecx
+        jnz     R100                   ; loop 32 bits
+R300:   mov     eax, 3                 ; return value
+        pop     ebx
+        ret        
+R900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     ebx
+        ret
+
+
+_PhysicalSeedVIA:
+;       VIA XSTORE  supported
+        push    ebx
+        push    esi
+        push    edi
+        mov     edi, [esp+16]          ; seeds
+        mov     ecx, [esp+20]          ; NumSeeds
+        mov     ebx, ecx
+        and     ecx, -2                ; round down to nearest even
+        jz      T200                   ; NumSeeds <= 1
+        ; make an even number of random dwords
+        shl     ecx, 2                 ; number of bytes (divisible by 8)
+        mov     edx, 3                 ; quality factor
+%if     TESTING
+        mov     eax, 1
+        rep     stosb
+%ELSE        
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+T200:
+        test    ebx, 1
+        jz      T300
+        ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+        mov     esi, edi               ; current output pointer
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 8                 ; make temporary space on stack
+        and     esp, -8                ; align by 8
+        mov     edi, esp
+        mov     ecx, 4                 ; Will generate 4 or 8 bytes, depending on CPU
+        mov     edx, 3                 ; quality factor
+%if     TESTING
+        mov     eax, 1
+        rep     stosb
+%ELSE
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+        mov     eax, [esp]
+        mov     [esi], eax             ; store the last 4 bytes
+        mov     esp, ebp
+        pop     ebp
+T300:
+        mov     eax, 2                 ; return value
+        pop     edi
+        pop     esi
+        pop     ebx
+        ret
+
+
+_PhysicalSeedRDTSC:
+        push    ebx
+        xor     eax, eax
+        cpuid                          ; serialize
+        rdtsc                          ; get time stamp counter
+        mov     ebx, [esp+8]           ; seeds
+        mov     ecx, [esp+12]          ; NumSeeds
+        test    ecx, ecx
+        jz      U300                   ; zero seeds
+        js      U900                   ; failure
+        mov     [ebx], eax             ; store time stamp counter as seeds[0]
+        add     ebx, 4
+        dec     ecx
+        jz      U300
+        mov     [ebx], edx             ; store upper part of time stamp counter as seeds[1]
+        add     ebx, 4
+        dec     ecx
+        jz      U300
+        xor     eax, eax
+U100:   mov     [ebx], eax             ; store 0 for the rest
+        add     ebx, 4
+        dec     ecx
+        jnz     U100
+U300:   mov     eax, 1                 ; return value        
+        pop     ebx
+        ret
+U900:   ; failure         
+        xor     eax, eax               ; return 0
+        pop     ebx
+        ret
+
+
+_PhysicalSeedNone:                     ; no possible generation
+        mov     edx, [esp+4]           ; seeds
+        mov     ecx, [esp+8]           ; NumSeeds
+        xor     eax, eax
+        jecxz   N200
+N100:   mov     [edx], eax
+        add     edx, 4
+        dec     ecx
+        jnz     N100
+N200:   ret                            ; return 0
+
+
+PhysicalSeedDispatcher:
+        push    ebx
+        pushfd
+        pop     eax
+        btc     eax, 21                ; check if CPUID bit can toggle
+        push    eax
+        popfd
+        pushfd
+        pop     ebx
+        xor     ebx, eax
+        bt      ebx, 21
+        jc      FAILURE                ; CPUID not supported
+
+        xor     eax, eax               ; 0
+        cpuid                          ; get number of CPUID functions
+        test    eax, eax
+        jz      FAILURE                ; function 1 not supported
+
+        ; test if RDSEED supported
+        xor     eax, eax
+        cpuid
+        cmp     eax, 7
+        jb      P200                   ; RDSEED not supported
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid
+        bt      ebx, 18
+       ; jc      USE_RDSEED             ; not tested yet!!
+
+P200:   ; test if RDRAND supported
+        mov     eax, 1
+        cpuid
+        bt      ecx, 30
+        jc      USE_RDRAND
+
+        ; test if VIA xstore instruction supported
+        mov     eax, 0C0000000H
+        push    eax
+        cpuid
+        pop     ebx
+        cmp     eax, ebx
+        jna     P300                   ; not a VIA processor
+        lea     eax, [ebx+1]
+        cpuid
+        bt      edx, 3
+        jc      VIA_METHOD
+
+P300:   ; test if RDTSC supported
+        mov     eax, 1
+        cpuid
+        bt      edx, 4
+        jc      USE_RDTSC              ; XSTORE instruction not supported or not enabled
+        
+FAILURE: ; No useful instruction supported
+        mov     edx, _PhysicalSeedNone
+        jmp     P800
+
+USE_RDRAND:     ; Use RDRAND instruction        
+        mov     edx, _PhysicalSeedRDRand
+        jmp     P800
+
+USE_RDSEED:     ; Use RDSEED instruction (not tested yet)
+        mov     edx, _PhysicalSeedRDSeed
+        jmp     P800
+
+VIA_METHOD:     ; Use VIA xstore instructions   
+        mov     edx, _PhysicalSeedVIA
+        jmp     P800
+        
+USE_RDTSC:
+        mov     edx, _PhysicalSeedRDTSC
+        ;jmp     P800
+        
+P800:   mov     [PhysicalSeedDispatch], edx
+        pop     ebx
+        jmp     edx                    ; continue in dispatched version
+        
+        
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF        
+        
+
+; -----------------------------------------------------------------
+;  DLL version, Windows only
+; -----------------------------------------------------------------
+%IFDEF WINDOWS
+
+_PhysicalSeedD at 8:
+global _PhysicalSeedD at 8
+        ; translate __cdecl to __stdcall calling
+        mov     eax, [esp+4]
+        mov     edx, [esp+8]
+        push    edx                                       
+        push    eax
+        call    _PhysicalSeed
+        pop     ecx
+        pop     ecx
+        ret     8
+
+%ENDIF ; WINDOWS
+
+
+; -----------------------------------------------------------------
+;  Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch  DD PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/physseed64.asm b/asmlibSrc/physseed64.asm
new file mode 100755
index 0000000..7dcecf4
--- /dev/null
+++ b/asmlibSrc/physseed64.asm
@@ -0,0 +1,394 @@
+;*************************  physseed64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2010-08-03
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator 
+; which is available on some processors. 
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[]       An array which will be filled with random numbers
+; int NumSeeds      Indicates the desired number of 32-bit random numbers
+;
+; Return value:     0   Failure. No suitable instruction available (processor older than Pentium)
+;                   1   No physical random number generator. Used time stamp counter instead
+;                   2   Success. VIA physical random number generator used
+;                   3   Success. Intel physical random number generator used
+;                   4   Success. Intel physical seed generator used
+; 
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+%define NUM_TRIES   20                 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING     0                  ; 1 for test only
+
+global PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global PhysicalSeedNone: function
+global PhysicalSeedRDTSC: function
+global PhysicalSeedVIA: function
+global PhysicalSeedRDRand: function
+global PhysicalSeedRDSeed function
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1     rcx
+  %define par2     rdx
+  %define par3     r8
+  %define par1d    ecx
+  %define par2d    edx
+  %define par3d    r8d
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1     rdi
+  %define par2     rsi
+  %define par3     rdx
+  %define par1d    edi
+  %define par2d    esi
+  %define par3d    edx
+%ENDIF 
+
+
+SECTION .text  align=16
+
+%IFDEF WINDOWS
+global PhysicalSeedD at 8                 ; DLL version
+PhysicalSeedD at 8:
+%ENDIF
+
+PhysicalSeed:
+        jmp     [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+
+PhysicalSeedRDSeed:
+        push    rbx
+        test    par2d, par2d           ; NumSeeds
+        jz      S300 
+        js      S900
+        mov     par3d, par2d           ; NumSeeds
+        shr     par3d, 1
+        jz      S150
+        ; do 64 bits at a time
+S100:   mov     ebx, NUM_TRIES
+S110:   ; rdseed rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 48h, 0Fh, 0C7h, 0F8h        ; rdseed rax
+%ENDIF
+        jc      S120
+        ; failed. try again
+        dec     ebx
+        jz      S900
+        jmp     S110
+S120:   mov     [par1], rax
+        add     par1, 8
+        dec     par3d
+        jnz     S100                   ; loop 64 bits
+S150:
+        and     par2d, 1
+        jz      S300
+        ; an odd 32 bit remains
+S200:   mov     ebx, NUM_TRIES
+S210:   ; rdseed rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F8h             ; rdseed eax
+%ENDIF
+        jc      S220
+        ; failed. try again
+        dec     ebx
+        jz      S900
+        jmp     S210
+S220:   mov     [par1], eax
+S300:   mov     eax, 4                 ; return value
+        pop     rbx
+        ret
+S900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+                
+
+PhysicalSeedRDRand:
+        push    rbx
+        test    par2d, par2d           ; NumSeeds
+        jz      R300
+        js      R900         
+        mov     par3d, par2d           ; NumSeeds
+        shr     par3d, 1               ; NumSeeds/2
+        jz      R150
+        ; do 64 bits at a time
+R100:   mov     ebx, NUM_TRIES
+R110:   ; rdrand rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 48h, 0Fh, 0C7h, 0F0h        ; rdrand rax
+%ENDIF
+        jc      R120
+        ; failed. try again
+        dec     ebx
+        jz      R900
+        jmp     R110
+R120:   mov     [par1], rax
+        add     par1, 8
+        dec     par3d
+        jnz     R100                   ; loop 64 bits
+R150:
+        and     par2d, 1
+        jz      R300
+        ; an odd 32 bit remains
+R200:   mov     ebx, NUM_TRIES
+R210:   ; rdrand eax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F0h             ; rdrand eax
+%ENDIF
+        jc      R220
+        ; failed. try again
+        dec     ebx
+        jz      R900
+        jmp     R210
+R220:   mov     [par1], eax
+R300:   mov     eax, 4                 ; return value
+        pop     rbx
+        ret
+R900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+
+
+PhysicalSeedVIA:
+;       VIA XSTORE  supported
+        push    rbx
+%IFDEF WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; seeds
+        mov     esi, edx               ; NumSeeds
+%ENDIF        
+        mov     ecx, esi               ; NumSeeds
+        and     ecx, -2                ; round down to nearest even
+        jz      T200                   ; NumSeeds <= 1
+        ; make an even number of random dwords
+        shl     ecx, 2                 ; number of bytes (divisible by 8)
+        mov     edx, 3                 ; quality factor
+%if     TESTING
+        mov     eax, 1
+        rep stosb
+%ELSE        
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+T200:        
+        test    esi, 1
+        jz      T300
+        ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+        mov     rbx, rdi               ; current output pointer
+        mov     ecx, 4                 ; Will generate 4 or 8 bytes, depending on CPU
+        mov     edx, 3                 ; quality factor
+        push    rcx                    ; make temporary space on stack
+        mov     rdi, rsp               ; point to buffer on stack
+%if     TESTING
+        mov     eax, 1
+        rep stosb
+%ELSE        
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+        pop     rax
+        mov     [rbx], eax             ; store the last 4 bytes
+T300:
+        mov     eax, 2                 ; return value        
+%IFDEF WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF  
+        pop     rbx      
+        ret        
+
+
+PhysicalSeedRDTSC:
+%IFDEF WINDOWS
+        push    rbx
+        push    rcx
+        push    rdx
+        xor     eax, eax
+        cpuid                          ; serialize
+        rdtsc                          ; get time stamp counter
+        pop     rbx                    ; numseeds
+        pop     rcx                    ; seeds
+        test    ebx, ebx
+        jz      U300                   ; zero seeds
+        js      U900                   ; failure
+        mov     [rcx], eax             ; store time stamp counter as seeds[0]
+        add     rcx, 4
+        dec     ebx
+        jz      U300
+        mov     [rcx], edx             ; store upper part of time stamp counter as seeds[1]
+        add     rcx, 4
+        dec     ebx
+        jz      U300
+        xor     eax, eax
+U100:   mov     [rcx], eax             ; store 0 for the rest
+        add     rcx, 4
+        dec     ebx
+        jnz     U100
+U300:   mov     eax, 1                 ; return value        
+        pop     rbx
+        ret
+U900:   ; failure         
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+        
+%ELSE   ; UNIX
+
+        push    rbx
+        xor     eax, eax
+        cpuid                          ; serialize
+        rdtsc                          ; get time stamp counter
+        test    esi, esi               ; numseeds
+        jz      U300                   ; zero seeds
+        js      U900                   ; failure
+        mov     [rdi], eax             ; store time stamp counter as seeds[0]
+        add     rdi, 4
+        dec     esi
+        jz      U300
+        mov     [rdi], edx             ; store upper part of time stamp counter as seeds[1]
+        add     rdi, 4
+        dec     esi
+        jz      U300
+        xor     eax, eax
+U100:   mov     [rdi], eax             ; store 0 for the rest
+        add     rdi, 4
+        dec     esi
+        jnz     U100
+U300:   mov     eax, 1                 ; return value        
+        pop     rbx
+        ret
+U900:   ; failure         
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret 
+
+%ENDIF  
+
+
+PhysicalSeedNone:                      ; no possible generation
+        xor     eax, eax
+        test    par2d, par2d           ; numseeds
+        jz      N200
+N100:   mov     [par1], eax
+        add     par1, 4
+        dec     par2d
+        jnz     N100
+N200:   ret                            ; return 0
+
+
+PhysicalSeedDispatcher:
+        push    rbx
+%IFDEF WINDOWS
+        push    rcx
+        push    rdx
+%ENDIF
+        ; test if RDSEED supported
+        xor     eax, eax
+        cpuid
+        cmp     eax, 7
+        jb      P200                   ; RDSEED not supported
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid
+        bt      ebx, 18
+       ; jc      USE_RDSEED             ; not tested yet!!
+
+P200:   ; test if RDRAND supported
+        mov     eax, 1
+        cpuid
+        bt      ecx, 30
+        jc      USE_RDRAND
+
+        ; test if VIA xstore instruction supported
+        mov     eax, 0C0000000H
+        push    rax
+        cpuid
+        pop     rbx
+        cmp     eax, ebx
+        jna     P300                   ; not a VIA processor
+        lea     eax, [rbx+1]
+        cpuid
+        bt      edx, 3
+        jc      VIA_METHOD
+
+P300:   ; test if RDTSC supported
+        mov     eax, 1
+        cpuid
+        bt      edx, 4
+        jc      USE_RDTSC              ; XSTORE instruction not supported or not enabled
+        
+FAILURE: ; No useful instruction supported
+        lea     rax, [PhysicalSeedNone]
+        jmp     P800
+
+USE_RDRAND:     ; Use RDRAND instruction        
+        lea     rax, [PhysicalSeedRDRand]
+        jmp     P800
+
+USE_RDSEED:     ; Use RDSEED instruction (not tested yet)
+        lea     rax, [PhysicalSeedRDSeed]
+        jmp     P800
+
+VIA_METHOD:     ; Use VIA xstore instructions   
+        lea     rax, [PhysicalSeedVIA]
+        jmp     P800
+        
+USE_RDTSC:
+        lea     rax, [PhysicalSeedRDTSC]
+        ;jmp     P800
+        
+P800:   mov     [PhysicalSeedDispatch], rax
+%IFDEF WINDOWS
+        pop     rdx
+        pop     rcx
+%ENDIF
+        pop     rbx
+        jmp     rax                    ; continue in dispatched version
+        
+
+; -----------------------------------------------------------------
+;  Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch  DQ PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/popcount32.asm b/asmlibSrc/popcount32.asm
new file mode 100755
index 0000000..29b137d
--- /dev/null
+++ b/asmlibSrc/popcount32.asm
@@ -0,0 +1,137 @@
+;*************************  popcount32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2011-08-21
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_popcount: function
+
+; Direct entries to CPU-specific versions
+global _popcountGeneric: function
+global _popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               popcount function
+;******************************************************************************
+
+
+_A_popcount: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+        jmp     near [edx+popcountDispatch-RP1]
+
+%ENDIF
+
+align 16
+_popcountSSE42: ; SSE4.2 version
+        popcnt  eax, dword [esp+4]
+        ret
+
+
+;******************************************************************************
+;                               popcount function generic
+;******************************************************************************
+
+_popcountGeneric: ; Generic version
+        mov     eax, [esp+4]           ; x
+        mov     edx, eax
+        shr     eax, 1
+        and     eax, 55555555h         ; odd bits in eax, even bits in edx
+        and     edx, 55555555h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 2
+        and     eax, 33333333h
+        and     edx, 33333333h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 4
+        add     eax, edx
+        and     eax, 0F0F0F0Fh
+        mov     edx, eax
+        shr     eax, 8
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 16
+        add     eax, edx
+        and     eax, 03FH
+        ret
+;_popcountGeneric end
+
+; ********************************************************************************
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+popcountCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version
+        mov     ecx, _popcountGeneric
+        cmp     eax, 9                ; check popcnt supported
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _popcountSSE42
+Q100:   mov     [popcountDispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP10:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_popcountGeneric-RP10]
+        cmp     eax, 9                ; check popcnt supported
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_popcountSSE42-RP10]
+Q100:   mov     [edx+popcountDispatch-RP10], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch  DD popcountCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/popcount64.asm b/asmlibSrc/popcount64.asm
new file mode 100755
index 0000000..f05e0c9
--- /dev/null
+++ b/asmlibSrc/popcount64.asm
@@ -0,0 +1,110 @@
+;*************************  popcount64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2011-07-20
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global A_popcount: function
+
+; Direct entries to CPU-specific versions
+global popcountGeneric: function
+global popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               popcount function
+;******************************************************************************
+
+
+A_popcount: ; function dispatching
+        jmp     near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+popcountSSE42: ; SSE4.2 version
+%ifdef  WINDOWS
+        popcnt  eax, ecx
+%else
+        popcnt  eax, edi
+%endif        
+        ret
+
+
+;******************************************************************************
+;                               popcount function generic
+;******************************************************************************
+
+popcountGeneric: ; Generic version
+%ifdef  WINDOWS
+        mov     eax, ecx
+%else
+        mov     eax, edi
+%endif        
+        mov     edx, eax
+        shr     eax, 1
+        and     eax, 55555555h         ; odd bits in eax, even bits in edx
+        and     edx, 55555555h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 2
+        and     eax, 33333333h
+        and     edx, 33333333h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 4
+        add     eax, edx
+        and     eax, 0F0F0F0Fh
+        mov     edx, eax
+        shr     eax, 8
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 16
+        add     eax, edx
+        and     eax, 03FH
+        ret
+;popcountGeneric end
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+%ifdef  WINDOWS
+%define par1      rcx                  ; parameter 1, pointer to haystack
+%else
+%define par1      rdi                  ; parameter 1, pointer to haystack
+%endif
+
+popcountCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        ; Point to generic version of strstr
+        lea     rdx, [popcountGeneric]
+        cmp     eax, 9                ; check popcnt supported
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     rdx, [popcountSSE42]
+Q100:   mov     [popcountDispatch], rdx
+        ; Continue in appropriate version 
+        jmp     rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch  DQ popcountCPUDispatch
diff --git a/asmlibSrc/procname32.asm b/asmlibSrc/procname32.asm
new file mode 100755
index 0000000..23f16bf
--- /dev/null
+++ b/asmlibSrc/procname32.asm
@@ -0,0 +1,186 @@
+;                   procname32.asm
+;
+; Author:           Agner Fog
+; Date created:     2007
+; Last modified:    2013-09-11
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+; 
+; Copyright (c) 2007-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _ProcessorName: function
+
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0              ; Static buffer to contain name
+
+
+SECTION .text  align=16
+
+%IFDEF POSITIONINDEPENDENT
+; Local function for reading instruction pointer into edi
+GetThunkEDI:
+        mov     edi, [esp]
+        ret
+%ENDIF  ; End IF POSITIONINDEPENDENT
+
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; extern "C" char * ProcessorName ();
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+_ProcessorName:
+        push    ebx
+        push    edi
+        
+; Make edi point to NameBuffer:
+        
+%IFDEF POSITIONINDEPENDENT
+        ; Position-independent code. Get edi = eip for reference point
+        call    GetThunkEDI
+        add     edi, NameBuffer - $
+%ELSE
+        ; Normal code requiring base relocation:
+        mov     edi, NameBuffer        
+%ENDIF
+        
+; detect if CPUID instruction supported by microprocessor:
+        pushfd
+        pop     eax
+        xor     eax, 1 << 21           ; Check if CPUID bit can toggle
+        push    eax
+        popfd
+        pushfd
+        pop     ebx
+        xor     eax, ebx
+        and     eax, 1 << 21
+        jnz     NOID                   ; CPUID not supported
+        xor     eax, eax
+        cpuid                          ; Get number of CPUID functions
+        test    eax, eax
+        jnz     IDENTIFYABLE           ; Function 1 supported
+        
+NOID:
+        ; processor has no CPUID
+        mov     DWORD [edi], '8038'    ; Write text '80386 or 80486'
+        mov     DWORD [edi+4], '6 or'
+        mov     DWORD [edi+8], ' 804'
+        mov     DWORD [edi+12], '86'   ; End with 0
+        jmp     PNEND
+        
+IDENTIFYABLE:
+        mov     eax, 80000000H
+        cpuid
+        cmp     eax, 80000004H         ; Text if extended vendor string available
+        jb      no_ext_vendor_string
+
+        ; Has extended vendor string
+        mov     eax, 80000002H
+        cpuid
+        mov     [edi], eax             ; Store 16 bytes of extended vendor string
+        mov     [edi+4], ebx
+        mov     [edi+8], ecx
+        mov     [edi+0CH], edx
+        mov     eax, 80000003H
+        cpuid
+        mov     [edi+10H], eax         ; Next 16 bytes
+        mov     [edi+14H], ebx
+        mov     [edi+18H], ecx
+        mov     [edi+1CH], edx
+        mov     eax, 80000004H
+        cpuid
+        mov     [edi+20H], eax         ; Next 16 bytes
+        mov     [edi+24H], ebx
+        mov     [edi+28H], ecx
+        mov     [edi+2CH], edx
+        jmp     get_family_and_model
+        
+no_ext_vendor_string:
+        ; No extended vendor string. Get short vendor string
+        xor     eax, eax
+        cpuid
+        mov     [edi],ebx              ; Store short vendor string
+        mov     [edi+4],edx
+        mov     [edi+8],ecx
+        mov     byte [edi+12],0        ; Terminate string
+        
+get_family_and_model:
+        push    edi                    ; Save string address
+        xor     eax, eax
+        mov     ecx, 30H
+        cld
+        repne   scasb                  ; Find end of text
+        dec     edi
+        mov     dword [edi], ' Fam'    ; Append text " Family "
+        mov     dword [edi+4], 'ily '
+        add     edi, 8
+
+        mov     eax, 1
+        cpuid                          ; Get family and model
+        mov     ebx, eax
+        mov     ecx, eax
+        shr     eax, 8
+        and     eax, 0FH               ; Family
+        shr     ecx, 20
+        and     ecx, 0FFH              ; Extended family
+        add     eax, ecx               ; Family + extended family
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [edi], 'H Mo'    ; Write text "H Model "
+        mov     dword [edi+4], 'del '
+        add     edi, 8
+        
+        mov     eax, ebx
+        shr     eax, 4
+        and     eax, 0FH               ; Model
+        mov     ecx, ebx
+        shr     ecx, 12
+        and     ecx, 0F0H              ; Extended model
+        or      eax, ecx               ; Model | extended model
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [edi], 'H'       ; Write text "H"
+        pop     edi                    ; Restore string address
+        
+PNEND:  ; finished
+        mov     eax, edi               ; Pointer to result
+        pop     edi
+        pop     ebx
+        ret
+;_ProcessorName ENDP
+
+WriteHex:                              ; Local function: Write 2 hexadecimal digits
+        ; Parameters: AL = number to write, EDI = text destination
+        mov     ecx, eax
+        shr     ecx, 4
+        and     ecx, 0FH               ; most significant digit first
+        cmp     ecx, 10
+        jnb     W1
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W2
+W1:     ; A - F
+        add     ecx, 'A' - 10
+W2:     mov     [edi], cl              ; write digit
+                
+        mov     ecx, eax
+        and     ecx, 0FH               ; next digit
+        cmp     ecx, 10
+        jnb     W3
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W4
+W3:     ; A - F
+        add     ecx, 'A' - 10
+W4:     mov     [edi+1], cl            ; write digit
+        add     edi, 2                 ; advance string pointer
+        ret
diff --git a/asmlibSrc/procname64.asm b/asmlibSrc/procname64.asm
new file mode 100755
index 0000000..e65c384
--- /dev/null
+++ b/asmlibSrc/procname64.asm
@@ -0,0 +1,143 @@
+;                   procname64.asm 
+;
+; Author:           Agner Fog
+; Date created:     2007
+; Last modified:    2011-07-02
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+; 
+; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global  ProcessorName: function
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0              ; Static buffer to contain name
+
+
+SECTION .text  align=16
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; void ProcessorName (char * text);
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+ProcessorName:
+        push    rbx
+        push    rdi
+        lea     rdi, [NameBuffer]      ; text pointer
+        
+        mov     eax, 80000000H
+        cpuid
+        cmp     eax, 80000004H         ; text if extended vendor string available
+        jb      no_ext_vendor_string
+
+        ; Has extended vendor string
+        mov     eax, 80000002H
+        cpuid
+        mov     [rdi], eax             ; store 16 bytes of extended vendor string
+        mov     [rdi+4], ebx
+        mov     [rdi+8], ecx
+        mov     [rdi+0CH], edx
+        mov     eax, 80000003H
+        cpuid
+        mov     [rdi+10H], eax         ; next 16 bytes
+        mov     [rdi+14H], ebx
+        mov     [rdi+18H], ecx
+        mov     [rdi+1CH], edx
+        mov     eax, 80000004H
+        cpuid
+        mov     [rdi+20H], eax         ; next 16 bytes
+        mov     [rdi+24H], ebx
+        mov     [rdi+28H], ecx
+        mov     [rdi+2CH], edx
+        jmp     get_family_and_model
+        
+no_ext_vendor_string:
+        ; No extended vendor string. Get short vendor string
+        xor     eax, eax
+        cpuid
+        mov     [rdi],ebx              ; store short vendor string
+        mov     [rdi+4],edx
+        mov     [rdi+8],ecx
+        mov     byte [rdi+12],0    ; terminate string
+        
+get_family_and_model:
+        xor     eax, eax
+        mov     ecx, 30H
+        cld
+        repne   scasb                  ; find end of text
+        dec     rdi
+        
+        mov     dword [rdi], ' Fam'   ; Append text " Family "
+        mov     dword [rdi+4], 'ily '
+        add     rdi, 8
+
+        mov     eax, 1
+        cpuid                          ; Get family and model
+        mov     ebx, eax
+        mov     ecx, eax
+        shr     eax, 8
+        and     eax, 0FH               ; Family
+        shr     ecx, 20
+        and     ecx, 0FFH              ; Extended family
+        add     eax, ecx               ; Family + extended family
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [rdi], 'H Mo' ; Write text "H Model "
+        mov     dword [rdi+4], 'del '
+        add     rdi, 8
+        
+        mov     eax, ebx
+        shr     eax, 4
+        and     eax, 0FH               ; Model
+        mov     ecx, ebx
+        shr     ecx, 12
+        and     ecx, 0F0H              ; Extended model
+        or      eax, ecx               ; Model | extended model
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [rdi], 'H'       ; Write text "H"
+        
+PNEND:  ; finished
+        lea     rax, [NameBuffer]      ; Pointer to result
+        pop     rdi
+        pop     rbx
+        ret
+;ProcessorName ENDP
+
+WriteHex:                              ; Local function: Write 2 hexadecimal digits
+        ; Parameters: AL = number to write, RDI = text destination
+        mov     ecx, eax
+        shr     ecx, 4
+        and     ecx, 0FH               ; most significant digit first
+        cmp     ecx, 10
+        jnb     W1
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W2
+W1:     ; A - F
+        add     ecx, 'A' - 10
+W2:     mov     [rdi], cl              ; write digit
+                
+        mov     ecx, eax
+        and     ecx, 0FH               ; next digit
+        cmp     ecx, 10
+        jnb     W3
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W4
+W3:     ; A - F
+        add     ecx, 'A' - 10
+W4:     mov     [rdi+1], cl            ; write digit
+        add     rdi, 2                 ; advance string pointer
+        ret
diff --git a/asmlibSrc/randomah.asi b/asmlibSrc/randomah.asi
new file mode 100755
index 0000000..ed7a018
--- /dev/null
+++ b/asmlibSrc/randomah.asi
@@ -0,0 +1,290 @@
+; ----------------------------- RANDOMAH.ASI ---------------------------
+;
+;  Author:           Agner Fog
+;  Date created:     1998
+;  Last modified:    2013-09-09
+;  Description:
+;  Assembly include file containing
+;  structure/class definitions for random number generators
+;
+; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Definitions for Mersenne Twister:
+
+TEMPERING EQU 1              ; set to 0 if no tempering (improves speed by 25%)
+
+%if 0
+; define constants for MT11213A:
+MERS_N    EQU 351
+MERS_M    EQU 175
+MERS_R    EQU 19
+MERS_A    EQU 0E4BD75F5H
+MERS_U    EQU 11
+MERS_S    EQU 7
+MERS_T    EQU 15
+MERS_L    EQU 17
+MERS_B    EQU 655E5280H
+MERS_C    EQU 0FFD58000H
+
+%ELSE
+; or constants for MT19937:
+MERS_N    EQU 624
+MERS_M    EQU 397
+MERS_R    EQU 31
+MERS_A    EQU 09908B0DFH
+MERS_U    EQU 11
+MERS_S    EQU 7
+MERS_T    EQU 15
+MERS_L    EQU 18
+MERS_B    EQU 9D2C5680H
+MERS_C    EQU 0EFC60000H
+
+%ENDIF
+
+LOWER_MASK EQU (1 << MERS_R) - 1             ; lower MERS_R bits
+UPPER_MASK EQU -1 << MERS_R                  ; upper 32-MERS_R bits
+
+; Define class CRandomMersenneA member data
+; Must be aligned by 16.
+
+STRUC CRandomMersenneA
+.Fill1    RESD      4        ; Alignment filler
+.PreInt:  RESD      4        ; premade tempered integer numbers, ready to use
+.PreFlt:  RESQ      4        ; premade floating point numbers, ready to use (subtract 1.0)
+          RESQ      1        ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
+.TmpFlt:  RESQ      1        ; temporary storage of floating point random number
+.PreInx:  RESD      1        ; index to next PreInt and PreFlt number
+.Instset: RESD      1        ; Instruction set
+.LastInterval: RESD 1        ; Last interval length for IRandomX
+.RLimit:  RESD      1        ; Rejection limit used by IRandomX
+.TMB:     RESD      4        ; 4 copies of MERS_B constant
+.TMC:     RESD      4        ; 4 copies of MERS_C constant
+.one:     RESQ      2        ; 2 copies of 1.0 constant
+.MTI:     RESD      1        ; index into MT buffer
+.UMASK:   RESD      1        ; UPPER_MASK
+.LMASK:   RESD      1        ; LOWER_MASK             ; constants
+.MATA:    RESD      1        ; MERS_A
+.wrap1:   RESD      4        ; MT buffer km wraparound
+.MT:      RESD      MERS_N   ; MT history buffer (aligned by 16)
+.wrap2:   RESD      4        ; MT buffer kk wraparound
+%if MERS_N & 3
+         ; MERS_N not divisible by 4. align by 4
+          RESD      (4 - (MERS_N & 3))
+%ENDIF        
+endstruc ; CRandomMersenneA
+
+
+; Definitions for Mother-of-all generator:
+
+; Define class CRandomMotherA member data
+; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
+STRUC   CRandomMotherA 
+.Fill2   RESD      4         ; Alignment filler
+.one     RESQ      1         ; 1.0
+.Instset RESD      1         ; Instruction set
+.M4      RESD      1         ; x[n-4]
+.M3      RESD      1         ; x[n-3] (aligned)
+.M2      RESD      1         ; x[n-2]
+.M1      RESD      1         ; x[n-1]
+.M0      RESD      1         ; x[n]
+.MC      RESD      1         ; Carry (aligned)
+.zero    RESD      1         ; Zero-extension of carry
+.RanP1   RESQ      1         ; Double random number in interval [1,2)
+.MF3     RESD      1         ; 2111111111 (aligned)
+.MF2     RESD      1         ; 1492
+.MF1     RESD      1         ; 1776
+.MF0     RESD      1         ; 5115
+endstruc ; CRandomMotherA
+
+MOTHERF0 EQU 5115            ; factor 0
+MOTHERF1 EQU 1776            ; factor 1
+MOTHERF2 EQU 1492            ; factor 2
+MOTHERF3 EQU 2111111111      ; factor 3
+
+
+; ***************************************************************************
+; Definitions for SFMT generator
+; ***************************************************************************
+
+; Choose Mersenne exponent.
+; Higher values give longer cycle length and use more memory:
+; MEXP equ    607
+; MEXP equ   1279
+; MEXP equ   2281
+; MEXP equ   4253
+  MEXP equ  11213
+; MEXP equ  19937
+; MEXP equ  44497
+
+%if MEXP == 44497
+SFMT_N      equ  348         ; Size of state vector
+SFMT_M      equ  330         ; Position of intermediate feedback
+SFMT_SL1    equ    5         ; Left shift of W[N-1], 32-bit words
+SFMT_SL2    equ    3         ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1    equ    9         ; Right shift of W[M], 32-bit words
+SFMT_SR2    equ	   3         ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0effffffbH  ;first DWORD of AND mask
+; AND mask:
+%define SFMT_MASK   0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
+; Period certification vector
+%define 1,0,0a3ac4000H,0ecc1327aH
+
+%elif MEXP == 19937
+SFMT_N      equ  156         ; Size of state vector
+SFMT_M      equ  122         ; Position of intermediate feedback
+SFMT_SL1    equ   18         ; Left shift of W[N-1], 32-bit words
+SFMT_SL2    equ    1         ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1    equ   11         ; Right shift of W[M], 32-bit words
+SFMT_SR2    equ	   1         ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0dfffffefH  ;first DWORD of AND mask
+%define SFMT_MASK   0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
+%define SFMT_PARITY 1,0,0,013c9e684H
+
+%elif MEXP == 11213
+SFMT_N      equ  88          ; Size of state vector
+SFMT_M      equ  68          ; Position of intermediate feedback
+SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   7          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0effff7fbH  ;first DWORD of AND mask
+%define SFMT_MASK	0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
+%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
+
+%elif MEXP == 4253
+SFMT_N      equ  34          ; Size of state vector
+SFMT_M      equ  17          ; Position of intermediate feedback
+SFMT_SL1	equ  20          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ  1           ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ  7           ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ  1           ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  09f7bffffH  ;first DWORD of AND mask
+%define SFMT_MASK	09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
+%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
+
+%elif MEXP == 2281
+SFMT_N      equ  18          ; Size of state vector
+SFMT_M      equ  12          ; Position of intermediate feedback
+SFMT_SL1	equ  19          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   1          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0bff7ffbfH ;first DWORD of AND mask
+%define SFMT_MASK	0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
+%define SFMT_PARITY 1,0,0,041dfa600H
+
+%elif MEXP == 1279
+SFMT_N      equ  10          ; Size of state vector
+SFMT_M      equ   7          ; Position of intermediate feedback
+SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0f7fefffdH ;first DWORD of AND mask
+%define SFMT_MASK	0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
+%define SFMT_PARITY 1,0,0,020000000H
+
+%elif MEXP == 607
+SFMT_N      equ   5          ; Size of state vector
+SFMT_M      equ   2          ; Position of intermediate feedback
+SFMT_SL1	equ  15          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ  13          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0fdff37ffH ;first DWORD of AND mask
+%define SFMT_MASK	0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
+%define SFMT_PARITY 1,0,0,05986f054H
+
+%ELSE
+%error MEXP must have one of the predefined values
+%ENDIF
+
+STRUC CRandomSFMTA 
+.Fill3         RESD     4    ; Alignment filler
+
+; Parameters for Mother-Of-All generator:
+.M3:           RESD     1    ; x[n-3] (aligned)
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M2:           RESD     1    ; x[n-2]
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M1:           RESD     1    ; x[n-1]
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M0:           RESD     1    ; x[n]
+.MC:           RESD     1    ; Carry (zero-extends into one)
+.one:          RESQ     1    ; 1.0 (low dword = zero-extension of carry) (aligned)
+.TempRan:      RESQ     1    ; Temporary random number
+.MF3:          RESD     1    ; 2111111111 (aligned)
+.Instset:      RESD     1    ; Instruction set
+.MF2:          RESD     1    ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF1:          RESD     1    ; 1776
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF0:          RESD     1    ; 5115
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+
+; Parameters for IRandomX:
+.LASTINTERVAL: RESD     1    ; Last interval length for IRandomX
+.RLIMIT:       RESD     1    ; Rejection limit used by IRandomX
+
+; Parameters for SFMT generator:
+.USEMOTHER:    RESD     1    ; 1 if combine with Mother-Of-All generator
+.IX:           RESD     1    ; Index into state buffer for SFMT
+
+.AMASK:        RESD     4    ; AND mask (aligned)
+.STATE:        RESD SFMT_N*4 ; State vector (aligned)
+endstruc ; CRandomSFMTA 
+
+
+; Load offset of TARGET into ecx. Use position-independent method if necessary
+%macro LOADOFFSET2ECX 1
+%IFNDEF  POSITIONINDEPENDENT
+        mov     ecx, %1
+%ELSE
+        ; get position-independent address of TARGET
+        call    get_thunk_ecx
+        add ecx, %1 - $
+%ENDIF
+%endmacro
+
+; Load offset of TARGET into edi. Use position-independent method if necessary
+%macro LOADOFFSET2EDI 1
+%IFNDEF  POSITIONINDEPENDENT
+        mov     edi, %1
+%ELSE
+        ; get position-independent address of TARGET
+        call    get_thunk_edi
+        add edi, %1 - $
+%ENDIF
+%endmacro
+
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1     rcx
+  %define par2     rdx
+  %define par3     r8
+  %define par4     r9
+  %define par5     qword [rsp+32+8]   ; stack offset including shadow space
+  %define par1d    ecx
+  %define par2d    edx
+  %define par3d    r8d
+  %define par4d    r9d
+  %define par5d    dword [rsp+32+8]
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1     rdi
+  %define par2     rsi
+  %define par3     rdx
+  %define par4     rcx
+  %define par5     r8
+  %define par1d    edi
+  %define par2d    esi
+  %define par3d    edx
+  %define par4d    ecx
+  %define par5d    r8d
+%ENDIF 
diff --git a/asmlibSrc/rdtsc32.asm b/asmlibSrc/rdtsc32.asm
new file mode 100755
index 0000000..1ada795
--- /dev/null
+++ b/asmlibSrc/rdtsc32.asm
@@ -0,0 +1,51 @@
+;          RDTSC32.ASM
+;
+; Author:           Agner Fog
+; Date created:     2003
+; Last modified:    2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _ReadTSC: function
+
+SECTION .text  align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" int ReadTSC (void);
+; or:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts 
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2:   730
+; Pentium 4:  700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+_ReadTSC:
+        push    ebx                    ; ebx is modified by cpuid
+        sub     eax, eax               ; 0
+        cpuid                          ; serialize
+        rdtsc                          ; read time stamp counter
+        push    eax
+        push    edx
+        sub     eax, eax
+        cpuid                ; serialize
+        pop     edx
+        pop     eax
+        pop     ebx
+        ret
+;_ReadTSC ENDP
diff --git a/asmlibSrc/rdtsc64.asm b/asmlibSrc/rdtsc64.asm
new file mode 100755
index 0000000..cfa30a4
--- /dev/null
+++ b/asmlibSrc/rdtsc64.asm
@@ -0,0 +1,51 @@
+;          RDTSC64.ASM
+;
+; Author:           Agner Fog
+; Date created:     2003
+; Last modified:    2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global  ReadTSC: function
+
+SECTION .text  align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts 
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2:   730
+; Pentium 4:  700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+ReadTSC:
+        push    rbx                    ; ebx is modified by cpuid
+        sub     eax, eax               ; 0
+        cpuid                          ; serialize
+        rdtsc                          ; read time stamp counter into edx:eax
+        shl     rdx, 32
+        or      rax, rdx               ; combine into 64 bit register        
+        push    rax
+        sub     eax, eax
+        cpuid                          ; serialize
+        pop     rax                    ; return value
+        pop     rbx
+        ret
+;ReadTSC ENDP
diff --git a/asmlibSrc/round32.asm b/asmlibSrc/round32.asm
new file mode 100755
index 0000000..eaa3cd4
--- /dev/null
+++ b/asmlibSrc/round32.asm
@@ -0,0 +1,41 @@
+;          ROUND32.ASM
+
+; Author:           Agner Fog
+; Date created:     2003
+; Last modified:    2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _RoundD: function
+global _RoundF: function
+
+SECTION .text  align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float  x);
+
+; This function converts a single or double precision floating point number 
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+_RoundD:
+        fld     qword [esp+4]          ; Load x
+        push    eax                    ; Make temporary space on stack
+        fistp   dword [esp]            ; Round. Store in temporary stack space
+        pop     eax                    ; Read from temporary stack space
+        ret
+;_RoundD  ENDP
+
+_RoundF:
+        fld     dword [esp+4]
+        push    eax
+        fistp   dword [esp]
+        pop     eax
+        ret
+;_RoundF ENDP
diff --git a/asmlibSrc/round64.asm b/asmlibSrc/round64.asm
new file mode 100755
index 0000000..826d4d9
--- /dev/null
+++ b/asmlibSrc/round64.asm
@@ -0,0 +1,38 @@
+;          ROUND64.ASM 
+
+; Author:           Agner Fog
+; Date created:     2007-06-15
+; Last modified:    2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global RoundD: function
+global RoundF: function
+
+
+SECTION .text  align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float  x);
+
+; This function converts a single or double precision floating point number 
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+RoundD:
+        cvtsd2si eax, xmm0             ; Round xmm0 to eax
+        ret
+;RoundD  ENDP
+
+RoundF:
+        cvtss2si eax, xmm0             ; Round xmm0 to eax
+        ret
+;RoundF ENDP
diff --git a/asmlibSrc/sfmt32.asm b/asmlibSrc/sfmt32.asm
new file mode 100755
index 0000000..f20140a
--- /dev/null
+++ b/asmlibSrc/sfmt32.asm
@@ -0,0 +1,1265 @@
+; ----------------------------- SFMT32.ASM ---------------------------
+; Author:        Agner Fog
+; Date created:  2008-11-01
+; Last modified: 2013-09-13
+; Source URL:    www.agner.org/optimize
+; Project:       asmlib.zip
+; Language:      assembly, NASM/YASM syntax, 32 bit
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte 
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 32-bit mode version for x86 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global _SFMTRandomInit, _SFMTRandomInitByArray, _SFMTBRandom, _SFMTRandom
+global _SFMTRandomL, _SFMTIRandom, _SFMTIRandomX, _SFMTgenRandomInit
+global _SFMTgenRandomInitByArray, _SFMTgenRandom, _SFMTgenRandomL
+global _SFMTgenIRandom, _SFMTgenIRandomX, _SFMTgenBRandom
+
+%ifdef   WINDOWS
+global _SFMTgenRandomInitD at 8, _SFMTgenRandomInitByArrayD at 12, _SFMTgenRandomD at 0
+global _SFMTgenRandomLD at 0, _SFMTgenIRandomD at 8, _SFMTgenIRandomXD at 8, _SFMTgenIRandomDX at 8
+global _SFMTgenBRandomD at 0
+%endif
+
+
+extern _InstructionSet
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+
+; Initialization Mask for SFMT:
+InitMask   DD SFMT_MASK
+
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16   ; code segment
+
+; ---------------------------------------------------------------
+;  Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; [esp+4]  = Pthis
+; [esp+8]  = ThisSize
+; [esp+12] = seed
+; [esp+16] = IncludeMother
+
+_SFMTRandomInit:
+        mov     ecx, [esp+4]                               ; Pthis
+        cmp     dword [esp+8], SFMTSize
+        jb      Error                                      ; Error exit if buffer too small
+        push    edi
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     ecx, -16
+        xor     eax, eax
+        cmp     dword [esp+16+4], eax                      ; IncludeMother
+        setnz   al                                         ; convert any nonzero value to 1
+        ; Store USEMOTHER
+        mov     [ecx+CRandomSFMTA.USEMOTHER], eax
+        
+        mov     eax, [esp+12+4]                            ; seed
+        xor     edi, edi                                   ; loop counter i
+        jmp     L002                                       ; go into seeding loop
+
+L001:   ; seeding loop for SFMT
+        ; y = factor * (y ^ (y >> 30)) + (++i);
+        call    InitSubf0                                  ; randomization subfunction
+L002:   mov     [ecx+edi*4+CRandomSFMTA.STATE],eax         ; initialize state
+        cmp     edi, SFMT_N*4 - 1
+        jb      L001
+
+        ; Put 5 more values into Mother-Of-All generator
+        call    InitSubf0
+        mov     [ecx+CRandomSFMTA.M0], eax
+        call    InitSubf0
+        mov     [ecx+CRandomSFMTA.M1], eax
+        call    InitSubf0
+        mov     [ecx+CRandomSFMTA.M2], eax
+        call    InitSubf0
+        mov     [ecx+CRandomSFMTA.M3], eax
+        call    InitSubf0
+        mov     [ecx+CRandomSFMTA.MC], eax
+        
+        ; more initialization and period certification
+        call    InitAndPeriod
+        
+        pop     edi
+        ret
+;_SFMTRandomInit ENDP
+        
+Error:                       ; Error exit
+        xor     eax, eax
+        div     eax                                        ; Divide by 0
+        ret
+        
+; Subfunction used by _SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; edi = i
+; output:
+; eax = new y
+; edi = i+1
+; edx modified
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx
+        imul    eax, 1812433253
+        inc     edi
+        add     eax, edi
+        ret
+;InitSubf0 endp 
+       
+; Subfunction used by _SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; edx modified
+        mov     edx, eax
+        shr     eax, 27
+        xor     eax, edx
+        imul    eax, 1664525
+        ret
+;InitSubf1 endp
+
+; Subfunction used by _SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; edx modified
+        mov     edx, eax
+        shr     eax, 27
+        xor     eax, edx
+        imul    eax, 1566083941
+        ret
+;InitSubf2 endp
+
+; Subfunciton for initialization and period certification, except seeding
+; ecx = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+        push    ebx
+        push    edi
+        ; initialize constants for Mother-Of-All and SFMT
+        LOADOFFSET2EDI InitMother                          ; edi points to InitMother
+        
+        xor     edx, edx
+L101:  ; Loop fills MF3 - MF0
+        mov     eax, [edi+edx]                             ; load from InitMother
+        mov     [ecx+edx+CRandomSFMTA.MF3], eax
+        add     edx, 4
+        cmp     edx, 32
+        jb      L101
+        xor     edx, edx
+L102:   ; Loop fills AMASK
+        mov     eax, [edi+edx+32]                          ; load from InitMask
+        mov     [ecx+edx+CRandomSFMTA.AMASK], eax
+        add     edx, 4
+        cmp     edx, 4*4
+        jb      L102        
+        
+        ; get instruction set
+        push    ecx
+        call    _InstructionSet
+        pop     ecx
+        mov     [ecx+CRandomSFMTA.Instset], eax
+        xor     eax, eax
+        mov     dword [ecx+CRandomSFMTA.one], eax
+        mov     dword [ecx+4+CRandomSFMTA.one], 3FF00000H        
+        
+        ; Period certification
+        ; Compute parity of STATE[0-4] & InitParity
+        xor     edx, edx               ; parity
+        xor     ebx, ebx               ; loop counter
+L104:   mov     eax, [ecx+ebx*4+CRandomSFMTA.STATE]
+        and     eax, [edi+(InitParity-InitMother)+ebx*4]   ; and InitParity[i]
+        xor     edx, eax
+        inc     ebx
+        cmp     ebx, 4
+        jb      L104
+        
+        ; get parity of edx
+        mov     eax, edx
+        shr     edx, 16
+        xor     eax, edx
+        xor     al, ah
+        jpo     L108                                       ; parity odd: period OK
+        
+        ; parity even: period not OK
+        ; Find a nonzero dword in period certification vector
+        xor     ebx, ebx                                   ; loop counter
+L105:   mov     eax, [edi+(InitParity-InitMother)+ebx*4]   ; InitParity[i]
+        test    eax, eax
+        jnz     L106
+        inc     ebx
+        ; assume that there is a nonzero dword in InitParity
+        jmp     L105                                       ; loop until nonzero found
+        
+L106:     ; find first nonzero bit in eax
+        bsf     edx, eax
+        ; flip the corresponding bit in STATE
+        btc     [ecx+ebx*4+CRandomSFMTA.STATE], edx
+
+L108:   cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        je      L109
+        call    Mother_Next                                ; Make first random number ready
+
+L109:   ; Generate first random numbers and set IX = 0
+        call    SFMT_Generate
+        pop     edi
+        pop     ebx
+        ret
+;InitAndPeriod   endp
+
+
+;  extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+_SFMTRandomInitByArray:
+; Parameters
+; [esp+ 4] = Pthis
+; [esp+ 8] = ThisSize
+; [esp+12] = seeds
+; [esp+16] = NumSeeds
+; [esp+20] = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4                                     ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+   SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+   SFMT_LAG equ  7
+%ELIF SFMT_SIZE >= 39
+   SFMT_LAG equ  5
+%ELSE
+   SFMT_LAG equ  3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+        push    ebx
+        push    esi
+        push    edi
+        push    ebp
+        cmp     dword [esp+8+16], SFMTSize
+        jb      Error                                      ; Error exit if buffer too small
+        mov     ecx, [esp+4+16]                            ; Pthis
+        mov     ebx, [esp+12+16]                           ; seeds
+        mov     ebp, [esp+16+16]                           ; NumSeeds
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     ecx, -16        
+        xor     eax, eax
+        cmp     dword [esp+20+16], eax                     ; IncludeMother
+        setnz   al                                         ; convert any nonzero value to 1
+        ; Store USEMOTHER
+        mov     [ecx+CRandomSFMTA.USEMOTHER], eax
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+;    r = factor * (r ^ (r >> 30)) + i;
+;    sta[i] = r;}
+
+        mov     eax, ebp                                   ; r = NumSeeds
+        xor     esi, esi                                   ; i
+L200:   mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx
+        imul    eax, 1812433253
+        add     eax, esi
+        mov     [ecx+esi*4+CRandomSFMTA.STATE], eax
+        inc     esi
+        cmp     esi, SFMT_SIZE
+        jb      L200        
+
+        ; count = max(NumSeeds,size-1)
+        mov     eax, SFMT_SIZE - 1
+        cmp     ebp, eax
+        cmovb   ebp, eax
+        push    ebp                                        ; save count as local variable
+        
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+;    r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+;    sta[(i + mid) % size] += r;
+;    if (j < NumSeeds) r += seeds[j]
+;    r += i;
+;    sta[(i + mid + lag) % size] += r;
+;    sta[i] = r;
+;    i = (i + 1) % size;
+; }
+        xor     edi, edi
+        lea     esi, [edi+1]
+
+        ; ecx = Pthis
+        ; ebx = seeds
+        ; esi = i
+        ; edi = j
+        ; eax = r
+        ; [esp] = count
+        ; [esp+20+16] = NumSeeds
+
+L201:   ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+        mov     eax, [ecx+esi*4+CRandomSFMTA.STATE]        ; sta[i]
+        lea     ebp, [esi+SFMT_MID]
+        cmp     ebp, SFMT_SIZE
+        jb      L202
+        sub     ebp, SFMT_SIZE
+L202:   xor     eax, [ecx+ebp*4+CRandomSFMTA.STATE]        ; sta[(i + mid) % size]
+        lea     edx, [esi+SFMT_SIZE-1]
+        cmp     edx, SFMT_SIZE
+        jb      L203
+        sub     edx, SFMT_SIZE
+L203:   xor     eax, [ecx+edx*4+CRandomSFMTA.STATE]        ; sta[(i + size - 1) % size]
+
+        ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+        call    InitSubf1
+        
+        ; sta[(i + mid) % size] += r;
+        add     [ecx+ebp*4+CRandomSFMTA.STATE], eax
+        
+        ; if (j < NumSeeds) r += seeds[j]
+        cmp     edi, [esp+20+16]
+        jnb     L204
+        add     eax, [ebx+edi*4]        
+L204:
+        ; r += i;
+        add     eax, esi
+        
+        ; sta[(i + mid + lag) % size] += r;
+        lea     edx, [esi+SFMT_MID+SFMT_LAG]
+        cmp     edx, SFMT_SIZE
+        jb      L205
+        sub     edx, SFMT_SIZE
+L205:   add     [ecx+edx*4+CRandomSFMTA.STATE], eax
+        
+        ;sta[i] = r;
+        mov     [ecx+esi*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     esi
+        cmp     esi, SFMT_SIZE
+        jb      L206
+        sub     esi, SFMT_SIZE
+L206:
+        ; j++, loop while j < count
+        inc     edi
+        cmp     edi, [esp]
+        jb      L201
+        
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+;   r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+;   sta[(i + mid) % size] ^= r;
+;   r -= i;
+;   sta[(i + mid + lag) % size] ^= r;
+;   sta[i] = r;
+;   i = (i + 1) % size;
+; }
+        ; j = 0
+        xor     edi, edi
+
+L210:   ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+        mov     eax, [ecx+esi*4+CRandomSFMTA.STATE]        ; sta[i]
+        lea     ebp, [esi+SFMT_MID]
+        cmp     ebp, SFMT_SIZE
+        jb      L211
+        sub     ebp, SFMT_SIZE
+L211:   add     eax, [ecx+ebp*4+CRandomSFMTA.STATE]        ; sta[(i + mid) % size]
+        lea     edx, [esi+SFMT_SIZE-1]
+        cmp     edx, SFMT_SIZE
+        jb      L212
+        sub     edx, SFMT_SIZE
+L212:   add     eax, [ecx+edx*4+CRandomSFMTA.STATE]        ; sta[(i + size - 1) % size]
+
+        ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+        call    InitSubf2
+        
+        ; sta[(i + mid) % size] ^= r;
+        xor     [ecx+ebp*4+CRandomSFMTA.STATE], eax
+        
+        ; r -= i;
+        sub     eax, esi
+        
+        ; sta[(i + mid + lag) % size] ^= r;
+        lea     edx, [esi+SFMT_MID+SFMT_LAG]
+        cmp     edx, SFMT_SIZE
+        jb      L213
+        sub     edx, SFMT_SIZE
+L213:   xor     [ecx+edx*4+CRandomSFMTA.STATE], eax
+
+        ; sta[i] = r;
+        mov     [ecx+esi*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     esi
+        cmp     esi, SFMT_SIZE
+        jb      L214
+        sub     esi, SFMT_SIZE
+L214:
+        ; j++, loop while j < size
+        inc     edi
+        cmp     edi, SFMT_SIZE
+        jb      L210
+    
+        pop     ebp                                        ; remove local variable count
+
+        ; if (UseMother) {
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      L220
+        
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+;    r = func2(r) + j;
+;    MotherState[j] = r + sta[2*j];
+; }
+        call    InitSubf2
+        mov     edx, [ecx+CRandomSFMTA.STATE]
+        add     edx, eax
+        mov     [ecx+CRandomSFMTA.M0], edx
+        call    InitSubf2
+        inc     eax
+        mov     edx, [ecx+8+CRandomSFMTA.STATE]
+        add     edx, eax
+        mov     [ecx+CRandomSFMTA.M1], edx
+        call    InitSubf2
+        add     eax, 2
+        mov     edx, [ecx+16+CRandomSFMTA.STATE]
+        add     edx, eax        
+        mov     [ecx+CRandomSFMTA.M2], edx
+        call    InitSubf2
+        add     eax, 3
+        mov     edx, [ecx+24+CRandomSFMTA.STATE]
+        add     edx, eax        
+        mov     [ecx+CRandomSFMTA.M3], edx
+        call    InitSubf2
+        add     eax, 4
+        mov     edx, [ecx+32+CRandomSFMTA.STATE]
+        add     edx, eax        
+        mov     [ecx+CRandomSFMTA.MC], edx
+        
+L220:   ; More initialization and period certification
+        call    InitAndPeriod
+        
+        pop     ebp
+        pop     edi
+        pop     esi
+        pop     ebx
+        ret
+;_SFMTRandomInitByArray ENDP
+
+
+align 16
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; ecx = pointer to structure CRandomSFMTA
+; eax, ecx, xmm0 unchanged
+        cmp     dword [ecx+CRandomSFMTA.Instset], 4
+        jb      Mother_Next_386
+        movdqa  xmm1, oword [ecx+CRandomSFMTA.M3]          ; load M3,M2
+        movdqa  xmm2, oword [ecx+CRandomSFMTA.M1]          ; load M1,M0
+        movhps  qword [ecx+CRandomSFMTA.M3], xmm1          ; M3=M2
+        movq    qword [ecx+CRandomSFMTA.M2], xmm2          ; M2=M1
+        movhps  qword [ecx+CRandomSFMTA.M1], xmm2          ; M1=M0
+        pmuludq xmm1, oword [ecx+CRandomSFMTA.MF3]         ; M3*MF3, M2*MF2
+        pmuludq xmm2, oword [ecx+CRandomSFMTA.MF1]         ; M1*MF1, M0*MF0
+        paddq   xmm1, xmm2                                 ; P3+P1, P2+P0
+        movhlps xmm2, xmm1                                 ; Get high qword
+        movq    xmm3, qword [ecx+CRandomSFMTA.MC]          ; +carry
+        paddq   xmm1, xmm3
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        movq    qword [ecx+CRandomSFMTA.M0], xmm1          ; Store new M0 and carry
+        ret
+        
+Mother_Next_386: ; same, no SSE2
+        push    eax
+        push    esi
+        push    edi
+        ; prepare new random number
+        mov     eax, [ecx+CRandomSFMTA.MF3]
+        mul     dword [ecx+CRandomSFMTA.M3]                ; x[n-4]
+        mov     esi, eax
+        mov     eax, [ecx+CRandomSFMTA.M2]                 ; x[n-3]
+        mov     edi, edx
+        mov     [ecx+CRandomSFMTA.M3], eax
+        mul     dword [ecx+CRandomSFMTA.MF2]
+        add     esi, eax
+        mov     eax, [ecx+CRandomSFMTA.M1]                 ; x[n-2]
+        adc     edi, edx
+        mov     [ecx+CRandomSFMTA.M2], eax
+        mul     dword [ecx+CRandomSFMTA.MF1]
+        add     esi, eax
+        mov     eax,[ecx+CRandomSFMTA.M0]                  ; x[n-1]
+        adc     edi, edx
+        mov     [ecx+CRandomSFMTA.M1], eax
+        mul     dword [ecx+CRandomSFMTA.MF0]
+        add     eax, esi
+        adc     edx, edi
+        add     eax, [ecx+CRandomSFMTA.MC]
+        adc     edx, 0
+        ; store next random number and carry
+        mov     [ecx+CRandomSFMTA.M0], eax
+        mov     [ecx+CRandomSFMTA.MC], edx
+        pop     edi
+        pop     esi
+        pop     eax
+        ret
+
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+; check if SSE2 instruction set supported
+        cmp     dword [ecx+CRandomSFMTA.Instset], 4
+        jb      SFMT_Generate_386
+        push    ebx
+        
+        ; register use
+        ; ecx = Pthis
+        ; edx = i*16 + offset state
+        ; eax, ebx = loop end
+        ; xmm1 = r1
+        ; xmm2 = r2 = r
+        ; xmm0, xmm3 = scratch
+        
+        ; r1 = state[SFMT_N*16 - 2];
+        ; r2 = state[SFMT_N*16 - 1];
+        movdqa  xmm1, oword [ecx+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+        movdqa  xmm2, oword [ecx+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+        mov     edx, CRandomSFMTA.STATE        
+        
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b, 
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+;    __m128i a1, b1, c1, d1, z1, z2;
+;    b1 = _mm_srli_epi32(b, SFMT_SR1);
+;    a1 = _mm_slli_si128(a, SFMT_SL2);
+;    c1 = _mm_srli_si128(c, SFMT_SR2);
+;    d1 = _mm_slli_epi32(d, SFMT_SL1);
+;    b1 = _mm_and_si128(b1, mask);
+;    z1 = _mm_xor_si128(a, a1);
+;    z2 = _mm_xor_si128(b1, d1);
+;    z1 = _mm_xor_si128(z1, c1);
+;    z2 = _mm_xor_si128(z1, z2);
+;    return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+;    r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+;    state[i] = r;
+;    r1 = r2;
+;    r2 = r;
+; }
+
+        mov      eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+        mov      ebx, SFMT_N*16 + CRandomSFMTA.STATE          ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L301:   movdqa   xmm0, oword [ecx+edx+SFMT_M*16]           ; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [ecx+CRandomSFMTA.AMASK]      ; b1
+        movdqa   xmm3, oword [ecx+edx]                     ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [ecx+edx], xmm2
+        
+        ; i++ while i < SFMT_N - SFMT_M
+        add      edx, 16
+        cmp      edx, eax
+        jb       L301
+        
+;align 16
+L302:   ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+        movdqa   xmm0, oword [ecx+edx+(SFMT_M-SFMT_N)*16]  ; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [ecx+CRandomSFMTA.AMASK ]     ; b1
+        movdqa   xmm3, oword [ecx+edx]                     ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [ecx+edx], xmm2
+        
+        ; i++ while i < SFMT_N
+        add      edx, 16
+        cmp      edx, ebx
+        jb       L302
+        
+        ; Check if initialized
+L308:   cmp     dword [ecx+CRandomSFMTA.AMASK], SFMT_MASK1
+        jne     Error                  ; Make error if not initialized
+
+        ; ix = 0;
+        mov      dword [ecx+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+        pop      ebx
+        ret
+
+; Same, SSE2 instruction set not supported:
+SFMT_Generate_386:
+        push    ebx
+        push    esi
+        push    edi
+        push    ebp
+        sub     esp, 32
+
+        ; register use
+        ; ecx = Pthis
+        ; edx = i*16
+        ; ebx = ((i+SFMT_M) mod SFMT_N) * 16
+        ; ebp = accumulator
+        ; eax = temporary
+        ; esi, edi = previous state[i]
+        
+        %define RR1   esp               ; r1
+        %define RR2   esp+16            ; r2 = r
+        
+        ; r1 = state[SFMT_N - 2];
+        ; r2 = state[SFMT_N - 1];
+        lea     esi, [ecx+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+        mov     edi, esp
+        push    ecx
+        mov     ecx, 8
+        rep     movsd
+        pop     ecx
+        
+; The two loops from i = 0 to SFMT_N - SFMT_M - 1 and 
+; from SFMT_N - SFMT_M to SFMT_N - 1 are joined together here:
+; for (i = 0; i < SFMT_N; i++) {
+;    r = sfmt_recursion(state[i], state[(i+SFMT_M)%SFMT_N], r1, r2, mask);
+;    state[i] = r;
+;    r1 = r2;
+;    r2 = r;
+
+        xor     edx, edx                                   ; i = 0
+        mov     ebx, SFMT_M * 16                           ; j = ((i+SFMT_M)%SFMT_N)*16
+        
+M1:     ; loop start
+        ; 1. dword:
+        mov     ebp, [ecx+ebx+CRandomSFMTA.STATE+0]
+        shr     ebp, SFMT_SR1                              ; 32-bit shifts right
+        and     ebp, [ecx+CRandomSFMTA.AMASK+0]
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE+0]
+        xor     ebp, eax
+        mov     esi, eax                                   ; save for 2. dword
+        shl     eax, SFMT_SL2*8                            ; 128-bit shift left
+        xor     ebp, eax
+        mov     eax, [RR1+0]
+        mov     edi, [RR1+4]
+        shrd    eax, edi, SFMT_SR2*8 ; 128-bit shift right
+        xor     ebp, eax
+        mov     eax, [RR2+0]
+        mov     [RR1+0], eax                               ; r1 = r2
+        shl     eax, SFMT_SL1                              ; 32-bit shifts left
+        xor     ebp, eax
+        mov     [RR2+0], ebp                               ; r2 = r
+        mov     [ecx+edx+CRandomSFMTA.STATE+0], ebp        ; state[i] = r
+        
+        ; 2. dword:
+        mov     ebp, [ecx+ebx+CRandomSFMTA.STATE+4]
+        shr     ebp, SFMT_SR1                              ; 32-bit shifts right
+        and     ebp, [ecx+CRandomSFMTA.AMASK+4]
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE+4]
+        xor     ebp, eax        
+        mov     edi, eax                                   ; save for 3. dword
+        ; esi = [ecx+edx].STATE[0] before change
+        shld    eax, esi, SFMT_SL2*8                       ; 128-bit shift left
+        xor     ebp, eax
+        mov     eax, [RR1+4]
+        mov     esi, [RR1+8]
+        shrd    eax, esi, SFMT_SR2*8 ; 128-bit shift right
+        xor     ebp, eax
+        mov     eax, [RR2+4]
+        mov     [RR1+4], eax                               ; r1 = r2
+        shl     eax, SFMT_SL1                              ; 32-bit shifts left
+        xor     ebp, eax
+        mov     [RR2+4], ebp                               ; r2 = r
+        mov     [ecx+edx+CRandomSFMTA.STATE+4], ebp        ; state[i] = r
+        
+        ; 3. dword:
+        mov     ebp, [ecx+ebx+CRandomSFMTA.STATE+8]
+        shr     ebp, SFMT_SR1                              ; 32-bit shifts right
+        and     ebp, [ecx+CRandomSFMTA.AMASK+8]
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE+8]
+        mov     esi, eax                                   ; save for 4. dword
+        xor     ebp, eax
+        ; edi = [ecx+edx+CRandomSFMTA.STATE+4] before change
+        shld    eax, edi, SFMT_SL2*8                       ; 128-bit shift left
+        xor     ebp, eax
+        mov     eax, [RR1+8]
+        mov     edi, [RR1+12]
+        shrd    eax, edi, SFMT_SR2*8                       ; 128-bit shift right
+        xor     ebp, eax
+        mov     eax, [RR2+8]
+        mov     [RR1+8], eax                               ; r1 = r2
+        shl     eax, SFMT_SL1                              ; 32-bit shifts left
+        xor     ebp, eax
+        mov     [RR2+8], ebp                               ; r2 = r
+        mov     [ecx+edx+CRandomSFMTA.STATE+8], ebp        ; state[i] = r
+        
+        ; 4. dword:
+        mov     ebp, [ecx+ebx+CRandomSFMTA.STATE+12]
+        shr     ebp, SFMT_SR1                              ; 32-bit shifts right
+        and     ebp, [ecx+CRandomSFMTA.AMASK+12]
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE+12]
+        xor     ebp, eax
+        ; esi = [ecx+edx+CRandomSFMTA.STATE+8] before change        
+        shld    eax, esi, SFMT_SL2*8                       ; 128-bit shift left
+        xor     ebp, eax
+        mov     eax, [RR1+12]
+        shr     eax, SFMT_SR2*8                            ; 128-bit shift right
+        xor     ebp, eax
+        mov     eax, [RR2+12]
+        mov     [RR1+12], eax                              ; r1 = r2
+        shl     eax, SFMT_SL1                              ; 32-bit shifts left
+        xor     ebp, eax
+        mov     [RR2+12], ebp                              ; r2 = r
+        mov     [ecx+edx+CRandomSFMTA.STATE+12], ebp       ; state[i] = r
+        
+        ; increment i, j
+        add     ebx, 16
+        cmp     ebx, SFMT_N*16
+        jb      M4
+        sub     ebx, SFMT_N*16                             ; modulo SFMT_N
+M4:     add     edx, 16
+        cmp     edx, SFMT_N*16
+        jb      M1
+        
+        ; free r1, r2 from stack
+        add     esp, 32
+        pop     ebp
+        pop     edi
+        pop     esi
+      ; pop     ebx
+        jmp     L308
+
+;SFMT_Generate endp
+
+
+;  extern "C" unsigned int SFMTBRandom(void * Pthis); // Output random bits
+
+_SFMTBRandom: ; generate random bits
+        mov     ecx, [esp+4]                               ; Pthis
+        ; Align by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     ecx, -16        
+
+SFMTBRandom_reg:                                           ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+        mov     edx, [ecx+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16
+        jnb     NeedGenerate
+        
+; y = ((uint32_t*)state)[ix++];
+        mov     eax, dword [ecx+edx+CRandomSFMTA.STATE]
+        add     edx, 4
+        mov     [ecx+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      NoMother
+        
+        ; add mother bits
+        add     eax,  [ecx+CRandomSFMTA.M0]                ; Add Mother random number        
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+NoMother: ; return y;
+        ret
+        
+NeedGenerate: 
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        mov     eax, [ecx+CRandomSFMTA.STATE]
+        mov     dword [ecx+CRandomSFMTA.IX], 4
+        jmp     AfterGenerate
+        
+;_SFMTBRandom ENDP
+
+
+;  extern "C" double SFMTRandom  (void * Pthis); // Output random float
+
+_SFMTRandom:                                               ; generate random float with 52 bits resolution
+        mov     ecx, [esp+4]                               ; Pthis
+        ; Align by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     ecx, -16
+        
+SFMTRandom_reg:                                            ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [ecx+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L403  
+
+        ; check instruction set
+L401:   cmp     dword [ecx+CRandomSFMTA.Instset], 4
+        jb      L404
+        
+        ; read 64 random bits
+        movq    xmm0, qword [ecx+edx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [ecx+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      L402
+        
+        ; add mother bits
+        movq    xmm1, qword [ecx+CRandomSFMTA.M0]          ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                      ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                 ; Add SFMT and Mother outputs
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+L402:   ; ConvertToFloat
+        psrlq	xmm0, 12			                       ; align with mantissa field of double precision float
+        movsd   xmm1, [ecx+CRandomSFMTA.one]   ; 1.0 double precision
+        por     xmm0, xmm1                                 ; insert exponent to get 1.0 <= x < 2.0
+        subsd   xmm0, xmm1                                 ; subtract 1.0 to get 0.0 <= x < 1.0
+        movsd   [ecx+CRandomSFMTA.TempRan], xmm0
+        fld     qword [ecx+CRandomSFMTA.TempRan]           ; transfer to st(0) register
+        ret                                                ; return value        
+        
+L403:   ;NeedGenerateR
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L401
+
+L404:   ;NoSSE2 ; Use old 386 instruction set:
+        push    ebx
+        ; read 64 random bits
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE]
+        mov     ebx, [ecx+edx+4+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [ecx+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      L405
+        
+        ; add mother bits
+        mov     edx, [ecx+CRandomSFMTA.MC]                 ; Mother random number MC
+        ror     edx, 16                                    ; rotate
+        add     eax, edx                                   ; 64 bit add
+        adc     ebx, [ecx+CRandomSFMTA.M0]                 ; Mother random number M0
+        call    Mother_Next                                ; next Mother. eax, ebx unchanged
+        
+L405:   ;ToFloatNoSSE2
+        shrd    eax, ebx, 12		                       ; align with mantissa field of double precision float
+        shr     ebx, 12
+        or      ebx, 3FF00000H                             ; insert exponent to get 1.0 <= x < 2.0
+        mov     dword [ecx+CRandomSFMTA.TempRan], eax
+        mov     dword [ecx+4+CRandomSFMTA.TempRan], ebx
+        fld     qword [ecx+CRandomSFMTA.TempRan]           ; transfer to st(0) register
+        fsub    qword [ecx+CRandomSFMTA.one]               ; subtract 1.0 to get 0.0 <= x < 1.0
+        pop     ebx
+        ret                                                ; return value        
+        
+;_SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+
+_SFMTRandomL:                                              ; generate random float with 63 bits resolution
+        mov     ecx, [esp+4]                               ; Pthis
+        ; Align by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     ecx, -16
+        
+SFMTRandomL_reg:                                           ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [ecx+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L505
+
+        ; check instruction set
+L501:   cmp     dword [ecx+CRandomSFMTA.Instset], 4
+        jb      L506
+        
+        ; read 64 random bits
+        movq    xmm0, qword [ecx+edx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [ecx+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      L502
+        
+        ; add mother bits
+        movq    xmm1, qword  [ecx+CRandomSFMTA.M0]         ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                      ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                 ; Add SFMT and Mother outputs
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+L502:   ;ConvertToFloat
+        sub     esp, 16                                    ; make space for long double
+        psrlq	xmm0, 1                                    ; align with mantissa field of long double
+        pcmpeqw xmm1, xmm1                                 ; all 1's
+        psllq   xmm1, 63                                   ; create a 1 in bit 63
+        por     xmm0, xmm1                                 ; bit 63 is always 1 in long double
+        movq    qword  [esp], xmm0                         ; store mantissa
+L504:   mov     dword  [esp+8], 3FFFH                      ; exponent
+        fld     tword  [esp]                               ; load long double
+        fsub    qword [ecx+CRandomSFMTA.one]               ; subtract 1.0 to get 0.0 <= x < 1.0
+        add     esp, 16
+        ret                                                ; return value        
+        
+L505:   ; NeedGenerateR
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L501
+
+L506:   ;NoSSE2 ; Use old 386 instruction set:
+        push    ebx
+        ; read 64 random bits
+        mov     eax, [ecx+edx+CRandomSFMTA.STATE]
+        mov     ebx, [ecx+edx+4+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [ecx+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [ecx+CRandomSFMTA.USEMOTHER], 0
+        jz      L507
+        
+        ; add mother bits
+        mov     edx, [ecx+CRandomSFMTA.MC]                 ; Mother random number MC
+        ror     edx, 16                                    ; rotate
+        add     eax, edx                                   ; 64 bit add
+        adc     ebx, [ecx+CRandomSFMTA.M0]                 ; Mother random number M0
+        call    Mother_Next                                ; next Mother. eax, ebx unchanged
+        
+L507:   ;ToFloatNoSSE2
+        mov     edx, ebx                                   ; now random bits are in edx:eax
+        pop     ebx                                        ; clean stack
+        sub     esp, 16                                    ; make room for long double
+        shrd    eax, edx, 1                                ; align with mantissa field of long double
+        stc
+        rcr     edx, 1                                     ; bit 63 is always 1
+        mov     [esp], eax
+        mov     [esp+4], edx
+        jmp     L504                                       ; the rest is the same as above
+        
+;_SFMTRandomL ENDP
+
+
+;  extern "C" int SFMTIRandom (void * Pthis, int min, int max);  // Output random integer
+
+_SFMTIRandom:
+        mov     ecx, [esp+4]                               ; Pthis
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     ecx, -16        
+        call    SFMTBRandom_reg                            ; random bits
+        mov     edx, [esp+12]                              ; max
+        mov     ecx, [esp+8]                               ; min
+        sub     edx, ecx
+        jl      short WrongInterval                        ; max < min
+        add     edx, 1                                     ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [edx+ecx]                             ; add min
+        ret
+WrongInterval:
+        mov     eax, 80000000H                             ; error exit
+        ret
+;_SFMTIRandom ENDP
+
+
+;  extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+_SFMTIRandomX:
+        push    edi
+        mov     ecx, [esp+8]                               ; Pthis
+        mov     edx, [esp+12]                              ; min
+        mov     edi, [esp+16]                              ; max
+        ; Align by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     ecx, -16        
+        sub     edi, edx                                   ; max - min
+        jle     short M30                                  ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        
+        ; if (interval != LastInterval) {
+        cmp     edi, [ecx+CRandomSFMTA.LASTINTERVAL]
+        je      M10
+        ; need to calculate new rejection limit
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [ecx+CRandomSFMTA.RLIMIT], eax       
+        mov     [ecx+CRandomSFMTA.LASTINTERVAL], edi
+M10:
+M20:    ; do { // Rejection loop
+        call    SFMTBRandom_reg                            ; random bits (ecx is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     edi
+        ; } while (remainder > RLimit);
+        cmp     eax, [ecx+CRandomSFMTA.RLIMIT]
+        ja      M20
+        
+        ; return (int32)iran + min
+        mov     eax, [esp+12]                              ; min
+        add     eax, edx
+        pop     edi
+        ret
+        
+M30:    jl      M40
+        ; max = min. Return min
+        mov     eax, edx
+        pop     edi
+        ret                                                ; max = min exit
+        
+M40:    ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     edi
+        ret
+;_SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+;  Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+;  extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0); 
+_SFMTgenRandomInit:
+        mov     eax, [esp+4]                               ; seed
+        mov     edx, [esp+8]                               ; IncludeMother
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        
+        ; call _SFMTRandomInit with Pthis pointing to SFMTInstance
+        push    edx					                       ; IncludeMother
+        push    eax					                       ; seed
+        push    SFMTSize                                   ; ThisSize
+        push    ecx					                       ; Pthis
+        call    _SFMTRandomInit
+        add     esp, 16
+        ret
+;_SFMTgenRandomInit ENDP
+
+
+;  extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+_SFMTgenRandomInitByArray:
+        mov     eax, [esp+4]                               ; seeds
+        mov     ecx, [esp+8]                               ; NumSeeds
+        mov     edx, [esp+12]                              ; IncludeMother
+        push    edx
+        push    ecx
+        push    eax
+        push    SFMTSize                                   ; ThisSize
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        push    ecx
+        call	_SFMTRandomInitByArray
+        add     esp, 20
+        ret
+;_SFMTgenRandomInitByArray ENDP  
+
+
+;  extern "C" double SFMTgenRandom();
+_SFMTgenRandom:                                            ; generate random float with 52 bits resolution
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        jmp     SFMTRandom_reg                             ; random bits
+;_SFMTgenRandom ENDP
+
+
+;  extern "C" double SFMTgenRandom();
+_SFMTgenRandomL:                                           ; generate random float with 63 bits resolution
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        jmp     SFMTRandomL_reg                            ; random bits
+;_SFMTgenRandomL ENDP
+
+
+;  extern "C" int SFMTgenIRandom (int min, int max);
+_SFMTgenIRandom:
+        mov     eax, [esp+4]                               ; min
+        mov     edx, [esp+8]                               ; max
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        push    edx
+        push    eax
+        push    ecx                                        ; Pthis
+        call	_SFMTIRandom				               ; continue in _SFMTIRandom
+        add     esp, 12
+        ret
+;_SFMTgenIRandom ENDP
+
+
+;  extern "C" int SFMTgenIRandomX (int min, int max);
+_SFMTgenIRandomX:
+        mov     eax, [esp+4]                               ; min
+        mov     edx, [esp+8]                               ; max
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        push    edx
+        push    eax
+        push    ecx                                        ; Pthis
+        call	_SFMTIRandomX                              ; continue in _SFMTIRandomX
+        add     esp, 12
+        ret
+;_SFMTgenIRandomX ENDP
+
+
+;  extern "C" uint32_t SFMTgenBRandom();
+_SFMTgenBRandom:                                           ; generate random float with 32 bits resolution
+        LOADOFFSET2ECX SFMTInstance                        ; Get address of SFMTInstance into ecx
+        jmp     SFMTBRandom_reg                            ; random bits
+;_SFMTgenBRandom ENDP
+
+
+
+%IFDEF   WINDOWS
+; -----------------------------------------------------------------
+;  Single-threaded DLL versions for SFMT generator, Windows only
+; -----------------------------------------------------------------
+
+;  extern "C" void __stdcall SFMTgenRandomInitD(int seed, int IncludeMother = 0);
+_SFMTgenRandomInitD at 8:
+        mov     eax, [esp+4]                               ; seed
+        mov     edx, [esp+8]                               ; IncludeMother
+        push    edx
+        push    eax
+        push    SFMTSize                                   ; ThisSize
+        push    SFMTInstance	                           ; Pthis
+        call    _SFMTRandomInit
+        add     esp, 16
+        ret     8
+;_SFMTgenRandomInitD at 8  ENDP
+
+
+
+;  extern "C" void __stdcall SFMTgenRandomInitByArrayD
+; (int const seeds[], int NumSeeds, int IncludeMother = 0);
+_SFMTgenRandomInitByArrayD at 12:
+        mov     eax, [esp+4]                               ; seeds
+        mov     ecx, [esp+8]                               ; NumSeeds
+        mov     edx, [esp+12]                              ; IncludeMother
+        push    edx
+        push    ecx
+        push    eax
+        push    SFMTSize                                   ; ThisSize
+        push    SFMTInstance
+        call	_SFMTRandomInitByArray
+        add     esp, 20
+        ret     12
+;_SFMTgenRandomInitByArrayD at 12 ENDP        
+
+
+
+;  extern "C" double __stdcall SFMTgenRandomD(); // Output random float
+_SFMTgenRandomD at 0:      ; generate random float with 52 bits resolution
+        mov     ecx, SFMTInstance
+        jmp     SFMTRandom_reg                             ; random bits
+;_SFMTgenRandomD at 0 ENDP
+
+
+;  extern "C" long double __stdcall SFMTgenRandomLD();
+_SFMTgenRandomLD at 0:            ; generate random float with 63 bits resolution
+        mov     ecx, SFMTInstance
+        jmp     SFMTRandomL_reg                            ; random bits
+;_SFMTgenRandomLD at 0 ENDP
+
+
+;  extern "C" int __stdcall SFMTgenIRandomD (int min, int max);
+_SFMTgenIRandomD at 8:
+        mov     eax, [esp+4]                               ; min
+        mov     edx, [esp+8]                               ; max
+        push    edx
+        push    eax
+        push    SFMTInstance
+        call	_SFMTIRandom				               ; continue in _SFMTIRandom
+        add     esp, 12
+        ret     8
+;_SFMTgenIRandomD at 8 ENDP
+
+
+;  extern "C" int __stdcall SFMTgenIRandomD (int min, int max);
+_SFMTgenIRandomXD at 8:
+        mov     eax, [esp+4]                               ; min
+        mov     edx, [esp+8]                               ; max
+        push    edx
+        push    eax
+        push    SFMTInstance
+        call	_SFMTIRandomX                              ; continue in _SFMTIRandom
+        add     esp, 12
+        ret     8
+;_SFMTgenIRandomXD at 8 ENDP
+
+
+
+;  extern "C" int __stdcall SFMTgenIRandomDX (int min, int max);
+_SFMTgenIRandomDX at 8:
+        mov     eax, [esp+4]                               ; min
+        mov     edx, [esp+8]                               ; max
+        push    edx
+        push    eax
+        push    SFMTInstance
+        call	_SFMTIRandomX				               ; continue in _SFMTIRandomX
+        add     esp, 12
+        ret     8
+;_SFMTgenIRandomDX at 8 ENDP
+
+
+;  extern "C" unsigned int __stdcall SFMTgenBRandomD();
+_SFMTgenBRandomD at 0:                                        ; generate random float with 32 bits resolution
+        mov     ecx, SFMTInstance
+        jmp     SFMTBRandom_reg                            ; random bits
+;_SFMTgenBRandomD at 0 ENDP
+
+%ENDIF  ; WINDOWS
+
+%IFDEF   POSITIONINDEPENDENT
+get_thunk_ecx: ; load caller address into ecx for position-independent code
+        mov ecx, [esp]
+        ret
+
+get_thunk_edi: ; load caller address into edi for position-independent code
+        mov edi, [esp]
+        ret
+%ENDIF   ; POSITIONINDEPENDENT
+
+;END
diff --git a/asmlibSrc/sfmt64.asm b/asmlibSrc/sfmt64.asm
new file mode 100755
index 0000000..24bde65
--- /dev/null
+++ b/asmlibSrc/sfmt64.asm
@@ -0,0 +1,908 @@
+; ----------------------------- SFMT64.ASM ---------------------------
+; Author:        Agner Fog
+; Date created:  2008-11-01
+; Last modified: 2013-12-15
+; Project:       randoma library of random number generators
+; Source URL:    www.agner.org/random
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte 
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 64-bit mode version for x86-64 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+default rel
+
+global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
+global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
+global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
+global SFMTgenIRandomX, SFMTgenBRandom
+%IFDEF WINDOWS
+global SFMTgenRandomInitD, SFMTgenRandomInitByArrayD, SFMTgenIRandomD
+global SFMTgenIRandomXD, SFMTgenRandomD, SFMTgenBRandomD
+%ENDIF
+
+
+extern InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+
+section .data
+align 16
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+; Initialization Mask for SFMT:
+InitMask   DD SFMT_MASK
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16   ; code segment
+
+
+; ---------------------------------------------------------------
+;  Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; par1  = Pthis
+; par2d = ThisSize
+; par3d = seed
+; par4d = IncludeMother
+
+SFMTRandomInit:
+        cmp     par2d, SFMTSize
+        jb      Error                                      ; Error exit if buffer too small
+        push    rbx
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     par1, -16
+
+        xor     eax, eax
+        test    par4d, par4d                               ; IncludeMother
+        setnz   al                                         ; convert any nonzero value to 1
+        ; Store USEMOTHER
+        mov     [par1+CRandomSFMTA.USEMOTHER], eax
+        
+        mov     eax, par3d                                 ; seed
+        xor     ebx, ebx                                   ; loop counter i
+        jmp     L002                                       ; go into seeding loop
+
+L001:   ; seeding loop for SFMT
+        ; y = factor * (y ^ (y >> 30)) + (++i);
+        call    InitSubf0                                  ; randomization subfunction
+L002:   mov     [par1+rbx*4+CRandomSFMTA.STATE],eax        ; initialize state
+        cmp     ebx, SFMT_N*4 - 1
+        jb      L001
+
+        ; Put 5 more values into Mother-Of-All generator
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M0], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M1], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M2], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M3], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.MC], eax
+        
+        ; more initialization and period certification
+        call    InitAndPeriod
+        
+        pop     rbx
+        ret
+;SFMTRandomInit ENDP
+        
+Error:                                                     ; Error exit
+        xor     eax, eax
+        div     eax                                        ; Divide by 0
+        ret
+        
+; Subfunction used by SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; ebx = i
+; output:
+; eax = new y
+; ebx = i+1
+; edx modified
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx
+        imul    eax, 1812433253
+        inc     ebx
+        add     eax, ebx
+        ret
+;InitSubf0 endp 
+       
+; Subfunction used by SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+        mov     r10d, eax
+        shr     eax,  27
+        xor     eax,  r10d
+        imul    eax,  1664525
+        ret
+;InitSubf1 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+        mov     r10d, eax
+        shr     eax,  27
+        xor     eax,  r10d
+        imul    eax,  1566083941
+        ret
+;InitSubf2 endp
+
+
+; Subfunciton for initialization and period certification, except seeding
+; par1 = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+        push    rbx
+        
+        ; initialize constants for Mother-Of-All
+        movaps  xmm0, oword [InitMother]
+        movaps  oword [par1+CRandomSFMTA.MF3], xmm0
+        movaps  xmm0, oword [InitMother+16]
+        movaps  oword [par1+CRandomSFMTA.MF1], xmm0
+        
+        ; initialize constants for SFMT
+        movaps  xmm0, oword [InitMask]
+        movaps  oword [par1+CRandomSFMTA.AMASK], xmm0
+
+        ; initialize various variables
+        xor     eax, eax
+        mov     dword [par1+CRandomSFMTA.one], eax
+        mov     dword [par1+4+CRandomSFMTA.one], 3FF00000H
+        mov     dword [par1+CRandomSFMTA.LASTINTERVAL], eax        
+        
+        ; get instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        mov     [par1+CRandomSFMTA.Instset], eax
+        
+        ; Period certification
+        ; Compute parity of STATE[0-4] & InitParity
+        movaps  xmm1, oword [par1+CRandomSFMTA.STATE]
+        andps   xmm1, oword [InitParity]
+        movhlps xmm2, xmm1                                 ; high qword
+        xorps   xmm1, xmm2                                 ; xor two qwords
+        pshufd  xmm2, xmm1, 1                              ; high dword
+        xorps   xmm1, xmm2                                 ; xor two dwords
+        movd    eax,  xmm1                                 ; do rest of xor in eax
+        mov     edx,  eax
+        shr     eax,  16
+        xor     eax,  edx                                  ; xor two words
+        xor     al,   ah                                   ; xor two bytes
+        jpo     L008                                       ; parity odd: period OK
+        
+        ; parity even: period not OK
+        ; Find a nonzero dword in period certification vector
+        xor     ebx, ebx                                   ; loop counter
+        lea     rdx, [InitParity]
+L005:   mov     eax, [rdx+rbx*4]                           ; InitParity[i]
+        test    eax, eax
+        jnz     L006
+        inc     ebx
+        ; assume that there is a nonzero dword in InitParity
+        jmp     L005                                       ; loop until nonzero found
+        
+L006:   ; find first nonzero bit in eax
+        bsf     edx, eax
+        ; flip the corresponding bit in STATE
+        btc     [par1+rbx*4+CRandomSFMTA.STATE], edx
+
+L008:   cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        je      L009
+        call    Mother_Next                                ; Make first random number ready
+
+L009:   ; Generate first random numbers and set IX = 0
+        call    SFMT_Generate
+        pop     rbx
+        ret
+;InitAndPeriod   endp
+
+
+;  extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+SFMTRandomInitByArray:
+; Parameters
+; par1  = Pthis
+; par2d = ThisSize
+; par3  = seeds
+; par4d = NumSeeds
+; par5d = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4                                     ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+   SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+   SFMT_LAG equ  7
+%ELIF SFMT_SIZE >= 39
+   SFMT_LAG equ  5
+%ELSE
+   SFMT_LAG equ  3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+        xor     eax, eax
+        cmp     par5d, eax                                 ; IncludeMother (parameter is on stack if windows)
+        setnz   al                                         ; convert any nonzero value to 1
+
+        push    rbx
+        push    rbp
+        
+        cmp     par2d, SFMTSize                            ; ThisSize
+        jb      Error                                      ; Error exit if buffer too small
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     par1, -16        
+
+        ; Store USEMOTHER
+        mov     [par1+CRandomSFMTA.USEMOTHER], eax 
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+;    r = factor * (r ^ (r >> 30)) + i;
+;    sta[i] = r;}
+
+        mov     eax, par4d                                 ; r = NumSeeds
+        xor     ebx, ebx                                   ; i
+L100:   mov     par2d, eax
+        shr     eax, 30
+        xor     eax, par2d
+        imul    eax, 1812433253
+        add     eax, ebx
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L100        
+
+        ; count = max(NumSeeds,size-1)
+        mov     eax,  SFMT_SIZE - 1
+        mov     r11d, par4d                                 ; NumSeeds
+        cmp     r11d, eax
+        cmovb   r11d, eax
+        
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+;    r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+;    sta[(i + mid) % size] += r;
+;    if (j < NumSeeds) r += seeds[j]
+;    r += i;
+;    sta[(i + mid + lag) % size] += r;
+;    sta[i] = r;
+;    i = (i + 1) % size;
+; }
+        ; register use:
+        ; par1  = Pthis
+        ; par2  = j
+        ; par3  = seeds
+        ; par4  = NumSeeds
+        ; eax   = r
+        ; ebx   = i
+        ; ebp   = (i + mid) % size, (i + mid + lag) % size
+        ; r10   = (i + size - 1) % size
+        ; r11   = count
+
+        xor     par2d, par2d           ; j = 0
+        lea     ebx, [par2+1]          ; i = 1
+
+L101:   ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
+        lea     ebp,  [rbx+SFMT_MID]
+        cmp     ebp,  SFMT_SIZE
+        jb      L102
+        sub     ebp,  SFMT_SIZE
+L102:   xor     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
+        lea     r10d, [rbx+SFMT_SIZE-1]
+        cmp     r10d, SFMT_SIZE
+        jb      L103
+        sub     r10d, SFMT_SIZE
+L103:   xor     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
+
+        ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+        call    InitSubf1
+        
+        ; sta[(i + mid) % size] += r;
+        add     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ; if (j < NumSeeds) r += seeds[j]
+        cmp     par2d, par4d
+        jnb     L104
+        add     eax, [par3+par2*4]        
+L104:
+        ; r += i;
+        add     eax, ebx
+        
+        ; sta[(i + mid + lag) % size] += r;
+        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
+        cmp     ebp, SFMT_SIZE
+        jb      L105
+        sub     ebp, SFMT_SIZE
+L105:   add     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ;sta[i] = r;
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L106
+        sub     ebx, SFMT_SIZE
+L106:
+        ; j++, loop while j < count
+        inc     par2d
+        cmp     par2d, r11d
+        jb      L101
+        
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+;   r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+;   sta[(i + mid) % size] ^= r;
+;   r -= i;
+;   sta[(i + mid + lag) % size] ^= r;
+;   sta[i] = r;
+;   i = (i + 1) % size;
+; }
+        ; j = 0
+        xor     par2d, par2d
+
+L110:    ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
+        lea     ebp,  [rbx+SFMT_MID]
+        cmp     ebp,  SFMT_SIZE
+        jb      L111
+        sub     ebp,  SFMT_SIZE
+L111:   add     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
+        lea     r10d, [rbx+SFMT_SIZE-1]
+        cmp     r10d, SFMT_SIZE
+        jb      L112
+        sub     r10d, SFMT_SIZE
+L112:   add     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
+
+        ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+        call    InitSubf2
+        
+        ; sta[(i + mid) % size] ^= r;
+        xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ; r -= i;
+        sub     eax, ebx
+        
+        ; sta[(i + mid + lag) % size] ^= r;
+        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
+        cmp     ebp, SFMT_SIZE
+        jb      L113
+        sub     ebp, SFMT_SIZE
+L113:   xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+        ; sta[i] = r;
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L114
+        sub     ebx, SFMT_SIZE
+L114:
+        ; j++, loop while j < size
+        inc     par2d
+        cmp     par2d, SFMT_SIZE
+        jb      L110
+    
+        ; if (UseMother) {
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L120
+        
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+;    r = func2(r) + j;
+;    MotherState[j] = r + sta[2*j];
+; }
+        call    InitSubf2
+        mov     par2d, [par1+CRandomSFMTA.STATE]
+        add     par2d, eax
+        mov     [par1+CRandomSFMTA.M0], par2d
+        call    InitSubf2
+        inc     eax
+        mov     par2d, [par1+8+CRandomSFMTA.STATE]
+        add     par2d, eax
+        mov     [par1+CRandomSFMTA.M1], par2d
+        call    InitSubf2
+        add     eax, 2
+        mov     par2d, [par1+16+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.M2], par2d
+        call    InitSubf2
+        add     eax, 3
+        mov     par2d, [par1+24+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.M3], par2d
+        call    InitSubf2
+        add     eax, 4
+        mov     par2d, [par1+32+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.MC], par2d
+        
+L120:    ; More initialization and period certification
+        call    InitAndPeriod
+        
+        pop     rbp
+        pop     rbx
+        ret
+;SFMTRandomInitByArray ENDP
+
+
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; par1 = aligned pointer to structure CRandomSFMTA
+; eax, par1, xmm0 unchanged
+
+        movdqa  xmm1, oword [par1+CRandomSFMTA.M3]         ; load M3,M2
+        movdqa  xmm2, oword [par1+CRandomSFMTA.M1]         ; load M1,M0
+        movhps  qword [par1+CRandomSFMTA.M3], xmm1         ; M3=M2
+        movq    qword [par1+CRandomSFMTA.M2], xmm2         ; M2=M1
+        movhps  qword [par1+CRandomSFMTA.M1], xmm2         ; M1=M0
+        pmuludq xmm1, oword [par1+CRandomSFMTA.MF3]        ; M3*MF3, M2*MF2
+        pmuludq xmm2, oword [par1+CRandomSFMTA.MF1]        ; M1*MF1, M0*MF0
+        paddq   xmm1, xmm2                                 ; P3+P1, P2+P0
+        movhlps xmm2, xmm1                                 ; Get high qword
+        movq    xmm3, qword [par1+CRandomSFMTA.MC]         ; +carry
+        paddq   xmm1, xmm3
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        movq    qword [par1+CRandomSFMTA.M0], xmm1         ; Store new M0 and carry
+        ret
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+        push    rbx
+        
+        ; register use
+        ; par1 = Pthis (rcx or rdi)
+        ; edx  = i*16 + offset state
+        ; eax, ebx = loop end
+        ; xmm1 = r1
+        ; xmm2 = r2 = r
+        ; xmm0, xmm3 = scratch
+        
+        ; r1 = state[SFMT_N*16 - 2];
+        ; r2 = state[SFMT_N*16 - 1];
+        movdqa  xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+        movdqa  xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+        mov     edx, CRandomSFMTA.STATE
+        
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b, 
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+;    __m128i a1, b1, c1, d1, z1, z2;
+;    b1 = _mm_srli_epi32(b, SFMT_SR1);
+;    a1 = _mm_slli_si128(a, SFMT_SL2);
+;    c1 = _mm_srli_si128(c, SFMT_SR2);
+;    d1 = _mm_slli_epi32(d, SFMT_SL1);
+;    b1 = _mm_and_si128(b1, mask);
+;    z1 = _mm_xor_si128(a, a1);
+;    z2 = _mm_xor_si128(b1, d1);
+;    z1 = _mm_xor_si128(z1, c1);
+;    z2 = _mm_xor_si128(z1, z2);
+;    return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+;    r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+;    state[i] = r;
+;    r1 = r2;
+;    r2 = r;
+; }
+
+        mov      eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+        mov      ebx, SFMT_N*16 + CRandomSFMTA.STATE          ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L201:   movdqa   xmm0, oword [par1+rdx+SFMT_M*16]          ; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
+        movdqa   xmm3, oword [par1+rdx]                    ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [par1+rdx], xmm2
+        
+        ; i++ while i < SFMT_N - SFMT_M
+        add      edx, 16
+        cmp      edx, eax
+        jb       L201
+        
+;align 16
+L202:   ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+        movdqa   xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
+        movdqa   xmm3, oword [par1+rdx]                    ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [par1+rdx], xmm2
+        
+        ; i++ while i < SFMT_N
+        add      edx, 16
+        cmp      edx, ebx
+        jb       L202
+        
+        ; Check if initialized
+L208:   cmp     dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
+        jne     Error                                      ; Make error if not initialized
+
+        ; ix = 0;
+        mov      dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+        pop      rbx
+        ret
+;SFMT_Generate endp
+
+
+;  extern "C" unsigned int SFMTBRandom(void * Pthis);                     // Output random bits
+
+SFMTBRandom:                                               ; generate random bits
+        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     par1, -16        
+
+SFMTBRandom_reg:                                           ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16
+        jnb     NeedGenerate
+        
+; y = ((uint32_t*)state)[ix++];
+        mov     eax, dword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 4
+        mov     [par1+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      NoMother
+        
+        ; add mother bits
+        add     eax,  [par1+CRandomSFMTA.M0]               ; Add Mother random number        
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+NoMother: ; return y;
+        ret
+        
+NeedGenerate: 
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        mov     eax, [par1+CRandomSFMTA.STATE]
+        mov     dword [par1+CRandomSFMTA.IX], 4
+        jmp     AfterGenerate
+        
+;SFMTBRandom ENDP
+
+
+;  extern "C" double SFMTRandom  (void * Pthis); // Output random float
+SFMTRandom:                                                ; generate random float with 52 bits resolution
+        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     par1, -16
+        
+SFMTRandom_reg:                                            ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L303
+
+L301:   ; read 64 random bits
+        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [par1+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L302 ; ConvertToFloat
+        
+        ; add mother bits
+        movq    xmm1, qword [par1+CRandomSFMTA.M0]         ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                      ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                 ; Add SFMT and Mother outputs
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+L302:   ; ConvertToFloat
+        psrlq	xmm0, 12			                       ; align with mantissa field of double precision float
+        movsd   xmm1, [par1+CRandomSFMTA.one]              ; 1.0 double precision
+        por     xmm0, xmm1                                 ; insert exponent to get 1.0 <= x < 2.0
+        subsd   xmm0, xmm1                                 ; subtract 1.0 to get 0.0 <= x < 1.0
+        ret                                                ; return value        
+        
+L303:   ; NeedGenerateR
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L301
+
+;SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+SFMTRandomL:                                               ; generate random float with 63 bits resolution
+        ; Align Pthis by 16.
+        and     par1, -16
+        
+SFMTRandomL_reg:                                           ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L403
+
+L401:   ; read 64 random bits
+        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [par1+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L402
+                
+        ; add mother bits
+        movq    xmm1, qword [par1+CRandomSFMTA.M0]        ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                     ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                ; Add SFMT and Mother outputs
+        call    Mother_Next                               ; Make next Mother random number ready
+        
+L402:   ;ConvertToFloat
+        sub     rsp, 16                                   ; make space for long double
+        psrlq	xmm0, 1                                   ; align with mantissa field of long double
+        pcmpeqw xmm1, xmm1                                ; all 1's
+        psllq   xmm1, 63                                  ; create a 1 in bit 63
+        por     xmm0, xmm1                                ; bit 63 is always 1 in long double
+        movq    qword [rsp], xmm0                         ; store mantissa
+        mov     dword [rsp+8], 3FFFH                      ; exponent
+        fld     tword [rsp]                               ; load long double
+        fsub    qword [par1+CRandomSFMTA.one]             ; subtract 1.0 to get 0.0 <= x < 1.0
+        pcmpeqw xmm0, xmm0                                ; make a NAN for compilers that don't support long double
+        add     rsp, 16
+        ret                                               ; return value in st(0)
+        
+L403:   ;NeedGenerateR
+        call    SFMT_Generate                             ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L401        
+;SFMTRandomL ENDP
+
+
+;  extern "C" int SFMTIRandom (void * Pthis, int min, int max);  // Output random integer
+
+SFMTIRandom:
+; par1  = Pthis
+; par2d = min
+; par3d = max
+
+        ; Align Pthis by 16.
+        and     par1, -16        
+        push    par2                                       ; save min, max
+        push    par3
+        call    SFMTBRandom_reg                            ; random bits
+        pop     rdx                                        ; max
+        pop     rcx                                        ; min        
+        sub     edx, ecx
+        jl      short WrongInterval                        ; max < min
+        inc     edx                                        ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [rdx+rcx]                             ; add min to high dword of product
+        ret
+WrongInterval:
+        mov     eax, 80000000H                             ; error exit
+        ret
+;SFMTIRandom ENDP
+
+
+;  extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandomX:
+; par1  = Pthis
+; par2d = min
+; par3d = max
+
+        push    rbx
+        ; Align Pthis by 16.
+        and     par1, -16        
+
+        mov     ebx, par3d 
+        sub     ebx, par2d                                 ; max - min
+        jle     short M30                                  ; max <= min (signed)
+        inc     ebx                                        ; interval = max - min + 1
+        
+        ; if (interval != LastInterval) {
+        cmp     ebx, [par1+CRandomSFMTA.LASTINTERVAL]
+        je      M10
+        ; need to calculate new rejection limit
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     ebx                                        ; (would give overflow if interval = 1)
+        mul     ebx
+        dec     eax
+        mov     [par1+CRandomSFMTA.RLIMIT], eax       
+        mov     [par1+CRandomSFMTA.LASTINTERVAL], ebx
+M10:    mov     ebx, par2d                                 ; save min
+
+M20:    ; do { // Rejection loop
+        call    SFMTBRandom_reg                            ; random bits (par1 is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     dword [par1+CRandomSFMTA.LASTINTERVAL]
+        ; } while (remainder > RLimit);
+        cmp     eax, [par1+CRandomSFMTA.RLIMIT]
+        ja      M20
+        
+        ; return (int32)iran + min
+        lea     eax, [rbx+rdx]
+        pop     rbx
+        ret
+        
+M30:    jl      M40
+        ; max = min. Return min
+        mov     eax, par2d
+        pop     rbx
+        ret                                                ; max = min exit
+        
+M40:    ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     rbx
+        ret
+;SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+;  Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+;  extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0); 
+SFMTgenRandomInit:
+%IFDEF WINDOWS
+SFMTgenRandomInitD:
+%ENDIF
+; par1d = seed
+; par2d = IncludeMother
+
+        ; set up parameters for call SFMTRandomInit
+        mov     par4d, par2d                               ; IncludeMother
+        mov     par3d, par1d                               ; seed
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomInit
+;SFMTgenRandomInit ENDP
+
+
+;  extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+SFMTgenRandomInitByArray:
+; par1  = seeds
+; par2d = NumSeeds
+; par3d = IncludeMother
+
+        ; set up parameters for call SFMTRandomInitByArray
+%IFDEF   WINDOWS
+SFMTgenRandomInitByArrayD:
+        push    par3                                       ; IncludeMother on stack
+        sub     rsp, 32                                    ; empty shadow space
+        mov     par4d, par2d                               ; NumSeeds
+        mov     par3,  par1                                ; seeds
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        call	SFMTRandomInitByArray
+        add     rsp, 40
+        ret
+%ELSE    ; UNIX
+        mov     par5d, par3d                               ; IncludeMother in register
+        mov     par4d, par2d                               ; NumSeeds
+        mov     par3,  par1                                ; seeds
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomInitByArray
+%ENDIF        
+;SFMTgenRandomInitByArray ENDP  
+
+
+;  extern "C" double SFMTgenRandom();
+SFMTgenRandom:                                             ; generate random float with 52 bits resolution
+%IFDEF WINDOWS
+SFMTgenRandomD:
+%ENDIF
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTRandom_reg                             ; random bits
+;SFMTgenRandom ENDP
+
+
+;  extern "C" double SFMTgenRandom();
+SFMTgenRandomL:                                            ; generate random float with 63 bits resolution
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomL_reg                            ; random bits
+;SFMTgenRandomL ENDP
+
+
+;  extern "C" int SFMTgenIRandom (int min, int max);
+SFMTgenIRandom:
+%IFDEF WINDOWS
+SFMTgenIRandomD:   
+%ENDIF
+        mov     par3d, par2d
+        mov     par2d, par1d
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTIRandom				                   ; continue in _SFMTIRandom
+;SFMTgenIRandom ENDP
+
+
+;  extern "C" int SFMTgenIRandomX (int min, int max);
+SFMTgenIRandomX:
+%IFDEF WINDOWS
+SFMTgenIRandomXD:
+%ENDIF
+        mov     par3d, par2d
+        mov     par2d, par1d
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp	    SFMTIRandomX                               ; continue in _SFMTIRandomX
+;SFMTgenIRandomX ENDP
+
+
+;  extern "C" uint32_t SFMTgenBRandom();
+SFMTgenBRandom:                                            ; generate random float with 32 bits resolution
+%IFDEF WINDOWS
+SFMTgenBRandomD:
+%ENDIF
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTBRandom_reg                            ; random bits
+;SFMTgenBRandom ENDP
+
+;END
diff --git a/asmlibSrc/strcat32.asm b/asmlibSrc/strcat32.asm
new file mode 100755
index 0000000..523808d
--- /dev/null
+++ b/asmlibSrc/strcat32.asm
@@ -0,0 +1,60 @@
+;*************************  strcat32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char * dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strcat: function                  ; Function _A_strcat
+global ?OVR_strcat: function                ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen32.asm
+extern _A_strlen
+
+; Imported from memcpy32.asm
+extern _A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+;    memcpy(dest+strlen(dest), src, strlen(src)+1);
+;    return dest
+; }
+
+; Function entry:
+_A_strcat:
+?OVR_strcat:
+
+        mov     eax, [esp+8]           ; src
+        push    eax
+        call    _A_strlen              ; length of src
+        inc     eax                    ; include terminating zero in length
+        push    eax                    ; strlen(src)+1        
+        mov     edx, [esp+4+8]         ; dest
+        push    edx
+        call    _A_strlen              ; length of dest
+        pop     edx                    ; dest. Assume unchanged by _A_strlen
+        add     edx, eax               ; dest+strlen(dest)
+        mov     ecx, [esp+8+8]         ; src
+                                       ; strlen(src)+1 is on stack
+        push    ecx                    ; src
+        push    edx                    ; dest+strlen(dest)
+        call    _A_memcpy              ; copy
+        add     esp, 16                ; clean up stack
+        mov     eax, [esp+4]           ; return dest
+        ret
+
+;_A_strcat ENDP
diff --git a/asmlibSrc/strcat64.asm b/asmlibSrc/strcat64.asm
new file mode 100755
index 0000000..bf9de71
--- /dev/null
+++ b/asmlibSrc/strcat64.asm
@@ -0,0 +1,68 @@
+;*************************  strcat64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char *dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcat: function                  ; Function A_strcat
+global ?OVR_strcat: function               ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+;    memcpy(dest+strlen(dest), src, strlen(src)+1);
+;    return dest
+; }
+
+; Function entry:
+A_strcat:
+?OVR_strcat:
+
+%IFDEF  WINDOWS
+%define Rpar1   rcx                    ; function parameter 1
+%define Rpar2   rdx                    ; function parameter 2
+%define Rpar3   r8                     ; function parameter 3
+%ENDIF
+%IFDEF  UNIX
+%define Rpar1   rdi                    ; function parameter 1
+%define Rpar2   rsi                    ; function parameter 2
+%define Rpar3   rdx                    ; function parameter 3
+%ENDIF
+
+        push    Rpar1                  ; dest
+        push    Rpar2                  ; src
+        call    A_strlen               ; length of dest
+        push    rax                    ; strlen(dest)
+        mov     Rpar1, [rsp+8]         ; src
+        call    A_strlen               ; length of src
+        pop     Rpar1                  ; strlen(dest)
+        pop     Rpar2                  ; src
+        add     Rpar1, [rsp]           ; dest + strlen(dest)
+        lea     Rpar3, [rax+1]         ; strlen(src)+1
+        call    A_memcpy               ; copy
+        pop     rax                    ; return dest
+        ret
+
+;A_strcat ENDP
diff --git a/asmlibSrc/strcmp32.asm b/asmlibSrc/strcmp32.asm
new file mode 100755
index 0000000..008d1fb
--- /dev/null
+++ b/asmlibSrc/strcmp32.asm
@@ -0,0 +1,177 @@
+;*************************  strcmp32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-14
+; Last modified:    2012-07-07
+
+; Description:
+; Faster version of the standard strcmp function:
+; int A_strcmp(const char * s1, const char * s2);
+; Tests if two strings are equal. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard function strcmp:
+; The alias ?OVR_strcmp is changed to _strcmp in the object file if
+; it is desired to override the standard library function strcmp.
+; Overriding is disabled because the function may read beyond the end of a 
+; string, while the standard strcmp function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global _A_strcmp: function             ; Function A_strcmp
+
+; Direct entries to CPU-specific versions
+global _strcmpGeneric: function            ; Generic version for processors without SSE4.2
+global _strcmpSSE42: function          ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+; strcmp function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcmp: function
+?OVR_strcmp:
+%endif
+
+_A_strcmp: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     dword [strcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP:                                    ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+        jmp     near [edx+strcmpDispatch-RP]
+
+%ENDIF
+
+align 16
+_strcmpSSE42:
+		mov     eax, [esp+4]           ; string 1
+		mov     edx, [esp+8]           ; string 2
+		push    ebx
+		mov     ebx, -16               ; offset counter
+
+compareloop:
+        add     ebx, 16                ; increment offset
+        movdqu  xmm1, [eax+ebx]        ; read 16 bytes of string 1
+        pcmpistri xmm1, [edx+ebx], 00011000B ; unsigned bytes, equal each, invert. returns index in ecx
+        jnbe    compareloop            ; jump if not carry flag and not zero flag
+        
+        jnc     equal
+notequal:                              ; strings are not equal
+        ; strings are not equal
+        add     ecx, ebx               ; offset to first differing byte
+        movzx   eax, byte [eax+ecx]    ; compare bytes
+        movzx   edx, byte [edx+ecx]
+		sub     eax, edx
+		pop     ebx
+		ret
+        
+equal:
+        xor     eax, eax               ; strings are equal
+		pop     ebx
+		ret
+
+;_strcmpSSE42: endp
+
+
+align 16
+_strcmpGeneric:  ; generic version
+; This is a very simple solution. There is not much gained by using SSE2 or anything complicated
+		mov     ecx, [esp+4]          ; string 1
+		mov     edx, [esp+8]          ; string 2
+		
+_compareloop:
+        mov     al, [ecx]
+        cmp     al, [edx]
+        jne     _notequal
+        test    al, al
+        jz      _equal
+        inc     ecx
+        inc     edx
+        jmp     _compareloop        
+        
+_equal: xor     eax, eax               ; strings are equal
+		ret
+
+_notequal:                             ; strings are not equal
+        movzx   eax, byte [ecx]        ; compare first differing byte
+        movzx   edx, byte [edx]
+		sub     eax, edx
+		ret
+		
+;_strcmpGeneric end
+
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; CPU dispatching for strcmp. This is executed only once
+strcmpCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strcmp
+        mov     ecx, _strcmpGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strcmp
+        mov     ecx, _strcmpSSE42
+Q100:   mov     [strcmpDispatch], ecx
+        ; Continue in appropriate version of strcmp
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP2:    ; reference point edx
+        ; Point to generic version of strcmp
+        lea     ecx, [edx+_strcmpGeneric-RP2]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strcmp
+        lea     ecx, [edx+_strcmpSSE42-RP2]
+Q100:   mov     [edx+strcmpDispatch-RP2], ecx
+        ; Continue in appropriate version of strcmp
+        jmp     ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strcmpDispatch DD strcmpCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
+
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+        dq      0, 0
diff --git a/asmlibSrc/strcmp64.asm b/asmlibSrc/strcmp64.asm
new file mode 100755
index 0000000..2dc7738
--- /dev/null
+++ b/asmlibSrc/strcmp64.asm
@@ -0,0 +1,162 @@
+;*************************  strcmp64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-14
+; Last modified:    2012-07-07
+
+; Description:
+; Faster version of the standard strcmp function:
+; int A_strcmp(const char * s1, const char * s2);
+; Tests if two strings are equal. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard function strcmp:
+; The alias ?OVR_strcmp is changed to _strcmp in the object file if
+; it is desired to override the standard library function strcmp.
+; Overriding is disabled because the function may read beyond the end of a 
+; string, while the standard strcmp function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global A_strcmp: function             ; Function A_strcmp
+
+; Direct entries to CPU-specific versions
+global strcmpGeneric: function            ; Generic version for processors without SSE4.2
+global strcmpSSE42: function          ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+; strcmp function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcmp: function
+?OVR_strcmp:
+%endif
+
+A_strcmp: ; function dispatching
+
+        jmp     near [strcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+strcmpSSE42:
+%ifdef  WINDOWS
+        push    rdi
+        mov     rdi, rcx
+%define rs1     rdi                    ; pointer to string 1
+%define rs2     rdx                    ; pointer to string 2
+%define par1    rcx
+%define par2    rdx
+%else   ; UNIX
+%define rs1     rdi
+%define rs2     rsi
+%define par1    rdi
+%define par2    rsi
+%endif
+
+        mov     rax, -16               ; offset counter
+compareloop:
+        add     rax, 16                ; increment offset
+        movdqu  xmm1, [rs1+rax]        ; read 16 bytes of string 1
+        pcmpistri xmm1, [rs2+rax], 00011000B ; unsigned bytes, equal each, invert. returns index in ecx
+        jnbe    compareloop            ; jump if not carry flag and not zero flag
+        
+        jnc     equal
+notequal:
+        ; strings are not equal
+        add     rcx, rax               ; offset to first differing byte
+        movzx   eax, byte [rs1+rcx]    ; compare first differing byte
+        movzx   edx, byte [rs2+rcx]
+		sub     rax, rdx
+%ifdef  WINDOWS
+        pop     rdi
+%endif
+		ret
+
+equal:
+        xor     eax, eax               ; strings are equal
+%ifdef  WINDOWS
+        pop     rdi
+%endif
+		ret
+
+;strcmpSSE42: endp
+
+
+align 16
+strcmpGeneric:  ; generic version
+; This is a very simple solution. There is not much gained by using SSE2 or anything complicated
+%ifdef  WINDOWS
+%define ss1     rcx                    ; pointer to string 1
+%define ss2     rdx                    ; pointer to string 2
+%else   ; UNIX
+%define ss1     rdi
+%define ss2     rsi
+%endif
+
+		
+_compareloop:
+        mov     al, [ss1]
+        cmp     al, [ss2]
+        jne     _notequal
+        test    al, al
+        jz      _equal
+        inc     ss1
+        inc     ss2
+        jmp     _compareloop        
+        
+_equal: xor     eax, eax               ; strings are equal
+		ret
+
+_notequal:                             ; strings are not equal
+        movzx   eax, byte [ss1]        ; compare first differing byte
+        movzx   edx, byte [ss2]
+		sub     eax, edx
+		ret
+		
+;strcmpGeneric end
+
+
+; CPU dispatching for strcmp. This is executed only once
+strcmpCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version of strcmp
+        lea     r9, [strcmpGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strcmp
+        lea     r9, [strcmpSSE42]
+Q100:   mov     [strcmpDispatch], r9
+        ; Continue in appropriate version of strcmp
+        jmp     r9
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strcmpDispatch DQ strcmpCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+        dq      0, 0
diff --git a/asmlibSrc/strcountset32.asm b/asmlibSrc/strcountset32.asm
new file mode 100755
index 0000000..f9770ba
--- /dev/null
+++ b/asmlibSrc/strcountset32.asm
@@ -0,0 +1,194 @@
+;*************************  strcountinset32.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2011-08-21
+
+; Description:
+; size_t strCountInSet(const char * str, const char * set);
+;
+; Counts how many characters in str that belong to the set defined by set.
+; Both strings are zero-terminated ASCII strings.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _strCountInSet: function
+
+; Direct entries to CPU-specific versions
+global _strCountInSetGeneric: function
+global _strCountInSetSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               strCountInSet function
+;******************************************************************************
+
+_strCountInSet: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [strCountInSetDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+        jmp     near [edx+strCountInSetDispatch-RP1]
+
+%ENDIF
+
+
+align 16
+_strCountInSetSSE42: ; SSE4.2 version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str
+        mov     edi, [esp+16]          ; set
+        xor     eax, eax               ; match counter
+str_next:
+        movdqu  xmm2, [esi]            ; str
+        movdqu  xmm1, [edi]            ; set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    ecx, xmm0
+        jns     set_extends            ; the set is more than 16 bytes
+        jz      str_finished
+set_finished:
+        popcnt  ecx, ecx
+        add     eax, ecx        
+        ; first 16 characters checked, continue with next 16 characters (a terminating zero would never match)
+        add     esi, 16                ; next 16 bytes of str
+        jmp     str_next
+
+set_and_str_finished:        
+        or      ecx, edx               ; accumulate matches
+str_finished:
+        popcnt  ecx, ecx
+        add     eax, ecx        
+        pop     edi
+        pop     esi
+        ret
+
+set_loop:
+        or      ecx, edx               ; accumulate matches
+set_extends:
+        add     edi, 16
+        movdqu  xmm1, [edi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop
+        jz      set_and_str_finished
+        mov     edi, [esp+16]          ; restore set pointer
+        or      ecx, edx               ; accumulate matches
+        jmp     set_finished
+        
+;_strCountInSetSSE42 end
+
+;******************************************************************************
+;                               strCountInSet function generic
+;******************************************************************************
+
+align 8
+_strCountInSetGeneric: ; Generic version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str pointer
+        mov     edi, [esp+16]          ; set pointer
+        xor     eax, eax               ; match counter
+str_next10:
+        mov     cl, [esi]              ; read one byte from str
+        test    cl, cl
+        jz      str_finished10         ; str finished
+set_next10:
+        mov     dl, [edi]
+        test    dl, dl
+        jz      set_finished10
+        inc     edi                    ; next in set
+        cmp     cl, dl
+        jne     set_next10
+        ; character match found, goto next character
+        inc     eax                    ; count match
+        inc     esi
+        jmp     str_next10
+
+set_finished10: ; end of set, no match found
+        mov     edi, [esp+16]          ; restore set pointer
+        inc     esi
+        jmp     str_next10             ; next in string
+
+str_finished10: ; end of str, count is in eax
+        pop     edi
+        pop     esi
+        ret
+;_strCountInSetGeneric end
+
+
+; ********************************************************************************
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strCountInSet. This is executed only once
+; ********************************************************************************
+
+strCountInSetCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strstr
+        mov     ecx, _strCountInSetGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _strCountInSetSSE42
+Q100:   mov     [strCountInSetDispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP11:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_strCountInSetGeneric-RP11]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_strCountInSetSSE42-RP11]
+Q100:   mov     [edx+strCountInSetDispatch-RP11], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strCountInSetDispatch  DD strCountInSetCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountset64.asm b/asmlibSrc/strcountset64.asm
new file mode 100755
index 0000000..909987c
--- /dev/null
+++ b/asmlibSrc/strcountset64.asm
@@ -0,0 +1,175 @@
+;*************************  strcountinset64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2011-07-20
+
+; Description:
+; size_t strCountInSet(const char * str, const char * set);
+;
+; Counts how many characters in str that belong to the set defined by set.
+; Both strings are zero-terminated ASCII strings.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global strCountInSet: function
+
+; Direct entries to CPU-specific versions
+global strCountInSetGeneric: function
+global strCountInSetSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               strCountInSet function
+;******************************************************************************
+%ifdef  WINDOWS
+%define par1    rcx
+%define par2    rdx
+%else
+%define par1    rdi
+%define par2    rsi
+%endif
+
+
+strCountInSet: ; function dispatching
+        jmp     near [strCountInSetDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+strCountInSetSSE42: ; SSE4.2 version
+%ifdef  WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        xor     eax, eax               ; match counter
+str_next:
+        movdqu  xmm2, [rdi]            ; str
+        movdqu  xmm1, [rsi]            ; set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    ecx, xmm0
+        jns     set_extends            ; the set is more than 16 bytes
+        jz      str_finished
+set_finished:
+        popcnt  ecx, ecx
+        add     rax, rcx        
+        ; first 16 characters checked, continue with next 16 characters (a terminating zero would never match)
+        add     rdi, 16                ; next 16 bytes of str
+        jmp     str_next
+
+set_and_str_finished:        
+        or      ecx, edx               ; accumulate matches
+str_finished:
+        popcnt  ecx, ecx
+        add     rax, rcx
+%ifdef  WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        ret
+
+set_loop:
+        or      ecx, edx               ; accumulate matches
+set_extends:
+        add     rsi, 16
+        movdqu  xmm1, [rsi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop
+        jz      set_and_str_finished
+        mov     rsi, r8                ; restore set pointer
+        or      ecx, edx               ; accumulate matches
+        jmp     set_finished
+        
+;strCountInSetSSE42 end
+
+;******************************************************************************
+;                               strCountInSet function generic
+;******************************************************************************
+
+align 8
+strCountInSetGeneric: ; Generic version
+%ifdef  WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        xor     eax, eax               ; match counter
+str_next10:
+        mov     cl, [rdi]              ; read one byte from str
+        test    cl, cl
+        jz      str_finished10         ; str finished
+set_next10:
+        mov     dl, [rsi]
+        test    dl, dl
+        jz      set_finished10
+        inc     rsi                    ; next in set
+        cmp     cl, dl
+        jne     set_next10
+        ; character match found, goto next character
+        inc     rax                    ; count match
+        inc     rdi
+        jmp     str_next10
+
+set_finished10: ; end of set, no match found
+        mov     rsi, r8                ; restore set pointer
+        inc     rdi
+        jmp     str_next10             ; next in string
+
+str_finished10: ; end of str, count is in eax
+%ifdef  WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        ret
+;strCountInSetGeneric end
+
+
+
+; ********************************************************************************
+; CPU dispatching for strCountInSet. This is executed only once
+; ********************************************************************************
+
+strCountInSetCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version of strstr
+        lea     r8, [strCountInSetGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     r8, [strCountInSetSSE42]
+Q100:   mov     [strCountInSetDispatch], r8
+        ; Continue in appropriate version 
+        jmp     r8
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strCountInSetDispatch  DQ strCountInSetCPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountutf832.asm b/asmlibSrc/strcountutf832.asm
new file mode 100755
index 0000000..46910b8
--- /dev/null
+++ b/asmlibSrc/strcountutf832.asm
@@ -0,0 +1,162 @@
+;*************************  strcountutf832.asm  ***********************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2013-09-11
+
+; Description:
+; size_t strcount_UTF8(const char * str);
+; Counts the number of characters in a UTF-8 encoded string.
+;
+; This functions does not check if the string contains valid UTF-8 code, it 
+; simply counts all bytes except continuation bytes 10xxxxxxB.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the string.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _strcount_UTF8: function
+
+; Direct entries to CPU-specific versions
+global _strcount_UTF8Generic: function
+global _strcount_UTF8SSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .data
+align  16
+byterange: times 8  DB 10000000b, 10111111b ; range for UTF-8 continuation bytes
+
+section .text
+
+;******************************************************************************
+;                               strcount_UTF8 function
+;******************************************************************************
+
+
+_strcount_UTF8: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [strcount_UTF8Dispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+        jmp     near [edx+strcount_UTF8Dispatch-RP1]
+
+%ENDIF
+
+;******************************************************************************
+;                        strcount_UTF8 function SSE4.2 version
+;******************************************************************************
+align 16
+_strcount_UTF8SSE42: ; SSE4.2 version
+        mov     edx,  [esp+4]          ; str
+        movdqa  xmm1, [byterange]      ; define range of continuation bytes to ignore
+        xor     ecx, ecx               ; character counter
+str_next:
+        pcmpistrm xmm1, [edx], 00110100b; check range, invert valid bits, return bit mask in xmm0
+        movd    eax, xmm0
+        jz      str_finished           ; terminating zero found
+        popcnt  eax, eax               ; count
+        add     ecx, eax
+        add     edx, 16
+        jmp     str_next
+
+str_finished:
+        popcnt  eax, eax
+        add     eax, ecx
+        ret
+
+
+;******************************************************************************
+;                        strcount_UTF8 function generic
+;******************************************************************************
+
+align 8
+_strcount_UTF8Generic:
+        mov     edx,  [esp+4]          ; str
+        xor     eax, eax               ; character counter
+        xor     ecx, ecx               ; zero extend cl
+str_next1:
+        mov     cl, [edx]              ; one byte fron string
+        test    cl, cl
+        jz      str_finished1          ; terminating zero
+        sub     cl, 10000000b          ; lower limit of continuation bytes
+        cmp     cl, 00111111b          ; upper limit - lower limit
+        seta    cl                     ; 1 if outside limit (unsigned compare includes negative values as above)
+        add     eax, ecx
+        inc     edx
+        jmp     str_next1
+        
+str_finished1:
+        ret
+;_strcount_UTF8Generic end
+
+
+; ********************************************************************************
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strcount_UTF8. This is executed only once
+; ********************************************************************************
+
+strcount_UTF8CPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strstr
+        mov     ecx, _strcount_UTF8Generic
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _strcount_UTF8SSE42
+Q100:   mov     [strcount_UTF8Dispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP11:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_strcount_UTF8Generic-RP11]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_strcount_UTF8SSE42-RP11]
+Q100:   mov     [edx+strcount_UTF8Dispatch-RP11], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strcount_UTF8Dispatch  DD strcount_UTF8CPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountutf864.asm b/asmlibSrc/strcountutf864.asm
new file mode 100755
index 0000000..b155e57
--- /dev/null
+++ b/asmlibSrc/strcountutf864.asm
@@ -0,0 +1,127 @@
+;*************************  strcountutf864.asm  ***********************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2013-09-11
+
+; Description:
+; size_t strcount_UTF8(const char * str);
+; Counts the number of characters in a UTF-8 encoded string.
+;
+; This functions does not check if the string contains valid UTF-8 code, it 
+; simply counts all bytes except continuation bytes 10xxxxxxB.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the string.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for SSE2 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global strcount_UTF8: function
+
+; Direct entries to CPU-specific versions
+global strcount_UTF8Generic: function
+global strcount_UTF8SSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .data
+align  16
+byterange: times 8  DB 10000000b, 10111111b ; range for UTF-8 continuation bytes
+
+section .text
+
+;******************************************************************************
+;                               strcount_UTF8 function
+;******************************************************************************
+
+
+strcount_UTF8: ; function dispatching
+
+        jmp     near [strcount_UTF8Dispatch] ; Go to appropriate version, depending on instruction set
+
+
+;******************************************************************************
+;                        strcount_UTF8 function SSE4.2 version
+;******************************************************************************
+
+%ifdef  WINDOWS
+%define  par1  rcx
+%else
+%define  par1  rdi
+%endif
+
+align 16
+strcount_UTF8SSE42: ; SSE4.2 version
+        movdqa  xmm1, [byterange]      ; define range of continuation bytes to ignore
+        xor     edx, edx               ; character counter
+str_next:
+        pcmpistrm xmm1, [par1], 00110100b; check range, invert valid bits, return bit mask in xmm0
+        movd    eax, xmm0
+        jz      str_finished           ; terminating zero found
+        popcnt  eax, eax               ; count
+        add     rdx, rax
+        add     par1, 16
+        jmp     str_next
+
+str_finished:
+        popcnt  eax, eax
+        add     rax, rdx
+        ret
+
+
+;******************************************************************************
+;                        strcount_UTF8 function generic
+;******************************************************************************
+
+align 8
+strcount_UTF8Generic:
+        xor     eax, eax               ; character counter
+        xor     edx, edx               ; zero extend dl
+str_next1:
+        mov     dl, [par1]             ; one byte fron string
+        test    dl, dl
+        jz      str_finished1          ; terminating zero
+        sub     dl, 10000000b          ; lower limit of continuation bytes
+        cmp     dl, 00111111b          ; upper limit - lower limit
+        seta    dl                     ; 1 if outside limit (unsigned compare includes negative values as above)
+        add     rax, rdx
+        inc     par1
+        jmp     str_next1
+        
+str_finished1:
+        ret
+;_strcount_UTF8Generic end
+
+
+; ********************************************************************************
+; CPU dispatching for strcount_UTF8. This is executed only once
+; ********************************************************************************
+
+strcount_UTF8CPUDispatch:
+        ; get supported instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        ; Point to generic version of strstr
+        lea     rdx, [strcount_UTF8Generic]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     rdx, [strcount_UTF8SSE42]
+Q100:   mov     [strcount_UTF8Dispatch], rdx
+        ; Continue in appropriate version 
+        jmp     rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strcount_UTF8Dispatch  DQ strcount_UTF8CPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcpy32.asm b/asmlibSrc/strcpy32.asm
new file mode 100755
index 0000000..0062114
--- /dev/null
+++ b/asmlibSrc/strcpy32.asm
@@ -0,0 +1,53 @@
+;*************************  strcpy32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strcpy: function                 ; Function _A_strcpy
+global ?OVR_strcpy: function               ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen32.asm
+extern _A_strlen
+
+; Imported from memcpy32.asm
+extern _A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+;    return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+_A_strcpy:
+?OVR_strcpy:
+
+        mov     eax, [esp+8]           ; src
+        push    eax
+        call    _A_strlen              ; length of src
+        pop     ecx                    ; ecx = src. Assume unchanged by _A_strlen
+        inc     eax                    ; include terminating zero in length
+        mov     edx, [esp+4]           ; dest
+        push    eax                    ; length+1
+        push    ecx                    ; src
+        push    edx                    ; dest
+        call    _A_memcpy              ; copy
+        add     esp, 12                ; clean up stack
+        ret
+
+;_A_strcpy ENDP
diff --git a/asmlibSrc/strcpy64.asm b/asmlibSrc/strcpy64.asm
new file mode 100755
index 0000000..f7a6836
--- /dev/null
+++ b/asmlibSrc/strcpy64.asm
@@ -0,0 +1,64 @@
+;*************************  strcpy64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy. These functions allow
+; calling without proper stack alignment.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcpy: function                 ; Function A_strcpy
+global ?OVR_strcpy: function              ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+;    return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+A_strcpy:
+?OVR_strcpy:
+
+%IFDEF  WINDOWS
+%define Rpar1   rcx                    ; function parameter 1
+%define Rpar2   rdx                    ; function parameter 2
+%define Rpar3   r8                     ; function parameter 3
+%ENDIF
+%IFDEF  UNIX
+%define Rpar1   rdi                    ; function parameter 1
+%define Rpar2   rsi                    ; function parameter 2
+%define Rpar3   rdx                    ; function parameter 3
+%ENDIF
+
+        push    Rpar1                  ; dest
+        push    Rpar2                  ; src
+        mov     Rpar1, Rpar2
+        ; (A_strlen does not require stack alignment)
+        call    A_strlen               ; length of src
+        lea     Rpar3,[rax+1]          ; include terminating zero in length
+        pop     Rpar2                  ; src
+        pop     Rpar1                  ; dest
+        jmp     A_memcpy               ; copy and return
+
+;A_strcpy ENDP
diff --git a/asmlibSrc/stricmp32.asm b/asmlibSrc/stricmp32.asm
new file mode 100755
index 0000000..2050b24
--- /dev/null
+++ b/asmlibSrc/stricmp32.asm
@@ -0,0 +1,70 @@
+;*************************  stricmpaz32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-12-05
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int stricmp_az(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global _A_stricmp: function                     ; Function _A_stricmp
+
+SECTION .text  align=16
+
+; extern "C" int stricmp_az(const char *string1, const char *string2);
+
+_A_stricmp:
+        mov     ecx, [esp+4]           ; string1
+        mov     edx, [esp+8]           ; string2
+        sub     edx, ecx
+        
+L10:    mov     al,  [ecx]
+        cmp     al,  [ecx+edx]
+        jne     L20
+        inc     ecx
+        test    al, al
+        jnz     L10                    ; continue with next byte
+        
+        ; terminating zero found. Strings are equal
+        xor     eax, eax
+        ret        
+        
+L20:    ; bytes are different. check case
+        xor     al, 20H                ; toggle case
+        cmp     al, [ecx+edx]
+        jne     L30
+        ; possibly differing only by case. Check if a-z
+        or      al, 20H                ; upper case
+        sub     al, 'a'
+        cmp     al, 'z'-'a'
+        ja      L30                    ; not a-z
+        ; a-z and differing only by case
+        inc     ecx
+        jmp     L10                    ; continue with next byte
+
+L30:    ; bytes are different, even after changing case
+        movzx   eax, byte [ecx]        ; get original value again
+        sub     eax, 'A'
+        cmp     eax, 'Z' - 'A'
+        ja      L40
+        add     eax, 20H
+L40:    movzx   edx, byte [ecx+edx]
+        sub     edx, 'A'
+        cmp     edx, 'Z' - 'A'
+        ja      L50
+        add     edx, 20H
+L50:    sub     eax, edx                 ; subtract to get result
+        ret
+
+;_A_stricmp END
diff --git a/asmlibSrc/stricmp64.asm b/asmlibSrc/stricmp64.asm
new file mode 100755
index 0000000..ef8d152
--- /dev/null
+++ b/asmlibSrc/stricmp64.asm
@@ -0,0 +1,84 @@
+;*************************  stricmpaz64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-12-05
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int A_stricmp(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+default rel
+
+global A_stricmp: function                     ; Function A_stricmp
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1   rcx                   ; first parameter
+  %define par2   rdx                   ; second parameter
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1   rdi                   ; first parameter
+  %define par2   rsi                   ; second parameter
+%ENDIF
+
+SECTION .text  align=16
+
+; extern "C" int A_stricmp(const char *string1, const char *string2);
+
+A_stricmp:
+        sub     par2, par1
+        
+L10:    mov     al,  [par1]            ; string1
+        cmp     al,  [par1+par2]       ; string2
+        jne     L20
+        inc     par1
+        test    al, al
+        jnz     L10                    ; continue with next byte
+        
+        ; terminating zero found. Strings are equal
+        xor     eax, eax
+        ret        
+        
+L20:    ; bytes are different. check case
+        xor     al, 20H                ; toggle case
+        cmp     al, [par1+par2]
+        jne     L30
+        ; possibly differing only by case. Check if a-z
+        or      al, 20H                ; upper case
+        sub     al, 'a'
+        cmp     al, 'z'-'a'
+        ja      L30                    ; not a-z
+        ; a-z and differing only by case
+        inc     par1
+        jmp     L10                    ; continue with next byte
+
+L30:    ; bytes are different, even after changing case
+        movzx   eax, byte [par1]       ; get original value again
+        sub     eax, 'A'
+        cmp     eax, 'Z' - 'A'
+        ja      L40
+        add     eax, 20H               ; A-Z, make lower case
+L40:    movzx   edx, byte [par1+par2]
+        sub     edx, 'A'
+        cmp     edx, 'Z' - 'A'
+        ja      L50
+        add     edx, 20H                ; A-Z, make lower case
+L50:    sub     eax, edx                ; subtract to get result
+        ret
+
+;A_stricmp END
diff --git a/asmlibSrc/strlen32.asm b/asmlibSrc/strlen32.asm
new file mode 100755
index 0000000..7083590
--- /dev/null
+++ b/asmlibSrc/strlen32.asm
@@ -0,0 +1,182 @@
+;**************************  strlen32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; Internal calls: The parameter on the stack is left unchanged for the sake
+; of calls from strcpy and strcat.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of 
+; the string and after the end, but will never load any unnecessary cache 
+; line and never trigger a page fault for reading from non-existing memory 
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+; CPU dispatching included for 386 and SSE2 instruction sets.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strlen: function             ; Function _A_strlen
+global ?OVR_strlen: function           ; ?OVR removed if standard function strlen overridden
+
+; Imported from instrset32.asm
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+
+SECTION .text  align=16
+
+; extern "C" int strlen (const char * s);
+_A_strlen:
+?OVR_strlen:
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     [strlenDispatch]       ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_eax          ; get reference point for position-independent code
+RP:                                    ; reference point eax = offset RP
+A020:                                  ; Go here after CPU dispatching
+
+        ; Make the following instruction with address relative to RP:
+        cmp     dword [eax-RP+strlenCPUVersion], 1
+        jb      strlenCPUDispatch      ; First time: strlenCPUVersion = 0, go to dispatcher
+        je      strlen386              ; strlenCPUVersion = 1, go to 80386 version
+%ENDIF
+
+; SSE2 version
+strlenSSE2:
+        mov      eax,  [esp+4]         ; get pointer to string
+        mov      ecx,  eax             ; copy pointer
+        pxor     xmm0, xmm0            ; set to zero
+        and      ecx,  0FH             ; lower 4 bits indicate misalignment
+        and      eax,  -10H            ; align pointer by 16
+        movdqa   xmm1, [eax]           ; read from nearest preceding boundary
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        shr      edx,  cl              ; shift out false bits
+        shl      edx,  cl              ; shift back again
+        bsf      edx,  edx             ; find first 1-bit
+        jnz      A200                  ; found
+        
+        ; Main loop, search 16 bytes at a time
+A100:   add      eax,  10H             ; increment pointer by 16
+        movdqa   xmm1, [eax]           ; read 16 bytes aligned
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        bsf      edx,  edx             ; find first 1-bit
+        ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+        ;  but we are assuming that most strings are short, and newer processors have higher priority)
+        jz       A100                  ; loop if not found
+        
+A200:   ; Zero-byte found. Compute string length        
+        sub      eax,  [esp+4]         ; subtract start address
+        add      eax,  edx             ; add byte index
+        ret
+
+strlen386: ; 80386 version
+        push    ebx
+        mov     ecx, [esp+8]           ; get pointer to string
+        mov     eax, ecx               ; copy pointer
+        and     ecx, 3                 ; lower 2 bits of address, check alignment
+        jz      L2                     ; string is aligned by 4. Go to loop
+        and     eax, -4                ; align pointer by 4
+        mov     ebx, [eax]             ; read from nearest preceding boundary
+        shl     ecx, 3                 ; mul by 8 = displacement in bits
+        mov     edx, -1
+        shl     edx, cl                ; make byte mask
+        not     edx                    ; mask = 0FFH for false bytes
+        or      ebx, edx               ; mask out false bytes
+
+        ; check first four bytes for zero
+        lea     ecx, [ebx-01010101H]   ; subtract 1 from each byte
+        not     ebx                    ; invert all bytes
+        and     ecx, ebx               ; and these two
+        and     ecx, 80808080H         ; test all sign bits
+        jnz     L3                     ; zero-byte found
+        
+        ; Main loop, read 4 bytes aligned
+L1:     add     eax, 4                 ; increment pointer by 4
+L2:     mov     ebx, [eax]             ; read 4 bytes of string
+        lea     ecx, [ebx-01010101H]   ; subtract 1 from each byte
+        not     ebx                    ; invert all bytes
+        and     ecx, ebx               ; and these two
+        and     ecx, 80808080H         ; test all sign bits
+        jz      L1                     ; no zero bytes, continue loop
+        
+L3:     bsf     ecx, ecx               ; find right-most 1-bit
+        shr     ecx, 3                 ; divide by 8 = byte index
+        sub     eax, [esp+8]           ; subtract start address
+        add     eax, ecx               ; add index to byte
+        pop     ebx
+        ret
+        
+        
+; CPU dispatching for strlen. This is executed only once
+strlenCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        pushad
+        call    _InstructionSet
+        ; Point to generic version of strlen
+        mov     dword [strlenDispatch], strlen386
+        cmp     eax, 4                 ; check SSE2
+        jb      M100
+        ; SSE2 supported
+        ; Point to SSE2 version of strlen
+        mov     dword [strlenDispatch], strlenSSE2
+M100:   popad
+        ; Continue in appropriate version of strlen
+        jmp     dword [strlenDispatch]
+
+%ELSE   ; Position-independent version
+        pushad
+        
+        ; Make the following instruction with address relative to RP:
+        lea     ebx, [eax-RP+strlenCPUVersion]
+        ; Now ebx points to strlenCPUVersion.
+
+        call    _InstructionSet        
+
+        mov     byte [ebx], 1          ; Indicate generic version
+        cmp     eax, 4                 ; check SSE2
+        jb      M100
+        ; SSE2 supported
+        mov     byte [ebx], 2          ; Indicate SSE2 or later version
+M100:   popad
+        jmp     A020                   ; Go back and dispatch
+        
+get_thunk_eax: ; load caller address into ebx for position-independent code
+        mov eax, [esp]
+        ret       
+        
+%ENDIF        
+        
+SECTION .data
+align 16
+%IFNDEF POSITIONINDEPENDENT
+; Pointer to appropriate version.
+; This initially points to strlenCPUDispatch. strlenCPUDispatch will
+; change this to the appropriate version of strlen, so that
+; strlenCPUDispatch is only executed once:
+strlenDispatch: DD strlenCPUDispatch
+%ELSE    ; position-independent
+; CPU version: 0=unknown, 1=80386, 2=SSE2
+strlenCPUVersion: DD 0
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/asmlibSrc/strlen64.asm b/asmlibSrc/strlen64.asm
new file mode 100755
index 0000000..005fafd
--- /dev/null
+++ b/asmlibSrc/strlen64.asm
@@ -0,0 +1,84 @@
+;**************************  strlen64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Calling conventions: 
+; Stack alignment is not required. No shadow space or red zone used.
+; Called internally from strcpy and strcat without stack aligned.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of 
+; the string and after the end, but will never load any unnecessary cache 
+; line and never trigger a page fault for reading from non-existing memory 
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strlen: function              ; Function A_strlen
+global ?OVR_strlen: function           ; ?OVR removed if standard function strlen overridden
+
+
+SECTION .text  align=16
+
+; extern "C" int strlen (const char * s);
+
+; 64-bit Windows version:
+A_strlen:
+?OVR_strlen:
+
+%IFDEF  WINDOWS
+        mov      rax,  rcx             ; get pointer to string from rcx
+        mov      r8,   rcx             ; copy pointer
+%define Rscopy   r8                    ; Copy of s
+
+%ELSE   ; Unix
+        mov      rax,  rdi             ; get pointer to string from rdi
+        mov      ecx,  edi             ; copy pointer (lower 32 bits)
+%define Rscopy   rdi                   ; Copy of s
+%ENDIF
+        
+        ; rax = s, ecx = 32 bits of s
+        pxor     xmm0, xmm0            ; set to zero
+        and      ecx,  0FH             ; lower 4 bits indicate misalignment
+        and      rax,  -10H            ; align pointer by 16
+        movdqa   xmm1, [rax]           ; read from nearest preceding boundary
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        shr      edx,  cl              ; shift out false bits
+        shl      edx,  cl              ; shift back again
+        bsf      edx,  edx             ; find first 1-bit
+        jnz      L2                    ; found
+        
+        ; Main loop, search 16 bytes at a time
+L1:     add      rax,  10H             ; increment pointer by 16
+        movdqa   xmm1, [rax]           ; read 16 bytes aligned
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        bsf      edx,  edx             ; find first 1-bit
+        ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+        ;  but we are assuming that most strings are short, and newer processors have higher priority)
+        jz       L1                    ; loop if not found
+        
+L2:     ; Zero-byte found. Compute string length        
+        sub      rax,  Rscopy          ; subtract start address
+        add      rax,  rdx             ; add byte index
+        ret
+        
+;A_strlen ENDP
diff --git a/asmlibSrc/strspn32.asm b/asmlibSrc/strspn32.asm
new file mode 100755
index 0000000..eeb4b89
--- /dev/null
+++ b/asmlibSrc/strspn32.asm
@@ -0,0 +1,338 @@
+;*************************  strspn32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-19
+; Last modified:    2011-08-21
+
+; Description:
+; Faster version of the standard strspn and strcspn functions:
+; size_t A_strspn (const char * str, const char * set);
+; size_t A_strcspn(const char * str, const char * set);
+;
+; A_strspn finds the length of the initial portion of str which consists only of
+; characters that are part of set. 
+; A_strcspn finds the length of the initial portion of str which consists only of
+; characters that are not part of set. 
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard functions strspn and strcspn:
+; Overriding is disabled because the functions may read beyond the end of a string, 
+; while the standard strspn and strcspn functions are guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global _A_strspn: function
+global _A_strcspn: function
+
+; Direct entries to CPU-specific versions
+global _strspnGeneric: function
+global _strcspnGeneric: function
+global _strspnSSE42: function
+global _strcspnSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               strspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strspn: function
+?OVR_strspn:
+%endif
+
+_A_strspn: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [strspnDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+        jmp     near [edx+strspnDispatch-RP1]
+
+%ENDIF
+
+align 16
+_strspnSSE42: ; SSE4.2 version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str
+        mov     edi, [esp+16]          ; set
+        xor     ecx, ecx               ; span counter
+str_next:
+        movdqu  xmm2, [esi]            ; str
+        movdqu  xmm1, [edi]            ; set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    eax, xmm0
+        jns     set_extends
+set_finished:
+        cmp     ax, -1
+        jne     str_finished
+        ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+        add     esi, 16                ; next 16 bytes of str
+        add     ecx, 16                ; count span
+        jmp     str_next
+
+str_finished:
+        not     eax
+        bsf     eax, eax
+        add     eax, ecx
+        pop     edi
+        pop     esi
+        ret
+
+set_loop:
+        or      eax, edx               ; accumulate matches
+set_extends: ; the set is more than 16 bytes
+        add     edi, 16
+        movdqu  xmm1, [edi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop
+        mov     edi, [esp+16]          ; restore set pointer
+        or      eax, edx               ; accumulate matches
+        jmp     set_finished
+
+
+;******************************************************************************
+;                       strcspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcspn: function
+?OVR_strcspn:
+%endif
+
+_A_strcspn: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [strcspnDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP2:                                   ; reference point edx = offset RP2
+
+; Make the following instruction with address relative to RP2:
+        jmp     near [edx+strcspnDispatch-RP2]
+
+%ENDIF
+
+align 16
+_strcspnSSE42: ; SSE4.2 version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str
+        mov     edi, [esp+16]          ; set
+        xor     ecx, ecx               ; span counter
+str_next2:
+        movdqu  xmm2, [esi]            ; str
+        movdqu  xmm1, [edi]            ; set
+        pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+        movd    eax, xmm0
+        jns     set_extends2
+set_finished2:
+        cmp     ax, -1
+        jne     str_finished2
+        ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+        add     esi, 16                ; next 16 bytes of str
+        add     ecx, 16                ; count span
+        jmp     str_next2
+
+str_finished2:
+        not     eax
+        bsf     eax, eax
+        add     eax, ecx
+        pop     edi
+        pop     esi
+        ret
+
+set_loop2:
+        and     eax, edx               ; accumulate matches
+set_extends2: ; the set is more than 16 bytes
+        add     edi, 16
+        movdqu  xmm1, [edi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop2
+        mov     edi, [esp+16]          ; restore set pointer
+        and     eax, edx               ; accumulate matches
+        jmp     set_finished2
+
+
+;******************************************************************************
+;                               strspn function generic
+;******************************************************************************
+
+align 8
+_strspnGeneric: ; Generic version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str pointer
+str_next10:
+        mov     edi, [esp+16]          ; set pointer
+        mov     al, [esi]              ; read one byte from str
+        test    al, al
+        jz      str_finished10         ; str finished
+set_next10:
+        mov     dl, [edi]
+        test    dl, dl
+        jz      set_finished10
+        inc     edi
+        cmp     al, dl
+        jne     set_next10
+        ; character match found, goto next character
+        inc     esi
+        jmp     str_next10
+
+str_finished10: ; end of str, all match
+set_finished10: ; end of set, mismatch found
+        sub     esi, [esp+12]          ; calculate position
+        mov     eax, esi
+        pop     edi
+        pop     esi
+        ret
+;_strspnGeneric end
+
+align 8
+_strcspnGeneric: ; Generic version
+        push    esi
+        push    edi
+        mov     esi, [esp+12]          ; str pointer
+str_next20:
+        mov     edi, [esp+16]          ; set pointer
+        mov     al, [esi]              ; read one byte from str
+        test    al, al
+        jz      str_finished20         ; str finished
+set_next20:
+        mov     dl, [edi]
+        test    dl, dl
+        jz      set_finished20
+        inc     edi
+        cmp     al, dl
+        jne     set_next20
+        ; character match found, stop search
+        jmp     str_finished20
+
+set_finished20: ; end of set, mismatch found
+        inc     esi
+        jmp     str_next20
+
+str_finished20: ; end of str, all match
+        sub     esi, [esp+12]          ; calculate position
+        mov     eax, esi
+        pop     edi
+        pop     esi
+        ret
+;_strcspnGeneric end
+
+; ********************************************************************************
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strspn. This is executed only once
+; ********************************************************************************
+
+strspnCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strstr
+        mov     ecx, _strspnGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _strspnSSE42
+Q100:   mov     [strspnDispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP11:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_strspnGeneric-RP11]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_strspnSSE42-RP11]
+Q100:   mov     [edx+strspnDispatch-RP11], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+strcspnCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strstr
+        mov     ecx, _strcspnGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _strcspnSSE42
+Q200:   mov     [strcspnDispatch], ecx
+        ; Continue in appropriate version 
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP12:   ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_strcspnGeneric-RP12]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_strcspnSSE42-RP12]
+Q200:   mov     [edx+strcspnDispatch-RP12], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strspnDispatch  DD strspnCPUDispatch
+strcspnDispatch DD strcspnCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix problem in Mac linker
+        DD      0,0,0,0
+%ENDIF
+
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+        dq      0, 0
diff --git a/asmlibSrc/strspn64.asm b/asmlibSrc/strspn64.asm
new file mode 100755
index 0000000..60c0a4d
--- /dev/null
+++ b/asmlibSrc/strspn64.asm
@@ -0,0 +1,304 @@
+;*************************  strspn64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-19
+; Last modified:    2011-07-19
+
+; Description:
+; Faster version of the standard strspn and strcspn functions:
+; size_t A_strspn (const char * str, const char * set);
+; size_t A_strcspn(const char * str, const char * set);
+;
+; A_strspn finds the length of the initial portion of str which consists only of
+; characters that are part of set. 
+; A_strcspn finds the length of the initial portion of str which consists only of
+; characters that are not part of set. 
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard functions strspn and strcspn:
+; Overriding is disabled because the functions may read beyond the end of a string, 
+; while the standard strspn and strcspn functions are guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global A_strspn: function
+global A_strcspn: function
+
+; Direct entries to CPU-specific versions
+global strspnGeneric: function
+global strcspnGeneric: function
+global strspnSSE42: function
+global strcspnSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               strspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strspn: function
+?OVR_strspn:
+%endif
+
+align 16
+A_strspn: ; function dispatching
+        jmp     near [strspnDispatch] ; Go to appropriate version, depending on instruction set
+
+strspnSSE42: ; SSE4.2 version
+%ifdef  WINDOWS
+        push    rdi
+        push    rsi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        xor     ecx, ecx               ; span counter
+str_next:
+        movdqu  xmm2, [rdi]            ; str
+        movdqu  xmm1, [rsi]            ; set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    eax, xmm0
+        jns     set_extends
+set_finished:
+        cmp     ax, -1
+        jne     str_finished
+        ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+        add     rdi, 16                ; next 16 bytes of str
+        add     rcx, 16                ; count span
+        jmp     str_next
+
+str_finished:
+        not     eax
+        bsf     eax, eax
+        add     rax, rcx
+%ifdef  WINDOWS
+        pop     rsi
+        pop     rdi
+%endif
+        ret
+
+set_loop:
+        or      eax, edx               ; accumulate matches
+set_extends: ; the set is more than 16 bytes
+        add     rsi, 16
+        movdqu  xmm1, [rsi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop
+        mov     rsi, r8                ; restore set pointer
+        or      eax, edx               ; accumulate matches
+        jmp     set_finished
+
+
+;******************************************************************************
+;                       strcspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcspn: function
+?OVR_strcspn:
+%endif
+
+align 16
+A_strcspn: ; function dispatching
+        jmp     near [strcspnDispatch] ; Go to appropriate version, depending on instruction set
+
+strcspnSSE42: ; SSE4.2 version
+%ifdef  WINDOWS
+        push    rdi
+        push    rsi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        xor     ecx, ecx               ; span counter
+str_next2:
+        movdqu  xmm2, [rdi]            ; str
+        movdqu  xmm1, [rsi]            ; set
+        pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+        movd    eax, xmm0
+        jns     set_extends2
+set_finished2:
+        cmp     ax, -1
+        jne     str_finished2
+        ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+        add     rdi, 16                ; next 16 bytes of str
+        add     rcx, 16                ; count span
+        jmp     str_next2
+
+str_finished2:
+        not     eax
+        bsf     eax, eax
+        add     rax, rcx
+%ifdef  WINDOWS
+        pop     rsi
+        pop     rdi
+%endif
+        ret
+
+set_loop2:
+        and     eax, edx               ; accumulate matches
+set_extends2: ; the set is more than 16 bytes
+        add     rsi, 16
+        movdqu  xmm1, [rsi]            ; next part of set
+        pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+        movd    edx, xmm0
+        jns     set_loop2
+        mov     rsi, r8                ; restore set pointer
+        and     eax, edx               ; accumulate matches
+        jmp     set_finished2
+
+
+;******************************************************************************
+;                               strspn function generic
+;******************************************************************************
+
+align 8
+strspnGeneric: ; Generic version
+%ifdef  WINDOWS
+        push    rdi
+        push    rsi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        mov     r9,  rdi
+
+str_next10:
+        mov     al, [rdi]              ; read one byte from str
+        test    al, al
+        jz      str_finished10         ; str finished
+set_next10:
+        mov     dl, [rsi]
+        test    dl, dl
+        jz      set_finished10
+        inc     rsi
+        cmp     al, dl
+        jne     set_next10
+        ; character match found, goto next character
+        inc     rdi
+        mov     rsi, r8                ; set pointer
+        jmp     str_next10
+
+str_finished10: ; end of str, all match
+set_finished10: ; end of set, mismatch found
+        sub     rdi, r9                ; calculate position
+        mov     rax, rdi
+%ifdef  WINDOWS
+        pop     rsi
+        pop     rdi
+%endif
+        ret
+;_strspnGeneric end
+
+align 8
+strcspnGeneric: ; Generic version
+%ifdef  WINDOWS
+        push    rdi
+        push    rsi
+        mov     rdi, rcx               ; str
+        mov     rsi, rdx               ; set
+%endif
+        mov     r8,  rsi
+        mov     r9,  rdi
+str_next20:
+        mov     al, [rdi]              ; read one byte from str
+        test    al, al
+        jz      str_finished20         ; str finished
+set_next20:
+        mov     dl, [rsi]
+        test    dl, dl
+        jz      set_finished20
+        inc     rsi
+        cmp     al, dl
+        jne     set_next20
+        ; character match found, stop search
+        jmp     str_finished20
+
+set_finished20: ; end of set, mismatch found
+        inc     rdi
+        mov     rsi, r8                ; set pointer
+        jmp     str_next20
+
+str_finished20: ; end of str, all match
+        sub     rdi, r9                ; calculate position
+        mov     rax, rdi
+%ifdef  WINDOWS
+        pop     rsi
+        pop     rdi
+%endif
+        ret
+;_strcspnGeneric end
+
+
+; ********************************************************************************
+; CPU dispatching for strspn. This is executed only once
+; ********************************************************************************
+
+%ifdef  WINDOWS
+%define par1    rcx
+%define par2    rdx
+%else   ; UNIX
+%define par1    rdi
+%define par2    rsi
+%endif
+
+strspnCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version of strstr
+        lea     r8, [strspnGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     r8, [strspnSSE42]
+Q100:   mov     [strspnDispatch], r8
+        ; Continue in appropriate version 
+        jmp     r8
+
+
+strcspnCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version of strstr
+        lea     r8, [strcspnGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     r8, [strcspnSSE42]
+Q200:   mov     [strcspnDispatch], r8
+        ; Continue in appropriate version 
+        jmp     r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strspnDispatch  DQ strspnCPUDispatch
+strcspnDispatch DQ strcspnCPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strstr32.asm b/asmlibSrc/strstr32.asm
new file mode 100755
index 0000000..8ee6450
--- /dev/null
+++ b/asmlibSrc/strstr32.asm
@@ -0,0 +1,251 @@
+;*************************  strstr32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-14
+; Last modified:    2011-08-21
+
+; Description:
+; Faster version of the standard strstr function:
+; char * A_strstr(char * haystack, const char * needle);
+; Searches for substring needle in string haystack. Return value is pointer to 
+; first occurrence of needle, or NULL if not found. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment. Avoiding this would be complicated
+; and make the function much slower: For every unaligned 16-bytes read we would have to
+; check if it crosses a page boundary (4 kbytes), and if so check if the string ends
+; before the page boundary. Only if the string does not end before the page boundary
+; can we read into the next memory page.
+;
+; Overriding standard function strstr:
+; The alias ?OVR_strstr is changed to _strstr in the object file if
+; it is desired to override the standard library function strstr.
+; Overriding is disabled because the function may read beyond the end of a 
+; string, while the standard strstr function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global _A_strstr: function             ; Function A_strstr
+
+; Direct entries to CPU-specific versions
+global _strstrGeneric: function        ; Generic version for processors without SSE4.2
+global _strstrSSE42: function          ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+; strstr function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strstr: function
+?OVR_strstr:
+%endif
+
+_A_strstr: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+        jmp     near [strstrDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP:                                    ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+        jmp     dword [edx+strstrDispatch-RP]
+
+%ENDIF
+
+align 16
+_strstrSSE42: ; SSE4.2 version
+        push    ebx
+		push    esi
+		mov     esi, [esp+12]          ; haystack
+		mov     eax, [esp+16]          ; needle
+        movdqu  xmm1, [eax]            ; needle
+
+align 8
+haystacknext:   
+        ; [esi] = haystack
+        pcmpistrm xmm1, [esi], 00001100b ; unsigned byte search, equal ordered, return mask in xmm0
+		jc      matchbegin             ; found beginning of a match
+		jz      nomatch                ; end of haystack found, no match
+		add     esi, 16
+		jmp     haystacknext
+
+matchbegin:
+		jz      foundshort             ; haystack ends here, a short match is found
+		movd    eax, xmm0              ; bit mask of possible matches
+nextindexbit:
+        bsf     ecx, eax               ; index of first bit in mask of possible matches
+
+		; compare strings for full match
+		lea     ebx, [esi+ecx]         ; haystack + index
+		mov     edx, [esp+16]          ; needle
+
+compareloop: ; compare loop for long match
+        movdqu  xmm2, [edx]            ; paragraph of needle
+        pcmpistrm xmm2, [ebx], 00001100B ; unsigned bytes, equal ordered, modifies xmm0
+        ; (can't use "equal each, masked" because it inverts when past end of needle, but not when past end of both)
+
+        jno     longmatchfail          ; difference found after extending partial match
+		js      longmatchsuccess       ; end of needle found, and no difference
+		add     edx, 16
+		add     ebx, 16
+		jmp     compareloop            ; loop to next 16 bytes
+
+longmatchfail:
+        ; remove index bit of first partial match
+		btr     eax, ecx
+		test    eax, eax
+		jnz     nextindexbit           ; mask contains more index bits, loop to next bit in eax mask
+		; mask exhausted for possible matches, continue to next haystack paragraph
+		add     esi, 16
+		jmp     haystacknext           ; loop to next paragraph of haystack
+
+longmatchsuccess: ; match found over more than one paragraph
+		lea     eax, [esi+ecx]         ; haystack + index to begin of long match
+		pop     esi
+		pop     ebx
+		ret
+
+foundshort: ; match found within single paragraph 
+        movd    eax, xmm0              ; bit mask of matches
+        bsf     eax, eax               ; index of first match
+		add     eax, esi               ; pointer to first match
+		pop     esi
+		pop     ebx
+		ret
+
+nomatch: ; needle not found, return 0
+        xor     eax, eax
+		pop     esi
+		pop     ebx
+		ret
+
+;_strstrSSE42: endp
+
+
+align 16
+_strstrGeneric: ; generic version
+		push    esi
+		push    edi
+		mov     esi, [esp+12]          ; haystack
+		mov     edi, [esp+16]          ; needle
+		
+		mov     ax, [edi]
+		test    al, al
+		jz      _Found                 ; a zero-length needle is always found
+		test    ah, ah
+		jz      _SingleCharNeedle		
+		
+_SearchLoop: ; search for first character match
+        mov     cl, [esi]
+        test    cl, cl
+        jz      _NotFound              ; end of haystack reached without finding
+        cmp     al, cl
+        je      _FirstCharMatch        ; first character match
+_IncompleteMatch:
+        inc     esi
+        jmp     _SearchLoop            ; loop through haystack
+		
+_FirstCharMatch:
+        mov     ecx, esi               ; begin of match position
+_MatchLoop:
+        inc     ecx
+        inc     edi
+        mov     al, [edi]
+        test    al, al
+        jz      _Found                 ; end of needle. match ok
+        cmp     al, [ecx] 
+        je      _MatchLoop
+        ; match failed, recover and continue
+		mov     edi, [esp+16]          ; needle
+		mov     al, [edi]
+		jmp     _IncompleteMatch
+		
+_NotFound: ; needle not found. return 0
+        xor     eax, eax
+        pop     edi
+        pop     esi
+        ret
+		
+_Found: ; needle found. return pointer to position in haystack
+        mov     eax, esi
+        pop     edi
+        pop     esi
+        ret		
+		
+_SingleCharNeedle: ; Needle is a single character
+        movzx   ecx, byte [esi]
+        test    cl, cl
+        jz      _NotFound              ; end of haystack reached without finding
+        cmp     al, cl
+        je      _Found
+        inc     esi
+        jmp     _SingleCharNeedle  ; loop through haystack
+
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; CPU dispatching for strstr. This is executed only once
+strstrCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version of strstr
+        mov     ecx, _strstrGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        mov     ecx, _strstrSSE42
+Q100:   mov     [strstrDispatch], ecx
+        ; Continue in appropriate version of strstr
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP2:    ; reference point edx
+        ; Point to generic version of strstr
+        lea     ecx, [edx+_strstrGeneric-RP2]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     ecx, [edx+_strstrSSE42-RP2]
+Q100:   mov     [edx+strstrDispatch-RP2], ecx
+        ; Continue in appropriate version of strstr
+        jmp     ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strstrDispatch DD strstrCPUDispatch
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+        dq      0, 0
diff --git a/asmlibSrc/strstr64.asm b/asmlibSrc/strstr64.asm
new file mode 100755
index 0000000..d4cc1e1
--- /dev/null
+++ b/asmlibSrc/strstr64.asm
@@ -0,0 +1,218 @@
+;*************************  strstr64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-14
+; Last modified:    2011-07-14
+
+; Description:
+; Faster version of the standard strstr function:
+; char * A_strstr(char * haystack, const char * needle);
+; Searches for substring needle in string haystack. Return value is pointer to 
+; first occurrence of needle, or NULL if not found. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment. Avoiding this would be complicated
+; and make the function much slower: For every unaligned 16-bytes read we would have to
+; check if it crosses a page boundary (4 kbytes), and if so check if the string ends
+; before the page boundary. Only if the string does not end before the page boundary
+; can we read into the next memory page.
+;
+; Overriding standard function strstr:
+; The alias ?OVR_strstr is changed to _strstr in the object file if
+; it is desired to override the standard library function strstr.
+; Overriding is disabled because the function may read beyond the end of a 
+; string, while the standard strstr function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0               ; Set to one if override of standard function desired
+
+global A_strstr: function             ; Function A_strstr
+
+; Direct entries to CPU-specific versions
+global strstrGeneric: function            ; Generic version for processors without SSE4.2
+global strstrSSE42: function          ; Version for processors with SSE4.2
+
+; Imported from instrset64.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+; strstr function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strstr: function
+?OVR_strstr:
+%endif
+
+A_strstr: ; function dispatching
+        jmp     near [strstrDispatch] ; Go to appropriate version, depending on instruction set
+
+; define register use
+%ifdef  WINDOWS
+%define par1      rcx                  ; parameter 1, pointer to haystack
+%define par2      rdx                  ; parameter 2, pointer to needle
+%define bitindex  r8d                  ; bit index in eax mask 
+%define bitindexr r8                   ; bit index in eax mask 
+%define phay      r9                   ; pointer to match in haystack
+%define pnee      r10                  ; pointer to match in needle
+%define tempb     r8b                  ; temporary byte
+%else
+%define par1      rdi                  ; parameter 1, pointer to haystack
+%define par2      rsi                  ; parameter 2, pointer to needle
+%define bitindex  ecx                  ; bit index in eax mask 
+%define bitindexr rcx                  ; bit index in eax mask 
+%define phay      r9                   ; pointer to match in haystack
+%define pnee      rdx                  ; pointer to match in needle
+%define tempb     cl                   ; temporary byte
+%endif
+
+align 16
+strstrSSE42: ; SSE4.2 version
+        movdqu  xmm1, [par2]           ; needle
+
+;align 8
+haystacknext:   
+        ; [par1] = haystack
+        pcmpistrm xmm1, [par1], 00001100b ; unsigned byte search, equal ordered, return mask in xmm0
+		jc      matchbegin             ; found beginning of a match
+		jz      nomatch                ; end of haystack found, no match
+		add     par1, 16
+		jmp     haystacknext
+
+matchbegin:
+		jz      foundshort             ; haystack ends here, a short match is found
+		movd    eax, xmm0              ; bit mask of possible matches
+nextindexbit:
+        bsf     bitindex, eax          ; index of first bit in mask of possible matches
+
+		; compare strings for full match
+		lea     phay, [par1+bitindexr] ; haystack + index
+		mov     pnee, par2             ; needle
+
+compareloop: ; compare loop for long match
+        movdqu  xmm2, [pnee]           ; paragraph of needle
+        pcmpistrm xmm2, [phay], 00001100B ; unsigned bytes, equal ordered, modifies xmm0
+        ; (can't use "equal each, masked" because it inverts when past end of needle, but not when past end of both)
+
+        jno     longmatchfail          ; difference found after extending partial match
+		js      longmatchsuccess       ; end of needle found, and no difference
+		add     pnee, 16
+		add     phay, 16
+		jmp     compareloop            ; loop to next 16 bytes
+
+longmatchfail:
+        ; remove index bit of first partial match
+		btr     eax, bitindex
+		test    eax, eax
+		jnz     nextindexbit           ; mask contains more index bits, loop to next bit in eax mask
+		; mask exhausted for possible matches, continue to next haystack paragraph
+		add     par1, 16
+		jmp     haystacknext           ; loop to next paragraph of haystack
+
+longmatchsuccess: ; match found over more than one paragraph
+		lea     rax, [par1+bitindexr]  ; haystack + index to begin of long match
+		ret
+
+foundshort: ; match found within single paragraph 
+        movd    eax, xmm0              ; bit mask of matches
+        bsf     eax, eax               ; index of first match
+		add     rax, par1              ; pointer to first match
+		ret
+
+nomatch: ; needle not found, return 0
+        xor     eax, eax
+		ret
+
+;strstrSSE42: endp
+
+
+align 16
+strstrGeneric: ; generic version
+		
+		mov     ax, [par2]
+		test    al, al
+		jz      _Found                 ; a zero-length needle is always found
+		test    ah, ah
+		jz      _SingleCharNeedle		
+		
+_SearchLoop: ; search for first character match
+        mov     tempb, [par1]
+        test    tempb, tempb
+        jz      _NotFound              ; end of haystack reached without finding
+        cmp     al, tempb
+        je      _FirstCharMatch        ; first character match
+_IncompleteMatch:
+        inc     par1
+        jmp     _SearchLoop            ; loop through haystack
+		
+_FirstCharMatch:
+        mov     phay, par1             ; begin of match position
+        mov     pnee, par2
+_MatchLoop:
+        inc     phay
+        inc     pnee
+        mov     al, [pnee]
+        test    al, al
+        jz      _Found                 ; end of needle. match ok
+        cmp     al, [phay] 
+        je      _MatchLoop
+        ; match failed, recover and continue
+		mov     al, [par2]
+		jmp     _IncompleteMatch
+		
+_NotFound: ; needle not found. return 0
+        xor     eax, eax
+        ret
+		
+_Found: ; needle found. return pointer to position in haystack
+        mov     rax, par1
+        ret		
+		
+_SingleCharNeedle: ; Needle is a single character
+        mov     tempb, byte [par1]
+        test    tempb, tempb
+        jz      _NotFound              ; end of haystack reached without finding
+        cmp     al, tempb
+        je      _Found
+        inc     par1
+        jmp     _SingleCharNeedle      ; loop through haystack
+
+
+; CPU dispatching for strstr. This is executed only once
+strstrCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version of strstr
+        lea     r9, [strstrGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     r9, [strstrSSE42]
+Q100:   mov     [strstrDispatch], r9
+        ; Continue in appropriate version of strstr
+        jmp     r9
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strstrDispatch DQ strstrCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+        dq      0, 0
diff --git a/asmlibSrc/strtouplow32.asm b/asmlibSrc/strtouplow32.asm
new file mode 100755
index 0000000..c0aacf2
--- /dev/null
+++ b/asmlibSrc/strtouplow32.asm
@@ -0,0 +1,285 @@
+;*************************  strtouplow32.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-17
+; Last modified:    2013-09-11
+
+; Description:
+; A_strtolower converts a sting to lower case
+; A_strtoupper converts a sting to upper case
+; Only characters a-z or A-Z are converted, other characters are ignored.
+; The functions save time by ignoring locale-specific characters or UTF-8 
+; characters so it doesn't have to look up each character in a table.
+;
+; Function prototypes:
+; extern "C" void A_strtolower(char * string);
+; extern "C" void A_strtoupper(char * string);
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+section .data
+align  16
+
+azlow:   db 'azazazazazazazaz'         ; define range for lower case
+azhigh:  db 'AZAZAZAZAZAZAZAZ'         ; define range for upper case
+casebit: times 16 db 20h               ; bit to change when changing case
+
+section .text
+
+global _A_strtolower: function
+global _A_strtoupper: function
+global _strtolowerGeneric: function
+global _strtoupperGeneric: function
+global _strtolowerSSE42: function
+global _strtoupperSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet                 ; Instruction set for CPU dispatcher
+
+; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+_A_strtolower: 
+        jmp     near [strtolowerDispatch] ; Go to appropriate version, depending on instruction set
+
+_A_strtoupper: 
+        jmp     near [strtoupperDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE   ; Position-independent code
+
+_A_strtolower:
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP1:                                   ; reference point edx = offset RP1
+; Make the following instruction with address relative to RP1:
+        jmp     dword [edx+strtolowerDispatch-RP1]
+
+_A_strtoupper:
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP2:                                   ; reference point edx = offset RP2
+; Make the following instruction with address relative to RP2:
+        jmp     dword [edx+strtoupperDispatch-RP2]
+
+%ENDIF
+
+
+_strtoupperSSE42:
+%IFNDEF POSITIONINDEPENDENT
+        movdqa  xmm1, [azlow]          ; define range a-z
+        movdqa  xmm3, [casebit]        ; bit to change
+%ELSE
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP11:
+        movdqa  xmm1, [edx+azlow-RP11] ; same, with relative address
+        movdqa  xmm3, [edx+casebit-RP11]
+%ENDIF
+		jmp     strupperlower
+		
+_strtolowerSSE42:
+%IFNDEF POSITIONINDEPENDENT
+        movdqa  xmm1, [azhigh]         ; define range A-Z
+        movdqa  xmm3, [casebit]        ; bit to change
+%ELSE
+        call    get_thunk_edx          ; get reference point for position-independent code
+RP12:
+        movdqa  xmm1, [edx+azhigh-RP12]; same, with relative address
+        movdqa  xmm3, [edx+casebit-RP12]
+%ENDIF
+
+        
+strupperlower:
+        ; common code for strtoupper and strtolower
+        mov     edx,  [esp+4]          ; string
+next:   ; loop
+        movdqu  xmm2, [edx]            ; read 16 bytes from string
+        pcmpistrm xmm1, xmm2, 01000100b; find bytes in range A-Z or a-z, return mask in xmm0
+        jz      last                   ; string ends in this paragraph
+        pand    xmm0, xmm3             ; mask AND case bit
+        pxor    xmm2, xmm0             ; change case bit in masked bytes of string
+        movdqu  [edx], xmm2            ; write changed value
+		add     edx, 16
+		jmp     next                   ; next 16 bytes
+
+last:   ; Write last 0-15 bytes
+        ; While we can read past the end of the string if precautions are made, we cannot write
+		; past the end of the string, even if the value is unchanged, because the value may have
+		; been changed in the meantime by another thread
+        jnc     finish                 ; nothing changed, no need to write
+        pand    xmm3, xmm0             ; mask and case bit
+        pxor    xmm2, xmm3             ; change case bit
+
+%if 0   ; Method with maskmovdqu is elegant, but slow because maskmovdqu uses nontemporal (uncached) write
+        push    edi
+		mov     edi, edx
+		maskmovdqu xmm2, xmm0
+		pop     edi
+finish: ret
+
+%else   ; less elegant alternative, but probably faster if data needed again soon
+        ; write 8-4-2-1 bytes, if necessary
+		pmovmskb eax, xmm0             ; create bit mask
+		cmp     eax, 10000000b
+		jb      L10
+		; there are at least 8 bytes to write
+		movq    [edx], xmm2
+		psrldq  xmm2, 8
+		add     edx, 8
+		shr     eax, 8
+L10:    cmp     eax, 1000b
+        jb      L20
+		; there are at least 4 bytes to write
+		movd    [edx], xmm2
+		psrldq  xmm2, 4
+		add     edx, 4
+		shr     eax, 4
+L20:    movd    ecx, xmm2              ; use ecx for last 3 bytes
+        cmp     eax, 10b
+		jb      L30
+		; there are at least 2 bytes to write
+		mov     [edx], cx
+		shr     ecx, 16
+		add     edx, 2
+		shr     eax, 2
+L30:    cmp     eax, 1
+        jb      finish
+		; there is one more byte to write
+		mov     [edx], cl
+finish: ret
+%endif
+
+; 386 version
+_strtolowerGeneric:
+        mov     edx,  [esp+4]          ; string
+A100:   ; loop
+        mov     al, [edx]
+		test    al, al
+		jz      A900                   ; end of string
+		sub     al, 'A'
+		cmp     al, 'Z' - 'A'
+		jbe     A200                   ; is upper case
+		inc     edx
+		jmp     A100                   ; loop to next character
+A200:   ; convert to lower case
+        add     al, 'a'
+		mov     [edx], al
+		inc     edx
+		jmp     A100
+A900:   ret
+;_strtolowerGeneric end
+
+_strtoupperGeneric:
+        mov     edx,  [esp+4]          ; string
+B100:   ; loop
+        mov     al, [edx]
+		test    al, al
+		jz      B900                   ; end of string
+		sub     al, 'a'
+		cmp     al, 'z' - 'a'
+		jbe     B200                   ; is lower case
+		inc     edx
+		jmp     B100                   ; loop to next character
+B200:   ; convert to upper case
+        add     al, 'A'
+		mov     [edx], al
+		inc     edx
+		jmp     B100
+B900:   ret
+;_strtoupperGeneric end
+
+%IFDEF  POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+        mov edx, [esp]
+        ret
+%ENDIF
+
+; CPU dispatching for strtolower. This is executed only once
+strtolowerCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version
+        mov     ecx, _strtolowerGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version
+        mov     ecx, _strtolowerSSE42
+Q100:   mov     [strtolowerDispatch], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP21:    ; reference point edx
+        ; Point to generic version
+        lea     ecx, [edx+_strtolowerGeneric-RP21]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version
+        lea     ecx, [edx+_strtolowerSSE42-RP21]
+Q100:   mov     [edx+strtolowerDispatch-RP21], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+; CPU dispatching for strtoupper. This is executed only once
+strtoupperCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+        ; get supported instruction set
+        call    _InstructionSet
+        ; Point to generic version
+        mov     ecx, _strtoupperGeneric
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version 
+        mov     ecx, _strtoupperSSE42
+Q200:   mov     [strtoupperDispatch], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+
+%ELSE   ; Position-independent version
+        ; get supported instruction set
+        call    _InstructionSet
+        call    get_thunk_edx
+RP22:    ; reference point edx
+        ; Point to generic version 
+        lea     ecx, [edx+_strtoupperGeneric-RP22]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version
+        lea     ecx, [edx+_strtoupperSSE42-RP22]
+Q200:   mov     [edx+strtoupperDispatch-RP22], ecx
+        ; Continue in appropriate version
+        jmp     ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strtolowerDispatch DD strtolowerCPUDispatch
+strtoupperDispatch DD strtoupperCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix problem in Mac linker
+        DD      0,0,0,0
+%ENDIF
+
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+        dq      0, 0
diff --git a/asmlibSrc/strtouplow64.asm b/asmlibSrc/strtouplow64.asm
new file mode 100755
index 0000000..9ead7db
--- /dev/null
+++ b/asmlibSrc/strtouplow64.asm
@@ -0,0 +1,213 @@
+;*************************  strtouplow64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-17
+; Last modified:    2013-09-11
+
+; Description:
+; A_strtolower converts a sting to lower case
+; A_strtoupper converts a sting to upper case
+; Only characters a-z or A-Z are converted, other characters are ignored.
+; The functions save time by ignoring locale-specific characters or UTF-8 
+; characters so it doesn't have to look up each character in a table.
+;
+; Function prototypes:
+; extern "C" void A_strtolower(char * string);
+; extern "C" void A_strtoupper(char * string);
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for SSE2 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+section .data
+align 16
+
+azlow:   db 'azazazazazazazaz'         ; define range for lower case
+azhigh:  db 'AZAZAZAZAZAZAZAZ'         ; define range for upper case
+casebit: times 16 db 20h               ; bit to change when changing case
+
+%ifdef WINDOWS
+%define par1  rcx                      ; register for parameter 1
+%else ; UNIX
+%define par1  rdi
+%endif
+
+section .text
+
+global A_strtolower: function
+global A_strtoupper: function
+global strtolowerGeneric: function
+global strtoupperGeneric: function
+global strtolowerSSE42: function
+global strtoupperSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+; function dispatching
+
+A_strtolower: 
+        jmp     near [strtolowerDispatch] ; Go to appropriate version, depending on instruction set
+
+A_strtoupper: 
+        jmp     near [strtoupperDispatch] ; Go to appropriate version, depending on instruction set
+
+
+; SSE4.2 version
+strtoupperSSE42:
+        movdqa  xmm1, [azlow]          ; define range a-z
+		jmp     strupperlower
+strtolowerSSE42:
+        movdqa  xmm1, [azhigh]         ; define range A-Z
+strupperlower:
+        ; common code for strtoupper and strtolower
+        movdqa  xmm3, [casebit]        ; bit to change
+next:   ; loop
+        movdqu  xmm2, [par1]           ; read 16 bytes from string
+        pcmpistrm xmm1, xmm2, 01000100b; find bytes in range A-Z or a-z, return mask in xmm0
+        jz      last                   ; string ends in this paragraph
+        pand    xmm0, xmm3             ; mask AND case bit
+        pxor    xmm2, xmm0             ; change case bit in masked bytes of string
+        movdqu  [par1], xmm2           ; write changed value
+		add     par1, 16
+		jmp     next                   ; next 16 bytes
+
+last:   ; Write last 0-15 bytes
+        ; While we can read past the end of the string if precautions are made, we cannot write
+		; past the end of the string, even if the value is unchanged, because the value may have
+		; been changed in the meantime by another thread
+        jnc     finish                 ; nothing changed, no need to write
+        pand    xmm3, xmm0             ; mask and case bit
+        pxor    xmm2, xmm3             ; change case bit
+
+%if 0   ; Method with maskmovdqu is elegant, but slow because maskmovdqu uses nontemporal (uncached) write
+        push    rdi
+		mov     rdi, par1
+		maskmovdqu xmm2, xmm0
+		pop     rdi
+finish: ret
+
+%else   ; less elegant alternative, but probably faster if data needed again soon
+        ; write 8-4-2-1 bytes, if necessary
+		pmovmskb eax, xmm0             ; create bit mask
+		cmp     eax, 10000000b
+		jb      L10
+		; there are at least 8 bytes to write
+		movq    [par1], xmm2
+		psrldq  xmm2, 8
+		add     par1, 8
+		shr     eax, 8
+L10:    cmp     eax, 1000b
+        jb      L20
+		; there are at least 4 bytes to write
+		movd    [par1], xmm2
+		psrldq  xmm2, 4
+		add     par1, 4
+		shr     eax, 4
+L20:    movd    edx, xmm2              ; use edx for last 3 bytes
+        cmp     eax, 10b
+		jb      L30
+		; there are at least 2 bytes to write
+		mov     [par1], dx
+		shr     edx, 16
+		add     par1, 2
+		shr     eax, 2
+L30:    cmp     eax, 1
+        jb      finish
+		; there is one more byte to write
+		mov     [par1], dl
+finish: ret
+%endif
+
+; SSE2 version
+strtolowerGeneric:
+A100:   ; loop
+        mov     al, [par1]
+		test    al, al
+		jz      A900                   ; end of string
+		sub     al, 'A'
+		cmp     al, 'Z' - 'A'
+		jbe     A200                   ; is upper case
+		inc     par1
+		jmp     A100                   ; loop to next character
+A200:   ; convert to lower case
+        add     al, 'a'
+		mov     [par1], al
+		inc     par1
+		jmp     A100
+A900:   ret
+;strtolowerGeneric end
+
+strtoupperGeneric:
+B100:   ; loop
+        mov     al, [par1]
+		test    al, al
+		jz      B900                   ; end of string
+		sub     al, 'a'
+		cmp     al, 'z' - 'a'
+		jbe     B200                   ; is lower case
+		inc     par1
+		jmp     B100                   ; loop to next character
+B200:   ; convert to upper case
+        add     al, 'A'
+		mov     [par1], al
+		inc     edx
+		jmp     B100
+B900:   ret
+;strtoupperGeneric end
+
+
+; CPU dispatching for strtolower. This is executed only once
+strtolowerCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        ; Point to generic version
+        lea     rdx, [strtolowerGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version
+        lea     rdx, [strtolowerSSE42]
+Q100:   mov     [strtolowerDispatch], rdx
+        ; Continue in appropriate version
+        jmp     rdx
+
+; CPU dispatching for strtoupper. This is executed only once
+strtoupperCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        ; Point to generic version
+        lea     rdx, [strtoupperGeneric]
+        cmp     eax, 10                ; check SSE4.2
+        jb      Q200
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version
+        lea     rdx, [strtoupperSSE42]
+Q200:   mov     [strtoupperDispatch], rdx
+        ; Continue in appropriate version
+        jmp     rdx
+
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strtolowerDispatch DQ strtolowerCPUDispatch
+strtoupperDispatch DQ strtoupperCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+        dq      0, 0
diff --git a/asmlibSrc/substring32.asm b/asmlibSrc/substring32.asm
new file mode 100755
index 0000000..de450d2
--- /dev/null
+++ b/asmlibSrc/substring32.asm
@@ -0,0 +1,61 @@
+;*************************  substring32.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-18
+; Last modified:    2011-07-18
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than 
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global _A_substring: function                     ; Function _A_substring
+
+extern _A_strlen
+extern _A_memcpy
+
+SECTION .text
+
+; extern "C"                 4                   8            12          16
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+_A_substring:
+        mov     ecx, [esp+8]           ; source
+        push    ecx
+        call    _A_strlen              ; eax = strlen(source)
+        pop     ecx
+        mov     edx, [esp+12]          ; pos
+        sub     eax, edx               ; max length = strlen(source) - pos
+        jbe     empty                  ; strlen(source) <= pos. Return empty string
+        mov     ecx, [esp+16]          ; len
+        cmp     eax, ecx
+        cmova   eax, ecx               ; min(len, maxlen)
+        add     edx, [esp+8]           ; source + pos
+        mov     ecx, [esp+4]           ; dest
+        push    eax                    ; length for memcpy
+        push    edx                    ; source for memcpy
+        push    ecx                    ; dest for memcpy
+        call    _A_memcpy
+        pop     ecx
+        pop     edx
+        pop     eax                    ; return final length
+        mov     byte [ecx+eax], 0      ; terminating zero        
+        ret
+        
+empty:  ; return empty string
+        mov     ecx, [esp+4]           ; dest
+        xor     eax, eax               ; return 0
+        mov     byte [ecx], al
+        ret
+        
+;_A_substring END
diff --git a/asmlibSrc/substring64.asm b/asmlibSrc/substring64.asm
new file mode 100755
index 0000000..f911a98
--- /dev/null
+++ b/asmlibSrc/substring64.asm
@@ -0,0 +1,73 @@
+;*************************  substring64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-18
+; Last modified:    2011-07-18
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than 
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global A_substring: function                      ; Function _A_substring
+
+extern A_strlen
+extern A_memcpy
+
+SECTION .text
+
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+%ifdef WINDOWS
+%define par1    rcx                    ; dest
+%define par2    rdx                    ; source
+%define par3    r8                     ; pos
+%define par4    r9                     ; len
+%else   ; UNIX
+%define par1    rdi
+%define par2    rsi
+%define par3    rdx
+%define par4    rcx
+%endif
+
+A_substring:
+        push    par1
+        push    par2
+        push    par3
+        push    par4
+        mov     par1, par2
+        call    A_strlen               ; rax = strlen(source)
+        pop     par4
+        pop     par3
+        pop     par2
+        pop     par1        
+        sub     rax, par3              ; max length = strlen(source) - pos
+        jbe     empty                  ; strlen(source) <= pos. Return empty string
+        cmp     rax, par4
+        cmova   rax, par4              ; min(len, maxlen)
+        add     par2, par3             ; source + pos = source for memcpy
+        mov     par3, rax              ; length for memcpy
+        push    rax                    ; new length
+        call    A_memcpy
+        pop     rcx                    ; new length = return value, rax = dest
+        mov     byte [rcx+rax], 0      ; terminating zero
+        mov     rax, rcx               ; return new length
+        ret
+        
+empty:  ; return empty string
+        xor     eax, eax               ; return 0
+        mov     byte [par1], al
+        ret
+        
+;A_substring END
diff --git a/asmlibSrc/testalib.cpp b/asmlibSrc/testalib.cpp
new file mode 100755
index 0000000..fa185c8
--- /dev/null
+++ b/asmlibSrc/testalib.cpp
@@ -0,0 +1,151 @@
+/*************************** testalib.cpp **********************************
+* Author:        Agner Fog
+* Date created:  2007-06-14
+* Last modified: 2011-07-17
+* Project:       asmlib.zip
+* Source URL:    www.agner.org/optimize
+*
+* Description:
+* Simple test of asmlib library
+*
+* Instructions:
+* Compile for console mode and link with the appropriate version of asmlib
+*
+* Further documentation:
+* The file asmlib-instructions.pdf contains further documentation and 
+* instructions.
+*
+* Copyright 2007-2011 by Agner Fog. 
+* GNU General Public License http://www.gnu.org/licenses/gpl.html
+*****************************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "asmlib.h"
+
+
+void Failure(const char * text) {
+   // Report if test failure
+   printf("\nTest failed: %s\n", text);
+   exit(1);
+}
+
+int main () {
+
+   // test InstructionSet()
+   printf("\nInstructionSet = %i",   InstructionSet());
+
+   // test cpuid_abcd
+   int abcd[4]; char s[16];
+   cpuid_abcd(abcd, 0);
+   *(int*)(s+0) = abcd[1];             // ebx
+   *(int*)(s+4) = abcd[3];             // edx
+   *(int*)(s+8) = abcd[2];             // ecx
+   s[12] = 0;                          // terminate string
+   printf("\nVendor string  = %s", s);
+
+   // test ProcessorName()
+   printf("\nProcessorName  = %s",   ProcessorName());
+
+   // test CpuType
+   int vendor, family, model;
+   CpuType(&vendor, &family, &model);
+   printf("\nCpuType: vendor %i, family 0x%X, model 0x%X", vendor, family, model);
+
+   // test DataCacheSize
+   printf("\nData cache size: L1 %ikb, L2 %ikb, L3 %ikb", 
+      (int)DataCacheSize(1)/1024, (int)DataCacheSize(2)/1024, (int)DataCacheSize(3)/1024);
+   
+   // test ReadTSC()
+   ReadTSC();
+   int tsc = (int)ReadTSC();
+   tsc = (int)ReadTSC() - tsc;
+   printf("\nReadTSC takes %i clocks\n\n", tsc);  
+   
+   // test Round();
+   double d;
+   for (d = -1; d <= 1; d += 0.5) {
+      printf("Round %f = %i = %i\n", d, Round(d), Round(float(d)));
+   }
+
+   // Test memory and string functions
+   int i, n;
+   const int strsize = 256;
+   char string1[strsize], string2[strsize];
+   const char * teststring = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ 1234567890 @`'{}[]()<>";
+
+   // Initialize strings
+   A_memset(string1, 0, strsize);
+   A_memset(string2, 0, strsize);
+
+   // Test A_strcpy, A_strcat, A_strlen
+   A_strcpy(string1, teststring);
+   n = strsize/(int)A_strlen(teststring);
+   for (i = 0; i < n-1; i++) {
+      A_strcat(string1, teststring);
+   }
+   if (A_strlen(string1) != n * A_strlen(teststring)) Failure("A_strcpy, A_strcat, A_strlen");
+
+   // Test A_stricmp
+   A_memcpy(string2, string1, strsize);
+   string2[4] ^= 0x20;  string1[30] ^= 0x20; // Change case
+   if (A_stricmp(string1, string2) != 0)  Failure("A_stricmp");
+   string2[8] += 2;  // Make strings different
+   if (A_stricmp(string1, string2) >= 0)  Failure("A_stricmp");
+   string2[7] -= 2;  // Make strings different
+   if (A_stricmp(string1, string2) <= 0)  Failure("A_stricmp");
+
+   // test A_strtolower and A_strtoupper
+   A_strcpy(string1, teststring);
+   A_strcpy(string2, teststring);
+   A_strtolower(string1);
+   A_strtoupper(string2);
+   printf("\nstring converted to lower and upper case:\n%s\n%s\n%s", 
+      teststring, string1, string2);
+
+   // test strspn and strcspn
+   int n1, n2;
+   const int nset = 4;
+   const char * tset[] = {"abc", "", "01234567890123456789", "abcdefghijklmnopqrstuvwxyz"};
+   for (i = 0; i < nset; i++) {
+      n1 = A_strspn(teststring, tset[i]);
+      n2 = strspn(teststring, tset[i]);
+      if (n1 != n2) Failure("A_strspn");
+      n1 = A_strcspn(teststring, tset[i]);
+      n2 = strcspn(teststring, tset[i]);
+      if (n1 != n2) Failure("A_strcspn");
+   }
+
+   // Test A_memmove with overlapping source and destination
+   A_memcpy(string2, string1, strsize);
+
+   A_memcpy(string1+5, string1+12, 12);
+   memcpy  (string2+5, string2+12, 12);
+   if (A_stricmp(string1, string2) != 0)  Failure("memcpy");
+
+   A_memcpy(string1+5, string1+12, 130);
+   memcpy  (string2+5, string2+12, 130);
+   if (A_stricmp(string1, string2) != 0)  Failure("memcpy");
+
+   A_memmove(string1+5, string1+2, 12);
+   memmove  (string2+5, string2+2, 12);
+   if (A_stricmp(string1, string2) != 0)  Failure("A_memmove");
+
+   A_memmove(string1+3, string1+8, 12);
+   memmove  (string2+3, string2+8, 12);
+   if (A_stricmp(string1, string2) != 0)  Failure("A_memmove");
+ 
+   A_memmove(string1+41, string1+30, 100);
+   memmove  (string2+41, string2+30, 100);
+   if (A_stricmp(string1, string2) != 0)  Failure("A_memmove");
+
+   A_memmove(string1+32, string1+48, 177);
+   memmove  (string2+32, string2+48, 177);
+   if (A_stricmp(string1, string2) != 0)  Failure("A_memmove");
+
+   printf("\n\nTests passed OK\n");
+
+   return 0;
+}
diff --git a/asmlibSrc/testmem.cpp b/asmlibSrc/testmem.cpp
new file mode 100755
index 0000000..34fe160
--- /dev/null
+++ b/asmlibSrc/testmem.cpp
@@ -0,0 +1,396 @@
+//          TESTMEM.CPP                                   Agner Fog 2011-07-04
+
+// Test file for asmlib memcpy and memmove functions
+// Instructions: Compile on any platform and link with the appropriate
+// version of the asmlib library.
+
+#include <stdio.h>
+//#include <process.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include "asmlib.h"
+
+// define function type
+typedef void * memcpyF (void * dest, const void * src, size_t count); 
+typedef void * memsetF (void * dest, int c, size_t count);
+
+
+extern "C" {
+    extern int IInstrSet;
+    // function prototypes for CPU specific function versions
+    memcpyF memcpy386, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256;
+    memcpyF memmove386, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256;
+    memsetF memset386, memsetSSE2, memsetAVX;
+}
+
+// Tables of function pointers
+#if defined(_WIN64) || defined(_M_X64) || defined(__amd64)
+const int NUMFUNC = 5;
+memcpyF * memcpyTab[NUMFUNC] = {A_memcpy, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256};
+memcpyF * memmoveTab[NUMFUNC] = {A_memmove, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256};
+const char * DispatchNames[NUMFUNC] = {"Dispatched", "SSE2", "SSSE3", "Unalign", "U256"};
+int isetreq [NUMFUNC] = {0, 4, 6, 4, 11};  // instruction set required
+const int MEMSETFUNCS = 3;
+memsetF * memsetTab[MEMSETFUNCS] = {A_memset, memsetSSE2, memsetAVX};
+const char * memsetNames[MEMSETFUNCS] = {"Dispatched", "SSE2", "AVX"};
+int memsetreq [NUMFUNC] = {0, 4, 11};  // instruction set required
+#else
+const int NUMFUNC = 6;
+memcpyF * memcpyTab[NUMFUNC] = {A_memcpy, memcpy386, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256};
+memcpyF * memmoveTab[NUMFUNC] = {A_memmove, memmove386, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256};
+const char * DispatchNames[NUMFUNC] = {"Dispatched", "386", "SSE2", "SSSE3", "Unalign", "U256"};
+int isetreq [NUMFUNC] = {0, 0, 4, 6, 4, 11};  // instruction set required
+const int MEMSETFUNCS = 4;
+memsetF * memsetTab[MEMSETFUNCS] = {A_memset, memset386, memsetSSE2, memsetAVX};
+const char * memsetNames[MEMSETFUNCS] = {"Dispatched", "386", "SSE2", "AVX"};
+int memsetreq [NUMFUNC] = {0, 0, 4, 11};  // instruction set required
+#endif
+
+
+
+void error(const char * s, int a, int b, int c) {
+    printf("\nError %s: %i %i %i\n", s, a, b, c);
+    exit (1);
+}
+
+void error(const char * s, int i, int a, int b, int c) {
+    printf("\nError %s: %i %i %i %i\n", s, i, a, b, c);
+    exit (1);
+}
+
+int main () {
+
+    int ao, bo, os, len;
+    int version;
+    const int pagesize = 0x1000;  // 4 kbytes
+    const int n = 16*pagesize;
+    char a[n], b[n], c[n];
+    int instrset = InstructionSet();
+
+    // CacheBypassLimit = 5;
+    printf("\nmemcpy cache limit = 0x%X, memset cache limit 0x%X\n", 
+        (int)GetMemcpyCacheLimit(), (int)GetMemsetCacheLimit());
+
+    printf("\nTest memcpy");
+
+    int i, x = 91;
+    for (i=0; i<n; i++) {
+        x += 23;
+        a[i] = (char)x;
+    }
+
+    A_memset(b, -1, n);
+
+    SetMemcpyCacheLimit(0);  // default
+
+#if 1 
+    // Test memcpy for correctness
+    // Loop through versions
+    for (version = 0; version < NUMFUNC; version++) {
+
+        printf("\n%s", DispatchNames[version]);
+        if (instrset < isetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+
+        for (len=0; len<514; len++) {
+            for (ao = 0; ao <=20; ao++) {
+                for (bo = 0; bo <=32; bo++) {
+                    A_memset(b, -1, len+96);
+                    (*memcpyTab[version])(b+bo, a+ao, len);
+                    if (bo && b[bo-1] != -1) error("A", ao, bo, len);
+                    if (b[bo+len] != -1) error("B", ao, bo, len);
+                    if (len==0) continue;
+                    if (b[bo] != a[ao]) error("C", ao, bo, len);
+                    if (b[bo+len-1] != a[ao+len-1]) error("D", ao, bo, len);
+                    if (memcmp(b+bo, a+ao, len)) error("E", ao, bo, len);
+                }
+            }
+        }
+        // check false memory dependence branches
+        len = 300;
+        A_memcpy(b, a, 3*pagesize);
+        for (ao = pagesize-300; ao < pagesize+200; ao++) {
+            for (bo = 3*pagesize; bo <=3*pagesize+33; bo++) {
+                A_memset(b+bo-64, -1, len+128);
+                (*memcpyTab[version])(b+bo, b+ao, len);
+                if (b[bo-1] != -1) error("A1", ao, bo, len);
+                if (b[bo+len] != -1) error("B1", ao, bo, len);
+                if (memcmp(b+bo, b+ao, len)) error("E1", ao, bo, len);
+            }
+        }
+        // check false memory dependence branches with overlap
+        // src > dest and overlap: must copy forwards
+        len = pagesize+1000;
+        for (ao = 2*pagesize; ao <=2*pagesize+33; ao++) {
+            for (bo = pagesize-200; bo < pagesize+300; bo++) {
+                A_memcpy(b, a, 4*pagesize);
+                A_memcpy(c, a, 4*pagesize);
+                (*memcpyTab[version])(b+bo, b+ao, len);
+                //memcpy(c+bo, c+ao, len);  // Most library versions of memcpy are actually memmove
+                memcpySSE2(c+bo, c+ao, len);            
+                if (memcmp(b, c, 4*pagesize)) {
+                    error("E2", ao-pagesize, bo-2*pagesize, len);
+                }
+            }
+        }
+        // check false memory dependence branches with overlap
+        // dest > src and overlap: undefined behavior
+#if 1
+        len = pagesize+1000;
+        for (ao = pagesize-200; ao < pagesize+200; ao++) {
+            for (bo = 2*pagesize; bo <=2*pagesize+33; bo++) {
+                A_memcpy(b, a, 4*pagesize);
+                A_memcpy(c, a, 4*pagesize);
+                (*memcpyTab[version])(b+bo, b+ao, len);
+                //memcpy(c+bo, c+ao, len);  // MS Most library versions of memcpy are actually memmove
+                memcpySSE2(c+bo, c+ao, len);
+                if (memcmp(b, c, 4*pagesize)) {
+                    error("E3", ao-pagesize, bo-2*pagesize, len);
+                }
+            }
+        }
+#endif
+    }
+    printf("\n\nTest memmove");
+
+    // Test memmove for correctness
+    for (i=0; i<n; i++) {
+        x += 23;
+        a[i] = char(x);
+    }
+
+    // Loop through versions
+    for (version = 0; version < NUMFUNC; version++) {
+        printf("\n%s", DispatchNames[version]);
+        if (instrset < isetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+
+        // move forward
+        for (len=0; len<400; len++) {
+            for (bo = 0; bo <=33; bo++) {
+                for (os = 0; os <= 33; os++) {
+                    A_memcpy(b, a, len+100);
+                    (*memmoveTab[version])(b+bo+os, b+bo, len);
+                    for (i=0; i<bo+os; i++) if (b[i]!=a[i]) error("E", i, bo, os, len);
+                    for (i=bo+os; i<bo+os+len; i++) if (b[i] != a[i-os]) error("F", i, bo, os, len);
+                    for (;i < bo+os+len+20; i++) if (b[i]!=a[i]) error("G", i, bo, os, len);
+                }
+            }
+        }
+        // move backwards
+        for (len=0; len<400; len++) {
+            for (bo = 0; bo <=33; bo++) {
+                for (os = 0; os < 33; os++) {
+                    A_memcpy(b, a, len+96);
+                    (*memmoveTab[version])(b+bo, b+bo+os, len);
+                    for (i=0; i<bo; i++) if (b[i]!=a[i]) error("H", i, bo, os, len);
+                    for (i=bo; i<bo+len; i++) if (b[i] != a[i+os]) error("I", i, bo, os, len);
+                    for (;i < bo+len+20; i++) if (b[i]!=a[i]) error("J", i, bo, os, len);
+                }
+            }
+        }
+    }
+
+    printf("\n\nSame, with non-temporal moves");
+    SetMemcpyCacheLimit(1); // bypass cache
+
+    // Loop through versions
+    for (version = 0; version < NUMFUNC; version++) {
+
+        printf("\n%s", DispatchNames[version]);
+        if (instrset < isetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+
+        for (len=0; len<514; len++) {
+            for (ao = 0; ao <=20; ao++) {
+                for (bo = 0; bo <=32; bo++) {
+                    A_memset(b, -1, len+96);
+                    (*memcpyTab[version])(b+bo, a+ao, len);
+                    if (bo && b[bo-1] != -1) error("A", ao, bo, len);
+                    if (b[bo+len] != -1) error("B", ao, bo, len);
+                    if (len==0) continue;
+                    if (b[bo] != a[ao]) error("C", ao, bo, len);
+                    if (b[bo+len-1] != a[ao+len-1]) error("D", ao, bo, len);
+                    if (memcmp(b+bo, a+ao, len)) error("E", ao, bo, len);
+                }
+            }
+        }
+        // check false memory dependence branches
+        len = 300;
+        A_memcpy(b, a, 3*pagesize);
+        for (ao = pagesize-200; ao < pagesize+200; ao++) {
+            for (bo = 3*pagesize; bo <=3*pagesize+33; bo++) {
+                A_memset(b+bo-64, -1, len+128);
+                (*memcpyTab[version])(b+bo, b+ao, len);
+                if (b[bo-1] != -1) error("A1", ao, bo, len);
+                if (b[bo+len] != -1) error("B1", ao, bo, len);
+                if (memcmp(b+bo, b+ao, len)) error("E1", ao, bo, len);
+            }
+        }
+        // check false memory dependence branches with overlap
+        // src > dest and overlap: must copy forwards
+        len = pagesize+1000;
+        for (ao = 2*pagesize; ao <=2*pagesize+33; ao++) {
+            for (bo = pagesize-200; bo < pagesize+200; bo++) {
+                A_memcpy(b, a, 4*pagesize);
+                A_memcpy(c, a, 4*pagesize);
+                (*memcpyTab[version])(b+bo, b+ao, len);
+                //memcpy(c+bo, c+ao, len);  // Most library versions of memcpy are actually memmove
+                memcpySSE2(c+bo, c+ao, len);            
+                if (memcmp(b, c, 4*pagesize)) {
+                    error("E2", ao-pagesize, bo-2*pagesize, len);
+                }
+            }
+        }
+        // (check false memory dependence branches with overlap. skipped)
+    }
+    printf("\n\nTest memmove");
+
+    // Test memmove for correctness
+    for (i=0; i<n; i++) {
+        x += 23;
+        a[i] = char(x);
+    }
+
+    // Loop through versions
+    for (version = 0; version < NUMFUNC; version++) {
+        printf("\n%s", DispatchNames[version]);
+        if (instrset < isetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+
+        // move forward
+        for (len=0; len<400; len++) {
+            for (bo = 0; bo <=33; bo++) {
+                for (os = 0; os <= 33; os++) {
+                    A_memcpy(b, a, len+100);
+                    (*memmoveTab[version])(b+bo+os, b+bo, len);
+                    for (i=0; i<bo+os; i++) if (b[i]!=a[i]) error("E", i, bo, os, len);
+                    for (i=bo+os; i<bo+os+len; i++) if (b[i] != a[i-os]) error("F", i, bo, os, len);
+                    for (;i < bo+os+len+20; i++) if (b[i]!=a[i]) error("G", i, bo, os, len);
+                }
+            }
+        }
+        // move backwards
+        for (len=0; len<400; len++) {
+            for (bo = 0; bo <=33; bo++) {
+                for (os = 0; os < 33; os++) {
+                    A_memcpy(b, a, len+96);
+                    (*memmoveTab[version])(b+bo, b+bo+os, len);
+                    for (i=0; i<bo; i++) if (b[i]!=a[i]) error("H", i, bo, os, len);
+                    for (i=bo; i<bo+len; i++) if (b[i] != a[i+os]) error("I", i, bo, os, len);
+                    for (;i < bo+len+20; i++) if (b[i]!=a[i]) error("J", i, bo, os, len);
+                }
+            }
+        }
+    }
+#endif
+    SetMemcpyCacheLimit(0);  // back to default
+    SetMemsetCacheLimit(0);
+
+    printf("\n\nTest memset");
+
+    // test memset
+    const int val1 = 0x4C, val2 = 0xA2, len2 = 1024;
+    for (version = 0; version < MEMSETFUNCS; version++) {
+        memsetF * func = memsetTab[version];
+        printf("\n%s", memsetNames[version]);
+        if (instrset < memsetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+        for (os = 0; os < 34; os++) {
+            for (len = 0; len < 500; len++) {
+                memset(a, val1, len2);
+                memset(a+os, val2, len);
+                (*func)(b, val1, len2);
+                (*func)(b+os, val2, len);
+                if (memcmp(a, b, len2)) {
+                    error("MS", version, os, len);
+                }
+            }
+        }
+        for (len=0; len<200; len++) {
+            for (os = 0; os <= 33; os++) {
+                A_memcpy(b, a, len+64);
+                A_memset(b+os, 55, len);
+                for (i=0; i<os; i++) if (b[i] != a[i]) error("K", i, os, len);
+                for (; i<os+len; i++) if (b[i] != 55) error("L", i, os, len);
+                for (; i<os+len+17; i++) if (b[i] != a[i]) error("M", i, os, len);
+            }
+        }
+    }
+
+    printf("\n\nSame, with non-temporal moves");
+    SetMemsetCacheLimit(1);   // bypass cache
+
+    for (version = 0; version < MEMSETFUNCS; version++) {
+        memsetF * func = memsetTab[version];
+        printf("\n%s", memsetNames[version]);
+        if (instrset < memsetreq[version]) {
+            // instruction set not supported
+            printf(" skipped"); continue;
+        }
+        for (os = 0; os < 34; os++) {
+            for (len = 0; len < 500; len++) {
+                memset(a, val1, len2);
+                memset(a+os, val2, len);
+                (*func)(b, val1, len2);
+                (*func)(b+os, val2, len);
+                if (memcmp(a, b, len2)) {
+                    error("MS", version, os, len);
+                }
+            }
+        }
+    }
+    SetMemsetCacheLimit(0);   // back to default
+
+    printf("\n\nTest strlen");
+
+    // test strlen
+    for (len=0; len<400; len++) {
+        for (os = 0; os <= 32; os++) {
+            A_memset(b, 0, len+64);
+            A_memset(b+os, 'a', len);
+            x = A_strlen(b+os);
+            if (x != len) error("N", 0, os, len);
+            A_memset(b, 1, len+64);
+            b[os+len] = 0;
+            x = A_strlen(b+os);
+            if (x != len) error("O", 0, os, len);
+        }
+    }
+
+    printf("\n\nTest strcpy and strcat");
+
+    // test strcpy and strcat
+    for (i=0; i<n; i++) {
+        x += 23;
+        a[i] = char(x) | 1;
+    }
+    for (len=0; len<400; len++) {
+        for (os = 0; os <= 16; os++) {
+            for (i=0; i<33; i++) {
+                A_memmove(b, a, len+64);
+                b[os+len] = 0;
+                A_strcpy(c+5, b+os);
+                if (A_strlen(c+5) != len) error("P", 0, os, len);
+                A_memmove(b+55, a, i+4);
+                b[55+i] = 0;
+                A_strcat(c+5, b+55);
+                if (A_strlen(c+5) != len+i) error("R", 0, os, len);
+            }
+        }
+    }
+    printf("\n\nSuccess\n");
+
+    return 0;
+}
diff --git a/asmlibSrc/testrandom.cpp b/asmlibSrc/testrandom.cpp
new file mode 100755
index 0000000..4cfc59b
--- /dev/null
+++ b/asmlibSrc/testrandom.cpp
@@ -0,0 +1,130 @@
+/*************************** random.cpp **********************************
+* Author:        Agner Fog
+* Date created:  2013-09-09
+* Last modified: 2013-09-09
+* Project:       asmlib.zip
+* Source URL:    www.agner.org/optimize
+*
+* Description:
+* Test random number generators
+*
+* Instructions:
+* Compile for console mode and link with the appropriate version of asmlib
+*
+* Further documentation:
+* The file asmlib-instructions.pdf contains further documentation and 
+* instructions.
+*
+* Copyright 2007-2011 by Agner Fog. 
+* GNU General Public License http://www.gnu.org/licenses/gpl.html
+*****************************************************************************/
+
+#include <stdio.h>
+#include "asmlibran.h"
+#include "randomc.h"
+#include "sfmt.h"
+
+#include "mersenne.cpp"
+#include "mother.cpp"
+#include "sfmt.cpp"
+
+
+const int includeMother = 1;
+const int useInitByArray = 1;  
+
+
+
+int main () {
+    int i;
+    uint32_t a, b, c;
+    const int numseeds = 5;
+    int seeds[numseeds] = {1,2,3,4,5};
+    PhysicalSeed(seeds, 1);
+    printf("\nSeed: %08X\n", seeds[0]);
+
+    CRandomMersenneA mersa(0);
+    CRandomMersenne  mersc(0);
+    mersa.RandomInit(seeds[0]);
+    mersc.RandomInit(seeds[0]);
+    MersenneRandomInit(seeds[0]);
+
+    if (useInitByArray) {
+        mersa.RandomInitByArray(seeds, numseeds);
+        mersc.RandomInitByArray(seeds, numseeds);
+        MersenneRandomInitByArray(seeds, numseeds);
+    }
+
+    printf("\nMersenne:");
+    for (i=0; i<1000; i++) {
+        a = mersa.BRandom();
+        b = MersenneBRandom();
+        c = mersc.BRandom();
+        if (a != b || a != c) {
+            printf("\nerror: %08X %08X %08X", a, b, c);
+            break;
+        }
+        else if (i == 0 || i == 99) {
+            printf("\n %08X %08X %08X", a, b, c);
+        }
+    }        
+    printf("\n %8i %8i %8i", mersa.IRandom(0,9999), mersc.IRandom(0,9999), MersenneIRandom(0,9999));
+    printf("\n %8i %8i %8i", mersa.IRandomX(0,9999), mersc.IRandomX(0,9999), MersenneIRandomX(0,9999));
+    printf("\n %12.8f %12.8f %12.8f", mersa.Random(), mersc.Random(), MersenneRandom());
+
+
+    CRandomMotherA motha(0);
+    CRandomMother mothc(0);
+    motha.RandomInit(seeds[0]);
+    mothc.RandomInit(seeds[0]);
+    MotherRandomInit(seeds[0]);
+
+    printf("\n\nMother:");
+    for (i=0; i<1000; i++) {
+        a = motha.BRandom();
+        b = MotherBRandom();
+        c = mothc.BRandom();
+        if (a != b || a != c) {
+            printf("\nerror: %08X %08X %08X", a, b, c);
+            break;
+        }
+        else if (i == 0 || i == 99) {
+            printf("\n %08X %08X %08X", a, b, c);
+        }
+    }
+    printf("\n %8i %8i %8i", motha.IRandom(0,9999), mothc.IRandom(0,9999), MotherIRandom(0,9999));
+    printf("\n %12.8f %12.8f %12.8f", motha.Random(), mothc.Random(), MotherRandom());
+
+    CRandomSFMTA sfmta(0, includeMother);
+    CRandomSFMT sfmtc(0, includeMother);
+    sfmta.RandomInit(1, includeMother);
+    sfmtc.RandomInit(1);
+    SFMTgenRandomInit(1,includeMother);
+
+    if (useInitByArray) {
+        sfmta.RandomInitByArray(seeds, numseeds, includeMother);
+        sfmtc.RandomInitByArray(seeds, numseeds);
+        SFMTgenRandomInitByArray(seeds, numseeds, includeMother);
+    }
+
+    printf("\n\nSFMT:");
+    for (i=0; i<1000; i++) {
+        a = sfmta.BRandom();
+        b = SFMTgenBRandom();
+        c = sfmtc.BRandom();
+        if (a != b || a != c) {
+            printf("\nerror @%i: %08X %08X %08X", i, a, b, c);
+            break;
+        }
+        else if (i == 0 || i == 99) {
+            printf("\n %08X %08X %08X", a, b, c);
+        }
+    }        
+    printf("\n %8i %8i %8i", sfmta.IRandom(0,9999), sfmtc.IRandom(0,9999), SFMTgenIRandom(0,9999));
+    printf("\n %8i %8i %8i", sfmta.IRandomX(0,9999), sfmtc.IRandomX(0,9999), SFMTgenIRandomX(0,9999));
+    printf("\n %12.8f %12.8f %12.8f", sfmta.Random(), sfmtc.Random(), SFMTgenRandom());
+
+
+    printf("\n");
+
+   return 0;
+}
diff --git a/asmlibSrc/unalignedisfaster32.asm b/asmlibSrc/unalignedisfaster32.asm
new file mode 100755
index 0000000..aac78e8
--- /dev/null
+++ b/asmlibSrc/unalignedisfaster32.asm
@@ -0,0 +1,178 @@
+;*************************  unalignedisfaster32.asm  ******************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2013-08-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0:   Unaligned read is probably slower than alignment shift
+; 1:   Unknown
+; 2:   Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0:   32-bytes memory write is slower or AVX not supported
+; 1:   Unknown
+; 2:   32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global _UnalignedIsFaster: function
+global _Store256BitIsFaster: function
+extern _CpuType
+extern _InstructionSet
+
+
+SECTION .text
+
+_UnalignedIsFaster:
+        push    ebx
+        push    0                      ; vendor
+        push    0                      ; family 
+        push    0                      ; model
+        mov     eax, esp
+        push    eax                    ; &model
+        add     eax, 4
+        push    eax                    ; &family
+        add     eax, 4
+        push    eax                    ; &vendor
+        call    _CpuType               ; get vendor, family, model
+        add     esp, 12
+        pop     edx                    ; model
+        pop     ecx                    ; family
+        pop     ebx                    ; vendor
+        xor     eax, eax               ; return value
+        dec     ebx
+        jz      Intel
+        dec     ebx
+        jz      AMD
+        dec     ebx
+        jz      VIA
+        ; unknown vendor
+        inc     eax
+        jmp     Uend
+        
+Intel:  ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+        ; Nehalem  = family 6, model 1AH
+        ; Atom     = family 6, model 1CH
+        ; Netburst = family 0FH
+        ; Future models are likely to be family 6, mayby > 6, model > 1C
+        cmp     ecx, 6
+        jb      Uend                   ; old Pentium 1, etc
+        cmp     ecx, 0FH
+        je      Uend                   ; old Netburst architecture
+        cmp     edx, 1AH
+        jb      Uend                   ; earlier than Nehalem
+        cmp     edx, 1CH
+        je      Uend                   ; Intel Atom
+        or      eax, 2                 ; Intel Nehalem and later, except Atom
+        jmp     Uend
+        
+AMD:    ; AMD processors:
+        ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+        ; K10/Opteron = family 10H     ; Use unaligned
+        ; Bobcat = family 14H          ; PALIGNR is very slow. Use unaligned
+        ; Piledriver = family 15H      ; Use unaligned
+        ; Jaguar = family 16H          ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+        cmp     ecx, 10H               ; AMD K8 or earlier: use aligned
+        jb      Uend    
+        cmp     ecx, 16H               ; Jaguar: use aligned
+        je      Uend
+        or      eax, 2                 ; AMD K10 or later: use unaligned
+        jmp     Uend
+        
+        
+VIA:    ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000                
+        cmp     ecx, 0FH
+        jna     Uend                   ; VIA Nano
+        inc     eax                    ; Future versions: unknown
+       ;jmp     Uend
+        
+Uend:   pop     ebx
+        ret
+
+;_UnalignedIsFaster ENDP
+
+
+_Store256BitIsFaster:
+        call    _InstructionSet
+        cmp     eax, 11                ; AVX supported
+        jb      S90        
+        push    0                      ; vendor
+        push    0                      ; family 
+        push    0                      ; model
+        mov     eax, esp
+        push    eax                    ; &model
+        add     eax, 4
+        push    eax                    ; &family
+        add     eax, 4
+        push    eax                    ; &vendor
+        call    _CpuType               ; get vendor, family, model
+        add     esp, 12
+        pop     edx                    ; model
+        pop     ecx                    ; family
+        pop     eax                    ; vendor
+        
+        cmp     eax, 1                 ; Intel
+        je      S_Intel
+        cmp     eax, 2                 ; AMD
+        je      S_AMD
+        cmp     eax, 3                 ; VIA
+        je      S_VIA        
+        jmp     S91                    ; other vendor, not known
+
+S_Intel:cmp     ecx, 6
+        jne     S92                    ; unknown family. possibly future model
+        ; model 2AH Sandy Bridge
+        ; model 3AH Ivy Bridge
+        ; model 3CH Haswell
+        ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+        ; Haswell is much faster with 256 bit moves
+        cmp     edx, 3AH
+        jbe     S90
+        jmp     S92        
+
+S_AMD:  ; AMD
+        cmp     ecx, 15H               ; family 15h = Bulldozer, Piledriver
+        ja      S92                    ; assume future AMD families are faster
+        ; model 1 = Bulldozer is a little slower on 256 bit write
+        ; model 2 = Piledriver is terribly slow on 256 bit write
+        ; assume future models 3-4 are like Bulldozer
+        cmp     edx, 4
+        jbe     S90
+        jmp     S91                    ; later models: don't know
+        
+S_VIA:  jmp     S91                    ; don't know
+        
+S90:    xor     eax, eax               ; return 0
+        ret
+        
+S91:    mov     eax, 1                 ; return 1
+        ret        
+        
+S92:    mov     eax, 2                 ; return 2
+        ret        
+        
+; _Store256BitIsFaster ENDP
+
diff --git a/asmlibSrc/unalignedisfaster64.asm b/asmlibSrc/unalignedisfaster64.asm
new file mode 100755
index 0000000..c6a5ac9
--- /dev/null
+++ b/asmlibSrc/unalignedisfaster64.asm
@@ -0,0 +1,186 @@
+;*************************  unalignedisfaster64.asm  ******************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2013-08-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0:   Unaligned read is probably slower than alignment shift
+; 1:   Unknown or equal
+; 2:   Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0:   32-bytes memory write is slower or AVX not supported
+; 1:   Unknown
+; 2:   32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global UnalignedIsFaster: function
+global Store256BitIsFaster: function
+extern CpuType
+extern InstructionSet
+
+
+SECTION .text
+
+UnalignedIsFaster:
+
+%ifdef  UNIX
+        push    0                      ; vendor
+        mov     rdi, rsp
+        push    0                      ; family 
+        mov     rsi, rsp
+        push    0                      ; model
+        mov     rdx, rsp 
+%else   ; WINDOWS
+        push    0                      ; vendor
+        mov     rcx, rsp
+        push    0                      ; family 
+        mov     rdx, rsp
+        push    0                      ; model
+        mov     r8,  rsp 
+%endif
+        call    CpuType                ; get vendor, family, model
+        pop     rdx                    ; model
+        pop     rcx                    ; family
+        pop     r8                     ; vendor
+        xor     eax, eax               ; return value
+        dec     r8d
+        jz      Intel
+        dec     r8d
+        jz      AMD
+        dec     r8d
+        jz      VIA
+        ; unknown vendor
+        inc     eax
+        jmp     Uend
+        
+Intel:  ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+        ; Nehalem  = family 6, model 1AH
+        ; Atom     = family 6, model 1CH
+        ; Netburst = family 0FH
+        ; Future models are likely to be family 6, mayby > 6, model > 1C
+        cmp     ecx, 6
+        jb      Uend                   ; old Pentium 1, etc
+        cmp     ecx, 0FH
+        je      Uend                   ; old Netburst architecture
+        cmp     edx, 1AH
+        jb      Uend                   ; earlier than Nehalem
+        cmp     edx, 1CH
+        je      Uend                   ; Intel Atom
+        or      eax, 2                 ; Intel Nehalem and later, except Atom
+        jmp     Uend
+        
+AMD:    ; AMD processors:
+        ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+        ; K10/Opteron = family 10H     ; Use unaligned
+        ; Bobcat = family 14H          ; PALIGNR is very slow. Use unaligned
+        ; Piledriver = family 15H      ; Use unaligned
+        ; Jaguar = family 16H          ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+        cmp     ecx, 10H               ; AMD K8 or earlier: use aligned
+        jb      Uend    
+        cmp     ecx, 16H               ; Jaguar: use aligned
+        je      Uend
+        or      eax, 2                 ; AMD K10 or later: use unaligned
+        jmp     Uend
+        
+VIA:    ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000                
+        cmp     ecx, 0FH
+        jna     Uend                   ; VIA Nano
+        inc     eax                    ; Future versions: unknown
+       ;jmp     Uend
+        
+Uend:   ret
+
+;UnalignedIsFaster ENDP
+
+
+Store256BitIsFaster:
+        call    InstructionSet
+        cmp     eax, 11                ; AVX supported
+        jb      S90
+%ifdef  UNIX
+        push    0                      ; vendor
+        mov     rdi, rsp
+        push    0                      ; family 
+        mov     rsi, rsp
+        push    0                      ; model
+        mov     rdx, rsp 
+%else   ; WINDOWS
+        push    0                      ; vendor
+        mov     rcx, rsp
+        push    0                      ; family 
+        mov     rdx, rsp
+        push    0                      ; model
+        mov     r8,  rsp 
+%endif
+        call    CpuType                ; get vendor, family, model
+        pop     rdx                    ; model
+        pop     rcx                    ; family
+        pop     rax                    ; vendor
+
+        cmp     eax, 1                 ; Intel
+        je      S_Intel
+        cmp     eax, 2                 ; AMD
+        je      S_AMD
+        cmp     eax, 3
+        je      S_VIA        
+        jmp     S91                    ; other vendor, not known
+        
+S_Intel:cmp     ecx, 6
+        jne     S92                    ; unknown family. possibly future model
+        ; model 2AH Sandy Bridge
+        ; model 3AH Ivy Bridge
+        ; model 3CH Haswell
+        ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+        ; Haswell is much faster with 256 bit moves
+        cmp     edx, 3AH
+        jbe     S90
+        jmp     S92        
+
+S_AMD:  ; AMD
+        cmp     ecx, 15H               ; family 15h = Bulldozer, Piledriver
+        ja      S92                    ; assume future AMD families are faster
+                                       ; family 16H = Jaguar. 256 bit write is slightly faster
+        ; model 1 = Bulldozer is a little slower on 256 bit write
+        ; model 2 = Piledriver is terribly slow on 256 bit write
+        ; model 30h = Steamroller is reasonable on 256 bit write
+        cmp     edx, 30h
+        jb      S90
+        jmp     S91                    ; Steamroller: moderate
+        
+S_VIA:  jmp     S91                    ; don't know
+        
+S90:    xor     eax, eax               ; return 0
+        ret
+        
+S91:    mov     eax, 1                 ; return 1
+        ret        
+        
+S92:    mov     eax, 2                 ; return 2
+        ret        
+        
+; Store256BitIsFaster ENDP

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/asmlib.git



More information about the debian-med-commit mailing list