[Pkg-ofed-commits] [libpsm2] 01/04: New upstream version 10.3-8

Tue Nov 21 22:12:10 UTC 2017

This is an automated email from the git hooks/post-receive script.

bsmith-guest pushed a commit to branch master
in repository libpsm2.

commit 490810c2edfbe67631d8c51588716b4a50ddeee8
Author: Brian T. Smith <bsmith at systemfabricworks.com>
Date:   Tue Nov 21 13:10:57 2017 -0600

    New upstream version 10.3-8
---
 40-psm.rules                         |   52 +
 COMMIT                               |    1 +
 COPYING                              |  376 +++++
 Makefile                             |  511 ++++++
 README                               |  300 ++++
 buildflags.mak                       |  210 +++
 compat/40-psm-compat.rules           |   52 +
 compat/Makefile                      |   90 ++
 compat/buildflags.mak                |  103 ++
 compat/libpsm2-compat.cmds           |   70 +
 compat/libpsm2-compat.conf           |   52 +
 compat/psm-compat.c                  |  335 ++++
 compat/psm2_compat_linker_script.map |   66 +
 include/common_defines.h             |  176 ++
 include/hfi1_deprecated.h            |  181 +++
 include/linux-i386/bit_ops.h         |   98 ++
 include/linux-i386/sysdep.h          |  171 ++
 include/opa_byteorder.h              |  264 +++
 include/opa_common.h                 |   62 +
 include/opa_debug.h                  |  108 ++
 include/opa_intf.h                   |   90 ++
 include/opa_queue.h                  |  512 ++++++
 include/opa_revision.h               |   64 +
 include/opa_service.h                |  268 +++
 include/opa_udebug.h                 |  194 +++
 include/opa_user.h                   |  973 +++++++++++
 include/psm2_mock_testing.h          |  176 ++
 include/rbtree.c                     |  692 ++++++++
 include/rbtree.h                     |   90 ++
 libpsm2.spec.in                      |  177 ++
 libuuid/Makefile                     |   92 ++
 libuuid/compare.c                    |   53 +
 libuuid/pack.c                       |   69 +
 libuuid/parse.c                      |   78 +
 libuuid/psm_uuid.c                   |  114 ++
 libuuid/psm_uuid.h                   |   78 +
 libuuid/unpack.c                     |   63 +
 libuuid/unparse.c                    |   75 +
 makesdeb.sh                          |  105 ++
 makesrpm.sh                          |  145 ++
 mpspawn/mpspawn_stats.h              |  132 ++
 opa/Makefile                         |  113 ++
 opa/opa_debug.c                      |  364 +++++
 opa/opa_dwordcpy-generic.c           |  298 ++++
 opa/opa_dwordcpy-i386.S              |   84 +
 opa/opa_dwordcpy-x86_64-fast.S       |   77 +
 opa/opa_dwordcpy-x86_64.c            |  298 ++++
 opa/opa_i2cflash.c                   |   87 +
 opa/opa_proto.c                      |  578 +++++++
 opa/opa_service.c                    |  909 +++++++++++
 opa/opa_sysfs.c                      |  854 ++++++++++
 opa/opa_syslog.c                     |  113 ++
 opa/opa_time.c                       |  284 ++++
 opa/opa_utils.c                      |  425 +++++
 opa/opa_write_pio-i386.c             |  305 ++++
 opa/opa_write_pio-x86_64.c           |  296 ++++
 psm.c                                |  732 +++++++++
 psm2.h                               | 1517 +++++++++++++++++
 psm2_am.h                            |  411 +++++
 psm2_linker_script.map               |   93 ++
 psm2_linker_script_map.in            |   95 ++
 psm2_mq.h                            | 1403 ++++++++++++++++
 psm_am.c                             |  269 ++++
 psm_am_internal.h                    |   93 ++
 psm_context.c                        |  817 ++++++++++
 psm_context.h                        |  102 ++
 psm_diags.c                          |  362 +++++
 psm_ep.c                             | 1527 ++++++++++++++++++
 psm_ep.h                             |  245 +++
 psm_ep_connect.c                     |  620 +++++++
 psm_error.c                          |  348 ++++
 psm_error.h                          |   78 +
 psm_help.h                           |  190 +++
 psm_lock.h                           |  142 ++
 psm_log.h                            |  224 +++
 psm_memcpy.c                         |   67 +
 psm_mock.c                           |   90 ++
 psm_mpool.c                          |  588 +++++++
 psm_mpool.h                          |  107 ++
 psm_mq.c                             | 1433 ++++++++++++++++
 psm_mq_internal.h                    |  639 ++++++++
 psm_mq_recv.c                        |  593 +++++++
 psm_mq_utils.c                       |  273 ++++
 psm_perf.c                           |  246 +++
 psm_perf.h                           |  142 ++
 psm_stats.c                          |  664 ++++++++
 psm_stats.h                          |  120 ++
 psm_sysbuf.c                         |  234 +++
 psm_sysbuf.h                         |   81 +
 psm_timer.c                          |  198 +++
 psm_timer.h                          |  164 ++
 psm_user.h                           |  500 ++++++
 psm_utils.c                          | 2553 +++++++++++++++++++++++++++++
 psm_utils.h                          |  379 +++++
 psmi_wrappers.c                      |   94 ++
 psmi_wrappers.h                      |   98 ++
 ptl.h                                |  211 +++
 ptl_am/Makefile                      |   91 ++
 ptl_am/am_cuda_memhandle_cache.c     |  316 ++++
 ptl_am/am_cuda_memhandle_cache.h     |  124 ++
 ptl_am/am_reqrep.c                   |  118 ++
 ptl_am/am_reqrep_shmem.c             | 2590 +++++++++++++++++++++++++++++
 ptl_am/cmarw.h                       |   73 +
 ptl_am/cmarwu.c                      |  207 +++
 ptl_am/psm_am_internal.h             |  466 ++++++
 ptl_am/ptl.c                         |  364 +++++
 ptl_am/ptl_fwd.h                     |   64 +
 ptl_ips/Makefile                     |   96 ++
 ptl_ips/ips_crc32.c                  |   91 ++
 ptl_ips/ips_epstate.c                |  154 ++
 ptl_ips/ips_epstate.h                |  100 ++
 ptl_ips/ips_expected_proto.h         |  397 +++++
 ptl_ips/ips_opp_path_rec.c           |  602 +++++++
 ptl_ips/ips_path_rec.c               |  791 +++++++++
 ptl_ips/ips_path_rec.h               |  185 +++
 ptl_ips/ips_proto.c                  | 2348 +++++++++++++++++++++++++++
 ptl_ips/ips_proto.h                  |  687 ++++++++
 ptl_ips/ips_proto_am.c               |  595 +++++++
 ptl_ips/ips_proto_am.h               |   93 ++
 ptl_ips/ips_proto_connect.c          | 1551 ++++++++++++++++++
 ptl_ips/ips_proto_dump.c             |  255 +++
 ptl_ips/ips_proto_expected.c         | 2957 ++++++++++++++++++++++++++++++++++
 ptl_ips/ips_proto_header.h           |  181 +++
 ptl_ips/ips_proto_help.h             |  705 ++++++++
 ptl_ips/ips_proto_internal.h         |   96 ++
 ptl_ips/ips_proto_mq.c               | 1733 ++++++++++++++++++++
 ptl_ips/ips_proto_params.h           |  264 +++
 ptl_ips/ips_proto_recv.c             | 1447 +++++++++++++++++
 ptl_ips/ips_recvhdrq.c               |  869 ++++++++++
 ptl_ips/ips_recvhdrq.h               |  240 +++
 ptl_ips/ips_recvq.c                  |   91 ++
 ptl_ips/ips_recvq.h                  |  124 ++
 ptl_ips/ips_scb.c                    |  364 +++++
 ptl_ips/ips_scb.h                    |  226 +++
 ptl_ips/ips_spio.c                   |  951 +++++++++++
 ptl_ips/ips_spio.h                   |  189 +++
 ptl_ips/ips_stats.h                  |   83 +
 ptl_ips/ips_subcontext.c             |   97 ++
 ptl_ips/ips_subcontext.h             |   81 +
 ptl_ips/ips_tid.c                    |  278 ++++
 ptl_ips/ips_tid.h                    |  169 ++
 ptl_ips/ips_tidcache.c               |  653 ++++++++
 ptl_ips/ips_tidcache.h               |  158 ++
 ptl_ips/ips_tidflow.c                |  267 +++
 ptl_ips/ips_tidflow.h                |  133 ++
 ptl_ips/ips_writehdrq.c              |  110 ++
 ptl_ips/ips_writehdrq.h              |  269 ++++
 ptl_ips/ipserror.c                   |  200 +++
 ptl_ips/ipserror.h                   |  122 ++
 ptl_ips/ptl.c                        |  950 +++++++++++
 ptl_ips/ptl_fwd.h                    |   65 +
 ptl_ips/ptl_ips.h                    |  194 +++
 ptl_ips/ptl_rcvthread.c              |  506 ++++++
 ptl_self/Makefile                    |   90 ++
 ptl_self/ptl.c                       |  394 +++++
 ptl_self/ptl_fwd.h                   |   62 +
 rpm_release_extension                |    1 +
 157 files changed, 59022 insertions(+)

diff --git a/40-psm.rules b/40-psm.rules
new file mode 100644
index 0000000..ba8d494
--- /dev/null
+++ b/40-psm.rules
@@ -0,0 +1,52 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+KERNEL=="hfi1", MODE="0666"
+KERNEL=="hfi1_[0-9]", MODE="0666"
diff --git a/COMMIT b/COMMIT
new file mode 100644
index 0000000..b6b4b33
--- /dev/null
+++ b/COMMIT
@@ -0,0 +1 @@
+6ca1de91a1ee2604096449942bbed93e0ad9311e
\ No newline at end of file
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..ea3d558
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,376 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the
+BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright(c) 2016 Intel Corporation.  All rights reserved.
+
+==================================================================
+                        BSD Simplified License
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f0a539d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,511 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2017 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2017 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+
+OPTIONS =
+HISTORY = .outdirs
+HISTORIC_TARGETS = $(patsubst %, %_clean, $(shell cat $(HISTORY) 2> /dev/null))
+
+RPM_NAME := libpsm2
+
+SUBDIRS:= ptl_self ptl_ips ptl_am libuuid opa
+top_srcdir := $(shell readlink -m .)
+
+# Default locations
+OUTDIR := $(top_srcdir)/build_release
+MOCK_OUTDIR := $(top_srcdir)/build_mock
+DEBUG_OUTDIR := $(top_srcdir)/build_debug
+
+# We need a temporary test variable, as the OUTDIR macro
+# can be overriden by the shell and thus not run.
+TESTOUTDIR= $(shell readlink -m $(OUTDIR))
+ifeq ($(top_srcdir), $(TESTOUTDIR))
+$(error OUTDIR cannot be the same as your source folder ${top_srcdir}))
+endif
+
+ifeq (/,$(TESTOUTDIR))
+$(error OUTDIR cannot be the / folder ))
+endif
+
+# Forces any value to be full path.
+# We don't need to override MOCK_OUTDIR or DEBUG_OUTDIR
+# as they are recursive make invocations and use OUTDIR
+ifneq ($(MAKECMDGOALS), mock)
+ifneq ($(MAKECMDGOALS), debug)
+override OUTDIR := $(shell readlink -m $(OUTDIR))
+endif
+endif
+
+LINKER_SCRIPT_FILE := ${OUTDIR}/psm2_linker_script.map
+
+PSM2_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
+PSM2_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
+PSM2_LIB_MAJOR   := $(shell printf "%d" ${PSM2_VERNO_MAJOR})
+PSM2_LIB_MINOR   := $(shell printf "%d" `sed -n 's/^\#define.*PSM2_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h`)
+SOURCES_CHKSUM_FILES = Makefile buildflags.mak $(LINKER_SCRIPT_FILE) \
+		`find . -regex '\(.*\.h\|.*\.c\)' -not -path "./test/*" -not -path "./tools/*" -not -path "_revision.c" | sort`
+SOURCES_CHKSUM_VALUE = $(shell cat ${SOURCES_CHKSUM_FILES} | sha1sum | cut -d' ' -f 1)
+
+OPA_LIB_MAJOR := 4
+OPA_LIB_MINOR := 0
+
+export PSM2_VERNO_MAJOR
+export PSM2_LIB_MAJOR
+export PSM2_VERNO_MINOR
+export PSM2_LIB_MINOR
+export OPA_LIB_MAJOR
+export OPA_LIB_MINOR
+export CCARCH ?= gcc
+export FCARCH ?= gfortran
+
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+ifneq (x86_64,$(arch))
+   ifneq (i386,$(arch))
+      $(error Unsupported architecture $(arch))
+   endif
+endif
+
+ifndef LIBDIR
+   ifeq (${arch},x86_64)
+      INSTALL_LIB_TARG=/usr/lib64
+   else
+      INSTALL_LIB_TARG=/usr/lib
+   endif
+else
+   INSTALL_LIB_TARG=${LIBDIR}
+endif
+export DESTDIR
+export INSTALL_LIB_TARG
+
+TARGLIB := libpsm2
+COMPATMAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' \
+             	 $(top_srcdir)/psm2.h)
+COMPATLIB := libpsm_infinipath
+
+MAJOR := $(PSM2_LIB_MAJOR)
+MINOR := $(PSM2_LIB_MINOR)
+
+nthreads := $(shell echo $$(( `nproc` * 2 )) )
+
+# The following line sets the DISTRO variable to:
+#  'rhel' if the host is running RHEL.
+#  'suse' if the host is running SUSE.
+#  'fedora' if the host is running Fedora.
+#  'ubuntu' if the host is running Ubuntu.
+#
+# The DISTRO variable is used subsequently for variable
+# behaviors of the 3 distros.
+
+DISTRO := $(shell . /etc/os-release; echo $$ID)
+
+# By default the following two variables have the following values:
+LIBPSM2_COMPAT_CONF_DIR := /etc
+LIBPSM2_COMPAT_SYM_CONF_DIR := /etc
+# We can't set SPEC_FILE_RELEASE_DIST to an empty value, a space will result.
+# It then messes up sed operations for PSM_CUDA=1.
+# So leaving the commented out line here as documentation to NOT set it.
+# SPEC_FILE_RELEASE_DIST :=
+UDEV_40_PSM_RULES := %{_udevrulesdir}/40-psm.rules
+
+ifeq (fedora,$(DISTRO))
+	# On Fedora, we change these two variables to these values:
+	LIBPSM2_COMPAT_CONF_DIR := /usr/lib
+	LIBPSM2_COMPAT_SYM_CONF_DIR := %{_prefix}/lib
+	SPEC_FILE_RELEASE_DIST := %{?dist}
+	UDEV_40_PSM_RULES :=#
+else ifeq (rhel,${DISTRO})
+	# Insert code specific to RHEL here.
+else ifeq (sles,${DISTRO})
+	# Insert code specific to SLES here.
+endif
+
+ifdef PSM_CUDA
+#Value needs to be something without spaces or dashes '-'
+SPEC_FILE_RELEASE_DIST += cuda
+endif
+
+export 	LIBPSM2_COMPAT_CONF_DIR
+
+# The desired version number comes from the most recent tag starting with "v"
+ifeq (true, $(shell git rev-parse --is-inside-work-tree))
+ISGIT := 1 # Cache the result for later
+# Note, we don't define ISGIT if we are not in a git folder
+VERSION := $(shell git describe --tags --abbrev=0 --match='psm-v*' | sed -e 's/^psm-v//' -e 's/-/_/')
+else
+VERSION := version
+endif
+
+# If we have a file called 'rpm_release_extension' (as on github),
+# we take the release extension number from this file
+RELEASE_EXT := $(shell if [ -e rpm_release_extension ] ; then cat rpm_release_extension; fi)
+CURRENTSHA := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --pretty=format:'%h' -n 1; fi)
+RPMEXTHASH := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --pretty=format:'%h' -n 1 rpm_release_extension; fi)
+
+# On github, the last commit for each release should be the one to bump up
+# the release extension number in 'rpm_release_extension'. Further commits
+# are counted here and appended to the final rpm name to distinguish commits
+# present only on github
+NCOMMITS := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --children $(RPMEXTHASH)..$(CURRENTSHA) . --pretty=oneline | wc -l; fi)
+
+# This logic should kick-in only on github
+ifdef RELEASE_EXT
+ifneq ($(CURRENTSHA), $(RPMEXTHASH))
+RELEASE := $(RELEASE_EXT)_$(NCOMMITS)
+endif
+endif
+
+# The desired release number comes the git describe following the version which
+# is the number of commits since the version tag was planted suffixed by the g<commitid>
+ifndef RELEASE
+RELTAG := "psm-v$(VERSION)"
+RELEASE := $(shell if [ -f rpm_release_extension ]; then cat rpm_release_extension;\
+		   elif [ $ISGIT ] ; then git rev-list $(RELTAG)..HEAD -- . | wc -l; \
+		   else echo "release" ; fi)
+endif
+
+DIST_SHA := ${shell if [ $(ISGIT) ] ; then git log -n1 --pretty=format:%H .; \
+		else echo DIST_SHA ; fi}
+
+# Concatenated version and release
+ifndef VERSION_RELEASE_OVERRIDE
+VERSION_RELEASE := $(VERSION).$(RELEASE)
+else
+VERSION_RELEASE := ${VERSION_RELEASE_OVERRIDE}
+endif
+
+LDLIBS := -lrt -lpthread -ldl -lnuma ${EXTRA_LIBS}
+
+PKG_CONFIG ?= pkg-config
+
+UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null)
+ifndef UDEVDIR
+	UDEVDIR = /lib/udev
+endif
+
+export UDEVDIR
+
+# The DIST variable is a name kernel corresponding to:
+# 1. The name of the directory containing the source code distribution
+#    (see dist: target below).
+# 2. The basename of the filename of the tar file created in the dist:
+#    target.
+DIST := ${RPM_NAME}-${VERSION_RELEASE}
+
+# If user has empty RPM NAME BASEEXT (defined or not), then attempt to
+# see if we are running on SLES 12.3 or newer.
+# If we are, then change the base package name, but not the supporting
+# packages to libpsm2-2. Do note this requires support both in the Makefile
+# specfile target rule as well as changes in the libpsm2.spec.in
+# file as well.
+ifeq ($(RPM_NAME_BASEEXT),)
+# Detect current version of the OS
+OS := $(shell grep -m1 NAME /etc/os-release | cut -f 2 -d\")
+OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 -d.)
+OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.)
+
+override RPM_NAME_BASEEXT := $(shell \
+    if [ "$(OS)" = "SLES" ]; then \
+       if [ "$(OSVERSION)" \> "11" ]; then \
+          if [ "$(OSSUBVERSION)" \> "2" ]; then \
+             echo "-2"; \
+          fi \
+       fi \
+    fi)
+endif
+
+all: outdir symlinks
+	@if [ ! -e $(HISTORY) ] || [ -z "`grep -E '^$(OUTDIR)$$' $(HISTORY)`" ]; then \
+		echo $(OUTDIR) >> $(HISTORY); \
+	fi
+	@for subdir in $(SUBDIRS); do \
+		mkdir -p $(OUTDIR)/$$subdir; \
+		$(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir $(OPTIONS); \
+	done
+	$(MAKE) -j $(nthreads) OUTDIR=$(OUTDIR) $(OPTIONS) $(OUTDIR)/${TARGLIB}.so
+	@mkdir -p $(OUTDIR)/compat
+	$(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat $(OPTIONS)
+
+%_clean:
+	make OUTDIR=$* clean
+
+clean: linker_script_file_clean cleanlinks
+	rm -rf ${OUTDIR}
+	@if [ -e $(HISTORY) ]; then \
+		grep -v -E "^$(OUTDIR)$$" $(HISTORY) > $(HISTORY)_tmp; \
+		mv $(HISTORY)_tmp $(HISTORY); \
+		if [ "`wc -c $(HISTORY) | cut -d ' ' -f 1`" -eq 0 ]; then \
+			rm -f $(HISTORY); \
+		fi; \
+	fi
+
+mock: OUTDIR := $(MOCK_OUTDIR)
+mock: OPTIONS = PSM2_MOCK_TESTING=1
+mock:
+	$(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
+
+debug: OUTDIR := $(DEBUG_OUTDIR)
+debug: OPTIONS = PSM_DEBUG=1
+debug:
+	$(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
+
+test_clean:
+	if [ -d ./test ]; then \
+		$(MAKE) -C test clean; \
+	fi
+
+specfile_clean:
+	rm -f ${OUTDIR}/${RPM_NAME}.spec
+
+distclean: specfile_clean cleanlinks $(HISTORIC_TARGETS) test_clean
+	rm -rf ${OUTDIR}/${DIST}
+	rm -f ${OUTDIR}/${DIST}.tar.gz
+	rm -fr temp.*
+
+outdir:
+	mkdir -p ${OUTDIR}
+
+symlinks:
+	@test -L $(top_srcdir)/include/linux-x86_64 || \
+		ln -sf linux-i386 $(top_srcdir)/include/linux-x86_64
+
+cleanlinks:
+	rm -rf $(top_srcdir)/include/linux-x86_64
+
+install: all
+	for subdir in $(SUBDIRS) ; do \
+		mkdir -p $(OUTDIR)/$$subdir ; \
+		$(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir install ; \
+	done
+	$(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so OUTDIR=$(OUTDIR)
+	$(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat install
+	install -D $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} \
+		${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR}
+	(cd ${DESTDIR}${INSTALL_LIB_TARG} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so)
+	install -m 0644 -D psm2.h ${DESTDIR}/usr/include/psm2.h
+	install -m 0644 -D psm2_mq.h ${DESTDIR}/usr/include/psm2_mq.h
+	install -m 0644 -D psm2_am.h ${DESTDIR}/usr/include/psm2_am.h
+ifneq (fedora,${DISTRO})
+	install -m 0644 -D 40-psm.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm.rules
+endif
+	# The following files and dirs were part of the noship rpm:
+	mkdir -p ${DESTDIR}/usr/include/hfi1diag
+	mkdir -p ${DESTDIR}/usr/include/hfi1diag/linux-x86_64
+	mkdir -p ${DESTDIR}/usr/include/hfi1diag/ptl_ips
+	install -m 0644 -D ptl_ips/ipserror.h ${DESTDIR}/usr/include/hfi1diag/ptl_ips/ipserror.h
+	install -m 0644 -D include/linux-x86_64/bit_ops.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/bit_ops.h
+	install -m 0644 -D include/linux-x86_64/sysdep.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/sysdep.h
+	install -m 0644 -D include/opa_udebug.h ${DESTDIR}/usr/include/hfi1diag/opa_udebug.h
+	install -m 0644 -D include/opa_debug.h ${DESTDIR}/usr/include/hfi1diag/opa_debug.h
+	install -m 0644 -D include/opa_intf.h ${DESTDIR}/usr/include/hfi1diag/opa_intf.h
+	install -m 0644 -D include/opa_user.h ${DESTDIR}/usr/include/hfi1diag/opa_user.h
+	install -m 0644 -D include/opa_service.h ${DESTDIR}/usr/include/hfi1diag/opa_service.h
+	install -m 0644 -D include/opa_common.h ${DESTDIR}/usr/include/hfi1diag/opa_common.h
+	install -m 0644 -D include/opa_byteorder.h ${DESTDIR}/usr/include/hfi1diag/opa_byteorder.h
+	install -m 0644 -D include/psm2_mock_testing.h ${DESTDIR}/usr/include/hfi1diag/psm2_mock_testing.h
+	install -m 0644 -D include/hfi1_deprecated.h ${DESTDIR}/usr/include/hfi1diag/hfi1_deprecated.h
+	install -m 0644 -D include/opa_revision.h ${DESTDIR}/usr/include/hfi1diag/opa_revision.h
+	install -m 0644 -D psmi_wrappers.h ${DESTDIR}/usr/include/hfi1diag/psmi_wrappers.h
+
+specfile: outdir specfile_clean
+	sed -e 's/@VERSION@/'${VERSION_RELEASE}'/g' libpsm2.spec.in | \
+		sed -e 's/@TARGLIB@/'${TARGLIB}'/g' \
+			-e 's/@RPM_NAME@/'${RPM_NAME}'/g' \
+			-e 's/@RPM_NAME_BASEEXT@/'${RPM_NAME_BASEEXT}'/g' \
+			-e 's/@COMPATLIB@/'${COMPATLIB}'/g' \
+			-e 's/@COMPATMAJOR@/'${COMPATMAJOR}'/g' \
+			-e 's;@UDEVDIR@;'${UDEVDIR}';g' \
+			-e 's/@MAJOR@/'${MAJOR}'/g' \
+			-e 's/@MINOR@/'${MINOR}'/g' \
+			-e 's:@LIBPSM2_COMPAT_CONF_DIR@:'${LIBPSM2_COMPAT_CONF_DIR}':g' \
+			-e 's:@LIBPSM2_COMPAT_SYM_CONF_DIR@:'${LIBPSM2_COMPAT_SYM_CONF_DIR}':g' \
+			-e 's;@SPEC_FILE_RELEASE_DIST@;'${SPEC_FILE_RELEASE_DIST}';g'  \
+			-e 's/@DIST_SHA@/'${DIST_SHA}'/g' > \
+		${OUTDIR}/${RPM_NAME}.spec
+	if [ -f /etc/redhat-release ] && [ `grep -o "[0-9.]*" /etc/redhat-release | cut -d"." -f1` -lt 7 ]; then \
+		sed -i 's;@40_PSM_RULES@;'${UDEVDIR}'/rules.d/40-psm.rules;g' ${OUTDIR}/${RPM_NAME}.spec; \
+	else \
+		sed -i 's;@40_PSM_RULES@;'${UDEV_40_PSM_RULES}';g' ${OUTDIR}/${RPM_NAME}.spec; \
+	fi
+
+# We can't totally prevent two make dist calls in a row from packaging
+# the previous make dist, unless we switch to using a dedicated ./src folder
+# That will come in the next major revision of the Makefile for now we can
+# prevent the easy and default cases
+dist: distclean
+	mkdir -p ${OUTDIR}/${DIST}
+	for x in $$(/usr/bin/find . 								\
+			-name ".git"                           -prune -o	\
+			-name "cscope*"                        -prune -o	\
+			-name "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" -prune -o	\
+			-name "*.orig"                         -prune -o	\
+			-name "*~"                             -prune -o	\
+			-name "#*"                             -prune -o	\
+			-name ".gitignore"                     -prune -o	\
+			-name "doc"                            -prune -o	\
+			-name "libcm"                          -prune -o	\
+			-name "psm.supp"                       -prune -o	\
+			-name "test"                           -prune -o	\
+			-name "tools"                          -prune -o	\
+			-name "artifacts"                      -prune -o	\
+			-print); do \
+		dir=$$(dirname $$x); \
+		mkdir -p ${OUTDIR}/${DIST}/$$dir; \
+		[ ! -d $$x ] && cp $$x ${OUTDIR}/${DIST}/$$dir; \
+	done
+	if [ $(ISGIT) ] ; then git log -n1 --pretty=format:%H . > ${OUTDIR}/${DIST}/COMMIT ; fi
+	echo ${RELEASE} > ${OUTDIR}/${DIST}/rpm_release_extension
+	cd ${OUTDIR}; tar czvf ${DIST}.tar.gz ${DIST}
+	@echo "${DIST}.tar.gz is located in ${OUTDIR}/${DIST}.tar.gz"
+
+ofeddist:
+	$(MAKE) -j $(nthreads) dist
+
+# rebuild the cscope database, skipping sccs files, done once for
+# top level
+cscope:
+	find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \
+	  -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i -
+
+sources-checksum:
+	@echo ${SOURCES_CHKSUM_VALUE}
+
+${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o	\
+		   ptl_am/am_reqrep.o		\
+		   ptl_am/ptl.o			\
+		   ptl_am/cmarwu.o		\
+		   ptl_am/am_cuda_memhandle_cache.o  \
+		   psm_context.o		\
+		   psm_ep.o			\
+		   psm_ep_connect.o		\
+		   psm_error.o			\
+		   psm_utils.o			\
+		   psm_sysbuf.o			\
+		   psm_timer.o			\
+		   psm_am.o			\
+		   psm_mq.o			\
+		   psm_mq_utils.o		\
+		   psm_mq_recv.o		\
+		   psm_mpool.o			\
+		   psm_stats.o			\
+		   psm_memcpy.o			\
+		   psm_mock.o			\
+		   psm.o			\
+		   psm_perf.o			\
+		   libuuid/psm_uuid.o		\
+		   libuuid/parse.o		\
+		   libuuid/pack.o		\
+		   libuuid/unpack.o		\
+		   libuuid/unparse.o		\
+		   ptl_ips/ptl.o		\
+		   ptl_ips/ptl_rcvthread.o	\
+		   ptl_ips/ipserror.o		\
+		   ptl_ips/ips_scb.o		\
+		   ptl_ips/ips_epstate.o	\
+		   ptl_ips/ips_recvq.o		\
+		   ptl_ips/ips_recvhdrq.o	\
+		   ptl_ips/ips_spio.o		\
+		   ptl_ips/ips_proto.o		\
+		   ptl_ips/ips_proto_recv.o	\
+		   ptl_ips/ips_proto_connect.o  \
+		   ptl_ips/ips_proto_expected.o \
+		   ptl_ips/ips_tid.o		\
+		   ptl_ips/ips_tidcache.o       \
+		   ptl_ips/ips_tidflow.o        \
+		   ptl_ips/ips_crc32.o 		\
+		   ptl_ips/ips_proto_dump.o	\
+		   ptl_ips/ips_proto_mq.o       \
+		   ptl_ips/ips_proto_am.o       \
+		   ptl_ips/ips_subcontext.o	\
+		   ptl_ips/ips_path_rec.o       \
+		   ptl_ips/ips_opp_path_rec.o   \
+		   ptl_ips/ips_writehdrq.o	\
+		   ptl_self/ptl.o		\
+		   opa/*.o			\
+		   psm_diags.o 			\
+		   psmi_wrappers.o
+
+${TARGLIB}-objs := $(patsubst %.o, ${OUTDIR}/%.o, ${${TARGLIB}-objs})
+
+DEPS:= $(${TARGLIB}-objs:.o=.d)
+-include $(DEPS)
+
+${OUTDIR}/${TARGLIB}.so: ${OUTDIR}/${TARGLIB}.so.${MAJOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+${OUTDIR}/${TARGLIB}.so.${MAJOR}: ${OUTDIR}/${TARGLIB}.so.${MAJOR}.${MINOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+# when we build the shared library, generate a revision and date
+# string in it, for easier id'ing when people may have copied the
+# file around.  Generate it such that the ident command can find it
+# and strings -a | grep OPA does a reasonable job as well.
+$(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} $(LINKER_SCRIPT_FILE)
+	echo "char psmi_hfi_IFS_version[]=\"`printenv RELEASE_TAG`\";" > ${OUTDIR}/_revision.c
+	date -u -d@$${SOURCE_DATE_EPOCH:-$$(date +%s)} +'char psmi_hfi_build_timestamp[] ="%F %T%:z";' >> ${OUTDIR}/_revision.c
+	echo "char psmi_hfi_sources_checksum[] =\"${SOURCES_CHKSUM_VALUE}\";" >> ${OUTDIR}/_revision.c
+	echo "char psmi_hfi_git_checksum[] =\"`git rev-parse HEAD`\";" >> ${OUTDIR}/_revision.c
+	$(CC) -c $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o
+	$(CC) $(LINKER_SCRIPT) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \
+		${${TARGLIB}-objs} $(OUTDIR)/_revision.o -Lopa $(LDLIBS)
+
+${OUTDIR}/%.o: ${top_srcdir}/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+
+$(LINKER_SCRIPT_FILE): psm2_linker_script_map.in
+	sed "s/_psm2_additional_globals_;/$(PSM2_ADDITIONAL_GLOBALS)/" \
+	     psm2_linker_script_map.in > ${OUTDIR}/psm2_linker_script.map
+
+linker_script_file_clean:
+	rm -f $(LINKER_SCRIPT_FILE)
diff --git a/README b/README
new file mode 100644
index 0000000..e74c865
--- /dev/null
+++ b/README
@@ -0,0 +1,300 @@
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Copyright (c) 2003-2017 Intel Corporation. All rights reserved.
+
+================================================================================
+
+ABSTRACT
+--------
+
+Discusses how to build, install and test the PSM2 library source code.
+
+Contains the following sections:
+
+- INTRODUCTION
+- DEPENDENCIES
+- BUILDING
+  * BUILDING USING MAKEFILE
+  * BUILDING USING RPMBUILD (CREATING SOURCE AND BINARY RPM'S)
+- INSTALLING
+  * INSTALLING USING MAKEFILE
+  * INSTALLING USING EITHER YUM OR DNF
+- RELATED SOFTWARE TO PSM2
+- SUPPORTING DOCUMENTATION
+
+INTRODUCTION
+============
+
+This README file discusses how to build, install and test the PSM2 library
+source code.
+
+The PSM2 library supports a number of fabric media and stacks, and all of
+them run on version 7.X of Red Hat Enterprise Linux (abbreviated: RHEL), and
+SuSE SLES.
+
+Only the x86_64 architecture is supported.
+
+Building PSM2 is possible on RHEL 7.2+ as it ships with hfi1 kernel driver.
+For older RHEL 7.x versions and SuSE SLES, OPA is not natively supported
+in the kernel and therefore, building PSM2 is not possible unless
+you have the correct kernel-devel package or use latest versions of IFS.
+
+There are two mechanisms for building and installing the PSM2 library:
+
+  1. Use provided Makefiles to build and install or
+  2. Generate the *.rpm files which you can then install using either
+     yum or dnf command
+
+DEPENDENCIES
+============
+
+The following packages are required to build the PSM2 library source code:
+(all packages are for the x86_64 architecture)
+
+compat-rdma-devel
+gcc-4.8.2
+glibc-devel
+glibc-headers
+kernel-headers
+
+Additional packages for GPU Direct support include:
+NVIDIA CUDA toolkit 8.0 or greater. Older versions are not supported.
+
+In addition to depending on these packages, root privileges are required to
+install the runtime libraries and development header files into standard
+system location.
+
+BUILDING
+========
+
+The instructions below use $BASENAME, $PRODUCT and $RELEASE to refer to
+the base name of the tarball, RPM that will be generated and the product
+and release identifiers of the RPM.
+
+The base name of the RPM changes depending on which version/branch
+of code you derive the tar file from.
+
+Up until v10.2 of PSM2, the base name for the RPM is hfi1-psm.
+From v10.2 onwards, the base name will be libpsm2. The internal
+library remains unchanged and is still libpsm2.so.2.
+
+BUILDING USING MAKEFILES
+------------------------
+
+1. Untar the tarball:
+	$ tar zxvf $BASENAME-$PRODUCT-$RELEASE.tar.gz
+2. Change directory into the untarred location:
+	$ cd $BASENAME-$PRODUCT-$RELEASE
+3. Run make on the command line. This will build the PSM2 library.
+	$ make
+  3.1. Optionally to build PSM2 library with GPU Direct support, Run make
+       PSM_CUDA=1 instead of make on the command line.
+      $ make PSM_CUDA=1
+
+BUILDING USING RPMBUILD
+-----------------------
+
+1. Run this command from your $PWD to generate rpm, srpm files
+	$ ./makesrpm.sh a
+
+  This command results in the following collection of rpm's and source
+  code rpm's under your $PWD/temp.X/ directory.
+  ("X" is the pid of the bash script that created the srpm and rpm files)
+  (Result shown here for RHEL systems.)
+
+  RPMS/x86_64/libpsm2-compat-10.3.7-1x86_64.rpm
+  RPMS/x86_64/libpsm2-devel-10.3.7-1x86_64.rpm
+  RPMS/x86_64/libpsm2-10.3.7-1x86_64.rpm
+  RPMS/x86_64/libpsm2-debuginfo-10.3.7-1x86_64.rpm
+  SRPMS/libpsm2-10.3.7-1.src.rpm
+
+  1.1. Optionally for GPU Direct support run this command from your $PWD to
+       generate rpm, srpm files
+       $ ./makesrpm.sh a -cuda
+
+      This command results in the following collection of rpm's and source code
+      rpm's under your $PWD/temp.X/ directory. ("X" is the pid of the bash
+      script that created the srpm and rpm files):
+      RPMS/x86_64/libpsm2-10.3.7-1cuda.x86_64.rpm
+      RPMS/x86_64/libpsm2-compat-10.3.7-1cuda.x86_64.rpm
+      RPMS/x86_64/libpsm2-devel-10.3.7-1cuda.x86_64.rpm
+      SRPMS/x86_64/libpsm2-10.3.7-1cuda.src.rpm
+
+  On systems with SLES 12.3 or newer, the package name for the base libpsm2
+  RPM will be:
+  	libpsm2-2-10.3.7-1.x86_64.rpm
+
+  Other supporting RPM package names will be as listed above.
+
+INSTALLING
+==========
+
+INSTALLING USING MAKEFILE
+-------------------------
+
+Install the libraries and header files on the system (as root):
+	$ make install
+
+The libraries will be installed in /usr/lib64, and the header files will
+be installed in /usr/include.
+
+This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on
+the "make install" command line. "DESTDIR" will add a leading path component
+to the overall install path and "LIBDIR" will change the path where libraries
+will be installed. For example, "make DESTDIR=/tmp/psm-install install" will
+install all files (libraries and headers) into "/tmp/psm-install/usr/...",
+"make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the
+libraries in "/tmp/psm-install/libraries" and the headers in
+"/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will
+install the libraries in "/tmp/libs" and the headers in "/usr/include".
+
+
+INSTALLING USING EITHER YUM OR DNF
+----------------------------------
+
+You can install the rpm's and source rpm's previously built using rpmbuild using
+either the yum or dnf command as the root user.  See the appropriate man page for
+details of installing rpm's.
+
+Note: It is also possible to use rpm command to install rpm's, but it is recommended
+that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags.
+yum or dnf should be better able to resolve dependency issues.
+
+RELATED SOFTWARE TO PSM2
+========================
+
+MPI Libraries supported
+-----------------------
+A large number of open source (OpenMPI, MVAPICH2) and Vendor MPI
+implementations support PSM2 for optimized communication on HCAs. Vendor MPI
+implementations (HP-MPI, Intel MPI 4.0 with PMI, Platform/Scali MPI)
+require that the PSM2 runtime libraries be installed and available on
+each node. Usually a configuration file or a command line switch to mpirun
+needs to be specified to utilize the PSM2 transport.
+
+OpenMPI support
+---------------
+It is recommended to use the v1.10.4 or newer version of OpenMPI.
+Prior versions of OpenMPI have an issue with support PSM2 network transports
+mixed with standard Verbs transport (BTL openib). This prevents an OpenMPI
+installation with network modules available for PSM2 and Verbs to work
+correctly on nodes with no HFI hardware. This has been fixed in the
+latest development branch allowing a single OpenMPI installation to target
+HFI hardware via PSM2 or Verbs as well as alternate transports seamlessly.
+
+If NVIDIA CUDA support is desired, you can use the OpenMPI build
+(v1.10.4-cuda-hfi) provided by Intel in the IFS installer v10.4.X or newer.
+The changes have also been accepted into v3.0.x branch of upstream OpenMPI
+repository. Therefore subsequent v3.0.x versions of OpenMPI should carry the
+required OpenMPI support for PSM2 GPUDirect feature.
+
+PSM2 header and runtime files need to be installed on a node where the OpenMPI
+build is performed. All compute nodes additionally should have the PSM2 runtime
+libraries available on them. OpenMPI provides a standard configure, make and
+make install mechanism which will detect and build the relevant PSM2 network
+modules for OpenMPI once the header and runtime files are detected.
+
+MVAPICH2 support
+----------------
+MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware.
+OPA IFS supports MVAPICH2 v2.1 (or later). PSM2 header and runtime files
+need to be installed on a node where MVAPICH2 builds are performed. All
+compute nodes should also have the PSM2 runtime libraries available on them.
+
+For building and installing MVAPICH2 with OPA support, refer to MVAPICH2
+user guides here:
+http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.2rc1-userguide.html
+
+(Note: Support for PSM2 is currently on v2.2rc1 of OSU MVAPICH2 code base.
+The above link might change when a stable v2.2 is released.)
+
+OFED Support
+------------
+Intel OPA is not yet included within OFED. But the hfi1 driver is available
+publicly at kernel.org. Please do pull the driver from either kernel.org or
+the github page for opa-hfi1 driver (https://github.com/01org/opa-hfi1)
+
+SUPPORTING DOCUMENTATION
+------------------------
+PSM2 Programmer's Guide is published along with documentation for "Intel® Omni-Path
+Host Fabric Interface PCIe Adapter 100 Series"
+(http://www.intel.com/content/www/us/en/support/network-and-i-o/fabric-products/000016242.html)
+
+Refer to this document for description on APIs and environment variables that
+are available for use. For sample code on writing applications leveraging the
+PSM2 APIs, refer to Section 5.
+
+Link to latest (as of Sep 2017) PSM2 Programmer's Guide:
+https://www.intel.com/content/dam/support/us/en/documents/network-and-i-o/fabric-products/Intel_PSM2_PG_H76473_v7_0.pdf
+
+PSM Compatibility Support
+------------
+
+libpsm2-compat suppports applications that use the PSM API instead of
+the PSM2 API, through a compatibility library. This library is an interface
+between PSM applications and the PSM2 API.
+
+If the system has an application that is coded to use PSM and has requirements
+to use PSM2 (i.e. the host has Omni-Path hardware), the compatibility library
+must be used.
+
+Please refer to your operating system's documentation to find how to modify the
+order in which system directories are searched for dynamic libraries. The
+libpsm2-compat version of libpsm_infinipath.so.1 must be earlier on the search
+path than that of libpsm-infinipath. Doing so allows applications coded to PSM
+to transparently use the PSM2 API and devices which require it.
+
+Please note that the installation path for the libpsm2-compat version of
+libpsm_infinipath.so.1 will differ depending on your operating system
+specifics. Common locations include:
+- /usr/lib64/psm2-compat/
+- /usr/lib/psm2-compat/
+
diff --git a/buildflags.mak b/buildflags.mak
new file mode 100644
index 0000000..78efd70
--- /dev/null
+++ b/buildflags.mak
@@ -0,0 +1,210 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2016 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2016 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2016 Intel Corporation. All rights reserved.
+#
+
+# set top_srcdir and include this file
+
+ifeq (,$(top_srcdir))
+$(error top_srcdir must be set to include makefile fragment)
+endif
+
+export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
+export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
+
+ifeq (${CCARCH},gcc)
+	export CC := gcc
+else
+	ifeq (${CCARCH},gcc4)
+		export CC := gcc4
+	else
+		ifeq (${CCARCH},icc)
+		     export CC := icc
+		else
+		     anerr := $(error Unknown C compiler arch: ${CCARCH})
+		endif # ICC
+	endif # gcc4
+endif # gcc
+
+ifeq (${FCARCH},gfortran)
+	export FC := gfortran
+else
+	anerr := $(error Unknown Fortran compiler arch: ${FCARCH})
+endif # gfortran
+
+BASECFLAGS += $(BASE_FLAGS)
+LDFLAGS += $(BASE_FLAGS)
+ASFLAGS += $(BASE_FLAGS)
+
+ifeq ($(PSM2_MOCK_TESTING),1)
+BASECFLAGS += -DPSM2_MOCK_TESTING=1
+# we skip the linker script for testing version, we want all symbols to be
+# reachable from outside the library
+else
+LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE)
+endif
+
+WERROR := -Werror
+INCLUDES := -I. -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn -I$(top_srcdir)/include/$(os)-$(arch)
+
+#
+# use IFS provided hfi1_user.h if installed.
+#
+IFS_HFI_HEADER_PATH := /usr/include/uapi
+INCLUDES += -I${IFS_HFI_HEADER_PATH}
+
+BASECFLAGS +=-Wall $(WERROR)
+
+#
+# test if compiler supports SSE4.2 (needed for crc32 instruction)
+#
+RET := $(shell echo "int main() {}" | ${CC} -msse4.2 -E -dM -xc - 2>&1 | grep -q SSE4_2 ; echo $$?)
+ifeq (0,${RET})
+  BASECFLAGS += -msse4.2
+else
+  $(error SSE4.2 compiler support required )
+endif
+
+#
+# test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction.
+#
+ifneq (,${PSM_AVX})
+  ifeq (${CC},icc)
+    MAVX2=-march=core-avx2
+  else
+    MAVX2=-mavx2
+  endif
+  RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
+  ifeq (0,${RET})
+    TMPVAR := $(BASECFLAGS)
+    BASECFLAGS := $(filter-out -msse4.2,$(TMPVAR))
+    BASECFLAGS += ${MAVX2}
+  endif
+
+  ifneq (icc,${CC})
+    RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?)
+    ifeq (0,${RET})
+      BASECFLAGS += -mavx512f
+    endif
+  endif
+endif
+
+#
+# feature test macros for drand48_r
+#
+BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE
+
+ifneq (,${HFI_BRAKE_DEBUG})
+  BASECFLAGS += -DHFI_BRAKE_DEBUG
+endif
+ifneq (,${PSM_DEBUG})
+  BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
+else
+  BASECFLAGS += -O3 -g3
+endif
+ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting
+  BASECFLAGS += -O -fprofile-arcs -ftest-coverage
+  LDFLAGS += -fprofile-arcs
+endif
+ifneq (,${PSM_LOG})
+   BASECFLAGS += -DPSM_LOG
+ifneq (,${PSM_LOG_FAST_IO})
+   BASECFLAGS += -DPSM_LOG_FAST_IO
+   PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message;
+endif
+endif
+ifneq (,${PSM_PERF})
+   BASECFLAGS += -DRDPMC_PERF_FRAMEWORK
+endif
+ifneq (,${PSM_HEAP_DEBUG})
+   BASECFLAGS += -DPSM_HEAP_DEBUG
+endif
+ifneq (,${PSM_PROFILE})
+  BASECFLAGS += -DPSM_PROFILE
+endif
+ifneq (,${PSM_CUDA})
+  BASECFLAGS += -DNVIDIA_GPU_DIRECT -DPSM_CUDA
+  CUDA_HOME ?= /usr/local/cuda
+  INCLUDES += -I$(CUDA_HOME)/include
+endif
+
+BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE
+
+ifeq (${CCARCH},gcc)
+  BASECFLAGS += -funwind-tables
+endif
+
+ifneq (,${PSM_VALGRIND})
+  CFLAGS += -DPSM_VALGRIND
+else
+  CFLAGS += -DNVALGRIND
+endif
+
+ASFLAGS += -g3 -fpic
+
+BASECFLAGS += ${OPA_CFLAGS}
+
+ifeq (${CCARCH},icc)
+    BASECFLAGS += -O3 -g3 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
+    CFLAGS += $(BASECFLAGS)
+    LDFLAGS += -static-intel
+else
+	ifeq (${CCARCH},gcc)
+	    CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing -Wformat-security
+	else
+	    ifeq (${CCARCH},gcc4)
+		CFLAGS += $(BASECFLAGS)
+	    else
+		$(error Unknown compiler arch "${CCARCH}")
+	    endif # gcc4
+	endif # gcc
+endif # icc
+
diff --git a/compat/40-psm-compat.rules b/compat/40-psm-compat.rules
new file mode 100644
index 0000000..fc7c4b1
--- /dev/null
+++ b/compat/40-psm-compat.rules
@@ -0,0 +1,52 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+KERNEL=="hfi1", SYMLINK+="ipath"
+KERNEL=="hfi1_[0-9]", MODE="0666", SYMLINK+="ipath"
diff --git a/compat/Makefile b/compat/Makefile
new file mode 100644
index 0000000..092775f
--- /dev/null
+++ b/compat/Makefile
@@ -0,0 +1,90 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+OUTDIR = .
+
+COMPATLIB := libpsm_infinipath
+COMPAT_LIB_TARG := $(INSTALL_LIB_TARG)/psm2-compat
+compat_build_dir := $(shell readlink -m .)
+
+MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' ../psm2.h)
+
+top_srcdir := $(compat_build_dir)/..
+include $(compat_build_dir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${COMPATLIB}-objs := psm-compat.o
+${COMPATLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${COMPATLIB}-objs})
+
+DEPS:= $(${COMPATLIB}-objs:.o=.d)
+-include $(DEPS)
+
+all .DEFAULT: ${${COMPATLIB}-objs} $(OUTDIR)/${COMPATLIB}.so.${MAJOR}
+
+install: all
+	install -m 0644 -D 40-psm-compat.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm-compat.rules
+	install -m 0644 -D libpsm2-compat.conf ${DESTDIR}${LIBPSM2_COMPAT_CONF_DIR}/modprobe.d/libpsm2-compat.conf
+	install -m 0755 -D libpsm2-compat.cmds ${DESTDIR}/usr/lib/libpsm2/libpsm2-compat.cmds
+	install -D $(OUTDIR)/${COMPATLIB}.so.${MAJOR} ${DESTDIR}${COMPAT_LIB_TARG}/${COMPATLIB}.so.${MAJOR}
+
+$(OUTDIR)/%.o: $(compat_build_dir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+
+$(OUTDIR)/${COMPATLIB}.so.${MAJOR}: ${${COMPATLIB}-objs}
+	$(CC) $(BASECFLAGS) $(LINKER_SCRIPT) $(LDFLAGS) -Wl,-soname=${COMPATLIB}.so.${MAJOR} -shared \
+		 -L$(OUTDIR)/.. ${${COMPATLIB}-objs} -lpsm2 -o $@
+
+clean:
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno ${COMPATLIB}.*; \
+		cd -; \
+	fi
diff --git a/compat/buildflags.mak b/compat/buildflags.mak
new file mode 100644
index 0000000..c677989
--- /dev/null
+++ b/compat/buildflags.mak
@@ -0,0 +1,103 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+ifeq (,$(top_srcdir))
+$(error top_srcdir must be set to include makefile fragment)
+endif
+
+export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
+export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
+export CCARCH ?= gcc
+
+ifeq (${CCARCH},gcc)
+	export CC := gcc
+else
+	ifeq (${CCARCH},gcc4)
+		export CC := gcc4
+	else
+		ifeq (${CCARCH},icc)
+				 export CC := icc
+		else
+				 anerr := $(error Unknown C compiler arch: ${CCARCH})
+		endif # ICC
+	endif # gcc4
+endif # gcc
+
+BASECFLAGS += $(BASE_FLAGS)
+LDFLAGS += $(BASE_FLAGS)
+ASFLAGS += $(BASE_FLAGS)
+
+LINKER_SCRIPT_FILE := psm2_compat_linker_script.map
+LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE)
+WERROR := -Werror
+INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn
+
+BASECFLAGS +=-Wall $(WERROR)
+
+BASECFLAGS += -fpic -fPIC
+
+ASFLAGS += -g3 -fpic
+
+ifeq (${CCARCH},icc)
+    BASECFLAGS += -O3 -g3
+    CFLAGS += $(BASECFLAGS)
+    LDFLAGS += -static-intel
+else
+	ifeq (${CCARCH},gcc)
+	    CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing
+	else
+		ifeq (${CCARCH},gcc4)
+			CFLAGS += $(BASECFLAGS)
+		else
+			$(error Unknown compiler arch "${CCARCH}")
+		endif
+	endif
+endif
diff --git a/compat/libpsm2-compat.cmds b/compat/libpsm2-compat.cmds
new file mode 100755
index 0000000..dcead1e
--- /dev/null
+++ b/compat/libpsm2-compat.cmds
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# This script was created to allow for both an hfi1 and qib adapter
+# to co-exist on the same machine.
+# The simlink from /dev/ipath is removed to allow ib_qib to load
+# correctly and create a proper device file.
+
+case "$1" in
+start)
+  # Remove symlink if hfi1 was loaded first
+  if [ -L "/dev/ipath" ]; then
+    rm /dev/ipath
+  fi
+  ;;
+stop)
+  # Restore symlink if hfi1 is loaded
+  if [ -f "/dev/hfi1" ] && ! [ -L "/dev/ipath" ]; then
+    ln -s /dev/hfi1 /dev/ipath
+  fi
+  ;;
+esac
diff --git a/compat/libpsm2-compat.conf b/compat/libpsm2-compat.conf
new file mode 100644
index 0000000..d71e8f2
--- /dev/null
+++ b/compat/libpsm2-compat.conf
@@ -0,0 +1,52 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+install ib_qib /usr/lib/libpsm2/libpsm2-compat.cmds start; modprobe -i ib_qib $CMDLINE_OPTS
+remove ib_qib modprobe -r -i ib_qib && /usr/lib/libpsm2/libpsm2-compat.cmds stop
diff --git a/compat/psm-compat.c b/compat/psm-compat.c
new file mode 100644
index 0000000..4309e02
--- /dev/null
+++ b/compat/psm-compat.c
@@ -0,0 +1,335 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "../psm2.h"
+#include "../psm2_mq.h"
+#include "../psm2_am.h"
+
+/* Functions from TS psm.h */
+psm2_error_t
+psm_init(int *major, int *minor)
+{
+  return psm2_init(major, minor);
+}
+
+psm2_error_t
+psm_finalize(void)
+{
+  return psm2_finalize();
+}
+
+psm2_error_t
+psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+  return psm2_map_nid_hostname(num, nids, hostnames);
+}
+
+void
+psm_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label)
+{
+  return psm2_epaddr_setlabel(epaddr, epaddr_label);
+}
+
+void
+psm_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt)
+{
+  psm2_epaddr_setctxt(epaddr, ctxt);
+}
+
+void *
+psm_epaddr_getctxt(psm2_epaddr_t epaddr)
+{
+  return psm2_epaddr_getctxt(epaddr);
+}
+
+psm2_error_t
+psm_setopt(psm2_component_t component, const void *component_obj,
+       int optname, const void *optval, uint64_t optlen)
+{
+  return psm2_setopt(component, component_obj,
+         optname, optval, optlen);
+}
+
+psm2_error_t
+psm_getopt(psm2_component_t component, const void *component_obj,
+       int optname, void *optval, uint64_t *optlen)
+{
+  return psm2_getopt(component, component_obj,
+       optname, optval, optlen);
+}
+
+psm2_error_t
+psm_poll(psm2_ep_t ep)
+{
+  return psm2_poll(ep);
+}
+
+void
+psm_uuid_generate(psm2_uuid_t uuid_out)
+{
+    psm2_uuid_generate(uuid_out);
+}
+
+/* Functions from TS psm_am.h */
+psm2_error_t
+psm_am_register_handlers(psm2_ep_t ep,
+       const psm2_am_handler_fn_t *handlers,
+       int num_handlers, int *handlers_idx)
+{
+  return psm2_am_register_handlers(ep, handlers, num_handlers, handlers_idx);
+}
+
+psm2_error_t
+psm_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+           psm2_amarg_t *args, int nargs, void *src, size_t len,
+           int flags, psm2_am_completion_fn_t completion_fn,
+           void *completion_ctxt)
+{
+  return psm2_am_request_short(epaddr, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt);
+}
+
+psm2_error_t
+psm_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+         psm2_amarg_t *args, int nargs, void *src, size_t len,
+         int flags, psm2_am_completion_fn_t completion_fn,
+         void *completion_ctxt)
+{
+  return psm2_am_reply_short(token, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt);
+}
+
+psm2_error_t
+psm_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters,
+      size_t sizeof_parameters_in,
+      size_t *sizeof_parameters_out)
+{
+  return psm2_am_get_parameters(ep, parameters, sizeof_parameters_in, sizeof_parameters_out);
+}
+
+
+/* Functions from TS psm_error.h */
+
+psm2_error_t
+psm_error_defer(psm2_error_token_t token)
+{
+  return psm2_error_defer(token);
+}
+
+psm2_error_t
+psm_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler)
+{
+  return psm2_error_register_handler(ep, errhandler);
+}
+
+const char *
+psm_error_get_string(psm2_error_t error)
+{
+  return psm2_error_get_string(error);
+}
+
+/* Functions from TS psm_mq.h */
+psm2_error_t
+psm_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_status_t *status)
+{
+  return psm2_mq_iprobe(mq, tag, tagsel, status);
+}
+
+psm2_error_t
+psm_mq_cancel(psm2_mq_req_t *ireq)
+{
+  return psm2_mq_cancel(ireq);
+}
+
+psm2_error_t
+psm_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+  return psm2_mq_wait(ireq, status);
+}
+
+psm2_error_t
+psm_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+  return psm2_mq_test(ireq, status);
+}
+
+psm2_error_t
+psm_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+       const void *buf, uint32_t len, void *context, psm2_mq_req_t *req)
+{
+  return psm2_mq_isend(mq, dest, flags, stag, buf, len, context, req);
+}
+
+psm2_error_t
+psm_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+      const void *buf, uint32_t len)
+{
+  return psm2_mq_send(mq, dest, flags, stag, buf, len);
+}
+
+psm2_error_t
+psm_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags,
+        void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo)
+{
+  return psm2_mq_irecv(mq, tag, tagsel, flags, buf, len, context, reqo);
+}
+
+psm2_error_t
+psm_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
+{
+  return psm2_mq_ipeek(mq, oreq, status);
+}
+
+psm2_error_t
+psm_mq_getopt(psm2_mq_t mq, int key, void *value)
+{
+  return psm2_mq_getopt(mq, key, value);
+}
+
+psm2_error_t
+psm_mq_setopt(psm2_mq_t mq, int key, const void *value)
+{
+  return psm2_mq_setopt(mq, key, value);
+}
+
+psm2_error_t
+psm_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+      const struct psm2_optkey *opts,
+      int numopts, psm2_mq_t *mqo)
+{
+  return psm2_mq_init(ep, tag_order_mask, opts, numopts, mqo);
+}
+
+psm2_error_t
+psm_mq_finalize(psm2_mq_t mq)
+{
+  return psm2_mq_finalize(mq);
+}
+
+void
+psm_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats)
+{
+  psm2_mq_get_stats(mq, stats);
+}
+
+/* Functions from TS psm_mq.h */
+psm2_error_t
+psm_ep_num_devunits(uint32_t *num_units_o)
+{
+  return psm2_ep_num_devunits(num_units_o);
+}
+
+uint64_t
+psm_epid_nid(psm2_epid_t epid)
+{
+  return psm2_epid_nid(epid);
+}
+
+uint64_t
+psm_epid_context(psm2_epid_t epid)
+{
+  return psm2_epid_context(epid);
+}
+
+uint64_t
+psm_epid_port(psm2_epid_t epid)
+{
+  return psm2_epid_port(epid);
+}
+
+psm2_error_t
+psm_ep_query (int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
+{
+  return psm2_ep_query (num_of_epinfo, array_of_epinfo);
+}
+
+psm2_error_t
+psm_ep_epid_lookup (psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+  return psm2_ep_epid_lookup (epid, epconn);
+}
+
+psm2_error_t
+psm_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o)
+{
+  return psm2_ep_epid_share_memory(ep, epid, result_o);
+}
+
+psm2_error_t
+psm_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts)
+{
+  return psm2_ep_open_opts_get_defaults(opts);
+}
+
+psm2_error_t
+psm_ep_open(psm2_uuid_t const unique_job_key, struct psm2_ep_open_opts const *opts_i,
+      psm2_ep_t *epo, psm2_epid_t *epido)
+{
+  return psm2_ep_open(unique_job_key, opts_i, epo, epido);
+}
+
+psm2_error_t
+psm_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
+{
+  return psm2_ep_close(ep, mode, timeout_in);
+}
+
+psm2_error_t
+psm_ep_connect(psm2_ep_t ep, int num_of_epid,
+          psm2_epid_t const *array_of_epid,
+          int const *array_of_epid_mask,
+          psm2_error_t  *array_of_errors,
+          psm2_epaddr_t *array_of_epaddr,
+          int64_t timeout)
+{
+  return psm2_ep_connect(ep, num_of_epid, array_of_epid, array_of_epid_mask,
+            array_of_errors, array_of_epaddr, timeout);
+}
diff --git a/compat/psm2_compat_linker_script.map b/compat/psm2_compat_linker_script.map
new file mode 100644
index 0000000..0933c68
--- /dev/null
+++ b/compat/psm2_compat_linker_script.map
@@ -0,0 +1,66 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+   C++ // Comments don't work in this file. */
+
+PSM_1.0
+{
+    /* Expose only those symbols we choose to. This way we do not
+       pollute users namespace more than absolutely necessary. */
+    global:
+        psm_*;
+
+    /* Make all other symbols local */
+    local:
+        *;
+};
diff --git a/include/common_defines.h b/include/common_defines.h
new file mode 100644
index 0000000..b244464
--- /dev/null
+++ b/include/common_defines.h
@@ -0,0 +1,176 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef COMMON_DEFINES_H
+#define COMMON_DEFINES_H
+
+/* TESTING being defined flips a couple of switches so that a testable version
+ * of libpsm2.so is built. It'll make properly annotated static functions be
+ * non-static, visible to the outside. Also, all mockable functions will be
+ * replaced with function pointers which will originally point to the actual
+ * implementation. However, those function pointers might be reset by the test
+ * code, thus allowing for mocking selected PSM2 functions for the purpose of
+ * the test.
+ *
+ * So far the following utilities have been introduced for enabling a
+ * conditiional compilation of the testable vs. production version of the library:
+ *  - ustatic: toggles function visibility
+ *  - MOCKABLE(): decorates function name so that it is visible after being mocked
+ *  - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam
+ *        for mocking a function
+ *  - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam
+ *        for mocking a function
+ *
+ * If the declaration and definition of a static function @c foo reside in
+ * different files, this would be the common use case:
+ *
+ * @code
+ * // somefile.c:
+ * int MOCKABLE(foo)();
+ * MOCK_DCL_EPILOGUE(foo);
+ *
+ * // otherfile.c:
+ * int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * int foo();
+ *
+ * // otherfile.c:
+ * int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * On the other hand, if a testable version of the libary is being build, it
+ * would produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ *
+ * // otherfile.c:
+ * int foo_original_() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ *
+ * If the function to be mocked is a static function residing in the header,
+ * the following syntax would be used:
+ * @code
+ * // somefile.c:
+ * ustatic int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DCL_EPILOGUE(foo);
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * static int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * Similarly, if a testable version of the libary is being build, it would
+ * produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ */
+#ifndef TESTING
+
+/* If no testing is being done, ustatic resolves to regular "static" */
+#define ustatic static
+/* If no testing is being done, no indirection is introduced */
+#define MOCKABLE(fname) fname
+/* If no testing is being done, no declaration epilogue is needed */
+#define MOCK_DCL_EPILOGUE(fname)
+/* If no testing is being done, no definition epilogue is needed */
+#define MOCK_DEF_EPILOGUE(fname)
+
+#else /* ndef TESTING */
+
+/* For the testable version, all _ustatic_ function will NOT be static */
+#define ustatic
+/* TODO override inline directives in the same fashion as static */
+/* For the testable version, the actual implementation function is renamed */
+#define MOCKABLE(x) x ## _original_
+/* For the testable version, we declare the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the declaration of the actual function happens.
+ */
+#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x;
+/* For the testable version, we define the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the definition of the actual function happens.
+ */
+#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_;
+
+#endif /* ndef TESTING */
+
+#endif /* COMMON_DEFINES_H */
+
diff --git a/include/hfi1_deprecated.h b/include/hfi1_deprecated.h
new file mode 100644
index 0000000..36fd31f
--- /dev/null
+++ b/include/hfi1_deprecated.h
@@ -0,0 +1,181 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/*
+
+  hfi1_deprecated.h
+
+  Contains certain features of the hfi1 module that have been deprecated.
+
+  These features may still need to be supported by the psm library for
+  reasons of backwards compatibility.
+ */
+
+#ifndef __HFI1_DEPRECATED_H__
+
+#define __HFI1_DEPRECATED_H__
+
+/* First, include the current hfi1_user.h file: */
+
+#include <rdma/hfi/hfi1_user.h>
+
+/* Determine if we need to define and declare deprecated
+   entities based on the IB_IOCTL_MAGIC macro. */
+
+#if defined( IB_IOCTL_MAGIC )
+
+/* The macro: PSM2_SUPPORT_IW_CMD_API is used to stipulate
+   adding compile-time support of either the ioctl() or write()
+   command interfaces to the driver.  Note though that the
+   final decision whether to support this depends on factors
+   only known at runtime. */
+#define PSM2_SUPPORT_IW_CMD_API 1
+/* IOCTL_CMD_API_MODULE_MAJOR defines the first version of the hfi1
+ * module that supports the ioctl() command interface.  Prior to this
+ * (IOCTL_CMD_API_MODULE_MAJOR - 1 and smaller), the module used
+ * write() for the command interface. */
+#define IOCTL_CMD_API_MODULE_MAJOR        6
+
+/*
+ * round robin contexts across HFIs, then
+ * ports; this is the default.
+ * This option spreads the HFI selection within the local socket.
+ * If it is preferred to spread job over over entire set of
+ * HFIs within the system, see ALG_ACROSS_ALL below.
+ */
+#define HFI1_ALG_ACROSS_DEP 0
+
+/*
+ * use all contexts on an HFI (round robin
+ * active ports within), then next HFI
+ */
+#define HFI1_ALG_WITHIN_DEP 1
+
+struct hfi1_cmd_deprecated {
+	__u32 type;        /* command type */
+	__u32 len;         /* length of struct pointed to by add */
+	__u64 addr;        /* pointer to user structure */
+};
+
+#define hfi1_cmd hfi1_cmd_deprecated
+
+#define HFI1_ALG_ACROSS		HFI1_ALG_ACROSS_DEP
+#define HFI1_ALG_WITHIN		HFI1_ALG_WITHIN_DEP
+
+#else
+
+#define HFI1_SWMAJOR_SHIFT 16
+
+#endif /* defined( IB_IOCTL_MAGIC )*/
+
+#define HFI1_ALG_ACROSS_ALL_DEP 2
+#define HFI1_ALG_ACROSS_ALL	HFI1_ALG_ACROSS_ALL_DEP
+
+/* Note that struct hfi1_user_info_dep declaration is identical to
+   the struct hfi1_user_info declaration from MAJOR version 5 of the
+   hfi1_user.h file. */
+struct hfi1_user_info_dep {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to HFI1_USER_SWVERSION.
+	 */
+	__u32 userversion;
+	__u16 pad;
+	/* HFI selection algorithm, if unit has not selected */
+	__u16 hfi1_alg;
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the subcontext_cnt and subcontext_id to the same
+	 * values.  The only restriction on the subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 subctxt_cnt;
+	__u16 subctxt_id;
+	/* 128bit UUID passed in by PSM. */
+	__u8 uuid[16];
+};
+
+/*
+ * We assume here that we have the hfi1_user.h file installed in the system path
+ * with the 'flags' field defined in struct sdma_req_info. (At least, when the
+ * user needs to run GPU workloads, this _should_ be the version of hfi1_user.h
+ * file installed by the IFS.)
+ */
+struct sdma_req_info_v6_3 {
+	/*
+	 * bits 0-3 - version (currently unused)
+	 * bits 4-7 - opcode (enum sdma_req_opcode)
+	 * bits 8-15 - io vector count
+	 */
+	__u16 ctrl;
+	/*
+	 * Number of fragments contained in this request.
+	 * User-space has already computed how many
+	 * fragment-sized packet the user buffer will be
+	 * split into.
+	 */
+	__u16 npkts;
+	/*
+	 * Size of each fragment the user buffer will be
+	 * split into.
+	 */
+	__u16 fragsize;
+	/*
+	 * Index of the slot in the SDMA completion ring
+	 * this request should be using. User-space is
+	 * in charge of managing its own ring.
+	 */
+	__u16 comp_idx;
+} __attribute__((packed));
+
+#endif /* #ifndef __HFI1_DEPRECATED_H__ */
diff --git a/include/linux-i386/bit_ops.h b/include/linux-i386/bit_ops.h
new file mode 100644
index 0000000..d272e75
--- /dev/null
+++ b/include/linux-i386/bit_ops.h
@@ -0,0 +1,98 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_BIT_OPS_H
+#define _HFI_i386_BIT_OPS_H
+
+static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	int oldbit;
+
+	asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit),
+		      "=m"(*addr) : "dIr"(nr) : "memory");
+	return oldbit;
+}
+
+static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips___test_and_set_bit(int nr,
+					     volatile unsigned long *addr)
+{
+	int oldbit;
+
+	asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit),
+		      "=m"(*addr) : "dIr"(nr) : "memory");
+	return oldbit;
+}
+
+#endif /* _HFI_i386_BIT_OPS_H */
diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
new file mode 100644
index 0000000..bfd5746
--- /dev/null
+++ b/include/linux-i386/sysdep.h
@@ -0,0 +1,171 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_SYSDEP_H
+#define _HFI_i386_SYSDEP_H
+
+typedef struct cpuid {
+        unsigned eax, ebx, ecx, edx;
+} cpuid_t;
+
+static __inline__ void
+get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id)
+{
+	unsigned a, b, c, d;
+
+	asm (" \
+	mov %4, %%eax \n\
+	mov %5, %%ecx \n\
+	cpuid \n\
+	mov %%eax, %0 \n\
+	mov %%ebx, %1 \n\
+	mov %%ecx, %2 \n\
+	mov %%edx, %3 \n\
+	" : "=g" (a), "=g" (b), "=g" (c), "=g" (d)
+	: "g" (func), "g" (subfunc)
+	: "%eax", "%ebx", "%ecx", "%edx"
+	);
+
+	id->eax = a;
+	id->ebx = b;
+	id->ecx = c;
+	id->edx = d;
+}
+
+static __inline__ uint64_t get_cycles(void)
+{
+	uint64_t v;
+	uint32_t a, d;
+
+	asm volatile ("rdtsc" : "=a" (a), "=d"(d));
+	v = ((uint64_t) a) | (((uint64_t) d) << 32);
+
+	return v;
+}
+
+#ifndef LOCK_PREFIX
+#define LOCK_PREFIX "lock "
+#endif
+
+static __inline__ void ips_barrier()
+{
+	asm volatile ("" :  :  : "memory");
+}
+
+static __inline__ void ips_mb()
+{
+	asm volatile ("mfence" :  :  : "memory");
+}
+
+/* gcc-3.4 has a bug with this function body at -O0 */
+static
+#if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4
+#else
+__inline__
+#endif
+void ips_rmb()
+{
+	asm volatile ("" :  :  : "memory");
+}
+
+static __inline__ void ips_wmb()
+{
+	asm volatile ("sfence" :  :  : "memory");
+}
+
+static __inline__ void ips_sync_writes()
+{
+	asm volatile ("sfence" :  :  : "memory");
+}
+
+static __inline__ void ips_sync_reads()
+{
+	asm volatile ("lfence" :  :  : "memory");
+}
+
+static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
+				       uint32_t old_val, uint32_t new_val)
+{
+	uint32_t prev;
+	struct xchg_dummy {
+		uint32_t a[100];
+	};
+
+	asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev)
+		      : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val)
+		      : "memory");
+
+	return prev;
+}
+
+typedef struct {
+	volatile int32_t counter;
+} ips_atomic_t;
+
+#define ips_atomic_set(v, i)		  (((v)->counter) = (i))
+#define ips_atomic_cmpxchg(p, oval, nval)	  \
+	    ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval)
+
+#if 0
+static __inline__ int32_t
+ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value)
+{
+	asm volatile ("lock cmpxchg %2, %0" :
+		      "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+	return old_value;
+}
+#endif
+
+#endif /* _HFI_i386_SYSDEP_H */
diff --git a/include/opa_byteorder.h b/include/opa_byteorder.h
new file mode 100644
index 0000000..3139593
--- /dev/null
+++ b/include/opa_byteorder.h
@@ -0,0 +1,264 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_BYTEORDER_H
+#define OPA_BYTEORDER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <endian.h>
+
+#ifndef __BYTE_ORDER
+#	error "BYTE_ORDER undefined"
+#endif
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+static __inline__ __u16 __hfi_fswab16(__u16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __hfi_fswab32(__u32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __hfi_fswab64(__u64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __hfi_fswab16(__u16 x) {
+	return ((x & (__u16) 0x00ffU) << 8)
+	    | ((x & (__u16) 0xff00U) >> 8);
+} static __inline__ __u32 __hfi_fswab32(__u32 x) {
+	return ((x & (__u32) 0x000000ffUL) << 24)
+	    | ((x & (__u32) 0x0000ff00UL) << 8)
+	    | ((x & (__u32) 0x00ff0000UL) >> 8)
+	    | ((x & (__u32) 0xff000000UL) >> 24);
+}
+
+static __inline__ __u64 __hfi_fswab64(__u64 x) {
+	return ((x & (__u64) 0x00000000000000ffULL) << 56)
+	    | ((x & (__u64) 0x000000000000ff00ULL) << 40)
+	    | ((x & (__u64) 0x0000000000ff0000ULL) << 24)
+	    | ((x & (__u64) 0x00000000ff000000ULL) << 8)
+	    | ((x & (__u64) 0x000000ff00000000ULL) >> 8)
+	    | ((x & (__u64) 0x0000ff0000000000ULL) >> 24)
+	    | ((x & (__u64) 0x00ff000000000000ULL) >> 40)
+	    | ((x & (__u64) 0xff00000000000000ULL) >> 56);
+}
+
+static __inline__ __u16 __cpu_to_le16(__le16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_le32(__le32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_le64(__le64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __le16_to_cpu(__le16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __le32_to_cpu(__le32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __le64_to_cpu(__le64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __cpu_to_be16(__be16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_be32(__be32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_be64(__be64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __be16_to_cpu(__be16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __be32_to_cpu(__be32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __be64_to_cpu(__be64)
+    __attribute__ ((always_inline));
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+	return x;
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+	return x;
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+	return x;
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+	return x;
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+	return x;
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+	return x;
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+	return __hfi_fswab64(x);
+}
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+	return x;
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+	return x;
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+	return x;
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+	return x;
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+	return x;
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+	return x;
+}
+
+#else
+#	error "unsupported BYTE_ORDER: " #BYTE_ORDER
+#endif
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif /* OPA_BYTEORDER_H */
diff --git a/include/opa_common.h b/include/opa_common.h
new file mode 100644
index 0000000..1e89b69
--- /dev/null
+++ b/include/opa_common.h
@@ -0,0 +1,62 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_COMMON_H
+#define OPA_COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+#include "hfi1_deprecated.h"
+
+#endif /* OPA_COMMON_H */
diff --git a/include/opa_debug.h b/include/opa_debug.h
new file mode 100644
index 0000000..d5d8ff2
--- /dev/null
+++ b/include/opa_debug.h
@@ -0,0 +1,108 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_DEBUG_H
+#define OPA_DEBUG_H
+
+#ifndef _HFI_DEBUGGING		/* debugging enabled or not */
+#define _HFI_DEBUGGING 1
+#endif
+
+#if _HFI_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe hfi.ko hfi_debug=7
+ */
+
+#define __HFI_INFO        0x1	/* generic low verbosity stuff */
+#define __HFI_DBG         0x2	/* generic debug */
+#define __HFI_TRSAMPLE    0x8	/* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __HFI_VERBDBG     0x40	/* very verbose debug */
+#define __HFI_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages and important env vars */
+#define __HFI_PROCDBG     0x100
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG       0x200
+/* low-level environment variables */
+#define __HFI_ENVDBG	    0x400
+#define __HFI_EPKTDBG     0x800	/* print error packet data */
+#define __HFI_CCADBG      0x1000	/* print CCA related events */
+#else /* _HFI_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(hfi_debug & _HFI_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __HFI_INFO      0x0	/* generic low verbosity stuff */
+#define __HFI_DBG       0x0	/* generic debug */
+#define __HFI_TRSAMPLE  0x0	/* generate trace buffer sample entries */
+#define __HFI_VERBDBG   0x0	/* very verbose debug */
+#define __HFI_PKTDBG    0x0	/* print packet data */
+#define __HFI_PROCDBG   0x0	/* print process startup (init)/exit messages */
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG     0x0
+#define __HFI_CCADBG    0x0	/* print CCA related events */
+
+#endif /* _HFI_DEBUGGING */
+
+#define __HFI_VERBOSEDBG __HFI_VERBDBG
+
+#endif /* OPA_DEBUG_H */
diff --git a/include/opa_intf.h b/include/opa_intf.h
new file mode 100644
index 0000000..e187d7d
--- /dev/null
+++ b/include/opa_intf.h
@@ -0,0 +1,90 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_INTF_H
+#define OPA_INTF_H
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+#ifdef __inline__
+#undef __inline__
+#endif
+#define __inline__ inline __attribute__((always_inline, unused))
+
+#include "sysdep.h"
+#include "bit_ops.h"
+
+/* these aren't implemented for user mode, which is OK until we multi-thread */
+typedef struct _atomic {
+	uint32_t counter;
+} atomic_t;			/* no atomic_t type in user-land */
+#define atomic_set(a, v) ((a)->counter = (v))
+#define atomic_inc_return(a)  (++(a)->counter)
+
+#if defined(__GNUC__)
+#define likely(x)    __builtin_expect(!!(x), 1L)
+#define unlikely(x)  __builtin_expect(!!(x), 0L)
+#define if_pt(cond) if (likely(cond))
+#define if_pf(cond) if (unlikely(cond))
+#define _Pragma_unlikely
+#define _Pragma_likely
+#else
+#error "Unsupported compiler"
+#endif
+
+#define yield() sched_yield()
+#endif /* OPA_INTF_H */
diff --git a/include/opa_queue.h b/include/opa_queue.h
new file mode 100644
index 0000000..f3d9595
--- /dev/null
+++ b/include/opa_queue.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $
+ */
+
+#ifndef OPA_QUEUE_H_
+#define	OPA_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ	CIRCLEQ
+ * _HEAD		+	+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+	+
+ * _ENTRY		+	+	+	+	+
+ * _INIT		+	+	+	+	+
+ * _EMPTY		+	+	+	+	+
+ * _FIRST		+	+	+	+	+
+ * _NEXT		+	+	+	+	+
+ * _PREV		-	-	-	+	+
+ * _LAST		-	-	+	+	+
+ * _FOREACH		+	+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+	+
+ * _INSERT_HEAD		+	+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+	+
+ * _INSERT_AFTER	+	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+	+
+ * _REMOVE_HEAD		+	-	+	-	-
+ * _REMOVE		+	+	+	+	+
+ *
+ */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) =				\
+		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for ((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY(head) ?						\
+		NULL :							\
+		((struct type *)					\
+		((char *)((head)->stqh_last) - offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD(head, field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev =		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    (elm)->field.tqe_prev;				\
+	else								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Circular queue declarations.
+ */
+#define	CIRCLEQ_HEAD(name, type)					\
+struct name {								\
+	struct type *cqh_first;		/* first element */		\
+	struct type *cqh_last;		/* last element */		\
+}
+
+#define	CIRCLEQ_HEAD_INITIALIZER(head)					\
+	{ (void *)&(head), (void *)&(head) }
+
+#define	CIRCLEQ_ENTRY(type)						\
+struct {								\
+	struct type *cqe_next;		/* next element */		\
+	struct type *cqe_prev;		/* previous element */		\
+}
+
+/*
+ * Circular queue functions.
+ */
+#define	CIRCLEQ_EMPTY(head)	((head)->cqh_first == (void *)(head))
+
+#define	CIRCLEQ_FIRST(head)	((head)->cqh_first)
+
+#define	CIRCLEQ_FOREACH(var, head, field)				\
+	for ((var) = CIRCLEQ_FIRST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_NEXT((var), field))
+
+#define	CIRCLEQ_FOREACH_REVERSE(var, head, field)			\
+	for ((var) = CIRCLEQ_LAST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_PREV((var), field))
+
+#define	CIRCLEQ_INIT(head) do {						\
+	CIRCLEQ_FIRST((head)) = (void *)(head);				\
+	CIRCLEQ_LAST((head)) = (void *)(head);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field);	\
+	CIRCLEQ_PREV((elm), field) = (listelm);				\
+	if (CIRCLEQ_NEXT((listelm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\
+	CIRCLEQ_NEXT((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = (listelm);				\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field);	\
+	if (CIRCLEQ_PREV((listelm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\
+	CIRCLEQ_PREV((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_HEAD(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head));		\
+	CIRCLEQ_PREV((elm), field) = (void *)(head);			\
+	if (CIRCLEQ_LAST((head)) == (void *)(head))			\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm);	\
+	CIRCLEQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_INSERT_TAIL(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = (void *)(head);			\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head));		\
+	if (CIRCLEQ_FIRST((head)) == (void *)(head))			\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm);	\
+	CIRCLEQ_LAST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_LAST(head)	((head)->cqh_last)
+
+#define	CIRCLEQ_NEXT(elm, field)	((elm)->field.cqe_next)
+
+#define	CIRCLEQ_PREV(elm, field)	((elm)->field.cqe_prev)
+
+#define	CIRCLEQ_REMOVE(head, elm, field) do {				\
+	if (CIRCLEQ_NEXT((elm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field);	\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) =	\
+		    CIRCLEQ_PREV((elm), field);				\
+	if (CIRCLEQ_PREV((elm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field);	\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) =	\
+		    CIRCLEQ_NEXT((elm), field);				\
+} while (0)
+
+#endif /* !OPA_QUEUE_H_ */
diff --git a/include/opa_revision.h b/include/opa_revision.h
new file mode 100644
index 0000000..4a28821
--- /dev/null
+++ b/include/opa_revision.h
@@ -0,0 +1,64 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_REVISION_H
+#define OPA_REVISION_H
+
+/* Those variables are defined in the _revision.c file
+which is dynamically generated during building of the library */
+extern char psmi_hfi_IFS_version[];
+extern char psmi_hfi_build_timestamp[];
+extern char psmi_hfi_sources_checksum[];
+extern char psmi_hfi_git_checksum[];
+
+#endif /* OPA_REVISION_H */
diff --git a/include/opa_service.h b/include/opa_service.h
new file mode 100644
index 0000000..16cf0fd
--- /dev/null
+++ b/include/opa_service.h
@@ -0,0 +1,268 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_SERVICE_H
+#define OPA_SERVICE_H
+
+/* This file contains all the lowest level routines calling into sysfs */
+/* and qib driver. All other calls are based on these routines. */
+
+#include <libgen.h>
+
+#include "opa_intf.h"
+#include "opa_common.h"
+#include "opa_udebug.h"
+#include "opa_byteorder.h"
+
+/* upper and lower bounds for HFI port numbers */
+#define HFI_MIN_PORT 1
+#define HFI_MAX_PORT 1
+#define HFI_NUM_PORTS (HFI_MAX_PORT - HFI_MIN_PORT + 1)
+/* any unit id to match. */
+#define HFI_UNIT_ID_ANY ((long)-1)
+/* any port num to match. */
+#define HFI_PORT_NUM_ANY ((long)0)
+
+/* base name of path (without unit #) for qib driver */
+#define HFI_DEVICE_PATH "/dev/hfi1"
+#define HFI_CLASS_PATH "/sys/class/infiniband/hfi1"
+
+/* Commands used to communicate with driver. */
+enum PSMI_HFI_CMD {
+    PSMI_HFI_CMD_ASSIGN_CTXT = 0,   /* allocate HFI and context */
+    PSMI_HFI_CMD_CTXT_INFO,         /* find out what resources we got */
+    PSMI_HFI_CMD_USER_INFO,         /* set up userspace */
+    PSMI_HFI_CMD_TID_UPDATE,        /* update expected TID entries */
+    PSMI_HFI_CMD_TID_FREE,          /* free expected TID entries */
+    PSMI_HFI_CMD_CREDIT_UPD,        /* force an update of PIO credit */
+    PSMI_HFI_CMD_RECV_CTRL,         /* control receipt of packets */
+    PSMI_HFI_CMD_POLL_TYPE,         /* set the kind of polling we want */
+    PSMI_HFI_CMD_ACK_EVENT,         /* ack & clear user status bits */
+    PSMI_HFI_CMD_SET_PKEY,          /* set context's pkey */
+    PSMI_HFI_CMD_CTXT_RESET,        /* reset context's HW send context */
+    PSMI_HFI_CMD_TID_INVAL_READ,    /* read TID cache invalidations */
+    PSMI_HFI_CMD_GET_VERS,          /* get the version of the user cdev */
+
+#ifdef PSM_CUDA
+    PSMI_HFI_CMD_TID_UPDATE_V2 = 28,
+#endif
+    PSMI_HFI_CMD_LAST,
+};
+
+/* Legacy commands used to communicate with driver using 'write' */
+enum LEGACY_HFI1_CMD {
+    LEGACY_HFI1_CMD_ASSIGN_CTXT     = 1,     /* allocate HFI and context */
+    LEGACY_HFI1_CMD_CTXT_INFO       = 2,     /* find out what resources we got */
+    LEGACY_HFI1_CMD_USER_INFO       = 3,     /* set up userspace */
+    LEGACY_HFI1_CMD_TID_UPDATE      = 4,     /* update expected TID entries */
+    LEGACY_HFI1_CMD_TID_FREE        = 5,     /* free expected TID entries */
+    LEGACY_HFI1_CMD_CREDIT_UPD      = 6,     /* force an update of PIO credit */
+
+    LEGACY_HFI1_CMD_RECV_CTRL       = 8,     /* control receipt of packets */
+    LEGACY_HFI1_CMD_POLL_TYPE       = 9,     /* set the kind of polling we want */
+    LEGACY_HFI1_CMD_ACK_EVENT       = 10,    /* ack & clear user status bits */
+    LEGACY_HFI1_CMD_SET_PKEY        = 11,    /* set context's pkey */
+    LEGACY_HFI1_CMD_CTXT_RESET      = 12,    /* reset context's HW send context */
+    LEGACY_HFI1_CMD_TID_INVAL_READ  = 13,    /* read TID cache invalidations */
+    LEGACY_HFI1_CMD_GET_VERS        = 14    /* get the version of the user cdev */
+};
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+   returns 0 if the unit and port are not active. returns -1 when an error occurred. */
+int hfi_get_port_active(int, int);
+
+/* Given the unit number and port, return an error, or the corresponding LID */
+/* Returns an int, so -1 indicates a general error.  -2 indicates that the unit/port
+   are not active.  0 indicates that the unit is valid, but no LID has been assigned. */
+int hfi_get_port_lid(int, int);
+
+/* Given the unit number and port, return an error, or the corresponding GID */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_gid(int, int, uint64_t *hi, uint64_t *lo);
+
+/* Given the unit number, return an error, or the corresponding LMC value
+   for the port */
+/* Returns an int, so -1 indicates an error.  0 */
+int hfi_get_port_lmc(int unit, int port);
+
+/* Given the unit number, return an error, or the corresponding link rate
+   for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port);
+
+/* Given a unit, port and SL, return an error, or the corresponding SC for the
+   SL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sl2sc(int unit, int port, int sl);
+
+/* Given a unit, port and SC, return an error, or the corresponding VL for the
+   SC as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sc2vl(int unit, int port, int sc);
+
+/* Given a unit, port and VL, return an error, or the corresponding MTU for the
+   VL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_vl2mtu(int unit, int port, int vl);
+
+/* Given a unit, port and index, return an error, or the corresponding pkey for
+   the index as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_index2pkey(int unit, int port, int index);
+
+/* Get the number of units supported by the driver.  Does not guarantee
+   that a working chip has been found for each possible unit #. */
+/* Returns -1 with errno set, or number of units >=0 (0 means none found). */
+int hfi_get_num_units(void);
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+   returns 0 if no port on the unit is active.
+   returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit);
+
+/* get the number of contexts from the unit id. */
+/* Returns 0 if no unit or no match. */
+int hfi_get_num_contexts(int unit);
+
+/* Open hfi device file, return -1 on error. */
+int hfi_context_open(int unit, int port, uint64_t open_timeout);
+int hfi_context_open_ex(int unit, int port, uint64_t open_timeout,
+		     char *dev_name,size_t dev_name_len);
+void hfi_context_close(int fd);
+
+/* hfi_get_user_major_version() returns the major version of the driver
+   that should be used for this session of psm. Valid only after
+   hfi_context_open has been called. */
+uint16_t hfi_get_user_major_version(void);
+
+/* hfi_get_user_minor_version() return the minor version of the driver */
+uint16_t hfi_get_user_minor_version(void);
+
+void hfi_set_user_version(uint32_t version);
+void hfi_set_user_major_version(uint16_t major_version);
+
+int hfi_cmd_write(int fd, struct hfi1_cmd *, size_t count);
+int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt);
+
+int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf);
+int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp);
+
+/* We use mmap64() because we compile in both 32 and 64 bit mode,
+   and we have to map physical addresses that are > 32 bits long.
+   While linux implements mmap64, it doesn't have a man page,
+   and isn't declared in any header file, so we declare it here ourselves. */
+
+/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+   redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+   it doesn't work when the address being mapped is > 32 bits.  It chips
+   off bits 32 and above.   So we stay with mmap64. */
+extern void *mmap64(void *, size_t, int, int, int, __off64_t);
+void *hfi_mmap64(void *, size_t, int, int, int, __off64_t);
+
+/* Statistics maintained by the driver */
+int hfi_get_stats(uint64_t *, int);
+int hfi_get_stats_names(char **namep);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit(int unitno, uint64_t *, int);
+int hfi_get_ctrs_unit_names(int unitno, char **namep);
+int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int);
+int hfi_get_ctrs_port_names(int unitno, char **namep);
+
+/* sysfs helper routines (only those currently used are exported;
+ * try to avoid using others) */
+
+/* Calls stat() for the given attribute, return value is unchanged
+   from stat() sbuf is populated from stat() too. */
+int hfi_sysfs_stat(const char *attr,struct stat *sbuf);
+
+/* read a signed 64-bit quantity, in some arbitrary base */
+int hfi_sysfs_read_s64(const char *attr, int64_t *valp, int base);
+
+/* read a string value */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char **datap);
+
+/* open attribute in unit's sysfs directory via open(2) */
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags);
+/* print to attribute in {unit,port} sysfs directory */
+int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+			  const char *fmt, ...)
+			  __attribute__((format(printf, 4, 5)));
+int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...)
+			  __attribute__((format(printf, 3, 4)));
+
+int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data,
+			 size_t len);
+/* read up to one page of malloc'ed data (caller must free), returning
+   number of bytes read or -1 */
+int hfi_hfifs_read(const char *attr, char **datap);
+int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data);
+/* read a signed 64-bit quantity, in some arbitrary base */
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+			    int64_t *valp, int base);
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+			    int64_t *valp, int base);
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit);
+/* these read directly into supplied buffer and take a count */
+int hfi_hfifs_rd(const char *, void *, int);
+int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int);
+
+int hfi_hfifs_open(const char *relname, int flags);
+
+/* wait for device special file to show up. timeout is in
+ *    milliseconds, 0 is "callee knows best", < 0 is infinite. */
+int hfi_wait_for_device(const char *path, long timeout);
+
+int hfi_cmd_wait_for_packet(int fd);
+
+#endif /* OPA_SERVICE_H */
diff --git a/include/opa_udebug.h b/include/opa_udebug.h
new file mode 100644
index 0000000..9fd59cb
--- /dev/null
+++ b/include/opa_udebug.h
@@ -0,0 +1,194 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_UDEBUG_H
+#define OPA_UDEBUG_H
+
+#include <stdio.h>
+#include "opa_debug.h"
+
+extern unsigned hfi_debug;
+const char *hfi_get_unit_name(int unit);
+extern char *__progname;
+
+static const char hfi_ident_tag[] = "PSM2_IDENTIFY";
+char *hfi_get_mylabel();
+
+#if _HFI_DEBUGGING
+
+extern char *__hfi_mylabel;
+void hfi_set_mylabel(char *);
+extern FILE *__hfi_dbgout;
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s%s: " fmt, __hfi_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s%s: " fmt, __hfi_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_INFO(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&__HFI_INFO))  \
+			printf("%s%s: " fmt, __hfi_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG)
+
+#define __HFI_DBG_WHICH(which, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&(which))) \
+			fprintf(__hfi_dbgout, "%s%s: " fmt, __hfi_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&(which))) \
+			fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__)
+#define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__)
+#define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__)
+#define _HFI_ENVDBG(lev, fmt, ...) \
+	__HFI_DBG_WHICH_NOFUNC(					    \
+		(lev == 0) ? __HFI_INFO :				    \
+		    (lev > 1 ? __HFI_ENVDBG : (__HFI_PROCDBG|__HFI_ENVDBG)),\
+		"env " fmt, ##__VA_ARGS__)
+#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__)
+#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__)
+
+/*
+ * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together
+ * for a scope of code preparing debug info for printing; e.g.
+ * if (_HFI_DBG_ON) {
+ *     // put your code here
+ *     _HFI_DBG_ALWAYS(print your results here);
+ * }
+ */
+#define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG)
+#define _HFI_DBG_ALWAYS(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \
+			##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG)
+#define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG)
+#define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG)
+#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO)
+#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#else /* ! _HFI_DEBUGGING */
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+	do { \
+		printf("%s" fmt, "", ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+	do { \
+		printf("%s" fmt, "", ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_INFO(fmt, ...)
+
+#define __HFI_PKTDBG_ON 0
+
+#define _HFI_DBG(fmt, ...)
+#define _HFI_PDBG(fmt, ...)
+#define _HFI_EPDBG(fmt, ...)
+#define _HFI_PRDBG(fmt, ...)
+#define _HFI_ENVDBG(lev, fmt, ...)
+#define _HFI_VDBG(fmt, ...)
+#define _HFI_MMDBG(fmt, ...)
+#define _HFI_CCADBG(fmt, ...)
+
+#define _HFI_DBG_ON 0
+#define _HFI_DBG_ALWAYS(fmt, ...)
+#define _HFI_VDBG_ON 0
+#define _HFI_VDBG_ALWAYS(fmt, ...)
+#define _HFI_PRDBG_ON 0
+#define _HFI_PRDBG_ALWAYS(fmt, ...)
+#define _HFI_CCADBG_ON 0
+#define _HFI_CCADBG_ALWAYS(fmt, ...)
+#define _HFI_INFO_ON 0
+#define _HFI_INFO_ALWAYS(fmt, ...)
+
+#endif /* _HFI_DEBUGGING */
+
+#endif /* OPA_UDEBUG_H */
diff --git a/include/opa_user.h b/include/opa_user.h
new file mode 100644
index 0000000..274d674
--- /dev/null
+++ b/include/opa_user.h
@@ -0,0 +1,973 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_USER_H
+#define OPA_USER_H
+
+/* This file contains all of the data structures and routines that are
+   publicly visible and usable (to low level infrastructure code; it is
+   not expected that any application, or even normal application-level library,
+   will ever need to use any of this).
+
+   Additional entry points and data structures that are used by these routines
+   may be referenced in this file, but they should not be generally available;
+   they are visible here only to allow use in inlined functions.  Any variable,
+   data structure, or function that starts with a leading "_" is in this
+   category.
+*/
+
+/* Include header files we need that are unlikely to otherwise be needed by */
+/* programs. */
+#include <stddef.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <syslog.h>
+#include "opa_intf.h"
+#include "opa_common.h"
+#include "opa_byteorder.h"
+#include "opa_udebug.h"
+#include "opa_service.h"
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * hfi_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define HFI_RHF_LENGTH_MASK 0xFFF
+#define HFI_RHF_LENGTH_SHIFT 0
+#define HFI_RHF_RCVTYPE_MASK 0x7
+#define HFI_RHF_RCVTYPE_SHIFT 12
+#define HFI_RHF_USE_EGRBFR_MASK 0x1
+#define HFI_RHF_USE_EGRBFR_SHIFT 15
+#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF
+#define HFI_RHF_EGRBFR_INDEX_SHIFT 16
+#define HFI_RHF_SEQ_MASK 0xF
+#define HFI_RHF_SEQ_SHIFT 28
+
+#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF
+#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0
+#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF
+#define HFI_RHF_HDRQ_OFFSET_SHIFT 12
+
+#define HFI_RHF_ICRCERR 0x80000000
+#define HFI_RHF_ECCERR 0x20000000
+#define HFI_RHF_LENERR 0x10000000
+#define HFI_RHF_TIDERR 0x08000000
+
+#define HFI_RHF_TFGENERR 0x04000000
+#define HFI_RHF_TFSEQERR 0x02000000
+#define HFI_RHF_RCVTYPEERR 0x07000000
+
+#define HFI_RHF_DCERR 0x00800000
+#define HFI_RHF_DCUNCERR 0x00400000
+#define HFI_RHF_KHDRLENERR 0x00200000
+/* Change from 0xFFE00000 to 0xFDE00000, so that we don't commit to the
+ * error path on a SeqErr too soon - with RSM, the HFI may report a
+ * false SeqErr condition */
+#define HFI_RHF_ERR_MASK 0xFDE00000
+
+/* TidFlow related bits */
+#define HFI_TF_SEQNUM_SHIFT                 0
+#define HFI_TF_SEQNUM_MASK                  0x7ff
+#define HFI_TF_GENVAL_SHIFT                 11
+#define HFI_TF_GENVAL_MASK                  0xfffff
+
+#define HFI_TF_FLOWVALID_SHIFT              32
+#define HFI_TF_FLOWVALID_MASK               0x1
+#define HFI_TF_HDRSUPP_ENABLED_SHIFT        33
+#define HFI_TF_HDRSUPP_ENABLED_MASK         0x1
+
+#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT      34
+#define HFI_TF_KEEP_AFTER_SEQERR_MASK       0x1
+#define HFI_TF_KEEP_ON_GENERR_SHIFT         35
+#define HFI_TF_KEEP_ON_GENERR_MASK          0x1
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK  0x1
+#define HFI_TF_STATUS_SHIFT                 37
+#define HFI_TF_STATUS_MASK                  0x3
+#define HFI_TF_STATUS_SEQMISMATCH_SHIFT     37
+#define HFI_TF_STATUS_SEQMISMATCH_MASK      0x1
+#define HFI_TF_STATUS_GENMISMATCH_SHIFT     38
+#define HFI_TF_STATUS_GENMISMATCH_MASK      0x1
+
+#define HFI_TF_INVALID			    (~0U)
+#define HFI_TF_INVALID_GENERATION	    (~0U)
+#define HFI_TF_NFLOWS                       32
+
+/* PBC bits */
+#define HFI_PBC_STATICRCC_SHIFT         0
+#define HFI_PBC_STATICRCC_MASK          0xffff
+
+#define HFI_PBC_SC4_SHIFT               4
+#define HFI_PBC_SC4_MASK                0x1
+
+#define HFI_PBC_INTR_SHIFT              31
+#define HFI_PBC_DCINFO_SHIFT            30
+#define HFI_PBC_TESTEBP_SHIFT           29
+#define HFI_PBC_PACKETBYPASS_SHIFT      28
+#define HFI_PBC_INSERTHCRC_SHIFT        26
+#define HFI_PBC_INSERTHCRC_MASK         0x3
+#define HFI_PBC_CREDITRETURN_SHIFT      25
+#define HFI_PBC_INSERTBYPASSICRC_SHIFT  24
+#define HFI_PBC_TESTBADICRC_SHIFT       23
+#define HFI_PBC_FECN_SHIFT              22
+#define HFI_PBC_VL_SHIFT                12
+#define HFI_PBC_VL_MASK                 0xf
+#define HFI_PBC_LENGTHDWS_SHIFT         0
+#define HFI_PBC_LENGTHDWS_MASK          0xfff
+
+/* IB - LRH header consts */
+#define HFI_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
+#define HFI_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+#define HFI_LRH_SC_SHIFT 12
+#define HFI_LRH_SC_MASK 0xf
+#define HFI_LRH_LVER_SHIFT 8
+#define HFI_LRH_LVER_MASK 0xf
+#define HFI_LRH_SL_SHIFT 4
+#define HFI_LRH_SL_MASK 0xf
+#define HFI_LRH_PKTLEN_MASK 0xfff
+
+/* IB - BTH header consts */
+#define HFI_BTH_OPCODE_SHIFT 24
+#define HFI_BTH_OPCODE_MASK 0xff
+#define HFI_BTH_SE_SHIFT 23
+#define HFI_BTH_MIGREQ_SHIFT 22
+#define HFI_BTH_EXTRA_BYTE_SHIFT 20
+#define HFI_BTH_EXTRA_BYTE_MASK 3
+#define HFI_BTH_TVER_SHIFT 16
+#define HFI_BTH_TVER_MASK 0xF
+
+#define HFI_BTH_BECN_SHIFT 30
+#define HFI_BTH_FECN_SHIFT 31
+#define HFI_BTH_QP_SHIFT 16
+#define HFI_BTH_QP_MASK 0xff
+#define HFI_BTH_FLOWID_SHIFT 11
+#define HFI_BTH_FLOWID_MASK 0x1f
+#define HFI_BTH_SUBCTXT_SHIFT 8
+#define HFI_BTH_SUBCTXT_MASK 0x7
+
+#define HFI_BTH_SEQ_SHIFT 0
+#define HFI_BTH_SEQ_MASK 0x7ff
+#define HFI_BTH_GEN_SHIFT 11
+#define HFI_BTH_GEN_MASK 0xfffff
+#define HFI_BTH_ACK_SHIFT 31
+
+/* KDETH header consts */
+#define HFI_KHDR_OFFSET_MASK 0x7fff
+#define HFI_KHDR_OM_SHIFT 15
+#define HFI_KHDR_TID_SHIFT 16
+#define HFI_KHDR_TID_MASK 0x3ff
+#define HFI_KHDR_TIDCTRL_SHIFT 26
+#define HFI_KHDR_TIDCTRL_MASK 0x3
+#define HFI_KHDR_INTR_SHIFT 28
+#define HFI_KHDR_SH_SHIFT 29
+#define HFI_KHDR_KVER_SHIFT 30
+#define HFI_KHDR_KVER_MASK 0x3
+
+#define HFI_KHDR_MSGSEQ_MASK 0xffff
+#define HFI_KHDR_TINYLEN_MASK 0xf
+#define HFI_KHDR_TINYLEN_SHIFT 16
+#define HFI_KHDR_EGRFLAGS_SHIFT 20
+#define HFI_KHDR_EGRFLAGS_MASK 0x3f
+
+#define GET_HFI_KHDR_TIDCTRL(val) \
+	(((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \
+	HFI_KHDR_TIDCTRL_MASK)
+
+#ifdef PSM_CUDA
+extern int is_driver_gpudirect_enabled;
+
+static __inline__ int _psmi_is_driver_gpudirect_enabled() __attribute__((always_inline));
+
+static __inline__ int
+_psmi_is_driver_gpudirect_enabled()
+{
+	return is_driver_gpudirect_enabled;
+}
+#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED _psmi_is_driver_gpudirect_enabled()
+#endif
+
+/* this portion only defines what we currently use */
+struct hfi_pbc {
+	__u32 pbc0;
+	__u16 PbcStaticRateControlCnt;
+	__u16 fill1;
+};
+
+/* hfi kdeth header format */
+struct hfi_kdeth {
+	__u32 kdeth0;
+
+	union {
+		struct {
+			__u16 job_key;
+			__u16 hcrc;
+		};
+		__u32 kdeth1;
+	};
+};
+
+/* misc. */
+#define HFI_CRC_SIZE_IN_BYTES 4
+#define HFI_PCB_SIZE_IN_BYTES 8
+
+#define HFI_EAGER_TIDCTRL 0x0
+
+#define HFI_DEFAULT_SERVICE_ID 0x1000117500000000ULL
+#define HFI_DEFAULT_P_KEY 0x8001 /* fabric default pkey for app traffic */
+
+#if 0
+#define HFI_PERMISSIVE_LID 0xFFFF
+#define HFI_AETH_CREDIT_SHIFT 24
+#define HFI_AETH_CREDIT_MASK 0x1F
+#define HFI_AETH_CREDIT_INVAL 0x1F
+#define HFI_PSN_MASK 0xFFFFFF
+#define HFI_MSN_MASK 0xFFFFFF
+#define HFI_QPN_MASK 0xFFFFFF
+#define HFI_MULTICAST_LID_BASE 0xC000
+#define HFI_MULTICAST_QPN 0xFFFFFF
+#endif
+
+/* Receive Header Queue: receive type (from hfi) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+/* OPA PSM assumes that the message header is always 56 bytes. */
+#define HFI_MESSAGE_HDR_SIZE	56
+/* Usable bytes in header (hdrsize - lrh - bth) */
+#define HFI_MESSAGE_HDR_SIZE_HFI       (HFI_MESSAGE_HDR_SIZE-20)
+/* SPIO includes 8B PBC and message header */
+#define HFI_SPIO_HDR_SIZE      (8+56)
+/*
+ * SDMA includes 8B sdma hdr, 8B PBC, and message header.
+ * If we are using GPU workloads, we need to set a new
+ * "flags" member which takes another 2 bytes in the
+ * sdma hdr. We let the driver know of this 2 extra bytes
+ * at runtime when we set the length for the iovecs.
+ */
+#define HFI_SDMA_HDR_SIZE      (8+8+56)
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 hfi_hdrget_err_flags(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & HFI_RHF_ERR_MASK;
+}
+
+static inline __u32 hfi_hdrget_rcv_type(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_RCVTYPE_SHIFT)
+	    & HFI_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 hfi_hdrget_length_in_bytes(const __le32 *rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> HFI_RHF_LENGTH_SHIFT)
+		& HFI_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 hfi_hdrget_egrbfr_index(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_EGRBFR_INDEX_SHIFT)
+	    & HFI_RHF_EGRBFR_INDEX_MASK;
+}
+
+static inline __u32 hfi_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT)
+	    & HFI_RHF_SEQ_MASK;
+}
+
+static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT)
+	    & HFI_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 hfi_hdrget_egrbfr_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_EGRBFR_OFFSET_SHIFT)
+	    & HFI_RHF_EGRBFR_OFFSET_MASK;
+}
+
+static inline __u32 hfi_hdrget_use_egrbfr(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_USE_EGRBFR_SHIFT)
+	    & HFI_RHF_USE_EGRBFR_MASK;
+}
+
+/* interval timing routines */
+/* Convert a count of cycles to elapsed nanoseconds */
+/* this is only accurate for reasonably large numbers of cycles (at least tens)
+*/
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t)
+					  __attribute__ ((always_inline));
+/* convert elapsed nanoseconds to elapsed cycles */
+/* this is only accurate for reasonably large numbers of nsecs (at least tens)
+*/
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t)
+					  __attribute__ ((always_inline));
+/* get current count of nanoseconds from unspecified base value (only useful
+   for intervals) */
+static __inline__ uint64_t get_nanoseconds() __attribute__ ((always_inline));
+
+struct _hfi_ctrl {
+	int32_t fd;		/* device file descriptor */
+	/* tidflow valid */
+	uint32_t __hfi_tfvalid;
+	/* unit id */
+	uint32_t __hfi_unit;
+	/* port id */
+	uint32_t __hfi_port;
+
+	/* number of eager tid entries */
+	uint32_t __hfi_tidegrcnt;
+	/* number of expected tid entries */
+	uint32_t __hfi_tidexpcnt;
+
+	/* effective mtu size, should be <= base_info.mtu */
+	uint32_t __hfi_mtusize;
+	/* max PIO size, should be <= effective mtu size */
+	uint32_t __hfi_piosize;
+
+	/* two struct output from driver. */
+	struct hfi1_ctxt_info ctxt_info;
+	struct hfi1_base_info base_info;
+
+	/* some local storages in some condition: */
+	/* as storage of __hfi_rcvtidflow in hfi_userinit(). */
+	__le64 regs[HFI_TF_NFLOWS];
+
+	/* location to which OPA writes the rcvhdrtail register whenever
+	   it changes, so that no chip registers are read in the performance
+	   path. */
+	volatile __le64 *__hfi_rcvtail;
+
+	/* address where ur_rcvhdrtail is written */
+	volatile __le64 *__hfi_rcvhdrtail;
+	/* address where ur_rcvhdrhead is written */
+	volatile __le64 *__hfi_rcvhdrhead;
+	/* address where ur_rcvegrindextail is read */
+	volatile __le64 *__hfi_rcvegrtail;
+	/* address where ur_rcvegrindexhead is written */
+	volatile __le64 *__hfi_rcvegrhead;
+	/* address where ur_rcvegroffsettail is read */
+	volatile __le64 *__hfi_rcvofftail;
+	/* address where ur_rcvtidflow is written */
+	volatile __le64 *__hfi_rcvtidflow;
+};
+
+/* After the device is opened, hfi_userinit() is called to give the driver the
+   parameters the user code wants to use, and to get the implementation values,
+   etc. back.  0 is returned on success, a positive value is a standard errno,
+   and a negative value is reserved for future use.  The first argument is
+   the filedescriptor returned by the device open.
+
+   It is allowed to have multiple devices (and of different types)
+   simultaneously opened and initialized, although this won't be fully
+   implemented initially.  This routine is used by the low level
+   hfi protocol code (and any other code that has similar low level
+   functionality).
+   This is the only routine that takes a file descriptor, rather than an
+   struct _hfi_ctrl *.  The struct _hfi_ctrl * used for everything
+   else is returned by this routine.
+*/
+
+struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *);
+
+/* don't inline these; it's all init code, and not inlining makes the */
+/* overall code shorter and easier to debug */
+void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline));
+
+/* set the BTH pkey to check for this process. */
+/* This is for receive checks, not for sends.  It isn't necessary
+   to set the default key, that's always allowed by the hardware.
+   If too many pkeys are in use for the hardware to support, this
+   will return EAGAIN, and the caller should then fail and exit
+   or use the default key and check the pkey in the received packet
+   checking. */
+int32_t hfi_set_pkey(struct _hfi_ctrl *, uint16_t);
+
+/* flush the eager buffers, by setting the
+   eager index head register == eager index tail, if queue is full */
+void hfi_flush_egr_bufs(struct _hfi_ctrl *ctrl);
+
+int hfi_wait_for_packet(struct _hfi_ctrl *);
+
+/* stop_start == 0 disables receive on the context, for use in queue overflow
+   conditions.  stop_start==1 re-enables, and returns value of tail register,
+   to be used to re-init the software copy of the head register */
+int hfi_manage_rcvq(struct _hfi_ctrl *ctrl, uint32_t stop_start);
+
+/* ctxt_bp == 0 disables fabric back pressure on the context. */
+/* ctxt_bp == 1 enables fabric back pressure on the context. */
+int hfi_manage_bp(struct _hfi_ctrl *ctrl, uint8_t ctxt_bp);
+
+/* enable == 1 enables armlaunch (normal), 0 disables (only used */
+/* hfi_pkt_test -B at the moment, needed for linda). */
+int hfi_armlaunch_ctrl(struct _hfi_ctrl *ctrl, uint32_t enable);
+
+/* force an update of the PIOAvail register to memory */
+int hfi_force_pio_avail_update(struct _hfi_ctrl *ctrl);
+
+/* Disarm any send buffers which need disarming. */
+int hfi_disarm_bufs(struct _hfi_ctrl *ctrl);
+
+/* New user event mechanism, using spi_sendbuf_status HFI_EVENT_* bits
+   obsoletes hfi_disarm_bufs(), and extends it, although old mechanism
+   remains for binary compatibility. */
+int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits);
+
+/* Return send dma's current "in flight counter " */
+int hfi_sdma_inflight(struct _hfi_ctrl *ctrl, uint32_t *counter);
+
+/* Return send dma's current "completion counter" */
+int hfi_sdma_complete(struct _hfi_ctrl *ctrl, uint32_t *counter);
+
+/* set whether we want an interrupt on all packets, or just urgent ones */
+int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type);
+
+/* set send context pkey to verify, error if driver is not configured with */
+/* this pkey in its pkey table. */
+int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey);
+
+/* reset halted send context, error if context is not halted. */
+int hfi_reset_context(struct _hfi_ctrl *ctrl);
+
+/* Statistics maintained by the driver */
+const char *hfi_get_next_name(char **names);
+uint64_t hfi_get_single_stat(const char *attr, uint64_t *s);
+int hfi_get_stats_names_count(void);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit_names_count(int unitno);
+int hfi_get_ctrs_port_names_count(int unitno);
+
+uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s);
+int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c);
+void hfi_release_names(char *namep);
+
+/* Syslog wrapper
+
+   level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING,
+   LOG_NOTICE, LOG_INFO, LOG_DEBUG.
+
+   prefix should be a short string to describe which part of the software stack
+   is using syslog, i.e. "PSM", "mpi", "mpirun".
+*/
+void hfi_syslog(const char *prefix, int to_console, int level,
+		const char *format, ...)
+		__attribute__((format(printf, 4, 5)));
+
+void hfi_vsyslog(const char *prefix, int to_console, int level,
+		 const char *format, va_list ap);
+
+/* parameters for PBC for pio write routines, to avoid passing lots
+ * of args; we instead pass the structure pointer.  */
+struct hfi_pio_params {
+	uint16_t length;
+	uint8_t vl;
+	uint8_t port;
+	uint32_t cksum_is_valid;
+	uint32_t cksum;
+	uint32_t rate;
+};
+
+/* write pio buffers.  The hfi_write_pio_force_order() version assumes
+   that the processor does not write store buffers to i/o devices in the
+   order in which they are writte, and that when flushing partially
+   filled store buffers, the words are not ordered either.   The hfi_write_pio()
+   form is used when the processor writes store buffers to i/o in the order
+   in which they are filled, and writes partially filled buffers in increasing
+   address order (assuming they are filled that way).
+   The arguments are pio buffer address, payload length, header, and payload
+*/
+void hfi_write_pio(volatile uint32_t *, const struct hfi_pio_params *,
+		   void *, void *);
+void hfi_write_pio_force_order(volatile uint32_t *,
+			       const struct hfi_pio_params *, void *, void *);
+
+#define HFI_SPECIAL_TRIGGER_MAGIC        0xaebecede
+/* IBA7220 can use a "Special" trigger.  We write to the last dword
+   in the mapped SendBuf to trigger the launch. */
+void hfi_write_pio_special_trigger2k(volatile uint32_t *,
+				     const struct hfi_pio_params *, void *,
+				     void *);
+void hfi_write_pio_special_trigger4k(volatile uint32_t *,
+				     const struct hfi_pio_params *, void *,
+				     void *);
+
+/*
+ * Copy routine that may copy a byte multiple times but optimized for througput
+ * This is not safe to use for PIO routines where we want a guarantee that a
+ * byte is only copied/moved across the bus once.
+ */
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src,
+		  uint32_t ndwords);
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src,
+		  uint32_t nqwords);
+
+/*
+* Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once.
+*/
+#if defined(__x86_64__)
+void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src,
+		       uint32_t ndwords);
+void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src,
+		       uint32_t nqwords);
+#else
+#define hfi_dwordcpy_safe hfi_dwordcpy
+#define hfi_qwordcpy_safe hfi_qwordcpy
+#endif
+
+/* From here to the end of the file are implementation details that should not
+   be used outside this file (other than to call the function), except in the
+   one infrastructure file in which they are defined.
+
+   NOTE:  doing paired 32 bit writes to the chip to store 64 bit values (as
+   from 32 bit programs) will not work correctly, because there is no sub-qword
+   address decode.  Therefore 32 bit programs use only a single 32 bit store;
+   the head register values are all less than 32 bits, anyway.   Given that, we
+   use only 32 bits even for 64 bit programs, for simplicity.  These functions
+   must not be called until after hfi_userinit() is called.  The ctrl argument
+   is currently unused, but remains useful for adding debug code.
+*/
+
+static __inline__ void hfi_put_rcvegrindexhead(struct _hfi_ctrl *ctrl,
+					   uint64_t val)
+{
+	*ctrl->__hfi_rcvegrhead = __cpu_to_le64(val);
+}
+
+static __inline__ void hfi_put_rcvhdrhead(struct _hfi_ctrl *ctrl, uint64_t val)
+{
+	*ctrl->__hfi_rcvhdrhead = __cpu_to_le64(val);
+}
+
+static __inline__ uint64_t hfi_get_rcvhdrtail(struct _hfi_ctrl *ctrl)
+{
+	uint64_t res = __le64_to_cpu(*ctrl->__hfi_rcvtail);
+	ips_rmb();
+	return res;
+}
+
+static __inline__ void hfi_tidflow_set_entry(struct _hfi_ctrl *ctrl,
+					 uint32_t flowid, uint32_t genval,
+					 uint32_t seqnum)
+{
+/* For proper behavior with RSM interception of FECN packets for CCA,
+ * the tidflow entry needs the KeepAfterSequenceError bit set.
+ * A packet that is converted from expected to eager by RSM will not
+ * trigger an update in the tidflow state.  This will cause the tidflow
+ * to incorrectly report a sequence error on any non-FECN packets that
+ * arrive after the RSM intercepted packets.  If the KeepAfterSequenceError
+ * bit is set, PSM can properly detect this "false SeqErr" condition,
+ * and recover without dropping packets.
+ * Note that if CCA/RSM are not important, this change will slightly
+ * increase the CPU load when packets are dropped.  If this is significant,
+ * consider hiding this change behind a CCA/RSM environment variable.
+ */
+
+	ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64(
+		((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) |
+		((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) |
+		((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) |
+		(1ULL << HFI_TF_HDRSUPP_ENABLED_SHIFT) |
+		/* KeepAfterSequenceError = 1 -- previously was 0 */
+		(1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) |
+		(1ULL << HFI_TF_KEEP_ON_GENERR_SHIFT) |
+		/* KeePayloadOnGenErr = 0 */
+		(1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) |
+		(1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT));
+}
+
+static __inline__ void hfi_tidflow_reset(struct _hfi_ctrl *ctrl,
+					 uint32_t flowid, uint32_t genval,
+					 uint32_t seqnum)
+{
+/*
+ * If a tidflow table entry is set to "Invalid", we want to drop
+ * header if payload is dropped, we want to get a header if the payload
+ * is delivered.
+ *
+ * We set a tidflow table entry "Invalid" by setting FlowValid=1 and
+ * GenVal=0x1FFF/0xFFFFF, this is a special generation number and no
+ * packet will use this value. We don't care SeqNum but we set it to
+ * 0x7FF. So if GenVal does not match, the payload is dropped because
+ * KeepPayloadOnGenErr=0; for packet header, KeepOnGenErr=0 make sure
+ * header is not generated. But if a packet happens to have the special
+ * generation number, the payload is delivered, HdrSuppEnabled=0 make
+ * sure header is generated if SeqNUm matches, if SeqNum does not match,
+ * KeepAfterSeqErr=1 makes sure the header is generated.
+ */
+	ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64(
+		/* genval = 0x1FFF or 0xFFFFF */
+		((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) |
+		/* seqnum = 0x7FF */
+		((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) |
+		((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) |
+		/* HdrSuppEnabled = 0 */
+		(1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) |
+		/* KeepOnGenErr = 0 */
+		/* KeepPayloadOnGenErr = 0 */
+		(1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) |
+		(1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT));
+}
+
+/*
+ * This should only be used for debugging.
+ * Normally, we shouldn't read the chip.
+ */
+static __inline__ uint64_t hfi_tidflow_get(struct _hfi_ctrl *ctrl,
+					   uint32_t flowid)
+{
+	return __le64_to_cpu(ctrl->__hfi_rcvtidflow[flowid]);
+}
+
+static __inline__ uint32_t hfi_tidflow_get_seqnum(uint64_t val)
+{
+	return (val >> HFI_TF_SEQNUM_SHIFT) & HFI_TF_SEQNUM_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_genval(uint64_t val)
+{
+	return (val >> HFI_TF_GENVAL_SHIFT) & HFI_TF_GENVAL_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_flowvalid(uint64_t val)
+{
+	return (val >> HFI_TF_FLOWVALID_SHIFT) & HFI_TF_FLOWVALID_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_enabled(uint64_t val)
+{
+	return (val >> HFI_TF_HDRSUPP_ENABLED_SHIFT) &
+	    HFI_TF_HDRSUPP_ENABLED_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_after_seqerr(uint64_t val)
+{
+	return (val >> HFI_TF_KEEP_AFTER_SEQERR_SHIFT) &
+	    HFI_TF_KEEP_AFTER_SEQERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_on_generr(uint64_t val)
+{
+	return (val >> HFI_TF_KEEP_ON_GENERR_SHIFT) &
+	    HFI_TF_KEEP_ON_GENERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_payload_on_generr(uint64_t val)
+{
+	return (val >> HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT) &
+	    HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_seqmismatch(uint64_t val)
+{
+	return (val >> HFI_TF_STATUS_SEQMISMATCH_SHIFT) &
+	    HFI_TF_STATUS_SEQMISMATCH_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_genmismatch(uint64_t val)
+{
+	return (val >> HFI_TF_STATUS_GENMISMATCH_SHIFT) &
+	    HFI_TF_STATUS_GENMISMATCH_MASK;
+}
+
+/*
+ * This should only be used by a process to write the eager index into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void hfi_hdrset_use_egrbfr(__le32 *rbuf, uint32_t val)
+{
+	rbuf[0] =
+	    (rbuf[0] &
+	     __cpu_to_le32(~(HFI_RHF_USE_EGRBFR_MASK <<
+			     HFI_RHF_USE_EGRBFR_SHIFT))) |
+	    __cpu_to_le32((val & HFI_RHF_USE_EGRBFR_MASK) <<
+			  HFI_RHF_USE_EGRBFR_SHIFT);
+}
+
+static __inline__ void hfi_hdrset_egrbfr_index(__le32 *rbuf, uint32_t val)
+{
+	rbuf[0] =
+	    (rbuf[0] &
+	     __cpu_to_le32(~(HFI_RHF_EGRBFR_INDEX_MASK <<
+			     HFI_RHF_EGRBFR_INDEX_SHIFT))) |
+	    __cpu_to_le32((val & HFI_RHF_EGRBFR_INDEX_MASK) <<
+			  HFI_RHF_EGRBFR_INDEX_SHIFT);
+}
+
+static __inline__ void hfi_hdrset_egrbfr_offset(__le32 *rbuf, uint32_t val)
+{
+	rbuf[1] =
+	    (rbuf[1] &
+	     __cpu_to_le32(~(HFI_RHF_EGRBFR_OFFSET_MASK <<
+			     HFI_RHF_EGRBFR_OFFSET_SHIFT))) |
+	    __cpu_to_le32((val & HFI_RHF_EGRBFR_OFFSET_MASK) <<
+			  HFI_RHF_EGRBFR_OFFSET_SHIFT);
+}
+
+/*
+ * This should only be used by a process to update the receive header
+ * error flags.
+ */
+static __inline__ void hfi_hdrset_err_flags(__le32 *rbuf, uint32_t val)
+{
+	rbuf[1] |= __cpu_to_le32(val);
+}
+
+/*
+ * This should only be used by a process to write the rhf seq number into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void hfi_hdrset_seq(__le32 *rbuf, uint32_t val)
+{
+	rbuf[0] =
+	    (rbuf[0] &
+	     __cpu_to_le32(~(HFI_RHF_SEQ_MASK <<
+			     HFI_RHF_SEQ_SHIFT))) |
+	    __cpu_to_le32((val & HFI_RHF_SEQ_MASK) << HFI_RHF_SEQ_SHIFT);
+}
+
+/* Manage TID entries.  It is possible that not all entries
+   requested may be allocated.  A matching hfi_free_tid() must be
+   done for each hfi_update_tid(), because currently no caching or
+   reuse of expected tid entries is allowed, to work around malloc/free
+   and mmap/munmap issues.  The driver decides which TID entries to allocate.
+   If hfi_free_tid is called to free entries in use by a different
+   send by the same process, data corruption will probably occur,
+   but only within that process, not for other processes.
+*/
+
+/* update tidcnt expected TID entries from the array pointed to by tidinfo. */
+/* Returns 0 on success, else an errno.  See full description at declaration */
+static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl,
+					 uint64_t vaddr, uint32_t *length,
+					 uint64_t tidlist, uint32_t *tidcnt, uint16_t flags)
+{
+	struct hfi1_cmd cmd;
+#ifdef PSM_CUDA
+	struct hfi1_tid_info_v2 tidinfo;
+#else
+	struct hfi1_tid_info tidinfo;
+#endif
+	int err;
+
+	tidinfo.vaddr = vaddr;		/* base address for this send to map */
+	tidinfo.length = *length;	/* length of vaddr */
+
+	tidinfo.tidlist = tidlist;	/* driver copies tids back directly */
+	tidinfo.tidcnt = 0;		/* clear to zero */
+
+	cmd.type = PSMI_HFI_CMD_TID_UPDATE;
+#ifdef PSM_CUDA
+	cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2;
+
+	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+		tidinfo.flags = flags;
+	else
+		tidinfo.flags = 0;
+#endif
+
+	cmd.len = sizeof(tidinfo);
+	cmd.addr = (__u64) &tidinfo;
+
+	err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+	if (err != -1) {
+		*length = tidinfo.length;
+		*tidcnt = tidinfo.tidcnt;
+	}
+
+	return err;
+}
+
+static __inline__ int32_t hfi_free_tid(struct _hfi_ctrl *ctrl,
+					 uint64_t tidlist, uint32_t tidcnt)
+{
+	struct hfi1_cmd cmd;
+	struct hfi1_tid_info tidinfo;
+	int err;
+
+	tidinfo.tidlist = tidlist;	/* input to driver */
+	tidinfo.tidcnt = tidcnt;
+
+	cmd.type = PSMI_HFI_CMD_TID_FREE;
+	cmd.len = sizeof(tidinfo);
+	cmd.addr = (__u64) &tidinfo;
+
+	err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+	return err;
+}
+
+static __inline__ int32_t hfi_get_invalidation(struct _hfi_ctrl *ctrl,
+					 uint64_t tidlist, uint32_t *tidcnt)
+{
+	struct hfi1_cmd cmd;
+	struct hfi1_tid_info tidinfo;
+	int err;
+
+	tidinfo.tidlist = tidlist;	/* driver copies tids back directly */
+	tidinfo.tidcnt = 0;		/* clear to zero */
+
+	cmd.type = PSMI_HFI_CMD_TID_INVAL_READ;
+	cmd.len = sizeof(tidinfo);
+	cmd.addr = (__u64) &tidinfo;
+
+	err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+	if (err != -1)
+		*tidcnt = tidinfo.tidcnt;
+
+	return err;
+}
+
+extern uint32_t __hfi_pico_per_cycle;	/* only for use in these functions */
+
+/* this is only accurate for reasonably large numbers of cycles (at least tens) */
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs)
+{
+	return (__hfi_pico_per_cycle * cycs) / 1000ULL;
+}
+
+/* this is only accurate for reasonably large numbers of nsecs (at least tens) */
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns)
+{
+	return (ns * 1000ULL) / __hfi_pico_per_cycle;
+}
+
+static __inline__ uint64_t get_nanoseconds()
+{
+	return cycles_to_nanosecs(get_cycles());
+}
+
+/* open the diags device, if supported by driver.  Returns 0 on */
+/* success, errno on failure.  Also tells driver that diags */
+/* is active, which changes some driver behavior */
+int hfi_diag_open(unsigned);	/* unit */
+int hfi_diag_close(void);
+
+/* diags chip read and write routines */
+
+int hfid_read32(uint64_t reg_offset, uint32_t *read_valp);
+int hfid_write32(uint64_t reg_offset, uint32_t write_val);
+
+int hfid_readmult(uint64_t, unsigned, uint64_t *);	/* chip: offset, cnt, ptr */
+int hfid_write(uint64_t, uint64_t);	/* chip: offset, value */
+
+#define HFI_READ_EEPROM 31337
+#define HFI_WRITE_EEPROM 101
+
+struct hfi_eeprom_req {
+	void *addr;
+	uint16_t len;
+	uint16_t offset;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define HFI_FLASH_VERSION 2
+struct hfi_flash {
+	/* flash layout version (HFI_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+int hfid_send_pkt(const void *, unsigned);	/* send a packet for diags */
+int hfid_read_i2c(struct hfi_eeprom_req *);	/* diags read i2c flash */
+
+__u8 hfi_flash_csum(struct hfi_flash *, int);
+
+int hfid_reset_hardware(uint32_t);
+
+#endif /* OPA_USER_H */
diff --git a/include/psm2_mock_testing.h b/include/psm2_mock_testing.h
new file mode 100644
index 0000000..d1e9bff
--- /dev/null
+++ b/include/psm2_mock_testing.h
@@ -0,0 +1,176 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MOCK_TESTING_H
+#define PSM2_MOCK_TESTING_H
+
+/* PSM2_MOCK_TESTING being defined flips a couple of switches so that a
+ * testable version of libpsm2.so is built. It'll make properly annotated
+ * static functions be non-static, visible to the outside. Also, all mockable
+ * functions will be replaced with function pointers which will originally
+ * point to the actual implementation. However, those function pointers might
+ * be reset by the test code, thus allowing for mocking selected PSM2 functions
+ * for the purpose of the test.
+ *
+ * So far the following utilities have been introduced for enabling a
+ * conditional compilation of the testable vs. production version of the library:
+ *  - ustatic: toggles function visibility
+ *  - MOCKABLE(): decorates function name so that it is visible after being mocked
+ *  - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam
+ *        for mocking a function
+ *  - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam
+ *        for mocking a function
+ *
+ * If the declaration and definition of a static function @c foo reside in
+ * different files, this would be the common use case:
+ *
+ * @code
+ * // somefile.c:
+ * int MOCKABLE(foo)();
+ * MOCK_DCL_EPILOGUE(foo);
+ *
+ * // otherfile.c:
+ * int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * int foo();
+ *
+ * // otherfile.c:
+ * int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * On the other hand, if a testable version of the libary is being build, it
+ * would produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ *
+ * // otherfile.c:
+ * int foo_original_() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ *
+ * If the function to be mocked is a static function residing in the header,
+ * the following syntax would be used:
+ * @code
+ * // somefile.c:
+ * ustatic int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DCL_EPILOGUE(foo);
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * static int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * Similarly, if a testable version of the libary is being build, it would
+ * produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ */
+#ifndef PSM2_MOCK_TESTING
+
+/* If no testing is being done, ustatic resolves to regular "static" */
+#define ustatic static
+/* If no testing is being done, no indirection is introduced */
+#define MOCKABLE(fname) fname
+/* If no testing is being done, no declaration epilogue is needed */
+#define MOCK_DCL_EPILOGUE(fname)
+/* If no testing is being done, no definition epilogue is needed */
+#define MOCK_DEF_EPILOGUE(fname)
+
+#else /* ndef PSM2_MOCK_TESTING */
+
+/* For the testable version, all _ustatic_ function will NOT be static */
+#define ustatic
+/* TODO override inline directives in the same fashion as static */
+/* For the testable version, the actual implementation function is renamed */
+#define MOCKABLE(x) x ## _original_
+/* For the testable version, we declare the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the declaration of the actual function happens.
+ */
+#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x;
+/* For the testable version, we define the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the definition of the actual function happens.
+ */
+#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_;
+
+#endif /* ndef PSM2_MOCK_TESTING */
+
+#endif /* PSM2_MOCK_TESTING_H */
+
diff --git a/include/rbtree.c b/include/rbtree.c
new file mode 100644
index 0000000..9d6930d
--- /dev/null
+++ b/include/rbtree.c
@@ -0,0 +1,692 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+/*
+ * Abstract:
+ *	Implementation of quick map, a binary tree where the caller always provides
+ *	all necessary storage.
+ *
+ * Environment:
+ *	All
+ *
+ * $Revision$
+ */
+
+
+/*****************************************************************************
+*
+* Map
+*
+* Map is an associative array.  By providing a key, the caller can retrieve
+* an object from the map.  All objects in the map have an associated key,
+* as specified by the caller when the object was inserted into the map.
+* In addition to random access, the caller can traverse the map much like
+* a linked list, either forwards from the first object or backwards from
+* the last object.  The objects in the map are always traversed in
+* order since the nodes are stored sorted.
+*
+* This implementation of Map uses a red black tree verified against
+* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth
+* printing, 1994.
+*
+*****************************************************************************/
+
+#include <string.h> /* for memset declaration */
+
+#if !defined ( RBTREE_GET_LEFTMOST )       || \
+	! defined ( RBTREE_GET_RIGHTMOST ) || \
+	! defined ( RBTREE_MAP_COUNT )     || \
+	! defined ( RBTREE_ASSERT )
+#error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \
+        RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c"
+#endif
+
+#define IN /* nothing */
+
+/******************************************************************************
+*******************************************************************************
+**************                                                     ************
+**************			 IMPLEMENTATION OF QUICK MAP       ************
+**************                                                     ************
+*******************************************************************************
+******************************************************************************/
+
+/* Forward declarations: */
+static void ips_cl_qmap_init(
+				IN	cl_qmap_t		*p_map,
+				IN	cl_map_item_t* const	root,
+				IN	cl_map_item_t* const	nil);
+static void ips_cl_qmap_insert_item(
+				IN	cl_qmap_t* const	p_map,
+				IN	cl_map_item_t* const	p_item);
+static void ips_cl_qmap_remove_item(
+				IN	cl_qmap_t* const	p_map,
+				IN	cl_map_item_t* const	p_item);
+static cl_map_item_t* ips_cl_qmap_successor(
+				IN	cl_qmap_t* const	p_map,
+				IN	const cl_map_item_t*	p_item);
+static cl_map_item_t* ips_cl_qmap_predecessor(
+				IN	cl_qmap_t* const	p_map,
+				IN	const cl_map_item_t*	p_item);
+static cl_map_item_t* ips_cl_qmap_search(
+				IN	cl_qmap_t* const	p_map,
+				IN	unsigned long		start,
+				IN	unsigned long		end);
+
+/*
+ * Get the root.
+ */
+static inline cl_map_item_t*
+__cl_map_root(
+	IN	const cl_qmap_t* const	p_map )
+{
+	RBTREE_ASSERT( p_map );
+	return( p_map->root->p_left );
+}
+
+
+/*
+ * Returns whether a given item is on the left of its parent.
+ */
+static int
+__cl_map_is_left_child(
+	IN	const cl_map_item_t* const	p_item )
+{
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_up );
+	RBTREE_ASSERT( p_item->p_up != p_item );
+
+	return( p_item->p_up->p_left == p_item );
+}
+
+
+/*
+ * Retrieve the pointer to the parent's pointer to an item.
+ */
+static cl_map_item_t**
+__cl_map_get_parent_ptr_to_item(
+	IN	cl_map_item_t* const	p_item )
+{
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_up );
+	RBTREE_ASSERT( p_item->p_up != p_item );
+
+	if( __cl_map_is_left_child( p_item ) )
+		return( &p_item->p_up->p_left );
+
+	RBTREE_ASSERT( p_item->p_up->p_right == p_item );
+	return( &p_item->p_up->p_right );
+}
+
+
+/*
+ * Rotate a node to the left.  This rotation affects the least number of links
+ * between nodes and brings the level of C up by one while increasing the depth
+ * of A one.  Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ *	    R				      R
+ *	    |				      |
+ *	    A				      C
+ *	  /   \			        /   \
+ *	W       C			  A       Z
+ *	       / \			 / \
+ *	      B   Z			W   B
+ *	     / \			   / \
+ *	    X   Y			  X   Y
+ */
+static void
+__cl_map_rot_left(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	**pp_root;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_right != p_map->nil_item );
+
+	pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+
+	/* Point R to C instead of A. */
+	*pp_root = p_item->p_right;
+	/* Set C's parent to R. */
+	(*pp_root)->p_up = p_item->p_up;
+
+	/* Set A's right to B */
+	p_item->p_right = (*pp_root)->p_left;
+	/*
+	 * Set B's parent to A.  We trap for B being NIL since the
+	 * caller may depend on NIL not changing.
+	 */
+	if( (*pp_root)->p_left != p_map->nil_item )
+		(*pp_root)->p_left->p_up = p_item;
+
+	/* Set C's left to A. */
+	(*pp_root)->p_left = p_item;
+	/* Set A's parent to C. */
+	p_item->p_up = *pp_root;
+}
+
+
+/*
+ * Rotate a node to the right.  This rotation affects the least number of links
+ * between nodes and brings the level of A up by one while increasing the depth
+ * of C one.  Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ *	        R				     R
+ *	        |				     |
+ *	        C				     A
+ *	      /   \				   /   \
+ *	    A       Z			 W       C
+ *	   / \    				        / \
+ *	  W   B   				       B   Z
+ *	     / \				      / \
+ *	    X   Y				     X   Y
+ */
+static void
+__cl_map_rot_right(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	**pp_root;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_left != p_map->nil_item );
+
+	/* Point R to A instead of C. */
+	pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+	(*pp_root) = p_item->p_left;
+	/* Set A's parent to R. */
+	(*pp_root)->p_up = p_item->p_up;
+
+	/* Set C's left to B */
+	p_item->p_left = (*pp_root)->p_right;
+	/*
+	 * Set B's parent to C.  We trap for B being NIL since the
+	 * caller may depend on NIL not changing.
+	 */
+	if( (*pp_root)->p_right != p_map->nil_item )
+		(*pp_root)->p_right->p_up = p_item;
+
+	/* Set A's right to C. */
+	(*pp_root)->p_right = p_item;
+	/* Set C's parent to A. */
+	p_item->p_up = *pp_root;
+}
+
+/*
+ * Balance a tree starting at a given item back to the root.
+ */
+static void
+__cl_map_ins_bal(
+	IN	cl_qmap_t* const	p_map,
+	IN	cl_map_item_t*		p_item )
+{
+	cl_map_item_t*		p_grand_uncle;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item != p_map->root );
+
+	while( p_item->p_up->color == CL_MAP_RED )
+	{
+		if( __cl_map_is_left_child( p_item->p_up ) )
+		{
+			p_grand_uncle = p_item->p_up->p_up->p_right;
+			RBTREE_ASSERT( p_grand_uncle );
+			if( p_grand_uncle->color == CL_MAP_RED )
+			{
+				p_grand_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_BLACK;
+				p_item->p_up->p_up->color = CL_MAP_RED;
+				p_item = p_item->p_up->p_up;
+				continue;
+			}
+
+			if( !__cl_map_is_left_child( p_item ) )
+			{
+				p_item = p_item->p_up;
+				__cl_map_rot_left( p_map, p_item );
+			}
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_item->p_up->p_up->color = CL_MAP_RED;
+			__cl_map_rot_right( p_map, p_item->p_up->p_up );
+		}
+		else
+		{
+			p_grand_uncle = p_item->p_up->p_up->p_left;
+			RBTREE_ASSERT( p_grand_uncle );
+			if( p_grand_uncle->color == CL_MAP_RED )
+			{
+				p_grand_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_BLACK;
+				p_item->p_up->p_up->color = CL_MAP_RED;
+				p_item = p_item->p_up->p_up;
+				continue;
+			}
+
+			if( __cl_map_is_left_child( p_item ) )
+			{
+				p_item = p_item->p_up;
+				__cl_map_rot_right( p_map, p_item );
+			}
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_item->p_up->p_up->color = CL_MAP_RED;
+			__cl_map_rot_left( p_map, p_item->p_up->p_up );
+		}
+	}
+}
+
+static void ips_cl_qmap_init(
+				IN	cl_qmap_t		*p_map,
+				IN	cl_map_item_t* const	root,
+				IN	cl_map_item_t* const	nil_item)
+{
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( root );
+	RBTREE_ASSERT( nil_item );
+
+	memset(p_map,0,sizeof(cl_qmap_t));
+
+	p_map->root = root;
+
+	/* setup the RB tree map */
+	p_map->nil_item = nil_item;
+
+	p_map->root->p_up = p_map->root;
+	p_map->root->p_left = p_map->nil_item;
+	p_map->root->p_right = p_map->nil_item;
+	p_map->root->color = CL_MAP_BLACK;
+
+	p_map->nil_item->p_up = p_map->nil_item;
+	p_map->nil_item->p_left = p_map->nil_item;
+	p_map->nil_item->p_right = p_map->nil_item;
+	p_map->nil_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_insert_item(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	*p_insert_at, *p_comp_item;
+	int compare_res = 0;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_map->root->p_up == p_map->root );
+	RBTREE_ASSERT( p_map->root->color != CL_MAP_RED );
+	RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+
+	/* Find the insertion location. */
+	p_insert_at = p_map->root;
+	p_comp_item = __cl_map_root( p_map );
+
+	while( p_comp_item != p_map->nil_item )
+	{
+		p_insert_at = p_comp_item;
+
+		/* Traverse the tree until the correct insertion point is found. */
+		if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) )
+		{
+			p_comp_item = p_insert_at->p_left;
+			compare_res = 1;
+		} else {
+			p_comp_item = p_insert_at->p_right;
+			compare_res = -1;
+		}
+	}
+
+	RBTREE_ASSERT( p_insert_at != p_map->nil_item );
+	RBTREE_ASSERT( p_comp_item == p_map->nil_item );
+
+	/* Insert the item. */
+	p_item->p_left = p_map->nil_item;
+	p_item->p_right = p_map->nil_item;
+	p_item->color = CL_MAP_RED;
+	if( p_insert_at == p_map->root )
+	{
+		p_insert_at->p_left = p_item;
+	}
+	else if( compare_res > 0 ) /* key < p_insert_at->key */
+	{
+		p_insert_at->p_left = p_item;
+	}
+	else
+	{
+		p_insert_at->p_right = p_item;
+	}
+	/* Increase the count. */
+	RBTREE_MAP_COUNT(&p_map->payload)++;
+
+	p_item->p_up = p_insert_at;
+
+	/*
+	 * We have added depth to this section of the tree.
+	 * Rebalance as necessary as we retrace our path through the tree
+	 * and update colors.
+	 */
+	__cl_map_ins_bal( p_map, p_item );
+
+	__cl_map_root( p_map )->color = CL_MAP_BLACK;
+
+	/*
+	 * Note that it is not necessary to re-color the nil node black because all
+	 * red color assignments are made via the p_up pointer, and nil is never
+	 * set as the value of a p_up pointer.
+	 */
+}
+
+static void
+__cl_map_del_bal(
+	IN	cl_qmap_t* const	p_map,
+	IN	cl_map_item_t*		p_item )
+{
+	cl_map_item_t		*p_uncle;
+
+	while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) )
+	{
+		if( __cl_map_is_left_child( p_item ) )
+		{
+			p_uncle = p_item->p_up->p_right;
+
+			if( p_uncle->color == CL_MAP_RED )
+			{
+				p_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_RED;
+				__cl_map_rot_left( p_map, p_item->p_up );
+				p_uncle = p_item->p_up->p_right;
+			}
+
+			if( p_uncle->p_right->color != CL_MAP_RED )
+			{
+				if( p_uncle->p_left->color != CL_MAP_RED )
+				{
+					p_uncle->color = CL_MAP_RED;
+					p_item = p_item->p_up;
+					continue;
+				}
+
+				p_uncle->p_left->color = CL_MAP_BLACK;
+				p_uncle->color = CL_MAP_RED;
+				__cl_map_rot_right( p_map, p_uncle );
+				p_uncle = p_item->p_up->p_right;
+			}
+			p_uncle->color = p_item->p_up->color;
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_uncle->p_right->color = CL_MAP_BLACK;
+			__cl_map_rot_left( p_map, p_item->p_up );
+			break;
+		}
+		else
+		{
+			p_uncle = p_item->p_up->p_left;
+
+			if( p_uncle->color == CL_MAP_RED )
+			{
+				p_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_RED;
+				__cl_map_rot_right( p_map, p_item->p_up );
+				p_uncle = p_item->p_up->p_left;
+			}
+
+			if( p_uncle->p_left->color != CL_MAP_RED )
+			{
+				if( p_uncle->p_right->color != CL_MAP_RED )
+				{
+					p_uncle->color = CL_MAP_RED;
+					p_item = p_item->p_up;
+					continue;
+				}
+
+				p_uncle->p_right->color = CL_MAP_BLACK;
+				p_uncle->color = CL_MAP_RED;
+				__cl_map_rot_left( p_map, p_uncle );
+				p_uncle = p_item->p_up->p_left;
+			}
+			p_uncle->color = p_item->p_up->color;
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_uncle->p_left->color = CL_MAP_BLACK;
+			__cl_map_rot_right( p_map, p_item->p_up );
+			break;
+		}
+	}
+	p_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_remove_item(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	*p_child, *p_del_item;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+
+	if( p_item == p_map->nil_item )
+		return;
+
+	if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) )
+	{
+		/* The item being removed has children on at most on side. */
+		p_del_item = p_item;
+	}
+	else
+	{
+		/*
+		 * The item being removed has children on both side.
+		 * We select the item that will replace it.  After removing
+		 * the substitute item and rebalancing, the tree will have the
+		 * correct topology.  Exchanging the substitute for the item
+		 * will finalize the removal.
+		 */
+		p_del_item = ips_cl_qmap_successor(p_map, p_item);
+		RBTREE_ASSERT( p_del_item != p_map->nil_item );
+	}
+
+	RBTREE_MAP_COUNT(&p_map->payload)--;
+
+	/* Get the pointer to the new root's child, if any. */
+	if( p_del_item->p_left != p_map->nil_item )
+		p_child = p_del_item->p_left;
+	else
+		p_child = p_del_item->p_right;
+
+	/*
+	 * This assignment may modify the parent pointer of the nil node.
+	 * This is inconsequential.
+	 */
+	p_child->p_up = p_del_item->p_up;
+	(*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child;
+
+	if( p_del_item->color != CL_MAP_RED )
+		__cl_map_del_bal( p_map, p_child );
+
+	/*
+	 * Note that the splicing done below does not need to occur before
+	 * the tree is balanced, since the actual topology changes are made by the
+	 * preceding code.  The topology is preserved by the color assignment made
+	 * below (reader should be reminded that p_del_item == p_item in some cases).
+	 */
+	if( p_del_item != p_item )
+	{
+		/*
+		 * Finalize the removal of the specified item by exchanging it with
+		 * the substitute which we removed above.
+		 */
+		p_del_item->p_up = p_item->p_up;
+		p_del_item->p_left = p_item->p_left;
+		p_del_item->p_right = p_item->p_right;
+		(*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item;
+		p_item->p_right->p_up = p_del_item;
+		p_item->p_left->p_up = p_del_item;
+		p_del_item->color = p_item->color;
+	}
+
+	RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+}
+
+static cl_map_item_t *
+ips_cl_qmap_successor(
+	IN	cl_qmap_t* const		p_map,
+	IN	const cl_map_item_t*		p_item )
+{
+	cl_map_item_t	*p_tmp;
+
+	p_tmp = p_item->p_right;
+	if (p_tmp != p_map->nil_item) {
+		while (p_tmp->p_left != p_map->nil_item)
+			p_tmp = p_tmp->p_left;
+		return p_tmp;
+	} else {
+		p_tmp = p_item->p_up;
+		while (p_tmp->p_right == p_item) {
+			p_item = p_tmp;
+			p_tmp = p_tmp->p_up;
+		}
+		if (p_tmp == p_map->root)
+			return p_map->nil_item;
+		return p_tmp;
+	}
+}
+
+static cl_map_item_t *
+ips_cl_qmap_predecessor(
+	IN	cl_qmap_t* const		p_map,
+	IN	const cl_map_item_t*		p_item )
+{
+	cl_map_item_t	*p_tmp;
+
+	p_tmp = p_item->p_left;
+	if (p_tmp != p_map->nil_item) {
+		while (p_tmp->p_right != p_map->nil_item)
+			p_tmp = p_tmp->p_right;
+		return p_tmp;
+	} else {
+		p_tmp = p_item->p_up;
+		while (p_tmp->p_left == p_item) {
+			p_item = p_tmp;
+			p_tmp = p_tmp->p_up;
+		}
+		if (p_tmp == p_map->root)
+			return p_map->nil_item;
+		return p_tmp;
+	}
+}
+
+/*
+ * return the first node with buffer overlapping or zero.
+ */
+static cl_map_item_t *
+ips_cl_qmap_search(cl_qmap_t * const p_map,
+		unsigned long start, unsigned long end)
+{
+	cl_map_item_t *p_item, *p_tmp;
+
+	RBTREE_ASSERT( p_map );
+	p_item = __cl_map_root(p_map);
+
+	while (p_item != p_map->nil_item) {
+		if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) {
+			p_tmp = p_item->p_right;
+			if (p_tmp != p_map->nil_item) {
+				p_item = p_tmp;
+				continue;
+			}
+
+			/*
+			 * p_item is on immediate left side of 'start'.
+			 */
+			if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) {
+				/*
+				 * p_item is on immediate right
+				 * side of 'start'.
+				 */
+				p_item = ips_cl_qmap_successor(p_map, p_item);
+				if (p_item != p_map->nil_item &&
+				    end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+					p_item = p_map->nil_item;
+			}
+		} else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) {
+			p_tmp = p_item->p_left;
+			if (p_tmp != p_map->nil_item) {
+				p_item = p_tmp;
+				continue;
+			}
+
+			/*
+			 * p_tmp is on immediate left side of 'start'.
+			 */
+			p_tmp = ips_cl_qmap_predecessor(p_map, p_item);
+			if (p_tmp == p_map->nil_item ||
+			    (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) {
+				/*
+				 * p_item is on immediate right
+				 * side of 'start'.
+				 */
+				if (end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+					p_item = p_map->nil_item;
+			} else
+				p_item = p_tmp;
+		}
+
+		break;
+	}
+
+
+	return p_item;
+}
diff --git a/include/rbtree.h b/include/rbtree.h
new file mode 100644
index 0000000..13245b0
--- /dev/null
+++ b/include/rbtree.h
@@ -0,0 +1,90 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __RBTREE_H__
+
+#define __RBTREE_H__
+
+#include <stdint.h>
+
+#ifndef RBTREE_MAP_PL
+#error "You must define RBTREE_MAP_PL before including rbtree.h"
+#endif
+
+#ifndef RBTREE_MI_PL
+#error "You must define RBTREE_MI_PL before including rbtree.h"
+#endif
+
+/*
+ * Red-Black tid cache definition.
+ */
+typedef struct _cl_map_item {
+	struct _cl_map_item	*p_left;	/* left pointer */
+	struct _cl_map_item	*p_right;	/* right pointer */
+	struct _cl_map_item	*p_up;		/* up pointer */
+	uint16_t		color;		/* red-black color */
+
+	RBTREE_MI_PL            payload;
+} cl_map_item_t;
+
+typedef struct _cl_qmap {
+	cl_map_item_t		*root;		/* root node pointer */
+	cl_map_item_t		*nil_item;	/* terminator node pointer */
+
+	RBTREE_MAP_PL            payload;
+} cl_qmap_t;
+
+#define CL_MAP_RED   0
+#define CL_MAP_BLACK 1
+
+#endif
diff --git a/libpsm2.spec.in b/libpsm2.spec.in
new file mode 100644
index 0000000..c5ddf62
--- /dev/null
+++ b/libpsm2.spec.in
@@ -0,0 +1,177 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2017 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2017 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+Summary: Intel PSM2 Libraries
+Name: @RPM_NAME@
+Version: @VERSION@
+Release: 1 at SPEC_FILE_RELEASE_DIST@
+License: BSD or GPLv2
+URL: https://github.com/01org/opa-psm2/
+
+# The tarball can be created by:
+# git clone https://github.com/01org/opa-psm2
+# cd opa-psm2
+# git checkout @DIST_SHA@
+# make dist
+Source0: @RPM_NAME at -%{version}.tar.gz
+
+# The OPA product is supported on x86_64 only:
+ExclusiveArch: x86_64
+
+BuildRequires: gcc
+Provides: hfi1-psm
+Obsoletes: hfi1-psm < 1.0.0
+
+%if "@RPM_NAME_BASEEXT@"
+%package -n @RPM_NAME@@RPM_NAME_BASEEXT@
+%endif
+Summary: Intel PSM2 Libraries
+Provides: @RPM_NAME@
+%if 0%{?suse_version}
+BuildRequires: libnuma-devel
+Requires: libnuma1
+%else
+%if 0%{?rhel}==0 || 0%{?rhel} > 6
+BuildRequires: systemd
+BuildRequires: numactl-devel
+Requires: numactl-libs
+%endif
+%endif
+
+%package -n @RPM_NAME at -devel
+Summary: Development files for Intel PSM2
+Requires: %{name}%{?_isa} = %{version}-%{release}
+Provides: hfi1-psm-devel
+Obsoletes: hfi1-psm-devel < 1.0.0
+
+%package -n @RPM_NAME at -compat
+Summary: Compat library for Intel PSM2
+Requires: %{name}%{?_isa} = %{version}-%{release}
+%if 0%{?fedora}
+Requires: systemd-udev
+%endif
+Provides: hfi1-psm-compat
+Obsoletes: hfi1-psm-compat < 1.0.0
+
+# If an alternate basename is defined, like in SLES >=12.3
+# Then we generate a different base src.rpm, so use this
+# description instead.
+%if "@RPM_NAME_BASEEXT@"
+%description
+The source code for the PSM2 messaging API, libpsm2.
+A low-level user-level communications interface for the Intel(R) OPA
+family of products. PSM2 users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+%endif
+
+# In distro's other than SLES >=12.3 we use a single description
+# for both the .src.rpm and the base binary rpm. As the
+# RPM_NAME_BASEEXT defaults to empty contents.
+%description -n @RPM_NAME@@RPM_NAME_BASEEXT@
+PSM2 Messaging API, or PSM2 API, is the low-level
+user-level communications interface for the Intel(R) OPA
+family of products. PSM2 users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description -n @RPM_NAME at -devel
+Intel(R) PSM2, psm2*.h, headers and libpsm2.so files necessary
+for developing software using libpsm2.
+
+%description -n @RPM_NAME at -compat
+Support for MPIs linked with PSM versions < 2. This will allow
+software compiled to use Intel(R) Truescale PSM, libinfinipath, to run
+with Intel(R) OPA PSM2, libpsm2.
+
+%prep
+%setup -q -n @RPM_NAME at -%{version}
+
+%build
+export CFLAGS="%{optflags}"
+make %{?_smp_mflags}
+
+%install
+%make_install
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%files -n @RPM_NAME@@RPM_NAME_BASEEXT@
+%if 0%{?rhel} && 0%{?rhel} < 7
+%{!?_licensedir:%global license %doc}
+%endif
+%license COPYING
+%{_libdir}/@TARGLIB at .so.@MAJOR at .@MINOR@
+%{_libdir}/@TARGLIB at .so.@MAJOR@
+ at 40_PSM_RULES@
+
+%files -n @RPM_NAME at -devel
+%{_libdir}/@TARGLIB at .so
+%{_includedir}/psm2.h
+%{_includedir}/psm2_mq.h
+%{_includedir}/psm2_am.h
+%{_includedir}/hfi1diag
+
+%files -n @RPM_NAME at -compat
+%{_libdir}/psm2-compat
+%if 0%{?rhel} && 0%{?rhel} < 7
+ at UDEVDIR@/rules.d/40-psm-compat.rules
+%else
+%{_udevrulesdir}/40-psm-compat.rules
+%endif
+ at LIBPSM2_COMPAT_SYM_CONF_DIR@/modprobe.d/libpsm2-compat.conf
+%{_prefix}/lib/libpsm2
+
+%changelog
+* Wed Aug 30 2017 Rusell McGuire <russell.w.mcguire at intel.com>
+- Adjust RPM names to match SLES 12.3 distro names
+* Tue Apr 05 2016 Paul Reger <paul.j.reger at intel.com>
+- Upstream PSM2 source code for Fedora.
diff --git a/libuuid/Makefile b/libuuid/Makefile
new file mode 100644
index 0000000..aa3f5ac
--- /dev/null
+++ b/libuuid/Makefile
@@ -0,0 +1,92 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+CFLAGS += -DPSM_UUID=1 -Wno-unused-function
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := psm_uuid.o parse.o pack.o unpack.o unparse.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno; \
+		cd -; \
+	fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+	@echo "Nothing to do for install."
diff --git a/libuuid/compare.c b/libuuid/compare.c
new file mode 100644
index 0000000..44f275b
--- /dev/null
+++ b/libuuid/compare.c
@@ -0,0 +1,53 @@
+/*
+ * compare.c --- compare whether or not two UUID's are the same
+ *
+ * Returns 0 if the two UUID's are different, and 1 if they are the same.
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "psm_uuid.h"
+#include <string.h>
+
+#define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1);
+
+int uuid_compare(const uuid_t uu1, const uuid_t uu2)
+{
+	struct uuid	uuid1, uuid2;
+
+	uuid_unpack(uu1, &uuid1);
+	uuid_unpack(uu2, &uuid2);
+
+	UUCMP(uuid1.time_low, uuid2.time_low);
+	UUCMP(uuid1.time_mid, uuid2.time_mid);
+	UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version);
+	UUCMP(uuid1.clock_seq, uuid2.clock_seq);
+	return memcmp(uuid1.node, uuid2.node, 6);
+}
+
diff --git a/libuuid/pack.c b/libuuid/pack.c
new file mode 100644
index 0000000..801b891
--- /dev/null
+++ b/libuuid/pack.c
@@ -0,0 +1,69 @@
+/*
+ * Internal routine for packing UUID's
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_pack(const struct uuid *uu, uuid_t ptr)
+{
+	uint32_t	tmp;
+	unsigned char	*out = ptr;
+
+	tmp = uu->time_low;
+	out[3] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[2] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[1] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[0] = (unsigned char) tmp;
+
+	tmp = uu->time_mid;
+	out[5] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[4] = (unsigned char) tmp;
+
+	tmp = uu->time_hi_and_version;
+	out[7] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[6] = (unsigned char) tmp;
+
+	tmp = uu->clock_seq;
+	out[9] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[8] = (unsigned char) tmp;
+
+	memcpy(out+10, uu->node, 6);
+}
+
diff --git a/libuuid/parse.c b/libuuid/parse.c
new file mode 100644
index 0000000..dd8c258
--- /dev/null
+++ b/libuuid/parse.c
@@ -0,0 +1,78 @@
+/*
+ * parse.c --- UUID parsing
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+int uuid_parse(const char *in, uuid_t uu)
+{
+	struct uuid	uuid;
+	int 		i;
+	const char	*cp;
+	char		buf[3];
+
+	if (strlen(in) != 36)
+		return -1;
+	for (i=0, cp = in; i <= 36; i++,cp++) {
+		if ((i == 8) || (i == 13) || (i == 18) ||
+		    (i == 23)) {
+			if (*cp == '-')
+				continue;
+			else
+				return -1;
+		}
+		if (i== 36)
+			if (*cp == 0)
+				continue;
+		if (!isxdigit(*cp))
+			return -1;
+	}
+	uuid.time_low = strtoul(in, NULL, 16);
+	uuid.time_mid = strtoul(in+9, NULL, 16);
+	uuid.time_hi_and_version = strtoul(in+14, NULL, 16);
+	uuid.clock_seq = strtoul(in+19, NULL, 16);
+	cp = in+24;
+	buf[2] = 0;
+	for (i=0; i < 6; i++) {
+		buf[0] = *cp++;
+		buf[1] = *cp++;
+		uuid.node[i] = strtoul(buf, NULL, 16);
+	}
+
+	uuid_pack(&uuid, uu);
+	return 0;
+}
diff --git a/libuuid/psm_uuid.c b/libuuid/psm_uuid.c
new file mode 100644
index 0000000..4db29a6
--- /dev/null
+++ b/libuuid/psm_uuid.c
@@ -0,0 +1,114 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/stat.h>
+#include <limits.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static void psmi_make_drand_uuid(psm2_uuid_t uuid_out)
+{
+	struct drand48_data drand48_data;
+	int i;
+	long int rnum;
+	srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data);
+	for(i=0; i < 16; i++) {
+		lrand48_r(&drand48_data, &rnum);
+		uuid_out[i] = rnum % UCHAR_MAX;
+	}
+}
+
+/* Since libuuid can call srand, we will generate our own uuids */
+void
+__psm2_uuid_generate(psm2_uuid_t uuid_out)
+{
+	PSM2_LOG_MSG("entering");
+	/* Prefer using urandom, fallback to drand48_r */
+	struct stat urandom_stat;
+	size_t nbytes;
+	int fd;
+	if(stat("/dev/urandom", &urandom_stat) != 0) {
+		psmi_make_drand_uuid(uuid_out);
+		return;
+	}
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if(fd == -1) {
+		psmi_make_drand_uuid(uuid_out);
+	} else {
+		nbytes = read(fd, (char *) uuid_out, 16);
+		if(nbytes != 16) {
+			psmi_make_drand_uuid(uuid_out);
+		}
+		close(fd);
+	}
+	PSM2_LOG_MSG("leaving");
+	return;
+}
+PSMI_API_DECL(psm2_uuid_generate)
+
+void
+psmi_uuid_unparse(const uuid_t uu, char *out)
+{
+	uuid_unparse_lower(uu, out);
+}
+
+int
+psmi_uuid_parse(const char *in, uuid_t uu)
+{
+	return uuid_parse(in, uu);
+}
+
diff --git a/libuuid/psm_uuid.h b/libuuid/psm_uuid.h
new file mode 100644
index 0000000..09df044
--- /dev/null
+++ b/libuuid/psm_uuid.h
@@ -0,0 +1,78 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM_UUID_H
+#define _PSM_UUID_H
+struct uuid {
+	uint32_t	time_low;
+	uint16_t	time_mid;
+	uint16_t	time_hi_and_version;
+	uint16_t	clock_seq;
+	uint8_t	node[6];
+};
+
+typedef unsigned char uuid_t[16];
+
+int	    psmi_uuid_parse(const char *in, psm2_uuid_t uu);
+void	    psmi_uuid_unparse(const psm2_uuid_t uuid, char *out);
+int	    psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB);
+int uuid_compare(const uuid_t uu1, const uuid_t uu2);
+void uuid_pack(const struct uuid *uu, uuid_t ptr);
+void uuid_unparse(const uuid_t uu, char *out);
+void uuid_unparse_upper(const uuid_t uu, char *out);
+void uuid_unparse_lower(const uuid_t uu, char *out);
+void uuid_unpack(const uuid_t in, struct uuid *uu);
+int uuid_parse(const char *in, uuid_t uu);
+#endif
diff --git a/libuuid/unpack.c b/libuuid/unpack.c
new file mode 100644
index 0000000..26e4394
--- /dev/null
+++ b/libuuid/unpack.c
@@ -0,0 +1,63 @@
+/*
+ * Internal routine for unpacking UUID
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_unpack(const uuid_t in, struct uuid *uu)
+{
+	const uint8_t	*ptr = in;
+	uint32_t		tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_low = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_mid = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_hi_and_version = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->clock_seq = tmp;
+
+	memcpy(uu->node, ptr, 6);
+}
+
diff --git a/libuuid/unparse.c b/libuuid/unparse.c
new file mode 100644
index 0000000..d859379
--- /dev/null
+++ b/libuuid/unparse.c
@@ -0,0 +1,75 @@
+/*
+ * unparse.c -- convert a UUID to string
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdio.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static const char *fmt_lower =
+	"%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x";
+
+static const char *fmt_upper =
+	"%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X";
+
+#ifdef UUID_UNPARSE_DEFAULT_UPPER
+#define FMT_DEFAULT fmt_upper
+#else
+#define FMT_DEFAULT fmt_lower
+#endif
+
+static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt)
+{
+	struct uuid uuid;
+
+	uuid_unpack(uu, &uuid);
+	sprintf(out, fmt,
+		uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
+		uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
+		uuid.node[0], uuid.node[1], uuid.node[2],
+		uuid.node[3], uuid.node[4], uuid.node[5]);
+}
+
+void uuid_unparse_lower(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_lower);
+}
+
+void uuid_unparse_upper(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_upper);
+}
+
+void uuid_unparse(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out, FMT_DEFAULT);
+}
diff --git a/makesdeb.sh b/makesdeb.sh
new file mode 100755
index 0000000..072f741
--- /dev/null
+++ b/makesdeb.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2017 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2016 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Stop on error
+set -e
+
+BUILD_OPTS="gFGbBAS"
+BUILD_OPT=F
+
+function literate()
+{
+    echo $(sed "s/\B/&$2/g" <<< "$1")
+}
+
+function usage()
+{
+    echo "Usage: ${0##*/} [-h] [debuild -($(literate $BUILD_OPTS '|'))]"
+    exit $1
+}
+
+while getopts "h$BUILD_OPTS" OPT; do
+    case $OPT in
+        h)
+            usage
+                ;;
+        \?)
+            usage 1
+                ;;
+        *)
+            BUILD_OPT=$OPT
+                ;;
+    esac
+done
+
+# Remove parsed options
+shift $((OPTIND-1))
+
+# Check if we have any non-option parameters
+test ! $# -eq 0 && usage
+
+# Annotate changelog
+cat debian/changelog.in > debian/changelog
+
+GIT_TAG_PREFIX=v
+GIT_TAG_RELEASE=$(git describe --tags --long --match="$GIT_TAG_PREFIX*")
+VERSION=$(sed -e "s/^$GIT_TAG_PREFIX\(.\+\)-\(.\+\)-.\+/\1_\2/" -e 's/_/./g' <<< "$GIT_TAG_RELEASE")
+
+debchange --newversion=$VERSION "Bump up version to $VERSION"
+
+debchange --release ""
+
+# Build package
+debuild -$BUILD_OPT -tc
+
+echo "The deb package(s) is (are) in parent directory"
+
diff --git a/makesrpm.sh b/makesrpm.sh
new file mode 100755
index 0000000..e673b35
--- /dev/null
+++ b/makesrpm.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+#It makes no sense to have both CUDA and non-CUDA in the same invocation
+#as they require different versions of the hfi1_user.h at this point in time.
+#Limiting this script to only build CUDA if requested
+
+#default BUILDARG to build source RPM only
+BUILDARG=s
+RPM_NAME=libpsm2
+
+function usage()
+{
+    echo "Usage: $0 [OPTION] [OPTION] [OPTION]"
+    echo " "
+    echo "Creates tar ball of source and source rpms by default."
+    echo "Optionally generates binary rpm(s) "
+    echo " "
+    echo "     s,a,b,p,c,i,l"
+    echo "           Optional, default is s (sourcerpm)"
+    echo "           Set single extension letter for rpmbuild -b argument"
+    echo "     -r <name>, -rpmname <name>"
+    echo "           Optional, set the output rpm name"
+    echo "     -e <basename ext>, -baseext <basename ext>"
+    echo "           Optional, set a base name extension"
+    echo "           This only appends an extra string onto the base RPM name"
+    echo "           Does not affect supporting RPMs"
+    echo "     -c, -cuda"
+    echo "           Optional, default is unset"
+    echo "           Sets PSM_CUDA=1, creating -cuda based spec and rpms"
+    echo "     -d <path>, -dir <path>"
+    echo "           Optionally sets output folder for rpmbuild to use"
+    echo "     Examples:"
+    echo "           $0 b"
+    echo "           $0 s -cuda"
+    echo "           $0 -cuda"
+    echo "           $0 -d ./temp"
+    echo "           $0 b -cuda -dir output"
+    exit 1
+}
+
+err=0
+
+# OUTDIR is where the Makefile places its meta-data
+OUTDIR=build_release
+
+# Set TEMPDIR first, so user control can override the value
+# This is where rpmbuild places rpm(s) and uses its build meta-data.
+# It can be set the same as OUTDIR, and work just fine if desired.
+TEMPDIR=temp.$$
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -d | -dir)      shift
+                        if [ -z "$1" ]; then
+                            usage
+                        fi
+                        TEMPDIR=$1
+                        ;;
+        -c | -cuda)     export PSM_CUDA=1
+                        RPM_EXT="-cuda"
+                        ;;
+        -e | -baseext)  shift
+                        if [ -z "$1" ]; then
+                            usage
+                        fi
+                        $RPM_NAME_BASEEXT="$1"
+                        export RPM_NAME_BASEEXT="$1"
+                        ;;
+        -r | -rpmname)  shift
+                        if [ -z "$1" ]; then
+                            usage
+                        fi
+                        $RPM_NAME="$1"
+                        export RPM_NAME="$1"
+                        ;;
+        s|a|b|p|c|i|l)  BUILDARG=$1
+                        ;;
+        * )             err=1
+                        usage
+                        ;;
+    esac
+    shift
+done
+
+# Generic cleanup, build, and tmp folder creation
+make distclean OUTDIR=$OUTDIR
+make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT dist OUTDIR=$OUTDIR
+mkdir -p ./$TEMPDIR/{BUILD,RPMS,SOURCES,SPECS,SRPMS,BUILDROOT}
+# Differnet paths based on RPM_EXT
+cp ${OUTDIR}/$RPM_NAME-*.tar.gz $TEMPDIR/SOURCES
+make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT specfile OUTDIR=$OUTDIR
+cp ${OUTDIR}/$RPM_NAME.spec $TEMPDIR/SPECS
+rpmbuild -b$BUILDARG --define "_topdir $PWD/$TEMPDIR" --nodeps $TEMPDIR/SPECS/$RPM_NAME.spec
+
+echo "The SRPM(s) are in $TEMPDIR/SRPMS/`ls $TEMPDIR/SRPMS`"
diff --git a/mpspawn/mpspawn_stats.h b/mpspawn/mpspawn_stats.h
new file mode 100644
index 0000000..4382587
--- /dev/null
+++ b/mpspawn/mpspawn_stats.h
@@ -0,0 +1,132 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _MPSPAWN_STATS_H
+#define _MPSPAWN_STATS_H
+
+#include <math.h>
+
+#define MPSPAWN_STATS_VERSION	1
+
+typedef enum {
+	MPSPAWN_STATS_TYPE_DOUBLE = 0x1,
+#define MPSPAWN_STATS_TYPE_DOUBLE      0x1
+	MPSPAWN_STATS_TYPE_HEADER = 0x2,
+#define MPSPAWN_STATS_TYPE_HEADER      0x2
+	MPSPAWN_STATS_REDUCTION_MAX = 0x1000,
+#define MPSPAWN_STATS_REDUCTION_MAX    0x1000
+	MPSPAWN_STATS_REDUCTION_MIN = 0x2000,
+#define MPSPAWN_STATS_REDUCTION_MIN    0x2000
+	MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000,
+#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000
+	MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000
+#define MPSPAWN_STATS_SKIP_IF_ZERO     0x8000
+} mpspawn_stats_flags;
+
+#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \
+	    MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN)
+
+#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg)))
+#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL)
+#define MPSPAWN_ISNAN_U64(x)    (((uint64_t)(x)) == MPSPAWN_NAN_U64)
+
+#define MPSPAWN_NAN	    ((uint64_t) ~0ULL)	/* NAN) */
+#define MPSPAWN_ISNAN(x)    (isnan(x))
+
+struct mpspawn_stats_add_args;	/* client->mpspawn stats registration */
+struct mpspawn_stats_req_args;	/* mpspawn->client fn callback stats request */
+struct mpspawn_stats_init_args;	/* mpspawn->client "downcall" to register */
+
+/* Clients implement this function to fill in mpspawn request for stats */
+typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *);
+/* mpspawn implements this function to allow clients to register new stats */
+typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *);
+/* mpspawn implements this function to map rank indexes into epaddr structs */
+struct psm2_epaddr;
+typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank);
+
+typedef struct mpspawn_stats_req_args {
+	int version;
+	int num;
+	uint64_t *stats;
+	uint16_t *flags;
+	void *context;
+} mpspawn_stats_req_args_t;
+
+typedef
+struct mpspawn_stats_add_args {
+	int version;
+	int num;
+	char *header;
+	char **desc;
+	uint16_t *flags;
+	mpspawn_stats_req_fn req_fn;
+	void *context;
+} mpspawn_stats_add_args_t;
+
+typedef
+struct mpspawn_stats_init_args {
+	int version;
+	psm2_mq_t mq;		/* initialized mq endpoint */
+	int num_epaddr;		/* number of endpoints in job */
+	mpspawn_stats_add_fn add_fn;	/* function for client to add stats */
+	mpspawn_map_epaddr_fn epaddr_map_fn;
+	const char *stats_types;	/* stats type string mpirun -M */
+} mpspawn_stats_init_args_t;
+
+/* Function in psm exposed to register stats */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args);
+
+#endif
diff --git a/opa/Makefile b/opa/Makefile
new file mode 100644
index 0000000..d065429
--- /dev/null
+++ b/opa/Makefile
@@ -0,0 +1,113 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+TARGLIB := libopa
+MAJOR := $(OPA_LIB_MAJOR)
+MINOR := $(OPA_LIB_MINOR)
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+BASECFLAGS += -D_GNU_SOURCE
+INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips
+
+ifeq (${arch},x86_64)
+	PLATFORM_OBJ=opa_dwordcpy-x86_64-fast.o
+else
+	PLATFORM_OBJ=
+endif
+
+${TARGLIB}-objs := opa_debug.o opa_time.o opa_proto.o \
+	opa_service.o opa_utils.o \
+	opa_dwordcpy-$(arch).o opa_i2cflash.o opa_sysfs.o opa_syslog.o \
+	$(PLATFORM_OBJ)
+
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+install: all
+	@echo "Nothing to do for install."
+
+$(OUTDIR)/%.d:  $(this_srcdir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.d:  $(this_srcdir)/%.S
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.S | ${DEPS}
+	$(CC) $(ASFLAGS) -c $< -o $@
+
+clean:
+	@rm -f $(OUTDIR)/_revision.c
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno ${TARGLIB}.*; \
+		cd -; \
+	fi
+
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
diff --git a/opa/opa_debug.c b/opa/opa_debug.c
new file mode 100644
index 0000000..71b0003
--- /dev/null
+++ b/opa/opa_debug.c
@@ -0,0 +1,364 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <execinfo.h>
+#include <fcntl.h>
+#include <ucontext.h>
+#include "opa_user.h"
+#include "../psm_log.h"
+
+unsigned hfi_debug = 1;
+char *__hfi_mylabel = NULL;
+FILE *__hfi_dbgout;
+static void init_hfi_mylabel(void) __attribute__ ((constructor));
+static void init_hfi_backtrace(void) __attribute__ ((constructor));
+static void init_hfi_dbgfile(void) __attribute__ ((constructor));
+static void fini_hfi_backtrace(void) __attribute__ ((destructor));
+static struct sigaction SIGSEGV_old_act;
+static struct sigaction SIGBUS_old_act;
+static struct sigaction SIGILL_old_act;
+static struct sigaction SIGABRT_old_act;
+static struct sigaction SIGINT_old_act;
+static struct sigaction SIGTERM_old_act;
+#ifdef HFI_BRAKE_DEBUG
+static void hfi_brake_debug(void) __attribute__ ((constructor));
+
+/*
+  How to use hfi_break_debug code:
+
+  1. Build psm with HFI_BRAKE_DEBUG set in the environment.
+  2. Create a script for your test case (e.g. mpistress?).  In the script
+     make sure to choose a HFI brake file that corresponds to a network
+     file system that is common to all hosts where you will run your code.
+     Also, in the script, make sure to propagate the "HFI_BRAKE_FILE_NAME"
+     env var to all hosts.
+  3. Bring up 3 putty sessions to one of the hosts that your script uses.
+  4. In putty session number 1, touch the HFI_BRAKE_FILE and sync.
+  5. In putty session number 1, start the script.   You should see messages
+     of the form:
+-bash-4.2$ ./mpistress.0304.sc
+<hostname>:5716 remove the file: "/nfs/user/HFI_BRAKE"  to continue
+<hostname>:5717 remove the file: "/nfs/user/HFI_BRAKE"  to continue
+<hostname>:3456 remove the file: "/nfs/user/HFI_BRAKE"  to continue
+<hostname>:3457 remove the file: "/nfs/user/HFI_BRAKE"  to continue
+
+     Note that the hostname and process id are shown for all of the processes that are started
+     by your script.
+  6. In putty session 2, bring up gdb, and debug the program that is referenced in your script.
+     For example: /usr/mpi/gcc/openmpi-1.10.2-hfi/tests/intel/mpi_stress
+  7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1.
+  8. Note, at this point, you have only one gdb session.  I leave it as an exercise to the reader to
+     determine how to bring up multiple gdb sessions.
+  9. In putty session 3, rm the HFI_BRAKE_FILE.
+ 10. You are now debugging a live session of psm.
+ */
+
+static void hfi_brake_debug(void)
+{
+	struct stat buff;
+	char hostname[80];
+	const char *hfi_brake_file_name = getenv("HFI_BRAKE_FILE_NAME");
+	gethostname(hostname, 80);
+	hostname[sizeof(hostname) - 1] = '\0';
+
+	if (!hfi_brake_file_name)
+		hfi_brake_file_name = "/tmp/HFI_BRAKE_FILE";
+	printf("%s:%d remove the file: \"%s\"  to continue\n",hostname,getpid(),hfi_brake_file_name);
+	while (0 == stat(hfi_brake_file_name, &buff))
+	{
+		printf("%s:%d remove the file: \"%s\"  to continue\n",hostname,getpid(),hfi_brake_file_name);
+		sleep(10);
+	}
+	printf("%s:%d continuing.\n",hostname,getpid());
+}
+#endif
+
+static void init_hfi_mylabel(void)
+{
+	char lbl[1024];
+	char hostname[80];
+	char *e;
+	/* By default, try to come up with a decent default label, it will be
+	 * overridden later.  Try getting rank, if that's not available revert to
+	 * pid. */
+	gethostname(hostname, 80);
+	lbl[0] = '\0';
+	hostname[sizeof(hostname) - 1] = '\0';
+	if ((((e = getenv("PSC_MPI_RANK")) && *e)) ||
+	    (((e = getenv("MPI_RANKID")) && *e)) ||
+	    (((e = getenv("MPIRUN_RANK")) && *e))) {
+		char *ep;
+		unsigned long val;
+		val = strtoul(e, &ep, 10);
+		if (ep != e)	/* valid conversion */
+			snprintf(lbl, 1024, "%s.%lu", hostname, val);
+	}
+	if (lbl[0] == '\0')
+		snprintf(lbl, 1024, "%s.%u", hostname, getpid());
+	__hfi_mylabel = strdup(lbl);
+}
+
+/* FIXME: This signal handler does not conform to the posix standards described
+   in 'man 7 signal' due to it calling unsafe functions.
+
+   See 'CALLS UNSAFE FUNCTION' notes below for examples.
+ */
+static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv)
+{
+	/* we make these static to try and avoid issues caused
+	   by stack overflow that might have gotten us here. */
+	static void *backaddr[128];	/* avoid stack usage */
+	static char buf[150], hname[64], fname[128];
+	static int i, j, fd, id;
+	extern char *__progname;
+	PSM_LOG_DECLARE_BT_BUFFER();
+
+	/* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */
+	PSM_LOG_BT(100,__FUNCTION__);
+	/* If this is a SIGINT do not display backtrace. Just invoke exit
+	   handlers */
+	if ((sig == SIGINT) || (sig == SIGTERM))
+		/* CALLS UNSAFE FUNCTION (exit) */
+		exit(1);
+
+	/* CALLS UNSAFE FUNCTION (snprintf) */
+	id = snprintf(buf, sizeof(buf),
+		      "\n%.60s:%u terminated with signal %d", __progname,
+		      getpid(), sig);
+	if (ucv) {
+		static ucontext_t *uc;
+		uc = (ucontext_t *) ucv;
+		id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx",
+#if defined(__x86_64__)
+			       (unsigned long)uc->uc_mcontext.gregs[REG_RIP],
+			       (unsigned long)uc->uc_mcontext.gregs[REG_RSP]);
+#elif defined(__i386__)
+			       (unsigned long)uc->uc_mcontext.gregs[REG_EIP],
+			       (unsigned long)uc->uc_mcontext.gregs[REG_ESP]);
+#else
+			       0ul, 0ul);
+#warning No stack pointer or instruction pointer for this arch
+#endif
+	}
+	id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n");
+	/* CALLS UNSAFE FUNCTION (fprintf) */
+	fprintf(stderr, "%.*s", id, buf);
+
+	i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0]));
+	if (i > 2)		/* skip ourselves and backtrace */
+		j = 2, i -= j;
+	else
+		j = 0;
+
+	backtrace_symbols_fd(backaddr + j, i, 2);
+	(void)fsync(2);
+
+	/* Try to write it to a file as well, in case the rest doesn't make it
+	   out. Do it second, in case we get a second failure (more likely).
+	   We might eventually want to print some more of the registers to the
+	   btr file, to aid debugging, but not for now.  Truncate the program
+	   name if overly long, so we always get pid and (at least part of)
+	   hostname. */
+	/* CALLS UNSAFE FUNCTION (gethostname) */
+	(void)gethostname(hname, sizeof(hname));
+	hname[sizeof(hname) - 1] = '\0';
+	snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname,
+		 getpid(), hname);
+	if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) {
+		/* CALLS UNSAFE FUNCTION (fdopen) */
+		FILE *fp = fdopen(fd, "w");
+		if (fp)
+			fprintf(fp, "%.*s", id, buf);
+		backtrace_symbols_fd(backaddr + j, i, fd);
+		if (fp)
+			/* CALLS UNSAFE FUNCTION (fclose) */
+			fclose(fp);
+	}
+	switch (sig){
+        case SIGSEGV:
+                (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGBUS:
+                (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGILL:
+                (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGABRT:
+                (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        default:
+                break;
+        }
+	exit(1);		/* not _exit(), want atexit handlers to get run */
+}
+
+/* We do this as a constructor so any user program that sets signal handlers
+   for these will override our settings, but we still get backtraces if they
+   don't.
+*/
+static void init_hfi_backtrace(void)
+{
+	/* we need to track memory corruption */
+	static struct sigaction act;	/* easier than memset */
+	act.sa_sigaction = hfi_sighdlr;
+	act.sa_flags = SA_SIGINFO;
+
+	if (getenv("HFI_BACKTRACE")) {
+		/* permanent, although probably
+		   undocumented way to disable backtraces. */
+		(void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act);
+		(void)sigaction(SIGBUS, &act, &SIGBUS_old_act);
+		(void)sigaction(SIGILL, &act, &SIGILL_old_act);
+		(void)sigaction(SIGABRT, &act, &SIGABRT_old_act);
+		(void)sigaction(SIGINT, &act, &SIGINT_old_act);
+		(void)sigaction(SIGTERM, &act, &SIGTERM_old_act);
+	}
+}
+
+/* if HFI_DEBUG_FILENAME is set in the environment, then all the
+   debug prints (not info and error) will go to that file.
+   %h is expanded to the hostname, and %p to the pid, if present. */
+static void init_hfi_dbgfile(void)
+{
+	char *fname = getenv("HFI_DEBUG_FILENAME");
+	char *exph, *expp, tbuf[1024];
+	FILE *newf;
+
+	if (!fname) {
+		__hfi_dbgout = stdout;
+		return;
+	}
+	exph = strstr(fname, "%h");	/* hostname */
+	expp = strstr(fname, "%p");	/* pid */
+	if (exph || expp) {
+		int baselen;
+		char hname[256], pid[12];
+		if (exph) {
+			*hname = hname[sizeof(hname) - 1] = 0;
+			gethostname(hname, sizeof(hname) - 1);
+			if (!*hname)
+				strcpy(hname, "[unknown]");
+		}
+		if (expp)
+			snprintf(pid, sizeof(pid), "%d", getpid());
+		if (exph && expp) {
+			if (exph < expp) {
+				baselen = exph - fname;
+				snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+					 baselen, fname, hname,
+					 (int)(expp - (exph + 2)), exph + 2,
+					 pid, expp + 2);
+			} else {
+				baselen = expp - fname;
+				snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+					 baselen, fname, pid,
+					 (int)(exph - (expp + 2)), expp + 2,
+					 hname, exph + 2);
+			}
+		} else if (exph) {
+			baselen = exph - fname;
+			snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+				 baselen, fname, hname, exph + 2);
+		} else {
+			baselen = expp - fname;
+			snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+				 baselen, fname, pid, expp + 2);
+		}
+		fname = tbuf;
+	}
+	newf = fopen(fname, "a");
+	if (!newf) {
+		_HFI_ERROR
+		    ("Unable to open \"%s\" for debug output, using stdout: %s\n",
+		     fname, strerror(errno));
+		__hfi_dbgout = stdout;
+	} else {
+		__hfi_dbgout = newf;
+		setlinebuf(__hfi_dbgout);
+	}
+}
+
+void hfi_set_mylabel(char *label)
+{
+	__hfi_mylabel = label;
+}
+
+char *hfi_get_mylabel()
+{
+	return __hfi_mylabel;
+}
+
+static void fini_hfi_backtrace(void)
+{
+  if (getenv("HFI_BACKTRACE")) {
+    (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL);
+    (void)sigaction(SIGBUS,  &SIGBUS_old_act, NULL);
+    (void)sigaction(SIGILL,  &SIGILL_old_act, NULL);
+    (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL);
+    (void)sigaction(SIGINT,  &SIGINT_old_act, NULL);
+    (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL);
+  }
+}
diff --git a/opa/opa_dwordcpy-generic.c b/opa/opa_dwordcpy-generic.c
new file mode 100644
index 0000000..929202d
--- /dev/null
+++ b/opa/opa_dwordcpy-generic.c
@@ -0,0 +1,298 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "opa_intf.h"
+#include "psm_user.h"
+
+#if defined(__x86_64__)
+#define hfi_dwordcpy hfi_dwordcpy_safe
+#define hfi_qwordcpy hfi_qwordcpy_safe
+#endif
+
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = (const uint64_t *) src;
+	dst64[0] = (volatile uint64_t *) dest;
+
+	while (ndw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		ndw -= 8;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	}
+	if (ndw) {
+		src = (const uint32_t *) src64[0];
+		dest = (volatile uint32_t *) dst64[0];
+
+		switch (ndw) {
+		case 7:
+			*dest++ = *src++;
+		case 6:
+			*dest++ = *src++;
+		case 5:
+			*dest++ = *src++;
+		case 4:
+			*dest++ = *src++;
+		case 3:
+			*dest++ = *src++;
+		case 2:
+			*dest++ = *src++;
+		case 1:
+			*dest++ = *src++;
+		}
+
+	}
+}
+
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords)
+{
+	uint_fast32_t nqw = nqwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	while (nqw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		nqw -= 8;
+	}
+	if (nqw) {
+		switch (nqw) {
+		case 7:
+			*(dst64[0])++ = *(src64[0])++;
+		case 6:
+			*(dst64[0])++ = *(src64[0])++;
+		case 5:
+			*(dst64[0])++ = *(src64[0])++;
+		case 4:
+			*(dst64[0])++ = *(src64[0])++;
+		case 3:
+			*(dst64[0])++ = *(src64[0])++;
+		case 2:
+			*(dst64[0])++ = *(src64[0])++;
+		case 1:
+			*(dst64[0])++ = *(src64[0])++;
+		}
+	}
+}
+
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m512i *dp = (volatile __m512i *) dest;
+	const __m512i *sp = (const __m512i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x3f) == 0x0) {
+		/* source and destination are both 64 byte aligned */
+		do {
+			__m512i tmp0 = _mm512_load_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	} else {
+		/* only destination is 64 byte aligned - use unaligned loads */
+		do {
+			__m512i tmp0 = _mm512_loadu_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	}
+}
+#endif
+
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m256i *dp = (volatile __m256i *) dest;
+	const __m256i *sp = (const __m256i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x1f) == 0x0) {
+		/* source and destination are both 32 byte aligned */
+		do {
+			__m256i tmp0 = _mm256_load_si256(sp);
+			__m256i tmp1 = _mm256_load_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	} else {
+		/* only destination is 32 byte aligned - use unaligned loads */
+		do {
+			__m256i tmp0 = _mm256_loadu_si256(sp);
+			__m256i tmp1 = _mm256_loadu_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	}
+}
+#endif
+
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m128i *dp = (volatile __m128i *) dest;
+	const __m128i *sp = (const __m128i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0xf) == 0x0) {
+		/* source and destination are both 16 byte aligned */
+		do {
+			__m128i tmp0 = _mm_load_si128(sp);
+			__m128i tmp1 = _mm_load_si128(sp + 1);
+			__m128i tmp2 = _mm_load_si128(sp + 2);
+			__m128i tmp3 = _mm_load_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	} else {
+		/* only destination is 16 byte aligned - use unaligned loads */
+		do {
+			__m128i tmp0 = _mm_loadu_si128(sp);
+			__m128i tmp1 = _mm_loadu_si128(sp + 1);
+			__m128i tmp2 = _mm_loadu_si128(sp + 2);
+			__m128i tmp3 = _mm_loadu_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	}
+}
+#endif
+
+void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	psmi_assert((dst64[0] != NULL) && (src64[0] != NULL));
+	psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0);
+
+	do {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	} while (--nblock);
+}
diff --git a/opa/opa_dwordcpy-i386.S b/opa/opa_dwordcpy-i386.S
new file mode 100644
index 0000000..f3d898d
--- /dev/null
+++ b/opa/opa_dwordcpy-i386.S
@@ -0,0 +1,84 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+ 	.globl hfi_dwordcpy
+	.file	"opa_dword32cpy.S"
+	.text
+	.p2align 4,,15
+hfi_dwordcpy:
+	// standard C calling convention, args on stack
+        // does not return any value
+	.type	hfi_dwordcpy, @function
+	// save caller-saved regs
+	mov    %edi,%eax
+	mov    %esi,%edx
+
+	// setup regs
+	mov    0xc(%esp,1),%ecx
+	mov    0x4(%esp,1),%edi
+	mov    0x8(%esp,1),%esi
+	// and do it
+        cld
+	rep
+	movsd
+
+	// restore
+	mov %eax,%edi
+	mov %edx,%esi
+	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/opa/opa_dwordcpy-x86_64-fast.S b/opa/opa_dwordcpy-x86_64-fast.S
new file mode 100644
index 0000000..fe07ebf
--- /dev/null
+++ b/opa/opa_dwordcpy-x86_64-fast.S
@@ -0,0 +1,77 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+ 	.globl hfi_dwordcpy
+	.file	"opa_dwordcpy-x86_64-fast.S"
+	.text
+	.p2align 4,,15
+	// standard C calling convention, rdi is dest, rsi is source, rdx is count
+        // does not return any value
+hfi_dwordcpy:
+	.type	hfi_dwordcpy, @function
+	movl %edx,%ecx
+	shrl $1,%ecx
+	andl $1,%edx
+        cld
+	rep
+	movsq
+	movl %edx,%ecx
+	rep
+	movsd
+	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/opa/opa_dwordcpy-x86_64.c b/opa/opa_dwordcpy-x86_64.c
new file mode 100644
index 0000000..929202d
--- /dev/null
+++ b/opa/opa_dwordcpy-x86_64.c
@@ -0,0 +1,298 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "opa_intf.h"
+#include "psm_user.h"
+
+#if defined(__x86_64__)
+#define hfi_dwordcpy hfi_dwordcpy_safe
+#define hfi_qwordcpy hfi_qwordcpy_safe
+#endif
+
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = (const uint64_t *) src;
+	dst64[0] = (volatile uint64_t *) dest;
+
+	while (ndw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		ndw -= 8;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	}
+	if (ndw) {
+		src = (const uint32_t *) src64[0];
+		dest = (volatile uint32_t *) dst64[0];
+
+		switch (ndw) {
+		case 7:
+			*dest++ = *src++;
+		case 6:
+			*dest++ = *src++;
+		case 5:
+			*dest++ = *src++;
+		case 4:
+			*dest++ = *src++;
+		case 3:
+			*dest++ = *src++;
+		case 2:
+			*dest++ = *src++;
+		case 1:
+			*dest++ = *src++;
+		}
+
+	}
+}
+
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords)
+{
+	uint_fast32_t nqw = nqwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	while (nqw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		nqw -= 8;
+	}
+	if (nqw) {
+		switch (nqw) {
+		case 7:
+			*(dst64[0])++ = *(src64[0])++;
+		case 6:
+			*(dst64[0])++ = *(src64[0])++;
+		case 5:
+			*(dst64[0])++ = *(src64[0])++;
+		case 4:
+			*(dst64[0])++ = *(src64[0])++;
+		case 3:
+			*(dst64[0])++ = *(src64[0])++;
+		case 2:
+			*(dst64[0])++ = *(src64[0])++;
+		case 1:
+			*(dst64[0])++ = *(src64[0])++;
+		}
+	}
+}
+
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m512i *dp = (volatile __m512i *) dest;
+	const __m512i *sp = (const __m512i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x3f) == 0x0) {
+		/* source and destination are both 64 byte aligned */
+		do {
+			__m512i tmp0 = _mm512_load_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	} else {
+		/* only destination is 64 byte aligned - use unaligned loads */
+		do {
+			__m512i tmp0 = _mm512_loadu_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	}
+}
+#endif
+
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m256i *dp = (volatile __m256i *) dest;
+	const __m256i *sp = (const __m256i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x1f) == 0x0) {
+		/* source and destination are both 32 byte aligned */
+		do {
+			__m256i tmp0 = _mm256_load_si256(sp);
+			__m256i tmp1 = _mm256_load_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	} else {
+		/* only destination is 32 byte aligned - use unaligned loads */
+		do {
+			__m256i tmp0 = _mm256_loadu_si256(sp);
+			__m256i tmp1 = _mm256_loadu_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	}
+}
+#endif
+
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m128i *dp = (volatile __m128i *) dest;
+	const __m128i *sp = (const __m128i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0xf) == 0x0) {
+		/* source and destination are both 16 byte aligned */
+		do {
+			__m128i tmp0 = _mm_load_si128(sp);
+			__m128i tmp1 = _mm_load_si128(sp + 1);
+			__m128i tmp2 = _mm_load_si128(sp + 2);
+			__m128i tmp3 = _mm_load_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	} else {
+		/* only destination is 16 byte aligned - use unaligned loads */
+		do {
+			__m128i tmp0 = _mm_loadu_si128(sp);
+			__m128i tmp1 = _mm_loadu_si128(sp + 1);
+			__m128i tmp2 = _mm_loadu_si128(sp + 2);
+			__m128i tmp3 = _mm_loadu_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	}
+}
+#endif
+
+void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	psmi_assert((dst64[0] != NULL) && (src64[0] != NULL));
+	psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0);
+
+	do {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	} while (--nblock);
+}
diff --git a/opa/opa_i2cflash.c b/opa/opa_i2cflash.c
new file mode 100644
index 0000000..5b54bc2
--- /dev/null
+++ b/opa/opa_i2cflash.c
@@ -0,0 +1,87 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "opa_user.h"
+
+uint8_t hfi_flash_csum(struct hfi_flash *ifp, int adjust)
+{
+	uint8_t *ip = (uint8_t *) ifp;
+	uint8_t csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct hfi_flash))
+		len = sizeof(struct hfi_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+	return csum;
+}
diff --git a/opa/opa_proto.c b/opa/opa_proto.c
new file mode 100644
index 0000000..c9eb9f4
--- /dev/null
+++ b/opa/opa_proto.c
@@ -0,0 +1,578 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains the initialization functions used by the low
+   level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "ipserror.h"
+#include "opa_user.h"
+#include "opa_udebug.h"
+
+#include <sched.h>
+
+#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+
+/* It is allowed to have multiple devices (and of different types)
+   simultaneously opened and initialized, although this (still! Oct 07)
+   implemented.  This routine is used by the low level hfi protocol code (and
+   any other code that has similar low level functionality).
+   This is the only routine that takes a file descriptor, rather than an
+   struct _hfi_ctrl *.  The struct _hfi_ctrl * used for everything
+   else is returned as part of hfi1_base_info.
+*/
+struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
+{
+	struct _hfi_ctrl *spctrl = NULL;
+	struct hfi1_ctxt_info *cinfo;
+	struct hfi1_base_info *binfo;
+	void *tmp;
+	uint64_t *tmp64;
+	struct hfi1_cmd c;
+	uintptr_t pg_mask;
+	int __hfi_pg_sz;
+#ifdef PSM2_SUPPORT_IW_CMD_API
+	/* for major version 6 of driver, we will use uinfo_new.  See below for details. */
+	struct hfi1_user_info uinfo_new = {0};
+#endif
+
+	/* First get the page size */
+	__hfi_pg_sz = sysconf(_SC_PAGESIZE);
+	pg_mask = ~(intptr_t) (__hfi_pg_sz - 1);
+
+	if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) {
+		_HFI_INFO("can't allocate memory for hfi_ctrl: %s\n",
+			  strerror(errno));
+		goto err;
+	}
+	cinfo = &spctrl->ctxt_info;
+	binfo = &spctrl->base_info;
+
+	_HFI_VDBG("uinfo: ver %x, alg %d, subc_cnt %d, subc_id %d\n",
+		  uinfo->userversion, uinfo->hfi1_alg,
+		  uinfo->subctxt_cnt, uinfo->subctxt_id);
+
+	/* 1. ask driver to assign context to current process */
+	memset(&c, 0, sizeof(struct hfi1_cmd));
+	c.type = PSMI_HFI_CMD_ASSIGN_CTXT;
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+	/* If psm is communicating with a MAJOR version 6 driver, we need
+	   to pass in an actual struct hfi1_user_info not a hfi1_user_info_dep.
+	   Else if psm is communicating with a MAJOR version 5 driver, we can
+	   just continue to pass a hfi1_user_info_dep as struct hfi1_user_info_dep
+	   is identical to the MAJOR version 5 struct hfi1_user_info. */
+	if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR)
+	{
+		/* If psm is communicating with a MAJOR version 6 driver,
+		   we copy uinfo into uinfo_new and pass uinfo_new to the driver. */
+		c.len = sizeof(uinfo_new);
+		c.addr = (__u64) (&uinfo_new);
+
+		uinfo_new.userversion = uinfo->userversion;
+		uinfo_new.pad         = uinfo->pad;
+		uinfo_new.subctxt_cnt = uinfo->subctxt_cnt;
+		uinfo_new.subctxt_id  = uinfo->subctxt_id;
+		memcpy(uinfo_new.uuid,uinfo->uuid,sizeof(uinfo_new.uuid));
+	}
+	else
+	{
+		/* If psm is working with an old driver, we continue to use
+		   the struct hfi1_user_info_dep version of the struct: */
+		c.len = sizeof(*uinfo);
+		c.addr = (__u64) uinfo;
+	}
+#else
+	c.len = sizeof(*uinfo);
+	c.addr = (__u64) uinfo;
+#endif
+	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+		if (errno == ENODEV) {
+			_HFI_INFO("PSM2 and driver version mismatch\n");
+			/* Overwrite errno. One would wish that the driver
+			 * didn't return ENODEV for a version mismatch */
+			errno = EPROTONOSUPPORT;
+		} else {
+			_HFI_INFO("assign_context command failed: %s\n",
+				  strerror(errno));
+		}
+		goto err;
+	}
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+	if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR)
+	{
+		/* for the new driver, we copy the results of the call back to uinfo from
+		   uinfo_new. */
+		uinfo->userversion = uinfo_new.userversion;
+		uinfo->pad         = uinfo_new.pad;
+		uinfo->subctxt_cnt = uinfo_new.subctxt_cnt;
+		uinfo->subctxt_id  = uinfo_new.subctxt_id;
+		memcpy(uinfo->uuid,uinfo_new.uuid,sizeof(uinfo_new.uuid));
+	}
+#endif
+
+	/* 2. get context info from driver */
+	c.type = PSMI_HFI_CMD_CTXT_INFO;
+	c.len = sizeof(*cinfo);
+	c.addr = (__u64) cinfo;
+
+	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+		_HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
+		goto err;
+	}
+
+	/* sanity checking... */
+	if (cinfo->rcvtids%8) {
+		_HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
+		goto err;
+	}
+	if (cinfo->egrtids%8) {
+		_HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids);
+		goto err;
+	}
+	if (cinfo->rcvtids < cinfo->egrtids) {
+		_HFI_INFO("rcvtids(%d) < egrtids(%d)\n",
+				cinfo->rcvtids, cinfo->egrtids);
+		goto err;
+	}
+	if (cinfo->rcvhdrq_cnt%32) {
+		_HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n",
+				cinfo->rcvhdrq_cnt);
+		goto err;
+	}
+	if (cinfo->rcvhdrq_entsize%64) {
+		_HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n",
+				cinfo->rcvhdrq_entsize);
+		goto err;
+	}
+	if (cinfo->rcvegr_size%__hfi_pg_sz) {
+		_HFI_INFO("rcvegr_size not page multiple: %d\n",
+				cinfo->rcvegr_size);
+		goto err;
+	}
+
+	_HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n",
+		  cinfo->runtime_flags, cinfo->rcvegr_size);
+	_HFI_VDBG("ctxtinfo: active %d, unit %d, ctxt %d, subctxt %d\n",
+		  cinfo->num_active, cinfo->unit, cinfo->ctxt, cinfo->subctxt);
+	_HFI_VDBG("ctxtinfo: rcvtids %d, credits %d\n",
+		  cinfo->rcvtids, cinfo->credits);
+	_HFI_VDBG("ctxtinfo: numa %d, cpu %x, send_ctxt %d\n",
+		  cinfo->numa_node, cinfo->rec_cpu, cinfo->send_ctxt);
+	_HFI_VDBG("ctxtinfo: rcvhdrq_cnt %d, rcvhdrq_entsize %d\n",
+		  cinfo->rcvhdrq_cnt, cinfo->rcvhdrq_entsize);
+	_HFI_VDBG("ctxtinfo: egrtids %d, sdma_ring_size %d\n",
+		  cinfo->egrtids, cinfo->sdma_ring_size);
+
+	/* if affinity has not been setup, set it */
+	if ((!getenv("HFI_NO_CPUAFFINITY") && cinfo->rec_cpu != (__u16) -1) ||
+		getenv("HFI_FORCE_CPUAFFINITY")) {
+		cpu_set_t cpuset;
+		CPU_ZERO(&cpuset);
+		CPU_SET(cinfo->rec_cpu, &cpuset);
+		if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
+			_HFI_INFO("Couldn't set runon processor %u "
+				  "(unit:context %u:%u) (%u active chips): %s\n",
+				  cinfo->rec_cpu, cinfo->unit, cinfo->ctxt,
+				  cinfo->num_active, strerror(errno));
+		}
+	}
+
+
+	/* 4. Get user base info from driver */
+	c.type = PSMI_HFI_CMD_USER_INFO;
+	c.len = sizeof(*binfo);
+	c.addr = (__u64) binfo;
+
+	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+		_HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno));
+		goto err;
+	}
+
+	hfi_set_user_version(binfo->sw_version);
+
+	_HFI_VDBG("baseinfo: hwver %x, swver %x, jkey %d, qp %d\n",
+		  binfo->hw_version, binfo->sw_version,
+		  binfo->jkey, binfo->bthqp);
+	_HFI_VDBG("baseinfo: credit_addr %llx, sop %llx, pio %llx\n",
+		  binfo->sc_credits_addr, binfo->pio_bufbase_sop,
+		  binfo->pio_bufbase);
+	_HFI_VDBG("baseinfo: hdrbase %llx, egrbase %llx, sdmabase %llx\n",
+		  binfo->rcvhdr_bufbase, binfo->rcvegr_bufbase,
+		  binfo->sdma_comp_bufbase);
+	_HFI_VDBG("baseinfo: ureg %llx, eventbase %llx, "
+		  "statusbase %llx, tailaddr %llx\n", binfo->user_regbase,
+		  binfo->events_bufbase, binfo->status_bufbase,
+		  binfo->rcvhdrtail_base);
+
+	if (getenv("PSM2_IDENTIFY")) {
+                printf("%s %s run-time driver interface v%d.%d\n",
+                          hfi_get_mylabel(), hfi_ident_tag, hfi_get_user_major_version(), hfi_get_user_minor_version());
+        }
+
+	/*
+	 * Check if driver version matches PSM version,
+	 * this is different from PSM API version.
+	 */
+	if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) {
+		_HFI_INFO
+		    ("User major version 0x%x not same as driver major 0x%x\n",
+		     hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT);
+		if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version())
+			goto err;	/* else assume driver knows how to be compatible */
+	} else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) {
+		_HFI_PRDBG
+		    ("User minor version 0x%x not same as driver minor 0x%x\n",
+		     HFI1_USER_SWMINOR, binfo->sw_version & 0xffff);
+	}
+
+	/* Map the PIO credits address */
+	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			      (__off64_t) binfo->sc_credits_addr &
+			      pg_mask)) == MAP_FAILED) {
+		_HFI_INFO("mmap of sc_credits_addr (%llx) failed: %s\n",
+			  (unsigned long long)binfo->sc_credits_addr,
+			  strerror(errno));
+		goto err;
+	} else {
+		hfi_touch_mmap(tmp, __hfi_pg_sz);
+		binfo->sc_credits_addr = (uint64_t) (uintptr_t) tmp |
+		    (binfo->sc_credits_addr & ~pg_mask);
+		_HFI_VDBG("sc_credits_addr %llx\n",
+			  binfo->sc_credits_addr);
+	}
+
+	/* Map the PIO buffer SOP address */
+	if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
+			      PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
+			      (__off64_t) binfo->pio_bufbase_sop & pg_mask))
+	    == MAP_FAILED) {
+		_HFI_INFO("mmap of pio buffer sop at %llx failed: %s\n",
+			  (unsigned long long)binfo->pio_bufbase_sop,
+			  strerror(errno));
+		goto err;
+	} else {
+		/* Do not try to read the PIO buffers; they are mapped write */
+		/* only.  We'll fault them in as we write to them. */
+		binfo->pio_bufbase_sop = (uintptr_t) tmp;
+		_HFI_VDBG("pio_bufbase_sop %llx\n",
+			  binfo->pio_bufbase_sop);
+	}
+
+	/* Map the PIO buffer address */
+	if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
+			      PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
+			      (__off64_t) binfo->pio_bufbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of pio buffer at %llx failed: %s\n",
+			  (unsigned long long)binfo->pio_bufbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		/* Do not try to read the PIO buffers; they are mapped write */
+		/* only.  We'll fault them in as we write to them. */
+		binfo->pio_bufbase = (uintptr_t) tmp;
+		_HFI_VDBG("sendpio_bufbase %llx\n", binfo->pio_bufbase);
+	}
+
+	/* Map the receive header queue */
+	if ((tmp =
+	     hfi_mmap64(0, cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
+			PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			(__off64_t) binfo->rcvhdr_bufbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of rcvhdrq at %llx failed: %s\n",
+			  (unsigned long long)binfo->rcvhdr_bufbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		/* for use in protocol code */
+		hfi_touch_mmap(tmp,
+			       cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize);
+		binfo->rcvhdr_bufbase = (uintptr_t) tmp;	/* set to mapped address */
+		_HFI_VDBG("rcvhdr_bufbase %llx\n", binfo->rcvhdr_bufbase);
+	}
+
+	/* Map the receive eager buffer */
+	if ((tmp =
+	     hfi_mmap64(0, cinfo->egrtids * cinfo->rcvegr_size,
+			PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			(__off64_t) binfo->rcvegr_bufbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of rcvegrq bufs from %llx failed: %s\n",
+			  (unsigned long long)binfo->rcvegr_bufbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		hfi_touch_mmap(tmp, cinfo->egrtids * cinfo->rcvegr_size);
+		binfo->rcvegr_bufbase = (uint64_t) (uintptr_t) tmp;
+		_HFI_VDBG("rcvegr_bufbase %llx\n", binfo->rcvegr_bufbase);
+	}
+
+	/* Map the sdma completion queue */
+	if (!(cinfo->runtime_flags & HFI1_CAP_SDMA)) {
+		binfo->sdma_comp_bufbase = 0;
+	} else
+	    if ((tmp =
+		 hfi_mmap64(0, cinfo->sdma_ring_size *
+				sizeof(struct hfi1_sdma_comp_entry),
+			    PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			    (__off64_t) binfo->sdma_comp_bufbase & pg_mask)) ==
+		MAP_FAILED) {
+		_HFI_INFO
+		    ("mmap of sdma completion queue from %llx failed: %s\n",
+		     (unsigned long long)binfo->sdma_comp_bufbase,
+		     strerror(errno));
+		goto err;
+	} else {
+		binfo->sdma_comp_bufbase = (uint64_t) (uintptr_t) tmp;
+	}
+	_HFI_VDBG("sdma_comp_bufbase %llx\n", binfo->sdma_comp_bufbase);
+
+	/* Map RXE per-context CSRs */
+	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+			      PROT_WRITE | PROT_READ, MAP_SHARED | MAP_LOCKED,
+			      fd,
+			      (__off64_t) binfo->user_regbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of user registers at %llx failed: %s\n",
+			  (unsigned long long)binfo->user_regbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		/* we don't try to fault these in, no need */
+		binfo->user_regbase = (uint64_t) (uintptr_t) tmp;
+		_HFI_VDBG("user_regbase %llx\n", binfo->user_regbase);
+	}
+
+	/*
+	 * Set up addresses for optimized register writeback routines.
+	 * This is for the real onchip registers, shared context or not
+	 */
+	tmp64 = (uint64_t *) tmp;
+	spctrl->__hfi_rcvhdrtail = (volatile __le64 *)&tmp64[ur_rcvhdrtail];
+	spctrl->__hfi_rcvhdrhead = (volatile __le64 *)&tmp64[ur_rcvhdrhead];
+	spctrl->__hfi_rcvegrtail =
+	    (volatile __le64 *)&tmp64[ur_rcvegrindextail];
+	spctrl->__hfi_rcvegrhead =
+	    (volatile __le64 *)&tmp64[ur_rcvegrindexhead];
+	spctrl->__hfi_rcvofftail =
+	    (volatile __le64 *)&tmp64[ur_rcvegroffsettail];
+
+	if (!(cinfo->runtime_flags & HFI1_CAP_HDRSUPP)) {
+		spctrl->__hfi_rcvtidflow = spctrl->regs;
+		spctrl->__hfi_tfvalid = 0;
+	} else {
+		spctrl->__hfi_rcvtidflow =
+		    (volatile __le64 *)&tmp64[ur_rcvtidflowtable];
+		spctrl->__hfi_tfvalid = 1;
+	}
+
+	/* Map the rcvhdrq tail register address */
+	if (!(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+		/*
+		 * We don't use receive header queue tail register to detect
+		 * new packets, but here we save the address for
+		 * false-eager-full recovery.
+		 */
+		binfo->rcvhdrtail_base =
+		    (uint64_t) (uintptr_t) spctrl->__hfi_rcvhdrtail;
+		spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
+	} else
+	    if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+				  PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+				  (__off64_t) binfo->rcvhdrtail_base &
+				  pg_mask)) == MAP_FAILED) {
+		_HFI_INFO("mmap of rcvhdrq tail addr %llx failed: %s\n",
+			  (unsigned long long)binfo->rcvhdrtail_base,
+			  strerror(errno));
+		goto err;
+	} else {
+		hfi_touch_mmap(tmp, __hfi_pg_sz);
+		binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) tmp;
+		spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
+	}
+	_HFI_VDBG("rcvhdr_tail_addr %llx\n", binfo->rcvhdrtail_base);
+
+	/* Map the event page */
+	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			      (__off64_t) binfo->events_bufbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of status page at %llx failed: %s\n",
+			  (unsigned long long)binfo->events_bufbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		binfo->events_bufbase = (uint64_t) (uintptr_t) tmp |
+		    (binfo->events_bufbase & ~pg_mask);
+		_HFI_VDBG("events_bufbase %llx\n", binfo->events_bufbase);
+	}
+
+	/* Map the status page */
+	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+			      (__off64_t) binfo->status_bufbase & pg_mask)) ==
+	    MAP_FAILED) {
+		_HFI_INFO("mmap of status page (%llx) failed: %s\n",
+			  (unsigned long long)binfo->status_bufbase,
+			  strerror(errno));
+		goto err;
+	} else {
+		binfo->status_bufbase = (uintptr_t) tmp;
+		_HFI_VDBG("status_bufbase %llx\n", binfo->status_bufbase);
+	}
+
+	/* If subcontext is used, map the buffers */
+	if (uinfo->subctxt_cnt) {
+		unsigned num_subcontexts = uinfo->subctxt_cnt;
+		size_t size;
+
+		size = __hfi_pg_sz;
+		if ((tmp = hfi_mmap64(0, size,
+				      PROT_READ | PROT_WRITE,
+				      MAP_SHARED | MAP_LOCKED, fd,
+				      (__off64_t) binfo->subctxt_uregbase &
+				      pg_mask)) == MAP_FAILED) {
+			_HFI_INFO
+			    ("mmap of subcontext uregbase array (%llx) failed: %s\n",
+			     (unsigned long long)binfo->subctxt_uregbase,
+			     strerror(errno));
+			goto err;
+		} else {
+			hfi_touch_mmap(tmp, size);
+			binfo->subctxt_uregbase = (uint64_t) (uintptr_t) tmp;
+			_HFI_VDBG("subctxt_uregbase %llx\n",
+				  binfo->subctxt_uregbase);
+		}
+
+		size = ALIGN(cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
+			     __hfi_pg_sz) * num_subcontexts;
+		if ((tmp = hfi_mmap64(0, size,
+				      PROT_READ | PROT_WRITE,
+				      MAP_SHARED | MAP_LOCKED, fd,
+				      (__off64_t) binfo->subctxt_rcvhdrbuf &
+				      pg_mask)) == MAP_FAILED) {
+			_HFI_INFO
+			    ("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n",
+			     (unsigned long long)binfo->subctxt_rcvhdrbuf,
+			     strerror(errno));
+			goto err;
+		} else {
+			hfi_touch_mmap(tmp, size);
+			binfo->subctxt_rcvhdrbuf = (uint64_t) (uintptr_t) tmp;
+			_HFI_VDBG("subctxt_rcvhdrbuf %llx\n",
+				  binfo->subctxt_rcvhdrbuf);
+		}
+
+		size = ALIGN(cinfo->egrtids * cinfo->rcvegr_size,
+			     __hfi_pg_sz) * num_subcontexts;
+		if ((tmp = hfi_mmap64(0, size,
+				      PROT_READ | PROT_WRITE,
+				      MAP_SHARED | MAP_LOCKED, fd,
+				      (__off64_t) binfo->subctxt_rcvegrbuf &
+				      pg_mask)) == MAP_FAILED) {
+			_HFI_INFO
+			    ("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n",
+			     (unsigned long long)binfo->subctxt_rcvegrbuf,
+			     strerror(errno));
+			goto err;
+		} else {
+			hfi_touch_mmap(tmp, size);
+			binfo->subctxt_rcvegrbuf = (uint64_t) (uintptr_t) tmp;
+			_HFI_VDBG("subctxt_rcvegrbuf %llx\n",
+				  binfo->subctxt_rcvegrbuf);
+		}
+	}
+
+	/* Save some info. */
+	spctrl->fd = fd;
+	spctrl->__hfi_unit = cinfo->unit;
+	/*
+	 * driver should provide the port where the context is opened for, But
+	 * OPA driver does not have port interface to psm because there is only
+	 * one port. So we hardcode the port to 1 here. When we work on the
+	 * version of PSM for the successor to OPA, we should have port returned
+	 * from driver and will be set accordingly.
+	 */
+	/* spctrl->__hfi_port = cinfo->port; */
+	spctrl->__hfi_port = 1;
+	spctrl->__hfi_tidegrcnt = cinfo->egrtids;
+	spctrl->__hfi_tidexpcnt = cinfo->rcvtids - cinfo->egrtids;
+
+	return spctrl;
+
+err:
+	if (spctrl)
+		free(spctrl);
+	return NULL;
+}
diff --git a/opa/opa_service.c b/opa/opa_service.c
new file mode 100644
index 0000000..38e6518
--- /dev/null
+++ b/opa/opa_service.c
@@ -0,0 +1,909 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low
+   level hfi protocol code. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <poll.h>
+#include "opa_service.h"
+#include "psmi_wrappers.h"
+
+typedef union
+{
+	struct
+	{
+		uint16_t minor;
+		uint16_t major;
+	};
+	uint32_t version;
+} sw_version_t;
+
+static sw_version_t sw_version =
+{
+	{
+	.major = HFI1_USER_SWMAJOR,
+	.minor = HFI1_USER_SWMINOR
+	}
+};
+
+/*
+ * This function is necessary in a udev-based world.  There can be an
+ * arbitrarily long (but typically less than one second) delay between
+ * a driver getting loaded and any dynamic special files turning up.
+ *
+ * The timeout is in milliseconds.  A value of zero means "callee
+ * decides timeout".  Negative is infinite.
+ *
+ * Returns 0 on success, -1 on error or timeout.  Check errno to see
+ * whether there was a timeout (ETIMEDOUT) or an error (any other
+ * non-zero value).
+ */
+int hfi_wait_for_device(const char *path, long timeout)
+{
+	int saved_errno;
+	struct stat st;
+	long elapsed;
+	int ret;
+
+	if (timeout == 0)
+		timeout = 15000;
+
+	elapsed = 0;
+
+	while (1) {
+		static const long default_ms = 250;
+		struct timespec req = { 0 };
+		long ms;
+
+		ret = stat(path, &st);
+		saved_errno = errno;
+
+		if (ret == 0 || (ret == -1 && errno != ENOENT))
+			break;
+
+		if ((timeout > 0) && ((timeout - elapsed) <= 0)) {
+			saved_errno = ETIMEDOUT;
+			break;
+		}
+
+		if (elapsed == 0) {
+			if (timeout < 0)
+				_HFI_DBG
+				    ("Device file %s not present on first check; "
+				     "waiting indefinitely...\n", path);
+			else
+				_HFI_DBG
+				    ("Device file %s not present on first check; "
+				     "waiting up to %.1f seconds...\n", path,
+				     timeout / 1e3);
+		}
+
+		if (timeout < 0 || timeout - elapsed >= default_ms)
+			ms = default_ms;
+		else
+			ms = timeout;
+
+		elapsed += ms;
+		req.tv_nsec = ms * 1000000;
+
+		ret = nanosleep(&req, NULL);
+		saved_errno = errno;
+
+		if (ret == -1)
+			break;
+	}
+
+	if (ret == 0)
+		_HFI_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3);
+	else
+		_HFI_INFO
+		    ("The %s device failed to appear after %.1f seconds: %s\n",
+		     path, elapsed / 1e3, strerror(saved_errno));
+
+	errno = saved_errno;
+	return ret;
+}
+
+/* fwd declaration */
+ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count);
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+
+/* fwd declaration */
+ustatic int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count);
+/* Function pointer. */
+static int (*_hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_ioctl;
+
+#else
+/* Function pointer. */
+static int (*const _hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_write;
+#endif
+
+uint16_t hfi_get_user_major_version(void)
+{
+	return sw_version.major;
+}
+
+void hfi_set_user_major_version(uint16_t major_version)
+{
+	sw_version.major = major_version;
+}
+
+uint16_t hfi_get_user_minor_version(void)
+{
+	return sw_version.minor;
+}
+
+void hfi_set_user_version(uint32_t version)
+{
+	sw_version.version = version;
+}
+
+int hfi_context_open(int unit, int port, uint64_t open_timeout)
+{
+	char dev_name_ignored[256];
+
+	return hfi_context_open_ex(unit, port, open_timeout,
+				   dev_name_ignored, sizeof(dev_name_ignored));
+}
+
+int hfi_context_open_ex(int unit, int port, uint64_t open_timeout,
+		     char *dev_name,size_t dev_name_len)
+{
+	int fd;
+
+	if (unit != HFI_UNIT_ID_ANY && unit >= 0)
+		snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH,
+			 unit);
+	else
+		snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH,
+			 0);
+
+	if (hfi_wait_for_device(dev_name, (long)open_timeout) == -1) {
+		_HFI_DBG("Could not find an HFI Unit on device "
+			 "%s (%lds elapsed)", dev_name,
+			 (long)open_timeout / 1000);
+		return -1;
+	}
+
+	if ((fd = open(dev_name, O_RDWR)) == -1) {
+		_HFI_DBG("(host:Can't open %s for reading and writing",
+			 dev_name);
+		return -1;
+	}
+
+	if (fcntl(fd, F_SETFD, FD_CLOEXEC))
+		_HFI_INFO("Failed to set close on exec for device: %s\n",
+			  strerror(errno));
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+	{
+		/* if hfi1DriverMajor == -1, then we are potentially talking to a new driver.
+		   Let's confirm by issuing an ioctl version request: */
+		struct hfi1_cmd c;
+
+		memset(&c, 0, sizeof(struct hfi1_cmd));
+		c.type = PSMI_HFI_CMD_GET_VERS;
+		c.len  = 0;
+		c.addr = 0;
+
+		if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+			/* Let's assume that the driver is the old driver */
+			hfi_set_user_major_version(IOCTL_CMD_API_MODULE_MAJOR - 1);
+			/* the old driver uses write() for its command interface: */
+			_hfi_cmd_send = _hfi_cmd_write;
+		}
+		else
+		{
+			int major = c.addr >> HFI1_SWMAJOR_SHIFT;
+			if (major != hfi_get_user_major_version()) {
+					/* If there is a skew between the major version of the driver
+					   that is executing and the major version which was used during
+					   compilation of PSM, we treat that is a fatal error. */
+					_HFI_INFO("PSM2 and driver version mismatch: (%d != %d)\n",
+						  major, hfi_get_user_major_version());
+				close(fd);
+				return -1;
+			}
+		}
+	}
+
+#endif
+	return fd;
+}
+
+void hfi_context_close(int fd)
+{
+	(void)close(fd);
+}
+
+int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt)
+{
+	return writev(fd, iov, iovcnt);
+}
+
+int hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+	return _hfi_cmd_send(fd, cmd, count);
+}
+
+ustatic
+int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+    const static unsigned int cmdTypeToWriteNum[PSMI_HFI_CMD_LAST] = {
+        [PSMI_HFI_CMD_ASSIGN_CTXT]      = LEGACY_HFI1_CMD_ASSIGN_CTXT,
+        [PSMI_HFI_CMD_CTXT_INFO]        = LEGACY_HFI1_CMD_CTXT_INFO,
+        [PSMI_HFI_CMD_USER_INFO]        = LEGACY_HFI1_CMD_USER_INFO,
+        [PSMI_HFI_CMD_TID_UPDATE]       = LEGACY_HFI1_CMD_TID_UPDATE,
+        [PSMI_HFI_CMD_TID_FREE]         = LEGACY_HFI1_CMD_TID_FREE,
+        [PSMI_HFI_CMD_CREDIT_UPD]       = LEGACY_HFI1_CMD_CREDIT_UPD,
+        [PSMI_HFI_CMD_RECV_CTRL]        = LEGACY_HFI1_CMD_RECV_CTRL,
+        [PSMI_HFI_CMD_POLL_TYPE]        = LEGACY_HFI1_CMD_POLL_TYPE,
+        [PSMI_HFI_CMD_ACK_EVENT]        = LEGACY_HFI1_CMD_ACK_EVENT,
+        [PSMI_HFI_CMD_SET_PKEY]         = LEGACY_HFI1_CMD_SET_PKEY,
+        [PSMI_HFI_CMD_CTXT_RESET]       = LEGACY_HFI1_CMD_CTXT_RESET,
+        [PSMI_HFI_CMD_TID_INVAL_READ]   = LEGACY_HFI1_CMD_TID_INVAL_READ,
+        [PSMI_HFI_CMD_GET_VERS]         = LEGACY_HFI1_CMD_GET_VERS,
+    };
+
+    if (cmd->type < PSMI_HFI_CMD_LAST) {
+        cmd->type = cmdTypeToWriteNum[cmd->type];
+
+	    return psmi_write(fd, cmd, count);
+    } else {
+        errno = EINVAL;
+        return -1;
+    }
+}
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ustatic
+int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+	uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr };
+	const static struct
+	{
+		unsigned int ioctlCmd;
+		unsigned int addrOrLiteralIdx;
+	} cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = {
+        [PSMI_HFI_CMD_ASSIGN_CTXT]      = {HFI1_IOCTL_ASSIGN_CTXT   , 0},
+        [PSMI_HFI_CMD_CTXT_INFO]        = {HFI1_IOCTL_CTXT_INFO     , 0},
+        [PSMI_HFI_CMD_USER_INFO]        = {HFI1_IOCTL_USER_INFO     , 0},
+        [PSMI_HFI_CMD_TID_UPDATE]       = {HFI1_IOCTL_TID_UPDATE    , 0},
+        [PSMI_HFI_CMD_TID_FREE]         = {HFI1_IOCTL_TID_FREE      , 0},
+        [PSMI_HFI_CMD_CREDIT_UPD]       = {HFI1_IOCTL_CREDIT_UPD    , 1},
+        [PSMI_HFI_CMD_RECV_CTRL]        = {HFI1_IOCTL_RECV_CTRL     , 1},
+        [PSMI_HFI_CMD_POLL_TYPE]        = {HFI1_IOCTL_POLL_TYPE     , 1},
+        [PSMI_HFI_CMD_ACK_EVENT]        = {HFI1_IOCTL_ACK_EVENT     , 1},
+        [PSMI_HFI_CMD_SET_PKEY]         = {HFI1_IOCTL_SET_PKEY      , 1},
+        [PSMI_HFI_CMD_CTXT_RESET]       = {HFI1_IOCTL_CTXT_RESET    , 1},
+        [PSMI_HFI_CMD_TID_INVAL_READ]   = {HFI1_IOCTL_TID_INVAL_READ, 0},
+        [PSMI_HFI_CMD_GET_VERS]         = {HFI1_IOCTL_GET_VERS      , 1},
+#ifdef PSM_CUDA
+	[PSMI_HFI_CMD_TID_UPDATE_V2]	= {HFI1_IOCTL_TID_UPDATE_V2 , 0},
+#endif
+    };
+
+	if (cmd->type < PSMI_HFI_CMD_LAST)
+		return psmi_ioctl(fd,
+			     cmdTypeToIoctlNum[cmd->type].ioctlCmd,
+			     addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]);
+	else
+	{
+		errno = EINVAL;
+		return -1;
+	}
+}
+#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */
+
+/* we use mmap64() because we compile in both 32 and 64 bit mode,
+   and we have to map physical addresses that are > 32 bits long.
+   While linux implements mmap64, it doesn't have a man page,
+   and isn't declared in any header file, so we declare it here ourselves.
+
+   We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+   redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+   it doesn't work when the address being mapped is > 32 bits.  It chips
+   off bits 32 and above.   So we stay with mmap64. */
+void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd,
+		 __off64_t offset)
+{
+	return mmap64(addr, length, prot, flags, fd, offset);
+}
+
+/* get the number of units supported by the driver.  Does not guarantee */
+/* that a working chip has been found for each possible unit #. */
+/* number of units >=0 (0 means none found). */
+/* formerly used sysfs file "num_units" */
+int hfi_get_num_units(void)
+{
+	int ret;
+
+	for (ret = 0;; ret++) {
+		char pathname[PATH_MAX];
+		struct stat st;
+		int r;
+
+		snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH "_%d", ret);
+		if (ret == 0)
+			/* We only wait for the first device to come up.  Not
+			   on subsequent devices in order to save time. */
+			r = hfi_wait_for_device(pathname, 0);
+		else
+			r = stat(pathname, &st);
+		if (!r)
+			continue;
+		else
+			break;
+	}
+
+	return ret;
+}
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+   returns 0 if no port on the unit is active.
+   returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit)
+{
+	int p,rv;
+
+	for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++)
+		if ((rv=hfi_get_port_lid(unit, p)) > 0)
+			break;
+
+	if (p <= HFI_MAX_PORT)
+	{
+		return 1;
+	}
+
+	return rv;
+}
+
+/* get the number of contexts from the unit id. */
+/* Returns 0 if no unit or no match. */
+int hfi_get_num_contexts(int unit_id)
+{
+	int n = 0;
+	int units;
+	int64_t val;
+	uint32_t p = HFI_MIN_PORT;
+
+	units = hfi_get_num_units();
+
+	if_pf(units <=  0)
+		return 0;
+
+	if (unit_id == HFI_UNIT_ID_ANY) {
+		uint32_t u;
+
+		for (u = 0; u < units; u++) {
+			for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++)
+				if (hfi_get_port_lid(u, p) > 0)
+					break;
+
+			if (p <= HFI_MAX_PORT &&
+			    !hfi_sysfs_unit_read_s64(u, "nctxts", &val, 0))
+				n += (uint32_t) val;
+		}
+	} else {
+		for (; p <= HFI_MAX_PORT; p++)
+			if (hfi_get_port_lid(unit_id, p) > 0)
+				break;
+
+		if (p <= HFI_MAX_PORT &&
+		    !hfi_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0))
+			n += (uint32_t) val;
+	}
+
+	return n;
+}
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+   returns 0 if the unit and port are not active.
+   returns -1 when an error occurred. */
+int hfi_get_port_active(int unit, int port)
+{
+	int ret;
+	char *state;
+
+	ret = hfi_sysfs_port_read(unit, port, "phys_state", &state);
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single port chips */
+			_HFI_VDBG
+			    ("Failed to get phys_state for unit %u:%u: %s\n",
+			     unit, port, strerror(errno));
+		else
+			_HFI_DBG
+			    ("Failed to get phys_state for unit %u:%u: %s\n",
+			     unit, port, strerror(errno));
+		return -1;
+	} else {
+		if (strncmp(state, "5: LinkUp", 9)) {
+			_HFI_DBG("Link is not Up for unit %u:%u\n", unit, port);
+			free(state);
+			return 0;
+		}
+		free(state);
+		return 1;
+	}
+}
+
+/* Given the unit number, return an error, or the corresponding LID
+   For now, it's used only so the MPI code can determine it's own
+   LID, and which other LIDs (if any) are also assigned to this node
+   Returns an int, so -1 indicates an error.  0 may indicate that
+   the unit is valid, but no LID has been assigned.
+   No error print because we call this for both potential
+   ports without knowing if both ports exist (or are connected) */
+int hfi_get_port_lid(int unit, int port)
+{
+	int ret;
+	int64_t val;
+
+	if (hfi_get_port_active(unit,port) != 1)
+		return -2;
+	ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0);
+	_HFI_VDBG("hfi_get_port_lid: ret %d, unit %d port %d\n", ret, unit,
+		  port);
+
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single port chips */
+			_HFI_VDBG("Failed to get LID for unit %u:%u: %s\n",
+				  unit, port, strerror(errno));
+		else
+			_HFI_DBG("Failed to get LID for unit %u:%u: %s\n",
+				 unit, port, strerror(errno));
+	} else {
+		ret = val;
+
+/* disable this feature since we don't have a way to provide
+   file descriptor in multiple context case. */
+#if 0
+		if (getenv("HFI_DIAG_LID_LOOP")) {
+			/* provides diagnostic ability to run MPI, etc. even */
+			/* on loopback, by claiming a different LID for each context */
+			struct hfi1_ctxt_info info;
+			struct hfi1_cmd cmd;
+			cmd.type = PSMI_HFI_CMD_CTXT_INFO;
+			cmd.cmd.ctxt_info = (uintptr_t) &info;
+			if (__hfi_lastfd == -1)
+				_HFI_INFO
+				    ("Can't run CONTEXT_INFO for lid_loop, fd not set\n");
+			else if (write(__hfi_lastfd, &cmd, sizeof(cmd)) == -1)
+				_HFI_INFO("CONTEXT_INFO command failed: %s\n",
+					  strerror(errno));
+			else if (!info.context)
+				_HFI_INFO("CONTEXT_INFO returned context 0!\n");
+			else {
+				_HFI_PRDBG
+				    ("Using lid 0x%x, base %x, context %x\n",
+				     ret + info.context, ret, info.context);
+				ret += info.context;
+			}
+		}
+#endif
+	}
+
+	return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding GID
+   For now, it's used only so the MPI code can determine its fabric ID.
+   Returns an int, so -1 indicates an error.
+   No error print because we call this for both potential
+   ports without knowing if both ports exist (or are connected) */
+int hfi_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo)
+{
+	int ret;
+	char *gid_str = NULL;
+
+	ret = hfi_sysfs_port_read(unit, port, "gids/0", &gid_str);
+
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single
+			 * port chips */
+			_HFI_VDBG("Failed to get GID for unit %u:%u: %s\n",
+				  unit, port, strerror(errno));
+		else
+			_HFI_DBG("Failed to get GID for unit %u:%u: %s\n",
+				 unit, port, strerror(errno));
+	} else {
+		int gid[8];
+		if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x",
+			   &gid[0], &gid[1], &gid[2], &gid[3],
+			   &gid[4], &gid[5], &gid[6], &gid[7]) != 8) {
+			_HFI_DBG("Failed to parse GID for unit %u:%u: %s\n",
+				 unit, port, gid_str);
+			ret = -1;
+		} else {
+			*hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1])
+							     << 32) |
+			    (((uint64_t)
+			      gid[2]) << 16) | (((uint64_t) gid[3]) << 0);
+			*lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5])
+							     << 32) |
+			    (((uint64_t)
+			      gid[6]) << 16) | (((uint64_t) gid[7]) << 0);
+		}
+		free(gid_str);
+	}
+
+	return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding LMC value
+   for the port */
+/* Returns an int, so -1 indicates an error.  0 */
+int hfi_get_port_lmc(int unit, int port)
+{
+	int ret;
+	int64_t val;
+
+	ret = hfi_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0);
+
+	if (ret == -1) {
+		_HFI_INFO("Failed to get LMC for unit %u:%u: %s\n",
+			  unit, port, strerror(errno));
+	} else
+		ret = val;
+
+	return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding link rate
+   for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port)
+{
+	int ret;
+	double rate;
+	char *data_rate = NULL, *newptr;
+
+	ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate);
+	if (ret == -1)
+		goto get_port_rate_error;
+	else {
+		rate = strtod(data_rate, &newptr);
+		if ((rate == 0) && (data_rate == newptr))
+			goto get_port_rate_error;
+	}
+
+	free(data_rate);
+	return ((int)(rate * 2) >> 1);
+
+get_port_rate_error:
+	_HFI_INFO("Failed to get link rate for unit %u:%u: %s\n",
+		  unit, port, strerror(errno));
+
+	return ret;
+}
+
+/* Given a unit, port and SL, return an error, or the corresponding SC for the
+   SL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sl2sc(int unit, int port, int sl)
+{
+	int ret;
+	int64_t val;
+	char sl2scpath[16];
+
+	snprintf(sl2scpath, sizeof(sl2scpath), "sl2sc/%d", sl);
+	ret = hfi_sysfs_port_read_s64(unit, port, sl2scpath, &val, 0);
+
+	if (ret == -1) {
+		_HFI_DBG
+		    ("Failed to get SL2SC mapping for SL %d unit %u:%u: %s\n",
+		     sl, unit, port, strerror(errno));
+	} else
+		ret = val;
+
+	return ret;
+}
+
+/* Given a unit, port and SC, return an error, or the corresponding VL for the
+   SC as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sc2vl(int unit, int port, int sc)
+{
+	int ret;
+	int64_t val;
+	char sc2vlpath[16];
+
+	snprintf(sc2vlpath, sizeof(sc2vlpath), "sc2vl/%d", sc);
+	ret = hfi_sysfs_port_read_s64(unit, port, sc2vlpath, &val, 0);
+
+	if (ret == -1) {
+		_HFI_DBG
+		    ("Failed to get SC2VL mapping for SC %d unit %u:%u: %s\n",
+		     sc, unit, port, strerror(errno));
+	} else
+		ret = val;
+
+	return ret;
+}
+
+/* Given a unit, port and VL, return an error, or the corresponding MTU for the
+   VL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_vl2mtu(int unit, int port, int vl)
+{
+	int ret;
+	int64_t val;
+	char vl2mtupath[16];
+
+	snprintf(vl2mtupath, sizeof(vl2mtupath), "vl2mtu/%d", vl);
+	ret = hfi_sysfs_port_read_s64(unit, port, vl2mtupath, &val, 0);
+
+	if (ret == -1) {
+		_HFI_DBG
+		    ("Failed to get VL2MTU mapping for VL %d unit %u:%u: %s\n",
+		     vl, unit, port, strerror(errno));
+	} else
+		ret = val;
+
+	return ret;
+}
+
+/* Given a unit, port and index, return an error, or the corresponding pkey
+   value for the index as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_index2pkey(int unit, int port, int index)
+{
+	int ret;
+	int64_t val;
+	char index2pkeypath[16];
+
+	snprintf(index2pkeypath, sizeof(index2pkeypath), "pkeys/%d", index);
+	ret = hfi_sysfs_port_read_s64(unit, port, index2pkeypath, &val, 0);
+
+	if (ret == -1) {
+		_HFI_DBG
+		    ("Failed to get index2pkey mapping for index %d unit %u:%u: %s\n",
+		     index, unit, port, strerror(errno));
+	} else
+		ret = val;
+
+	return ret;
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the hfi driver, they return new info with
+ * the qib driver
+ */
+static int hfi_count_names(const char *namep)
+{
+	int n = 0;
+	while (*namep != '\0') {
+		if (*namep == '\n')
+			n++;
+		namep++;
+	}
+	return n;
+}
+
+int hfi_get_stats_names(char **namep)
+{
+	int i;
+	i = hfi_hfifs_read("driver_stats_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return hfi_count_names(*namep);
+}
+
+int hfi_get_stats(uint64_t *s, int nelem)
+{
+	int i;
+	i = hfi_hfifs_rd("driver_stats", s, nelem * sizeof(*s));
+	if (i < 0)
+		return -1;
+	else
+		return i / sizeof(*s);
+}
+
+int hfi_get_ctrs_unit_names(int unitno, char **namep)
+{
+	int i;
+	i = hfi_hfifs_unit_read(unitno, "counter_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return hfi_count_names(*namep);
+}
+
+int hfi_get_ctrs_unit(int unitno, uint64_t *c, int nelem)
+{
+	int i;
+	i = hfi_hfifs_unit_rd(unitno, "counters", c, nelem * sizeof(*c));
+	if (i < 0)
+		return -1;
+	else
+		return i / sizeof(*c);
+}
+
+int hfi_get_ctrs_port_names(int unitno, char **namep)
+{
+	int i;
+	i = hfi_hfifs_unit_read(unitno, "portcounter_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return hfi_count_names(*namep);
+}
+
+int hfi_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem)
+{
+	int i;
+	char buf[32];
+	snprintf(buf, sizeof(buf), "port%dcounters", port);
+	i = hfi_hfifs_unit_rd(unitno, buf, c, nelem * sizeof(*c));
+	if (i < 0)
+		return -1;
+	else
+		return i / sizeof(*c);
+}
+
+int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf)
+{
+	int fd;
+	size_t count;
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+	sprintf(ccabuf, HFI_CLASS_PATH "_%d/ports/%d/CCMgtA/cc_settings_bin",
+		unit, port);
+	fd = open(ccabuf, O_RDONLY);
+	if (fd < 0) {
+		return 0;
+	}
+	/*
+	 * 4 bytes for 'control map'
+	 * 2 bytes 'port control'
+	 * 32 (#SLs) * 6 bytes 'congestion setting' (per-SL)
+	 */
+	count = 4 + 2 + (32 * 6);
+	if (read(fd, ccabuf, count) != count) {
+		_HFI_CCADBG("Read cc_settings_bin failed. using static CCA\n");
+		close(fd);
+		return 0;
+	}
+
+	close(fd);
+
+	return 1;
+}
+
+int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp)
+{
+	int i;
+	unsigned short ccti_limit;
+	uint16_t *cct;
+	int fd;
+	char pathname[256];
+
+	*cctp = NULL;
+	sprintf(pathname, HFI_CLASS_PATH "_%d/ports/%d/CCMgtA/cc_table_bin",
+		unit, port);
+	fd = open(pathname, O_RDONLY);
+	if (fd < 0) {
+		_HFI_CCADBG("Open cc_table_bin failed. using static CCA\n");
+		return 0;
+	}
+	if (read(fd, &ccti_limit, sizeof(ccti_limit)) != sizeof(ccti_limit)) {
+		_HFI_CCADBG("Read ccti_limit failed. using static CCA\n");
+		close(fd);
+		return 0;
+	}
+
+	_HFI_CCADBG("ccti_limit = %d\n", ccti_limit);
+
+	if (ccti_limit < 63) {
+		_HFI_CCADBG("Read ccti_limit %d not in range [63, 65535], "
+			    "using static CCA.\n", ccti_limit);
+		close(fd);
+		return 0;
+	}
+
+	i = (ccti_limit + 1) * sizeof(uint16_t);
+	cct = malloc(i);
+	if (!cct) {
+		close(fd);
+		return -1;
+	}
+	if (read(fd, cct, i) != i) {
+		_HFI_CCADBG("Read ccti_entry_list, using static CCA\n");
+		free(cct);
+		close(fd);
+		return 0;
+	}
+
+	close(fd);
+
+	_HFI_CCADBG("cct[0] = 0x%04x\n", cct[0]);
+
+	*cctp = cct;
+	return ccti_limit;
+}
+
+/*
+ * This is for diag function hfi_wait_for_packet() only
+ */
+int hfi_cmd_wait_for_packet(int fd)
+{
+	int ret;
+	struct pollfd pfd;
+
+	pfd.fd = fd;
+	pfd.events = POLLIN;
+
+	ret = poll(&pfd, 1, 500 /* ms */);
+
+	return ret;
+}
diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c
new file mode 100644
index 0000000..f0cec91
--- /dev/null
+++ b/opa/opa_sysfs.c
@@ -0,0 +1,854 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains a simple sysfs interface used by the low level
+   hfi protocol code.  It also implements the interface to hfifs. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "opa_service.h"
+
+static char *sysfs_path;
+static size_t sysfs_path_len;
+static char *hfifs_path;
+static long sysfs_page_size;
+
+static void __attribute__ ((constructor)) sysfs_init(void)
+{
+	struct stat s;
+	if (sysfs_path == NULL)
+		sysfs_path = getenv("HFI_SYSFS_PATH");
+	if (sysfs_path == NULL) {
+		static char syspath[64];
+		snprintf(syspath, sizeof(syspath), "%s_%d", HFI_CLASS_PATH, 0);
+		sysfs_path = syspath;
+	}
+	if (stat(sysfs_path, &s) || !S_ISDIR(s.st_mode))
+		_HFI_DBG("Did not find sysfs directory %s, using anyway\n",
+			 sysfs_path);
+	sysfs_path_len = strlen(sysfs_path);
+
+	if (hfifs_path == NULL)
+		hfifs_path = getenv("HFI_HFIFS_PATH");
+	if (hfifs_path == NULL)
+		hfifs_path = "/hfifs";
+
+	if (!sysfs_page_size)
+		sysfs_page_size = sysconf(_SC_PAGESIZE);
+}
+
+const char *hfi_sysfs_path(void)
+{
+	return sysfs_path;
+}
+
+size_t hfi_sysfs_path_len(void)
+{
+	return sysfs_path_len;
+}
+
+const char *hfi_hfifs_path(void)
+{
+	return hfifs_path;
+}
+
+/* Calls stat() for the given attribute, return value is unchanged
+   from stat() sbuf is populated from stat() too. */
+int hfi_sysfs_stat(const char *attr,struct stat *sbuf)
+{
+	char buf[1024];
+
+	snprintf(buf, sizeof(buf), "%s/%s", hfi_sysfs_path(), attr);
+	return stat(buf,sbuf);
+}
+
+int hfi_sysfs_open(const char *attr, int flags)
+{
+	char buf[1024];
+	int saved_errno;
+	int fd;
+
+	snprintf(buf, sizeof(buf), "%s/%s", hfi_sysfs_path(), attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open driver attribute '%s': %s\n", attr,
+			 strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+int hfi_hfifs_open(const char *attr, int flags)
+{
+	char buf[1024];
+	int saved_errno;
+	int fd;
+
+	snprintf(buf, sizeof(buf), "%s/%s", hfi_hfifs_path(), attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open driver attribute '%s': %s\n", attr,
+			 strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+static int sysfs_vprintf(int fd, const char *fmt, va_list ap)
+{
+	char *buf;
+	int len, ret;
+	int saved_errno;
+
+	buf = alloca(sysfs_page_size);
+	len = vsnprintf(buf, sysfs_page_size, fmt, ap);
+
+	if (len > sysfs_page_size) {
+		_HFI_DBG("Attempt to write more (%d) than %ld bytes\n", len,
+			 sysfs_page_size);
+		saved_errno = EINVAL;
+		ret = -1;
+		goto bail;
+	}
+
+	ret = write(fd, buf, len);
+	saved_errno = errno;
+
+	if (ret != -1 && ret < len) {
+		_HFI_DBG("Write ran short (%d < %d)\n", ret, len);
+		saved_errno = EAGAIN;
+		ret = -1;
+	}
+
+bail:
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_printf(const char *attr, const char *fmt, ...)
+{
+	int fd = -1;
+	va_list ap;
+	int ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_open(attr, O_WRONLY);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		goto bail;
+	}
+
+	va_start(ap, fmt);
+	ret = sysfs_vprintf(fd, fmt, ap);
+	saved_errno = errno;
+	va_end(ap);
+
+	if (ret == -1) {
+		_HFI_DBG("Failed to write to driver attribute '%s': %s\n", attr,
+			 strerror(errno));
+	}
+
+bail:
+	if (fd != -1)
+		close(fd);
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	int len, l;
+
+	snprintf(buf, sizeof(buf), "%s", hfi_sysfs_path());
+	len = l = strlen(buf) - 1;
+	while (l > 0 && isdigit(buf[l]))
+		l--;
+	if (l)
+		buf[++l] = 0;
+	else
+		l = len;	/* assume they know what they are doing */
+	snprintf(buf + l, sizeof(buf) - l, "%u/%s", unit, attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+			 unit, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	char *path_copy = strdup(hfi_sysfs_path());
+
+	snprintf(buf, sizeof(buf), "%s/hfi1_%u/device/numa_node",
+		 dirname(path_copy), unit);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n",
+			 unit, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr,
+			int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	int len, l;
+
+	snprintf(buf, sizeof(buf), "%s", hfi_sysfs_path());
+	len = l = strlen(buf) - 1;
+	while (l > 0 && isdigit(buf[l]))
+		l--;
+	if (l)
+		buf[++l] = 0;
+	else
+		l = len;	/* assume they know what they are doing */
+	snprintf(buf + l, sizeof(buf) - l, "%u/ports/%u/%s", unit, port, attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n",
+			 attr, unit, port, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+int hfi_hfifs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+
+	snprintf(buf, sizeof(buf), "%s/%u/%s", hfi_hfifs_path(), unit, attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+			 unit, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+			  const char *fmt, ...)
+{
+	va_list ap;
+	int ret = -1;
+	int saved_errno;
+	int fd;
+
+	fd = hfi_sysfs_port_open(unit, port, attr, O_WRONLY);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		goto bail;
+	}
+
+	va_start(ap, fmt);
+	ret = sysfs_vprintf(fd, fmt, ap);
+	saved_errno = errno;
+	va_end(ap);
+
+	if (ret == -1) {
+		_HFI_DBG("Failed to write to attribute '%s' of unit %d: %s\n",
+			 attr, unit, strerror(errno));
+	}
+
+bail:
+	if (fd != -1)
+		close(fd);
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...)
+{
+	va_list ap;
+	int ret = -1;
+	int saved_errno;
+	int fd;
+
+	fd = hfi_sysfs_unit_open(unit, attr, O_WRONLY);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		goto bail;
+	}
+
+	va_start(ap, fmt);
+	ret = sysfs_vprintf(fd, fmt, ap);
+	saved_errno = errno;
+	va_end(ap);
+
+	if (ret == -1) {
+		_HFI_DBG("Failed to write to attribute '%s' of unit %d: %s\n",
+			 attr, unit, strerror(errno));
+	}
+
+bail:
+	if (fd != -1)
+		close(fd);
+
+	errno = saved_errno;
+	return ret;
+}
+
+static int read_page(int fd, char **datap)
+{
+	char *data = NULL;
+	int saved_errno;
+	int ret = -1;
+
+	data = malloc(sysfs_page_size);
+	saved_errno = errno;
+
+	if (!data) {
+		_HFI_DBG("Could not allocate memory: %s\n", strerror(errno));
+		goto bail;
+	}
+
+	ret = read(fd, data, sysfs_page_size);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		_HFI_DBG("Read of attribute failed: %s\n", strerror(errno));
+		goto bail;
+	}
+
+bail:
+	if (ret == -1) {
+		free(data);
+	} else {
+		*datap = data;
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_read(const char *attr, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_open(attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_unit_write(uint32_t unit, const char *attr, const void *data,
+			 size_t len)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	if (len > sysfs_page_size) {
+		_HFI_DBG("Attempt to write more (%ld) than %ld bytes\n",
+			 (long)len, sysfs_page_size);
+		saved_errno = EINVAL;
+		goto bail;
+	}
+
+	fd = hfi_sysfs_unit_open(unit, attr, O_WRONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = write(fd, data, len);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		_HFI_DBG("Attempt to write %ld bytes failed: %s\n",
+			 (long)len, strerror(errno));
+		goto bail;
+	}
+
+	if (ret < len) {
+		/* sysfs routines can routine count including null byte
+		   so don't return an error if it's > len */
+		_HFI_DBG
+		    ("Attempt to write %ld bytes came up short (%ld bytes)\n",
+		     (long)len, (long)ret);
+		saved_errno = EAGAIN;
+		ret = -1;
+	}
+
+bail:
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_hfifs_read(const char *attr, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_hfifs_open(attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * The _rd routines jread directly into a supplied buffer,
+ * unlike  the _read routines.
+ */
+int hfi_hfifs_rd(const char *attr, void *buf, int n)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_hfifs_open(attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read(fd, buf, n);
+	saved_errno = errno;
+
+bail:
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_hfifs_unit_rd(uint32_t unit, const char *attr, void *buf, int n)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read(fd, buf, n);
+	saved_errno = errno;
+
+bail:
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data,
+			 size_t len)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_hfifs_unit_open(unit, attr, O_WRONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = write(fd, data, len);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		_HFI_DBG("Attempt to write %ld bytes failed: %s\n",
+			 (long)len, strerror(errno));
+		goto bail;
+	}
+
+	if (ret != len) {
+		_HFI_DBG
+		    ("Attempt to write %ld bytes came up short (%ld bytes)\n",
+		     (long)len, (long)ret);
+		saved_errno = EAGAIN;
+		ret = -1;
+	}
+
+bail:
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_read_s64(const char *attr, int64_t *valp, int base)
+{
+	char *data, *end;
+	int ret;
+	int saved_errno;
+	long long val;
+
+	ret = hfi_sysfs_read(attr, &data);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, base);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	*valp = val;
+	ret = 0;
+
+bail:
+	free(data);
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+			    int64_t *valp, int base)
+{
+	char *data=NULL, *end;
+	int saved_errno;
+	long long val;
+	int ret;
+
+	ret = hfi_sysfs_unit_read(unit, attr, &data);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, base);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	*valp = val;
+	ret = 0;
+
+bail:
+	if (data)
+		free(data);
+	errno = saved_errno;
+	return ret;
+}
+
+static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	if (ret == -1)
+		*datap = NULL;
+
+	saved_errno = errno;
+	close(fd);
+bail:
+	errno = saved_errno;
+	return ret;
+}
+
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit)
+{
+	char *data=NULL, *end;
+	int saved_errno;
+	long long val;
+	int64_t ret = -1;
+
+	saved_errno = errno;
+	if (hfi_sysfs_unit_read_node(unit, &data) == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, 0);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	ret = (int64_t) val;
+bail:
+	free(data);
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+			    int64_t *valp, int base)
+{
+	char *data, *end;
+	int saved_errno;
+	long long val;
+	int ret;
+
+	ret = hfi_sysfs_port_read(unit, port, attr, &data);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, base);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	*valp = val;
+	ret = 0;
+
+bail:
+	free(data);
+	errno = saved_errno;
+	return ret;
+}
diff --git a/opa/opa_syslog.c b/opa/opa_syslog.c
new file mode 100644
index 0000000..ccd39c5
--- /dev/null
+++ b/opa/opa_syslog.c
@@ -0,0 +1,113 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "opa_user.h"
+
+#define SYSLOG_MAXLEN	512
+
+extern char *__hfi_mylabel;
+
+void
+hfi_vsyslog(const char *prefix, int to_console, int level,
+	    const char *format, va_list ap)
+{
+	char logprefix[SYSLOG_MAXLEN];
+	size_t len;
+
+	if (to_console) {
+		char hostname[80];
+		va_list ap_cons;
+		va_copy(ap_cons, ap);
+		len = strlen(format);
+		gethostname(hostname, sizeof(hostname));
+		hostname[sizeof(hostname) - 1] = '\0';
+
+		if (__hfi_mylabel)
+			fprintf(stderr, "%s", __hfi_mylabel);
+		else
+			fprintf(stderr, "%s: ", hostname);
+
+		vfprintf(stderr, format, ap_cons);
+		if (format[len] != '\n')
+			fprintf(stderr, "\n");
+		fflush(stderr);
+		va_end(ap_cons);
+	}
+
+	len = snprintf(logprefix, sizeof(logprefix),
+		       "(hfi/%s)[%d]: %s", prefix ? prefix : "hfi",
+		       (int)getpid(), format);
+
+	vsyslog(level | LOG_USER, logprefix, ap);
+
+	return;
+}
+
+void
+hfi_syslog(const char *prefix, int to_console, int level,
+	   const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	hfi_vsyslog(prefix, to_console, level, format, ap);
+	va_end(ap);
+}
diff --git a/opa/opa_time.c b/opa/opa_time.c
new file mode 100644
index 0000000..1b636ed
--- /dev/null
+++ b/opa/opa_time.c
@@ -0,0 +1,284 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "opa_user.h"
+
+/* init the cycle counter to picosecs/cycle conversion automatically */
+/* at program startup, if it's using timing functions. */
+static void init_picos_per_cycle(void) __attribute__ ((constructor));
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle);
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle);
+
+/* in case two of our mechanisms fail */
+#define SAFEDEFAULT_PICOS_PER_CYCLE 500
+
+uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+
+/* This isn't perfect, but it's close enough for rough timing. We want this
+   to work on systems where the cycle counter isn't the same as the clock
+   frequency.
+   __hfi_pico_per_cycle isn't going to lead to completely accurate
+   conversions from timestamps to nanoseconds, but it's close enough for
+   our purposes, which is mainly to allow people to show events with nsecs
+   or usecs if desired, rather than cycles.   We use it in some performance
+   analysis, but it has to be done with care, since cpuspeed can change,
+   different cpu's can have different speeds, etc.
+
+   Some architectures don't have their TSC-equivalent running at anything
+   related to the processor speed (e.g. G5 Power systems use a fixed
+   33 MHz frequency).
+*/
+
+#define MIN_TEST_TIME_IN_PICOS (100000000000LL)	/* 100 milliseconds */
+
+static int timebase_debug;	/* off by default */
+
+#define timebase_warn_always(fmt, ...)				    \
+	    hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__)
+#define timebase_warn(fmt, ...)	if (timebase_debug)		    \
+	    timebase_warn_always(fmt, ##__VA_ARGS__)
+
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle)
+{
+#if defined(__x86_64__) || defined(__i386__)
+	/* If pico-per-cycle is less than 200, the clock speed would be greater
+	 * than 5 GHz.  Similarly, we minimally support a 1GHz clock.
+	 * Allow some slop, because newer kernels with HPET can be a few
+	 * units off, and we don't want to spend the startup time needlessly */
+	if (pico_per_cycle >= 198 && pico_per_cycle <= 1005)
+		return 1;
+#endif
+	else
+		return 0;
+}
+
+/*
+ * Method #1:
+ *
+ * Derive the pico-per-cycle by trying to correlate the difference between two
+ * reads of the tsc counter to gettimeofday.
+ */
+static void init_picos_per_cycle()
+{
+	struct timeval tvs, tve;
+	int64_t usec = 0;
+	uint64_t ts, te;
+	int64_t delta;
+	uint32_t picos = 0;
+	int trials = 0;
+	int retry = 0;
+	cpu_set_t cpuset, cpuset_saved;
+	int have_cpuset = 1;
+
+	/*
+	 * Make sure we try to calculate the cycle time without being migrated.
+	 */
+	CPU_ZERO(&cpuset_saved);
+	if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved))
+		have_cpuset = 0;
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset))
+		have_cpuset = 0;
+
+	/*
+	 * If we set affinity correctly, give the scheduler another change to put
+	 * us on processor 0
+	 */
+	if (have_cpuset)
+		sched_yield();
+
+retry_pico_test:
+	if (++retry == 10) {
+		__hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos);
+		goto reset_cpu_mask;	/* Reset CPU mask before exiting */
+	}
+
+	usec = 0;
+	gettimeofday(&tvs, NULL);
+	ts = get_cycles();
+	while (usec < MIN_TEST_TIME_IN_PICOS) {	/* wait for at least 100 millisecs */
+		trials++;
+		usleep(125);
+		gettimeofday(&tve, NULL);
+		usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) +
+		    1000000000000LL * (tve.tv_sec - tvs.tv_sec);
+		if (usec < 0) {
+			timebase_warn
+			    ("RTC timebase, gettimeofday is negative (!) %lld\n",
+			     (long long)usec);
+			goto retry_pico_test;
+		}
+	}
+	te = get_cycles();
+	delta = te - ts;
+	picos = (uint32_t) (usec / delta);
+
+	if (!hfi_timebase_isvalid(picos)) {
+		cpu_set_t cpuget;
+		int affinity_valid =
+		    !sched_getaffinity(0, sizeof(cpuget), &cpuget);
+		if (affinity_valid && !CPU_ISSET(0, &cpuget))
+			affinity_valid = 0;
+		timebase_warn
+		    ("Failed to get valid RTC timebase, gettimeofday delta=%lld, "
+		     "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n",
+		     (long long)usec, (long long)delta, picos,
+		     affinity_valid ? "YES" : "NO", retry);
+		goto retry_pico_test;
+	}
+
+	/* If we've had to retry even once, let that be known */
+	if (retry > 1)
+		timebase_warn("Clock is %d picos/cycle found in %d trials and "
+			      "%.3f seconds (retry=%d)\n", picos, trials,
+			      (double)usec / 1.0e12, retry);
+
+	__hfi_pico_per_cycle = picos;
+
+reset_cpu_mask:
+	/* Restore affinity */
+	if (have_cpuset) {
+		sched_setaffinity(0, sizeof(cpuset), &cpuset_saved);
+		/*
+		 * Give a chance to other processes that also set affinity to 0 for
+		 * doing this test.
+		 */
+		sched_yield();
+	}
+}
+
+/*
+ * Method #2:
+ *
+ * Derive the pico-per-cycle from /proc instead of using sleep trick
+ * that relies on scheduler.
+ */
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
+{
+	/* we only validate once */
+	uint32_t new_pico_per_cycle = old_pico_per_cycle;
+
+	char hostname[80];
+	gethostname(hostname, 80);
+	hostname[sizeof(hostname) - 1] = '\0';
+
+	if (getenv("HFI_DEBUG_TIMEBASE"))
+		timebase_debug = 1;
+
+	/* If the old one is valid, don't bother with this mechanism */
+	if (hfi_timebase_isvalid(old_pico_per_cycle))
+		return old_pico_per_cycle;
+
+#if defined(__x86_64__) || defined(__i386__)
+	{
+		FILE *fp = fopen("/proc/cpuinfo", "r");
+		char input[255];
+		char *p = NULL;
+
+		if (!fp)
+			goto fail;
+
+		while (!feof(fp) && fgets(input, 255, fp)) {
+			if (strstr(input, "cpu MHz")) {
+				p = strchr(input, ':');
+				double MHz = 0.0;
+				if (p)
+					MHz = atof(p + 1);
+				new_pico_per_cycle =
+				    (uint32_t) (1000000. / MHz);
+				break;
+			}
+		}
+		fclose(fp);
+		if (!p)
+			goto fail;
+	}
+#endif
+
+	/* If there's no change (within a small range), just return the old one */
+	if (abs(new_pico_per_cycle - old_pico_per_cycle) < 5)
+		return old_pico_per_cycle;
+
+	if (hfi_timebase_isvalid(new_pico_per_cycle)) {
+		timebase_warn_always
+		    ("RTC timebase, using %d picos/cycle from /proc "
+		     "instead of the detected %d picos/cycle\n",
+		     new_pico_per_cycle, old_pico_per_cycle);
+		return new_pico_per_cycle;
+	}
+
+fail:
+	new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+	timebase_warn_always
+	    ("Problem obtaining CPU time base, detected to be %d "
+	     "pico/cycle, adjusted to safe default %d picos/cycle",
+	     old_pico_per_cycle, new_pico_per_cycle);
+	return new_pico_per_cycle;
+}
diff --git a/opa/opa_utils.c b/opa/opa_utils.c
new file mode 100644
index 0000000..2b66b77
--- /dev/null
+++ b/opa/opa_utils.c
@@ -0,0 +1,425 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low */
+/* level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <time.h>
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "ipserror.h"
+#include "opa_user.h"
+
+/* keep track whether we disabled mmap in malloc */
+int __hfi_malloc_no_mmap = 0;
+
+/* touch the pages, with a 32 bit read */
+void hfi_touch_mmap(void *m, size_t bytes)
+{
+	volatile uint32_t *b = (volatile uint32_t *)m, c;
+	size_t i;		/* m is always page aligned, so pgcnt exact */
+	int __hfi_pg_sz;
+
+	/* First get the page size */
+	__hfi_pg_sz = sysconf(_SC_PAGESIZE);
+
+	_HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n",
+		  (unsigned long)bytes / __hfi_pg_sz, m);
+	bytes /= sizeof(c);
+	for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c))
+		c = b[i];
+}
+
+/* flush the eager buffers, by setting the eager index head to eager index tail
+   if eager buffer queue is full.
+
+   Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR
+   was set in RHF errors), and no good eager packets were received, so
+   that eager head wasn't advanced.  */
+
+void hfi_flush_egr_bufs(struct _hfi_ctrl *ctrl)
+{
+	uint64_t head = __le64_to_cpu(*ctrl->__hfi_rcvegrhead);
+	uint64_t tail = __le64_to_cpu(*ctrl->__hfi_rcvegrtail);
+
+	if ((head % ctrl->__hfi_tidegrcnt) ==
+	    ((tail + 1) % ctrl->__hfi_tidegrcnt)) {
+		_HFI_DBG
+		    ("eager array full after overflow, flushing (head %llx, tail %llx\n",
+		     (long long)head, (long long)tail);
+		*ctrl->__hfi_rcvegrhead = __cpu_to_le64(tail);
+	}
+}
+
+/* stop_start == 0 disables receive on the context, for use in queue
+   overflow conditions.  stop_start==1 re-enables, to be used to
+   re-init the software copy of the head register */
+int hfi_manage_rcvq(struct _hfi_ctrl *ctrl, uint32_t stop_start)
+{
+	struct hfi1_cmd cmd;
+
+	cmd.type = PSMI_HFI_CMD_RECV_CTRL;
+	cmd.len = 0;
+	cmd.addr = (uint64_t) stop_start;
+
+	if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+		if (errno != EINVAL)	/* not implemented in driver */
+			_HFI_INFO("manage rcvq failed: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+/* ack event bits, and clear them.  Usage is check *spi_sendbuf_status,
+   pass bits you are prepared to handle to hfi_event_ack(), perform the
+   appropriate actions for bits that were set, and then (if appropriate)
+   check the bits again. */
+int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits)
+{
+	struct hfi1_cmd cmd;
+
+	cmd.type = PSMI_HFI_CMD_ACK_EVENT;
+	cmd.len = 0;
+	cmd.addr = ackbits;
+
+	if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+		if (errno != EINVAL)	/* not implemented in driver. */
+			_HFI_DBG("event ack failed: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+/* Tell the driver to change the way packets can generate interrupts.
+
+ HFI1_POLL_TYPE_URGENT: Generate interrupt only when packet sets
+ HFI_KPF_INTR
+ HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on).
+
+ PSM: Uses TYPE_URGENT in ips protocol
+*/
+int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type)
+{
+	struct hfi1_cmd cmd;
+
+	cmd.type = PSMI_HFI_CMD_POLL_TYPE;
+	cmd.len = 0;
+	cmd.addr = (uint64_t) poll_type;
+
+	if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+		if (errno != EINVAL)	/* not implemented in driver */
+			_HFI_INFO("poll type failed: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+/* set the send context pkey to check BTH pkey in each packet.
+   driver should check its pkey table to see if it can find
+   this pkey, if not, driver should return error. */
+int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey)
+{
+	struct hfi1_cmd cmd;
+
+	cmd.type = PSMI_HFI_CMD_SET_PKEY;
+	cmd.len = 0;
+	cmd.addr = (uint64_t) pkey;
+
+	if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+		if (errno != EINVAL)
+			_HFI_INFO("set pkey failed: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+/* Tell the driver to reset the send context. if the send context
+   if halted, reset it, if not, return error back to caller.
+   After context reset, the credit return should be reset to
+   zero by a hardware credit return DMA.
+   Driver will return ENOLCK if the reset is timeout, in this
+   case PSM needs to re-call again. */
+int hfi_reset_context(struct _hfi_ctrl *ctrl)
+{
+	struct hfi1_cmd cmd;
+
+	cmd.type = PSMI_HFI_CMD_CTXT_RESET;
+	cmd.len = 0;
+	cmd.addr = 0;
+
+retry:
+	if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+		if (errno == ENOLCK)
+			goto retry;
+
+		if (errno != EINVAL)
+			_HFI_INFO("reset ctxt failed: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+/* wait for a received packet for our context
+   This allows us to not busy wait, if nothing has happened for a
+   while, which allows better measurements of cpu utilization, and
+   in some cases, slightly better performance.  Called where we would
+   otherwise call sched_yield().  It is not guaranteed that a packet
+   has arrived, so the normal checking loop(s) should be done.
+
+   PSM: not used as is, PSM has it's own use of polling for interrupt-only
+   packets (sets hfi_poll_type to TYPE_URGENT) */
+int hfi_wait_for_packet(struct _hfi_ctrl *ctrl)
+{
+	return hfi_cmd_wait_for_packet(ctrl->fd);
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the hfi driver, they return new info with
+ * the qib driver
+ */
+static int hfi_count_names(const char *namep)
+{
+	int n = 0;
+	while (*namep != '\0') {
+		if (*namep == '\n')
+			n++;
+		namep++;
+	}
+	return n;
+}
+
+const char *hfi_get_next_name(char **names)
+{
+	char *p, *start;
+
+	p = start = *names;
+	while (*p != '\0' && *p != '\n') {
+		p++;
+	}
+	if (*p == '\n') {
+		*p = '\0';
+		p++;
+		*names = p;
+		return start;
+	} else
+		return NULL;
+}
+
+void hfi_release_names(char *namep)
+{
+	/* names were initialised in the data section before. Now
+	 * they are allocated when hfi_hfifs_read() is called. Allocation
+	 * for names is done only once at init time. Should we eventually
+	 * have an "stats_type_unregister" type of routine to explicitly
+	 * deallocate memory and free resources ?
+	 */
+#if 0
+	if (namep != NULL)
+		free(namep);
+#endif
+}
+
+int hfi_get_stats_names_count()
+{
+	char *namep;
+	int c;
+
+	c = hfi_get_stats_names(&namep);
+	free(namep);
+	return c;
+}
+
+int hfi_get_ctrs_unit_names_count(int unitno)
+{
+	char *namep;
+	int c;
+
+	c = hfi_get_ctrs_unit_names(unitno, &namep);
+	free(namep);
+	return c;
+}
+
+int hfi_get_ctrs_port_names_count(int unitno)
+{
+	char *namep;
+	int c;
+
+	c = hfi_get_ctrs_port_names(unitno, &namep);
+	free(namep);
+	return c;
+}
+
+int hfi_lookup_stat(const char *attr, char *namep, uint64_t *stats,
+		    uint64_t *s)
+{
+	const char *p;
+	int i, ret = -1, len = strlen(attr);
+	int nelem = hfi_count_names(namep);
+
+	for (i = 0; i < nelem; i++) {
+		p = hfi_get_next_name(&namep);
+		if (p == NULL)
+			break;
+		if (strncasecmp(p, attr, len + 1) == 0) {
+			ret = i;
+			*s = stats[i];
+		}
+	}
+	return ret;
+}
+
+uint64_t hfi_get_single_stat(const char *attr, uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = hfi_get_stats_names(&namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = hfi_get_stats(stats, nelem);
+	if (n != nelem)
+		goto bail;
+	ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = hfi_get_ctrs_unit_names(unit, &namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = hfi_get_ctrs_unit(unit, stats, nelem);
+	if (n != nelem)
+		goto bail;
+	ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = hfi_get_ctrs_port_names(unit, &namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = hfi_get_ctrs_port(unit, port, stats, nelem);
+	if (n != nelem)
+		goto bail;
+	ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+/*
+ * Add a constructor function to disable mmap if asked to do so by the user
+ */
+static void init_mallopt_disable_mmap(void) __attribute__ ((constructor));
+
+static void init_mallopt_disable_mmap(void)
+{
+	char *env = getenv("HFI_DISABLE_MMAP_MALLOC");
+
+	if (env && *env) {
+		if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) {
+			__hfi_malloc_no_mmap = 1;
+		}
+	}
+
+	return;
+}
diff --git a/opa/opa_write_pio-i386.c b/opa/opa_write_pio-i386.c
new file mode 100644
index 0000000..359fdbc
--- /dev/null
+++ b/opa/opa_write_pio-i386.c
@@ -0,0 +1,305 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains the initialization functions used by the low
+   level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "hfi_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void hfi_write_pio_force_order(volatile uint32_t *piob,
+			       const struct hfi_pio_params *pioparm, void *hdr,
+			       void *bdata)
+{
+	union hfi_pbc buf = {.qword = 0 };
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*piob++ = buf.dword;
+	/* 32 bit programs require fence after first 32 bits of pbc write */
+	/* Can't do as uint64_t store, or compiler could reorder */
+	ips_wmb();
+	*piob++ = buf.pbcflags;
+
+	if (!pioparm->length) {
+		uint32_t *dhdr, dcpywords;
+		dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1;
+		hfi_dwordcpy_safe(piob, hdr, dcpywords);
+		ips_wmb();
+		dhdr = hdr;
+		piob += dcpywords;
+		dhdr += dcpywords;
+		*piob++ = *dhdr;
+	} else {
+		uint32_t *pay2 = bdata, j;
+		uint32_t len = pioparm->length;
+
+		hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+		piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+		len >>= 2;
+		if (len > 16) {
+			uint32_t pay_words = 16 * ((len - 1) / 16);
+			hfi_dwordcpy_safe(piob, pay2, pay_words);
+			piob += pay_words;
+			pay2 += pay_words;
+			len -= pay_words;
+		}
+		/* now write the final chunk a word at a time, fence before trigger */
+		for (j = 0; j < (len - 1); j++)
+			*piob++ = *pay2++;
+		ips_wmb();	/* flush the buffer out now, so */
+		*piob++ = *pay2;
+	}
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		ips_wmb();
+		*piob = pioparm->cksum;
+	}
+
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+void hfi_write_pio(volatile uint32_t *piob,
+		   const struct hfi_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union hfi_pbc buf = { 0 };
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*piob++ = buf.dword;
+	/* 32 bit programs needs compiler fence to prevent compiler reordering
+	   the two 32 bit stores in a uint64_t, but on inorder wc systems, does
+	   not need a memory fence. */
+	asm volatile ("" :  :  : "memory");
+	*piob++ = buf.pbcflags;
+
+	hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+	piob += HFI_MESSAGE_HDR_SIZE >> 2;
+	asm volatile ("" :  :  : "memory");
+
+	if (pioparm->length)
+		hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+				  pioparm->length >> 2);
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		piob += pioparm->length >> 2;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		asm volatile ("" :  :  : "memory");
+		*piob = pioparm->cksum;
+	}
+
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+static void hfi_write_pio_special_trigger(volatile uint32_t *piob,
+					  const struct hfi_pio_params *pioparm,
+					  void *hdr, void *bdata,
+					  unsigned offset)
+					  __attribute__ ((always_inline));
+
+static void hfi_write_pio_special_trigger(volatile uint32_t *piob,
+					  const struct hfi_pio_params *pioparm,
+					  void *hdr, void *bdata,
+					  unsigned offset)
+{
+	union hfi_pbc buf = { 0 };
+	volatile uint32_t *piobs = piob;
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*piob++ = buf.dword;
+	/* 32 bit programs needs compiler fence to prevent compiler reordering
+	   the two 32 bit stores in a uint64_t, but on inorder wc systems, does
+	   not need a memory fence. */
+	asm volatile ("" :  :  : "memory");
+	*piob++ = buf.pbcflags;
+
+	hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+	piob += HFI_MESSAGE_HDR_SIZE >> 2;
+	asm volatile ("" :  :  : "memory");
+
+	if (pioparm->length)
+		hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+				  pioparm->length >> 2);
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		piob += pioparm->length >> 2;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		asm volatile ("" :  :  : "memory");
+		*piob = pioparm->cksum;
+	}
+
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+	*(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC;
+	ips_wmb();
+}
+
+void hfi_write_pio_special_trigger2k(volatile uint32_t *piob,
+				     const struct hfi_pio_params *pioparm,
+				     void *hdr, void *bdata)
+{
+	hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void hfi_write_pio_special_trigger4k(volatile uint32_t *piob,
+				     const struct hfi_pio_params *pioparm,
+				     void *hdr, void *bdata)
+{
+	hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
diff --git a/opa/opa_write_pio-x86_64.c b/opa/opa_write_pio-x86_64.c
new file mode 100644
index 0000000..1140705
--- /dev/null
+++ b/opa/opa_write_pio-x86_64.c
@@ -0,0 +1,296 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains the initialization functions used by the low
+   level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "opa_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void hfi_write_pio_force_order(volatile uint32_t *piob,
+			       const struct hfi_pio_params *pioparm, void *hdr,
+			       void *bdata)
+{
+	union hfi_pbc buf = {.qword = 0 };
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*(volatile uint64_t *)piob = buf.qword;
+	ips_wmb();		/* pbc must be forced to be first write to chip buffer */
+	piob += 2;
+
+	if (!pioparm->length) {
+		uint32_t *dhdr, dcpywords;
+		dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1;
+		hfi_dwordcpy_safe(piob, hdr, dcpywords);
+		ips_wmb();
+		dhdr = hdr;
+		piob += dcpywords;
+		dhdr += dcpywords;
+		*piob++ = *dhdr;
+	} else {
+		uint32_t *pay2 = bdata, j;
+		uint32_t len = pioparm->length;
+
+		hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+		piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+		len >>= 2;
+		if (len > 16) {
+			uint32_t pay_words = 16 * ((len - 1) / 16);
+			hfi_dwordcpy_safe(piob, pay2, pay_words);
+			piob += pay_words;
+			pay2 += pay_words;
+			len -= pay_words;
+		}
+		/* now write the final chunk a word at a time, fence before trigger */
+		for (j = 0; j < (len - 1); j++)
+			*piob++ = *pay2++;
+		ips_wmb();	/* flush the buffer out now, so */
+		*piob++ = *pay2;
+	}
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		ips_wmb();
+		*piob = pioparm->cksum;
+	}
+
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+void hfi_write_pio(volatile uint32_t *piob,
+		   const struct hfi_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union hfi_pbc buf = { 0 };
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*(volatile uint64_t *)piob = buf.qword;
+	piob += 2;
+	asm volatile ("" :  :  : "memory");
+
+	hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+
+	asm volatile ("" :  :  : "memory");
+	piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+	if (pioparm->length)
+		hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+				  pioparm->length >> 2);
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		piob += pioparm->length >> 2;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		asm volatile ("" :  :  : "memory");
+		*piob = pioparm->cksum;
+	}
+
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+}
+
+/*
+ * here we trigger on a "special" address, so just bang it out
+ * as fast as possible...
+ */
+static void
+hfi_write_pio_special_trigger(volatile uint32_t *piob,
+			      const struct hfi_pio_params *pioparm, void *hdr,
+			      void *bdata, unsigned offset)
+			      __attribute__ ((always_inline));
+
+static void
+hfi_write_pio_special_trigger(volatile uint32_t *piob,
+			      const struct hfi_pio_params *pioparm,
+			      void *hdr, void *bdata, unsigned offset)
+{
+	union hfi_pbc buf = { 0 };
+	volatile uint32_t *piobs = piob;
+	uint32_t cksum_len = pioparm->cksum_is_valid ?
+	    HFI_CRC_SIZE_IN_BYTES : 0;
+
+	buf.length =
+	    __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+			    pioparm->length) >> 2) + 1);
+	if (pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+					     __PBC_IBPORT | pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*(volatile uint64_t *)piob = buf.qword;
+	piob += 2;
+	asm volatile ("" :  :  : "memory");
+
+	hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+	piob += HFI_MESSAGE_HDR_SIZE >> 2;
+	asm volatile ("" :  :  : "memory");
+
+	if (pioparm->length)
+		hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+				  pioparm->length >> 2);
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf(pioparm->cksum_is_valid) {
+		int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+		int nCRC = 0;
+
+		piob += pioparm->length >> 2;
+
+		while (nCRC < (nCRCopies - 1)) {
+			*piob = pioparm->cksum;
+			piob++;
+			nCRC++;
+		}
+
+		asm volatile ("" :  :  : "memory");
+		*piob = pioparm->cksum;
+	}
+
+	/*
+	 * flush then write "special" then flush...
+	 */
+	ips_wmb();
+	*(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC;
+	ips_wmb();
+}
+
+void hfi_write_pio_special_trigger2k(volatile uint32_t *piob,
+				     const struct hfi_pio_params *pioparm,
+				     void *hdr, void *bdata)
+{
+	hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void hfi_write_pio_special_trigger4k(volatile uint32_t *piob,
+				     const struct hfi_pio_params *pioparm,
+				     void *hdr, void *bdata)
+{
+	hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
diff --git a/psm.c b/psm.c
new file mode 100644
index 0000000..16a2ceb
--- /dev/null
+++ b/psm.c
@@ -0,0 +1,732 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <dlfcn.h>
+#include "psm_user.h"
+#include "opa_revision.h"
+#include "opa_udebug.h"
+#include "psm_mq_internal.h"
+
+static int psmi_verno_major = PSM2_VERNO_MAJOR;
+static int psmi_verno_minor = PSM2_VERNO_MINOR;
+static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+static int psmi_verno_client_val;
+int psmi_epid_ver;
+
+#define PSMI_NOT_INITIALIZED    0
+#define PSMI_INITIALIZED        1
+#define PSMI_FINALIZED         -1	/* Prevent the user from calling psm2_init
+					 * once psm_finalize has been called. */
+static int psmi_isinit = PSMI_NOT_INITIALIZED;
+
+/* Global lock used for endpoint creation and destroy
+ * (in functions psm2_ep_open and psm2_ep_close) and also
+ * for synchronization with recv_thread (so that recv_thread
+ * will not work on an endpoint which is in a middle of closing). */
+psmi_lock_t psmi_creation_lock;
+
+#ifdef PSM_CUDA
+int is_cuda_enabled;
+int device_support_gpudirect;
+int cuda_runtime_version;
+int is_driver_gpudirect_enabled;
+#endif
+
+/*
+ * Bit field that contains capability set.
+ * Each bit represents different capability.
+ * It is supposed to be filled with logical OR
+ * on conditional compilation basis
+ * along with future features/capabilities.
+ * At the very beginning we start with Multi EPs.
+ */
+uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP;
+
+int psmi_verno_client()
+{
+	return psmi_verno_client_val;
+}
+
+/* This function is used to determine whether the current library build can
+ * successfully communicate with another library that claims to be version
+ * 'verno'.
+ *
+ * PSM 2.x is always ABI compatible, but this checks to see if two different
+ * versions of the library can coexist.
+ */
+int psmi_verno_isinteroperable(uint16_t verno)
+{
+	if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR)
+		return 0;
+
+	return 1;
+}
+
+int MOCKABLE(psmi_isinitialized)()
+{
+	return (psmi_isinit == PSMI_INITIALIZED);
+}
+MOCK_DEF_EPILOGUE(psmi_isinitialized);
+
+#ifdef PSM_CUDA
+int psmi_cuda_initialize()
+{
+	psm2_error_t err = PSM2_OK;
+	int num_devices, dev;
+	struct cudaDeviceProp dev_prop;
+	char *dlerr;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_VDBG("Enabling CUDA support.\n");
+
+	psmi_cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+	psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
+	if (!psmi_cuda_lib || !psmi_cudart_lib) {
+		dlerr = dlerror();
+		_HFI_ERROR("Unable to open libcuda.so and libcudart.so.  Error %s\n",
+			   dlerr ? dlerr : "no dlerror()");
+		goto fail;
+	}
+
+	psmi_cudaRuntimeGetVersion = dlsym(psmi_cudart_lib, "cudaRuntimeGetVersion");
+
+	if (!psmi_cudaRuntimeGetVersion) {
+		_HFI_ERROR
+			("Unable to resolve symbols in CUDA libraries.\n");
+		goto fail;
+	}
+
+	PSMI_CUDA_CALL(cudaRuntimeGetVersion, &cuda_runtime_version);
+	if (cuda_runtime_version < 4010) {
+		_HFI_ERROR("Please update CUDA runtime, required minimum version is 4.1 \n");
+		goto fail;
+	}
+
+
+	psmi_cuCtxGetCurrent = dlsym(psmi_cuda_lib, "cuCtxGetCurrent");
+	psmi_cuCtxSetCurrent = dlsym(psmi_cuda_lib, "cuCtxSetCurrent");
+	psmi_cuPointerGetAttribute = dlsym(psmi_cuda_lib, "cuPointerGetAttribute");
+	psmi_cuPointerSetAttribute = dlsym(psmi_cuda_lib, "cuPointerSetAttribute");
+
+	psmi_cudaGetDeviceCount = dlsym(psmi_cudart_lib, "cudaGetDeviceCount");
+	psmi_cudaGetDeviceProperties = dlsym(psmi_cudart_lib, "cudaGetDeviceProperties");
+	psmi_cudaGetDevice = dlsym(psmi_cudart_lib, "cudaGetDevice");
+	psmi_cudaSetDevice = dlsym(psmi_cudart_lib, "cudaSetDevice");
+	psmi_cudaStreamCreate = dlsym(psmi_cudart_lib, "cudaStreamCreate");
+	psmi_cudaDeviceSynchronize = dlsym(psmi_cudart_lib, "cudaDeviceSynchronize");
+	psmi_cudaStreamSynchronize = dlsym(psmi_cudart_lib, "cudaStreamSynchronize");
+	psmi_cudaEventCreate = dlsym(psmi_cudart_lib, "cudaEventCreate");
+	psmi_cudaEventDestroy = dlsym(psmi_cudart_lib, "cudaEventDestroy");
+	psmi_cudaEventQuery = dlsym(psmi_cudart_lib, "cudaEventQuery");
+	psmi_cudaEventRecord = dlsym(psmi_cudart_lib, "cudaEventRecord");
+	psmi_cudaEventSynchronize = dlsym(psmi_cudart_lib, "cudaEventSynchronize");
+	psmi_cudaMalloc = dlsym(psmi_cudart_lib, "cudaMalloc");
+	psmi_cudaHostAlloc = dlsym(psmi_cudart_lib, "cudaHostAlloc");
+	psmi_cudaFreeHost = dlsym(psmi_cudart_lib, "cudaFreeHost");
+	psmi_cudaMemcpy = dlsym(psmi_cudart_lib, "cudaMemcpy");
+	psmi_cudaMemcpyAsync = dlsym(psmi_cudart_lib, "cudaMemcpyAsync");
+
+	psmi_cudaIpcGetMemHandle = dlsym(psmi_cudart_lib, "cudaIpcGetMemHandle");
+	psmi_cudaIpcOpenMemHandle = dlsym(psmi_cudart_lib, "cudaIpcOpenMemHandle");
+	psmi_cudaIpcCloseMemHandle = dlsym(psmi_cudart_lib, "cudaIpcCloseMemHandle");
+
+	if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent ||
+	    !psmi_cuPointerGetAttribute || !psmi_cuPointerSetAttribute ||
+	    !psmi_cudaGetDeviceCount || !psmi_cudaGetDeviceProperties ||
+	    !psmi_cudaGetDevice || !psmi_cudaSetDevice ||
+	    !psmi_cudaStreamCreate ||
+	    !psmi_cudaDeviceSynchronize || !psmi_cudaStreamSynchronize ||
+	    !psmi_cudaEventCreate || !psmi_cudaEventDestroy ||
+	    !psmi_cudaEventQuery || !psmi_cudaEventRecord ||
+	    !psmi_cudaEventSynchronize ||
+	    !psmi_cudaMalloc || !psmi_cudaHostAlloc || !psmi_cudaFreeHost ||
+	    !psmi_cudaMemcpy || !psmi_cudaMemcpyAsync || !psmi_cudaIpcGetMemHandle ||
+	    !psmi_cudaIpcOpenMemHandle || !psmi_cudaIpcCloseMemHandle) {
+		_HFI_ERROR
+			("Unable to resolve symbols in CUDA libraries.\n");
+		goto fail;
+	}
+
+	if (cuda_runtime_version > 7000) {
+		psmi_cudaStreamCreateWithFlags = dlsym(psmi_cudart_lib,
+						       "cudaStreamCreateWithFlags");
+		if (!psmi_cudaStreamCreateWithFlags) {
+			_HFI_ERROR
+				("Unable to resolve symbols in CUDA libraries.\n");
+			goto fail;
+		}
+	}
+
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSMI_CUDA_CALL(cudaGetDeviceCount, &num_devices);
+	for (dev = 0; dev < num_devices; dev++) {
+		PSMI_CUDA_CALL(cudaGetDeviceProperties, &dev_prop, dev);
+		if (dev_prop.unifiedAddressing != 1) {
+			_HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
+			goto fail;
+		}
+		/* Only devices based on Kepler and
+		 * above can support GPU Direct.
+		 */
+		if (dev_prop.major >= 3 && cuda_runtime_version >= 5000)
+			device_support_gpudirect = 1;
+		else {
+			device_support_gpudirect = 0;
+			_HFI_INFO("Device %d does not GPUDirect RDMA (Non-fatal error) \n", dev);
+		}
+	}
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM2 CUDA support.\n");
+	return err;
+}
+#endif
+
+psm2_error_t __psm2_init(int *major, int *minor)
+{
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val env_tmask;
+
+	psmi_log_initialize();
+
+	PSM2_LOG_MSG("entering");
+#ifdef RDPMC_PERF_FRAMEWORK
+	psmi_rdpmc_perf_framework_init();
+#endif /* RDPMC_PERF_FRAMEWORK */
+
+	GENERIC_PERF_INIT();
+
+	if (psmi_isinit == PSMI_INITIALIZED)
+		goto update;
+
+	if (psmi_isinit == PSMI_FINALIZED) {
+		err = PSM2_IS_FINALIZED;
+		goto fail;
+	}
+
+	if (major == NULL || minor == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	psmi_init_lock(&psmi_creation_lock);
+
+#ifdef PSM_DEBUG
+	if (!getenv("PSM2_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n");
+#endif
+
+#ifdef PSM_PROFILE
+	if (!getenv("PSM2_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n");
+#endif
+
+	/* Make sure we complain if fault injection is enabled */
+	if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! You are running with fault injection enabled!\n");
+
+	/* Make sure, as an internal check, that this version knows how to detect
+	 * compatibility with other library versions it may communicate with */
+	if (psmi_verno_isinteroperable(psmi_verno) != 1) {
+		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"psmi_verno_isinteroperable() not updated for current version!");
+		goto fail;
+	}
+
+	/* The only way to not support a client is if the major number doesn't
+	 * match */
+	if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) {
+		err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION,
+					"This library does not implement version %d.%d",
+					*major, *minor);
+		goto fail;
+	}
+
+	/* Make sure we don't keep track of a client that claims a higher version
+	 * number than we are */
+	psmi_verno_client_val =
+	    min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno);
+
+	/* Check to see if we need to set Architecture flags to something
+	 * besides big core Xeons */
+	cpuid_t id;
+	psmi_cpu_model = CPUID_MODEL_UNDEFINED;
+
+	/* First check to ensure Genuine Intel */
+	get_cpuid(0x0, 0, &id);
+	if(id.ebx == CPUID_GENUINE_INTEL_EBX
+		&& id.ecx == CPUID_GENUINE_INTEL_ECX
+		&& id.edx == CPUID_GENUINE_INTEL_EDX)
+	{
+		/* Use cpuid with EAX=1 to get processor info */
+		get_cpuid(0x1, 0, &id);
+		psmi_cpu_model = CPUID_GENUINE_INTEL;
+	}
+
+	if( (psmi_cpu_model == CPUID_GENUINE_INTEL) &&
+		(id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON)
+	{
+		psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) |
+				((id.eax & CPUID_EXMODEL_MASK) >> 12);
+	}
+
+	psmi_isinit = PSMI_INITIALIZED;
+	/* hfi_debug lives in libhfi.so */
+	psmi_getenv("PSM2_TRACEMASK",
+		    "Mask flags for tracing",
+		    PSMI_ENVVAR_LEVEL_USER,
+		    PSMI_ENVVAR_TYPE_ULONG_FLAGS,
+		    (union psmi_envvar_val)hfi_debug, &env_tmask);
+	hfi_debug = (long)env_tmask.e_ulong;
+
+	/* The "real thing" is done in hfi_proto.c as a constructor function, but
+	 * we getenv it here to report what we're doing with the setting */
+	{
+		extern int __hfi_malloc_no_mmap;
+		union psmi_envvar_val env_mmap;
+		char *env = getenv("HFI_DISABLE_MMAP_MALLOC");
+		int broken = (env && *env && !__hfi_malloc_no_mmap);
+		psmi_getenv("HFI_DISABLE_MMAP_MALLOC",
+			    broken ? "Skipping mmap disable for malloc()" :
+			    "Disable mmap for malloc()",
+			    PSMI_ENVVAR_LEVEL_USER,
+			    PSMI_ENVVAR_TYPE_YESNO,
+			    (union psmi_envvar_val)0, &env_mmap);
+		if (broken)
+			_HFI_ERROR
+			    ("Couldn't successfully disable mmap in mallocs "
+			     "with mallopt()\n");
+	}
+
+	{
+		union psmi_envvar_val env_epid_ver;
+		psmi_getenv("PSM2_ADDR_FMT",
+					"Used to force PSM2 to use a particular version of EPID",
+					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+					(union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver);
+		psmi_epid_ver = env_epid_ver.e_int;
+		if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " The max epid version supported in this version of PSM2 is %d \n"
+					  "Please upgrade PSM2 \n",
+					  PSMI_MAX_EPID_VERNO_SUPPORTED);
+			goto fail;
+		} else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Invalid value provided through PSM2_ADDR_FMT \n");
+			goto fail;
+		}
+	}
+
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_enable_cuda;
+	psmi_getenv("PSM2_CUDA",
+		    "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val)0, &env_enable_cuda);
+	is_cuda_enabled = env_enable_cuda.e_int;
+#endif
+
+	if (getenv("PSM2_IDENTIFY")) {
+                Dl_info info_psm;
+		char ofed_delta[100] = "";
+		strcat(strcat(ofed_delta," built for OFED DELTA "),psmi_hfi_IFS_version);
+                printf("%s %s PSM2 v%d.%d%s\n"
+		       "%s %s location %s\n"
+		       "%s %s build date %s\n"
+		       "%s %s src checksum %s\n"
+                       "%s %s git checksum %s\n"
+                       "%s %s built against driver interface v%d.%d\n",
+			  hfi_get_mylabel(), hfi_ident_tag,
+					     PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR,
+					     (strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta
+#ifdef PSM_CUDA
+						: "-cuda",
+#else
+						: "",
+#endif
+                          hfi_get_mylabel(), hfi_ident_tag, dladdr(psm2_init, &info_psm) ?
+					     info_psm.dli_fname : "libpsm2 not available",
+                          hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp,
+                          hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum,
+			  hfi_get_mylabel(), hfi_ident_tag,
+					     (strcmp(psmi_hfi_git_checksum,"") != 0) ?
+					     psmi_hfi_git_checksum : "<not available>",
+			  hfi_get_mylabel(), hfi_ident_tag, HFI1_USER_SWMAJOR, HFI1_USER_SWMINOR);
+	}
+
+	if (getenv("PSM2_DIAGS")) {
+		_HFI_INFO("Running diags...\n");
+		psmi_diags();
+	}
+
+	psmi_multi_ep_init();
+
+	psmi_faultinj_init();
+
+	psmi_epid_init();
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED) {
+		err = psmi_cuda_initialize();
+		if (err != PSM2_OK)
+			goto fail;
+	}
+#endif
+
+update:
+	*major = (int)psmi_verno_major;
+	*minor = (int)psmi_verno_minor;
+fail:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_init)
+
+
+uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask)
+{
+	return (psm2_capabilities_bitset & req_cap_mask);
+}
+PSMI_API_DECL(psm2_get_capability_mask)
+
+
+psm2_error_t __psm2_finalize(void)
+{
+	struct psmi_eptab_iterator itor;
+	char *hostname;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	GENERIC_PERF_DUMP(stderr);
+	ep = psmi_opened_endpoint;
+	while (ep != NULL) {
+		psmi_opened_endpoint = ep->user_ep_next;
+		psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL,
+			     2 * PSMI_MIN_EP_CLOSE_TIMEOUT);
+		ep = psmi_opened_endpoint;
+	}
+
+	psmi_epid_fini();
+
+	psmi_faultinj_fini();
+
+	/* De-allocate memory for any allocated space to store hostnames */
+	psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME);
+	while ((hostname = psmi_epid_itor_next(&itor)))
+		psmi_free(hostname);
+	psmi_epid_itor_fini(&itor);
+
+	psmi_isinit = PSMI_FINALIZED;
+	PSM2_LOG_MSG("leaving");
+	psmi_log_fini();
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_finalize)
+
+/*
+ * Function exposed in >= 1.05
+ */
+psm2_error_t
+__psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (nids == NULL || hostnames == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	for (i = 0; i < num; i++) {
+		if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1)))
+			break;
+	}
+
+fail:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_map_nid_hostname)
+
+void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label)
+{
+	PSM2_LOG_MSG("entering");
+	PSM2_LOG_MSG("leaving");
+	return;			/* ignore this function */
+}
+PSMI_API_DECL(psm2_epaddr_setlabel)
+
+void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt)
+{
+
+	/* Eventually deprecate this API to use set/get opt as this is unsafe. */
+	PSM2_LOG_MSG("entering");
+	psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+		   PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *));
+	PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_epaddr_setctxt)
+
+void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr)
+{
+	psm2_error_t err;
+	uint64_t optlen = sizeof(void *);
+	void *result = NULL;
+
+	PSM2_LOG_MSG("entering");
+	/* Eventually deprecate this API to use set/get opt as this is unsafe. */
+	err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+			 PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen);
+
+	PSM2_LOG_MSG("leaving");
+
+	if (err == PSM2_OK)
+		return result;
+	else
+		return NULL;
+}
+PSMI_API_DECL(psm2_epaddr_getctxt)
+
+psm2_error_t
+__psm2_setopt(psm2_component_t component, const void *component_obj,
+	     int optname, const void *optval, uint64_t optlen)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	switch (component) {
+	case PSM2_COMPONENT_CORE:
+		rv = psmi_core_setopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_MQ:
+		/* Use the deprecated MQ set/get opt for now which does not use optlen */
+		rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_AM:
+		/* Hand off to active messages */
+		rv = psmi_am_setopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_IB:
+		/* Hand off to IPS ptl to set option */
+		rv = psmi_ptl_ips.setopt(component_obj, optname, optval,
+					   optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	}
+
+	/* Unrecognized/unknown component */
+	rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+				 component);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_setopt);
+
+psm2_error_t
+__psm2_getopt(psm2_component_t component, const void *component_obj,
+	     int optname, void *optval, uint64_t *optlen)
+{
+	psm2_error_t rv;
+
+	PSM2_LOG_MSG("entering");
+	switch (component) {
+	case PSM2_COMPONENT_CORE:
+		rv = psmi_core_getopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_MQ:
+		/* Use the deprecated MQ set/get opt for now which does not use optlen */
+		rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_AM:
+		/* Hand off to active messages */
+		rv = psmi_am_getopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_IB:
+		/* Hand off to IPS ptl to set option */
+		rv = psmi_ptl_ips.getopt(component_obj, optname, optval,
+					   optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	}
+
+	/* Unrecognized/unknown component */
+	rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+				 component);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_getopt);
+
+psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly)
+{
+	PSM2_LOG_MSG("entering");
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK_NO_PROGRESS;
+}
+PSMI_API_DECL(psmi_poll_noop)
+
+psm2_error_t __psm2_poll(psm2_ep_t ep)
+{
+	psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK;
+	psm2_ep_t tmp;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	tmp = ep;
+	do {
+		err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0);	/* poll reqs & reps */
+		if (err1 > PSM2_OK_NO_PROGRESS) {	/* some error unrelated to polling */
+			PSMI_UNLOCK(ep->mq->progress_lock);
+			PSM2_LOG_MSG("leaving");
+			return err1;
+		}
+
+		err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0);	/* get into ips_do_work */
+		if (err2 > PSM2_OK_NO_PROGRESS) {	/* some error unrelated to polling */
+			PSMI_UNLOCK(ep->mq->progress_lock);
+			PSM2_LOG_MSG("leaving");
+			return err2;
+		}
+		ep = ep->mctxt_next;
+	} while (ep != tmp);
+
+	/* This is valid because..
+	 * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK
+	 * PSM2_OK & PSM2_OK => PSM2_OK
+	 * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK
+	 * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */
+	PSMI_UNLOCK(ep->mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return (err1 & err2);
+}
+PSMI_API_DECL(psm2_poll)
+
+psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh)
+{
+	psm2_error_t err1 = PSM2_OK_NO_PROGRESS;
+	psm2_error_t err2;
+	psm2_ep_t tmp;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_LOCK_ASSERT(ep->mq->progress_lock);
+
+	tmp = ep;
+	do {
+		if (poll_amsh) {
+			err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0);	/* poll reqs & reps */
+			if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+				PSM2_LOG_MSG("leaving");
+				return err1;
+			}
+		}
+
+		err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0);	/* get into ips_do_work */
+		if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+			PSM2_LOG_MSG("leaving");
+			return err2;
+		}
+
+		ep = ep->mctxt_next;
+	} while (ep != tmp);
+	PSM2_LOG_MSG("leaving");
+	return (err1 & err2);
+}
+PSMI_API_DECL(psmi_poll_internal)
+#ifdef PSM_PROFILE
+/* These functions each have weak symbols */
+void psmi_profile_block()
+{
+	;			/* empty for profiler */
+}
+
+void psmi_profile_unblock()
+{
+	;			/* empty for profiler */
+}
+
+void psmi_profile_reblock(int did_no_progress)
+{
+	;			/* empty for profiler */
+}
+#endif
diff --git a/psm2.h b/psm2.h
new file mode 100644
index 0000000..3da78e3
--- /dev/null
+++ b/psm2.h
@@ -0,0 +1,1517 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_H
+#define PSM2_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2.h
+ * @page psm2_main PSM2 API
+ *
+ * @brief PSM2 OPA Messaging Library
+ *
+ * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level
+ * user-level communications interface for the OPA family of products.
+ * PSM2 users are enabled with mechanisms necessary to implement higher level
+ * communications interfaces in parallel environments.
+ *
+ * Since PSM2 targets clusters of multicore processors, it internally implements
+ * two levels of communication: intra-node shared memory communication and
+ * inter-node OPA communication.  Both of these levels are encapsulated
+ * below the interface and the user is free to assume that intra-node and
+ * inter-node communication is transparently handled within PSM.
+ *
+ * @section compat Compatibility
+ *
+ * PSM2 can coexist with other QLogic/Pathscale software distributions, such as
+ * OpenIB/OpenFabrics, which allows applications to simultaneously target
+ * PSM-based and non PSM-based applications on a single node without changing
+ * any system-level configuration.  However, PSM2 does not support running
+ * PSM-based and non PSM-based communication within the same user process.
+ *
+ * Except where noted, PSM2 does not assume an SPMD (single program, multiple
+ * data) parallel model and extends to MPMD (multiple program, multiple data)
+ * environments in specific areas. However, PSM2 assumes the runtime environment
+ * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness
+ * (little or big) and will fail at startup if any of these assumptions do not
+ * hold.  For homogeneous systems PSM2 can run either in 32-bit or 64-bit
+ * environments.  Even though both environments should expect similar
+ * performance from the API, PSM2 has chosen to favor 64-bit environments in
+ * some minor areas.
+ *
+ * @section ep_model Endpoint Communication Model
+ *
+ * PSM2 follows an endpoint communication model where an endpoint is defined as
+ * an object (or handle) instantiated to support sending and receiving messages
+ * to other endpoints.  In order to prevent PSM2 from being tied to a particular
+ * parallel model (such as SPMD), control over the parallel layout of endpoints
+ * is retained by the user.  Opening endpoints (@ref psm2_ep_open) and
+ * connecting endpoints to enable communication (@ref psm2_ep_connect) are two
+ * decoupled mechanisms.  Users that do not dynamically change the number of
+ * endpoints beyond parallel startup will probably lump both mechanisms
+ * together at startup.  Users that wish to manipulate the location and number
+ * of endpoints at runtime can do so by explicitly connecting sets or subsets
+ * of endpoints.
+ *
+ * As a side effect, this greater flexibility forces the user to cope with a
+ * two-stage initialization process.  In the first stage of opening an endpoint
+ * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a
+ * globally distributable endpoint identifier (@ref psm2_epid_t).  Prior to the
+ * second stage of connecting endpoints (@ref psm2_ep_connect), a user must
+ * distribute all relevent endpoint identifiers through an out-of-band
+ * mechanism.  Once the endpoint identifiers are successfully distributed to
+ * all processes that wish to communicate, the user
+ * connects all endpoint identifiers to the locally opened endpoint
+ * (@ref psm2_ep_connect).  In connecting the endpoints, the user obtains an
+ * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM
+ * communication primitives.
+ *
+ *
+ * @section components PSM2 Components
+ *
+ * PSM2 exposes a single endpoint initialization model, but enables various
+ * levels of communication functionality and semantics through @e components.
+ * The first major component available in PSM2 is PSM2 Matched Queues
+ * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am).
+ *
+ * Matched Queues (MQ) present a queue-based communication model with the
+ * distinction that queue consumers use a 3-tuple of metadata to match incoming
+ * messages against a list of preposted receive buffers.  The MQ semantics are
+ * sufficiently akin to MPI to cover the entire MPI-1.2 standard.
+ *
+ * The Active Message (AM) component presents a request/reply model where
+ * the arrival of a message triggers the execution of consumer-provided
+ * handler code. This can be used to implement many one-sided and two-sided
+ * communications paradigms.
+ *
+ * With future releases of the PSM2 interface, more components will
+ * be exposed to accommodate users that implement parallel communication
+ * models that deviate from the Matched Queue semantics.  For example, PSM
+ * plans to expose a connection management component to make it easier to
+ * handle endpoint management for clients without their own connection
+ * managers.
+ *
+ *
+ * @section progress PSM2 Communication Progress Guarantees
+ *
+ * PSM2 internally ensures progress of both intra-node and inter-node messages,
+ * but not autonomously.  This means that while performance does not depend
+ * greatly on how the user decides to schedule communication progress,
+ * explicit progress calls are required for correctness.  The @ref psm2_poll
+ * function is available to make progress over all PSM2 components in a generic
+ * manner.  For more information on making progress over many communication
+ * operations in the MQ component, see the @ref mq_progress documentation.
+ *
+ *
+ * @section completion PSM2 Completion semantics
+ *
+ * PSM2 implements the MQ component, which documents its own
+ * message completion semantics (@ref mq_completion).
+ *
+ *
+ * @section error_handling PSM2 Error handling
+ *
+ * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error.
+ * While most errors are fatal in that the user is not expected to be able to
+ * recover from them, PSM2 still allows some level of control.  By
+ * default, PSM2 returns all errors to the user but as a convenience, allows
+ * users to either defer errors internally to PSM2 or to have PSM2 return all
+ * errors to the user (callers to PSM2 functions).  PSM2 attempts to deallocate
+ * its resources as a best effort, but exits are always non-collective with
+ * respect to endpoints opened in other processes.  The user is expected to be
+ * able to handle non-collective exits from any endpoint and in turn cleanly
+ * and independently terminate the parallel environment.  Local error handling
+ * can be handled in three modes:
+ *
+ * Errors and error handling can be individually registered either globally or
+ * per-endpoint:
+ * @li @b Per-endpoint error handling captures errors for functions where the
+ * error scoping is determined to be over an endpoint.  This includes all
+ * communication functions that include an EP or MQ handle as the first
+ * parameter.
+ *
+ * @li @b Global error handling captures errors for functions where a
+ * particular endpoint cannot be identified or for @ref psm2_ep_open, where
+ * errors (if any) occur before the endpoint is opened.
+ *
+ * Error handling is controlled by registering error handlers (@ref
+ * psm2_error_register_handler).  The global error handler can
+ * be set at any time (even before @ref psm2_init), whereas a per-endpoint error
+ * handler can be set as soon as a new endpoint is successfully created.  If a
+ * per-endpoint handle is not registered, the per-endpoint handler inherits
+ * from the global error handler at time of open.
+ *
+ * PSM2 predefines two different mechanisms for handling errors:
+ *
+ * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER)
+ * @li No-op PSM2 error handler where errors are returned
+ *     (@ref PSM2_ERRHANDLER_NO_HANDLER)
+ *
+ * The default PSM-internal error handler effectively frees the user from
+ * explicitly handling the return values of ever PSM2 function but may not
+ * return to the user in a function determined to have caused a fatal error.
+ *
+ * The No-op PSM2 error handler bypasses all error handling functionality and
+ * always returns the error to the user.  The user can then use @ref
+ * psm2_error_get_string to obtain a generic string from an error code (compared
+ * to a more detailed error message available through registering of error
+ * handlers).
+ *
+ * For even more control, users can register their own error handlers to have
+ * access to more precise error strings and selectively control when an when
+ * not to return to callers of PSM2 functions.  All error handlers shown defer
+ * error handling to PSM2 for errors that are not recognized using @ref
+ * psm2_error_defer.  Deferring an error from a custom error handler is
+ * equivalent to relying on the default error handler.
+ *
+ * @section env_var Environment variables
+ *
+ * Some PSM2 behaviour can be controlled via environment variables.
+ *
+ * @li @b PSM2_DEVICES. PSM2 implements three devices for communication which
+ * are, in order,  @c self, @c shm and @c hfi.  For PSM2 jobs that do not
+ * require shared-memory communications, @b PSM2_DEVICES can be specified as @c
+ * self, @c hfi.  Similarly, for shared-memory only jobs, the @c hfi device
+ * can be disabled.  It is up to the user to ensure that the endpoint ids
+ * passed in @ref psm2_ep_connect do not require a device that has been
+ * explicitly disabled by the user.  In some instances, enabling only the
+ * devices that are required may improve performance.
+ *
+ * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts
+ * of PSM2 will output debugging information.  With a default value of @c 0x1,
+ * informative messages will be printed (this value should be considered a
+ * minimum).  At @c 0x101, startup and finalization messages are added to the
+ * output.  At @c 0x1c3, every communication event is logged and should hence
+ * be used for extreme debugging only.
+ *
+ * @li @b PSM2_MULTI_EP. By default, only one PSM2 endpoint may be opened in
+ * a process. With the correct setting of this environment variable, a process
+ * may open more than one PSM2 endpoint. In order to enable multiple endpoint
+ * per process support, the value of this environment variable should be set
+ * to "1" or "yes".
+ *
+ * @section thr_sfty Thread safety and reentrancy
+ * Unless specifically noted otherwise, all PSM2 functions should not be considered
+ * to be thread safe or reentrant.
+ */
+
+/** @brief Local endpoint handle (opaque)
+ *  @ingroup ep
+ *
+ * Handle returned to the user when a new local endpoint is created.  The
+ * handle is a local handle to be used in all communication functions and is
+ * not intended to globally identify the opened endpoint in any way.
+ *
+ * All open endpoint handles can be globally identified using the endpoint id
+ * integral type (@ref psm2_epid_t) and all communication must use an endpoint
+ * address (@ref psm2_epaddr_t) that can be obtained by connecting a local
+ * endpoint to one or more endpoint identifiers.
+ *
+ * @remark The local endpoint handle is opaque to the user.  */
+typedef struct psm2_ep *psm2_ep_t;
+
+/** @brief MQ handle (opaque)
+ * @ingroup mq
+ *
+ * Handle returned to the user when a new Matched queue is created (@ref
+ * psm2_mq_init).  */
+typedef struct psm2_mq *psm2_mq_t;
+
+/*! @defgroup init PSM2 Initialization and Maintenance
+ * @{
+ */
+#define PSM2_VERNO       0x0201	/*!< Header-defined Version number */
+#define PSM2_VERNO_MAJOR 0x02	/*!< Header-defined Major Version Number */
+#define PSM2_VERNO_MINOR 0x01	/*!< Header-defined Minor Version Number */
+#define PSM2_VERNO_COMPAT_MAJOR 0x01    /*!<Minimum PSM1 Major Version Number for Compatibility */
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_error {
+	/*! Interface-wide "ok", guaranteed to be 0. */
+	PSM2_OK = 0,
+	/*! No events progressed on @ref psm2_poll (not fatal) */
+	PSM2_OK_NO_PROGRESS = 1,
+	/*! Error in a function parameter */
+	PSM2_PARAM_ERR = 3,
+	/*! PSM2 ran out of memory */
+	PSM2_NO_MEMORY = 4,
+	/*! PSM2 has not been initialized by @ref psm2_init */
+	PSM2_INIT_NOT_INIT = 5,
+	/*! API version passed in @ref psm2_init is incompatible */
+	PSM2_INIT_BAD_API_VERSION = 6,
+	/*! PSM2 Could not set affinity */
+	PSM2_NO_AFFINITY = 7,
+	/*! PSM2 Unresolved internal error */
+	PSM2_INTERNAL_ERR = 8,
+	/*! PSM2 could not set up shared memory segment */
+	PSM2_SHMEM_SEGMENT_ERR = 9,
+	/*! PSM2 option is a read-only option */
+	PSM2_OPT_READONLY = 10,
+	/*! PSM2 operation timed out */
+	PSM2_TIMEOUT = 11,
+	/*! Too many endpoints */
+	PSM2_TOO_MANY_ENDPOINTS = 12,
+
+	/*! PSM2 is finalized */
+	PSM2_IS_FINALIZED = 13,
+
+	/*! Endpoint was closed */
+	PSM2_EP_WAS_CLOSED = 20,
+	/*! PSM2 Could not find an OPA Unit */
+	PSM2_EP_NO_DEVICE = 21,
+	/*! User passed a bad unit or port number */
+	PSM2_EP_UNIT_NOT_FOUND = 22,
+	/*! Failure in initializing endpoint */
+	PSM2_EP_DEVICE_FAILURE = 23,
+	/*! Error closing the endpoing error */
+	PSM2_EP_CLOSE_TIMEOUT = 24,
+	/*! No free ports could be obtained */
+	PSM2_EP_NO_PORTS_AVAIL = 25,
+	/*! Could not detect network connectivity */
+	PSM2_EP_NO_NETWORK = 26,
+	/*! Invalid Unique job-wide UUID Key */
+	PSM2_EP_INVALID_UUID_KEY = 27,
+	/*! Internal out of resources */
+	PSM2_EP_NO_RESOURCES = 28,
+
+	/*! Endpoint connect status unknown (because of other failures or if
+	 * connect attempt timed out) */
+	PSM2_EPID_UNKNOWN = 40,
+	/*! Endpoint could not be reached by any PSM2 component */
+	PSM2_EPID_UNREACHABLE = 41,
+	/*! At least one of the connecting nodes was incompatible in endianess */
+	PSM2_EPID_INVALID_NODE = 43,
+	/*! At least one of the connecting nodes provided an invalid MTU */
+	PSM2_EPID_INVALID_MTU = 44,
+	/*! At least one of the connecting nodes provided a bad key */
+	PSM2_EPID_INVALID_UUID_KEY = 45,
+	/*! At least one of the connecting nodes is running an incompatible
+	 * PSM2 protocol version */
+	PSM2_EPID_INVALID_VERSION = 46,
+	/*! At least one node provided garbled information */
+	PSM2_EPID_INVALID_CONNECT = 47,
+	/*! EPID was already connected */
+	PSM2_EPID_ALREADY_CONNECTED = 48,
+	/*! EPID is duplicated, network connectivity problem */
+	PSM2_EPID_NETWORK_ERROR = 49,
+	/*! EPID incompatible partition keys */
+	PSM2_EPID_INVALID_PKEY = 50,
+	/*! Unable to resolve path for endpoint */
+	PSM2_EPID_PATH_RESOLUTION = 51,
+
+	/*! MQ Non-blocking request is incomplete */
+	PSM2_MQ_NO_COMPLETIONS = 60,
+	/*! MQ Message has been truncated at the receiver */
+	PSM2_MQ_TRUNCATION = 61,
+
+	/*! AM reply error */
+	PSM2_AM_INVALID_REPLY = 70,
+
+    /*! Reserved Value to indicate highest ENUM value */
+    PSM2_ERROR_LAST = 80
+};
+
+/*! Backwards header compatibility for a confusing error return name */
+#define PSM2_MQ_INCOMPLETE PSM2_MQ_NO_COMPLETIONS
+
+/*! @see psm2_error */
+typedef enum psm2_error psm2_error_t;
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_component {
+	/*! PSM2 core library */
+	PSM2_COMPONENT_CORE = 0,
+	/*! MQ component */
+	PSM2_COMPONENT_MQ = 1,
+	/*! AM component */
+	PSM2_COMPONENT_AM = 2,
+	/*! IB component */
+	PSM2_COMPONENT_IB = 3
+};
+
+/*! @see psm2_component */
+typedef enum psm2_component psm2_component_t;
+
+/*! @brief PSM2 Path resolution mechanism
+ */
+enum psm2_path_res {
+	/*! PSM2 no path resolution */
+	PSM2_PATH_RES_NONE = 0,
+	/*! Use OFED Plus for path resolution */
+	PSM2_PATH_RES_OPP = 1,
+	/*! Use OFED UMAD for path resolution */
+	PSM2_PATH_RES_UMAD = 2
+};
+
+/*! @see psm2_path_resolution */
+typedef enum psm2_path_res psm2_path_res_t;
+
+/** @brief Initialize PSM2 interface
+ *
+ * Call to initialize the PSM2 library for a desired API revision number.
+ *
+ * @param[in,out] api_verno_major As input a pointer to an integer that holds
+ *                                @ref PSM2_VERNO_MAJOR. As output, the pointer
+ *                                is updated with the major revision number of
+ *                                the loaded library.
+ * @param[in,out] api_verno_minor As input, a pointer to an integer that holds
+ *                                @ref PSM2_VERNO_MINOR.  As output, the pointer
+ *                                is updated with the minor revision number of
+ *                                the loaded library.
+ *
+ * @pre The user has not called any other PSM2 library call except @ref
+ *      psm2_error_register_handler to register a global error handler.
+ *
+ * @post Depending on the environment variable @ref PSM2_MULTI_EP being set and
+ * 	 its contents, support for opening multiple endpoints is either enabled
+ * 	 or disabled.
+ *
+ * @warning PSM2 initialization is a precondition for all functions used in the
+ *          PSM2 library.
+ *
+ * @returns PSM2_OK The PSM2 interface could be opened and the desired API
+ *                 revision can be provided.
+ * @returns PSM2_INIT_BAD_API_VERSION The PSM2 library cannot compatibility for
+ *                                   the desired API version.
+ *
+ * @code{.c}
+   	// In this example, we want to handle our own errors before doing init,
+   	// since we don't want a fatal error if OPA is not found.
+   	// Note that @ref psm2_error_register_handler
+   	// (and @ref psm2_uuid_generate and @ref psm2_get_capability_mask)
+   	// are the only function that can be called before @ref psm2_init
+   	
+   	int try_to_initialize_psm() {
+   	    int verno_major = PSM2_VERNO_MAJOR;
+   	    int verno_minor = PSM2_VERNO_MINOR;
+   	
+   	    int err = psm2_error_register_handler(NULL,  // Global handler
+   	                                 PSM2_ERRHANDLER_NO_HANDLER); // return errors
+   	    if (err) {
+   	       fprintf(stderr, "Couldn't register global handler: %s\n",
+   	   	          psm2_error_get_string(err));
+   	       return -1;
+   	    }
+   	
+   	    err = psm2_init(&verno_major, &verno_minor);
+   	    if (err || verno_major > PSM2_VERNO_MAJOR) {
+   	       if (err)
+   	         fprintf(stderr, "PSM2 initialization failure: %s\n",
+   	                 psm2_error_get_string(err));
+   	     else
+   	         fprintf(stderr, "PSM2 loaded an unexpected/unsupported "
+   	                         "version (%d.%d)\n", verno_major, verno_minor);
+   	     return -1;
+   	    }
+   	
+   	    // We were able to initialize PSM2 but will defer all further error
+   	    // handling since most of the errors beyond this point will be fatal.
+   	    int err = psm2_error_register_handler(NULL,  // Global handler
+   	                                          PSM2_ERRHANDLER_PSM_HANDLER);
+   	    if (err) {
+   	       fprintf(stderr, "Couldn't register global errhandler: %s\n",
+   	   	          psm2_error_get_string(err));
+   	       return -1;
+   	    }
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor);
+
+/*! @brief PSM2 capabilities definitions
+ *
+ * Each capability is defined as a separate bit,
+ * i.e. next capabilities must be defined as
+ * consecutive bits : 0x2, 0x4 ... and so on.
+ */
+#define PSM2_MULTI_EP_CAP 0x1	/* Multiple Endpoints capability */
+
+/** @brief PSM2 capabilities provider
+ *
+ * @param[in] req_cap_mask Requested capabilities are given as bit field.
+ *
+ * @returns internal capabilities bit field ANDed with a requested bit mask */
+uint64_t psm2_get_capability_mask(uint64_t req_cap_mask);
+
+/** @brief Finalize PSM2 interface
+ *
+ * Single call to finalize PSM2 and close all unclosed endpoints
+ *
+ * @post The user guarantees not to make any further PSM2 calls, including @ref
+ * psm2_init.
+ *
+ * @returns PSM2_OK Always returns @c PSM2_OK */
+psm2_error_t psm2_finalize(void);
+
+/** @brief Error handling opaque token
+ *
+ * A token is required for users that register their own handlers and wish to
+ * defer further error handling to PSM. */
+typedef struct psm2_error_token *psm2_error_token_t;
+
+/** @brief Error handling function
+ *
+ * Users can handle errors explicitly instead of relying on PSM's own error
+ * handler.  There is one global error handler and error handlers that can be
+ * individually set for each opened endpoint.  By default, endpoints will
+ * inherit the global handler registered at the time of open.
+ *
+ * @param[in] ep Handle associated to the endpoint over which the error occurred
+ *               or @c NULL if the error is being handled by the global error
+ *               handler.
+ * @param[in] error PSM2 error identifier
+ * @param[in] error_string A descriptive error string of maximum length @ref
+ *                         PSM2_ERRSTRING_MAXLEN.
+ * @param[in] token Opaque PSM2 token associated with the particular event that
+ *		    generated the error.  The token can be used to extract the
+ *		    error string and can be passed to @ref psm2_error_defer to
+ *		    defer any remaining or unhandled error handling to PSM.
+ *
+ * @post If the error handler returns, the error returned is propagated to the
+ *       caller.  */
+typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep,
+					   const psm2_error_t error,
+					   const char *error_string,
+					   psm2_error_token_t token);
+
+#define PSM2_ERRHANDLER_DEFAULT	((psm2_ep_errhandler_t)-1)
+/**< Obsolete names, only here for backwards compatibility */
+#define PSM2_ERRHANDLER_NOP	((psm2_ep_errhandler_t)-2)
+/**< Obsolete names, only here for backwards compatibility */
+
+#define PSM2_ERRHANDLER_PSM_HANDLER  ((psm2_ep_errhandler_t)-1)
+/**< PSM2 error handler as explained in @ref error_handling */
+
+#define PSM2_ERRHANDLER_NO_HANDLER   ((psm2_ep_errhandler_t)-2)
+/**< Bypasses the default PSM2 error handler and returns all errors to the user
+ * (this is the default) */
+
+#define PSM2_ERRSTRING_MAXLEN	512 /**< Maximum error string length. */
+
+/** @brief PSM2 error handler registration
+ *
+ * Function to register error handlers on a global basis and on a per-endpoint
+ * basis.  PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special
+ * pre-defined handlers to respectively enable use of the default PSM-internal
+ * handler or the no-handler that disables registered error handling and
+ * returns all errors to the caller (both are documented in @ref
+ * error_handling).
+ *
+ * @param[in] ep Handle of the endpoint over which the error handler should be
+ *               registered.  With ep set to @c NULL, the behavior of the
+ *               global error handler can be controlled.
+ * @param[in] errhandler Handler to register.  Can be a user-specific error
+ *                       handling function or PSM2_ERRHANDLER_PSM_HANDLER or
+ *                       PSM2_ERRHANDLER_NO_HANDLER.
+ *
+ * @remark When ep is set to @c NULL, this is the only function that can be
+ * called before @ref psm2_init
+ */
+psm2_error_t
+psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler);
+
+/** @brief PSM2 deferred error handler
+ *
+ * Function to handle fatal PSM2 errors if no error handler is installed or if
+ * the user wishes to defer further error handling to PSM.  Depending on the
+ * type of error, PSM2 may or may not return from the function call.
+ *
+ * @param[in] err_token Error token initially passed to error handler
+ *
+ * @pre The user is calling into the function because it has decided that PSM
+ *      should handle an error case.
+ *
+ * @post The function may or may not return depending on the error
+ */
+psm2_error_t psm2_error_defer(psm2_error_token_t err_token);
+
+/** @brief Get generic error string from error
+ *
+ * Function to return the default error string associated to a PSM2 error.
+ *
+ * While a more detailed and precise error string is usually available within
+ * error handlers, this function is available to obtain an error string out of
+ * an error handler context or when a no-op error handler is registered.
+ *
+ * @param[in] error PSM2 error
+ */
+const char *psm2_error_get_string(psm2_error_t error);
+
+/** @brief Option key/pair structure
+ *
+ * Currently only used in MQ.
+ */
+struct psm2_optkey {
+	uint32_t key;	/**< Option key */
+	void *value;	/**< Option value */
+};
+
+/*! @} */
+
+/*! @defgroup ep PSM2 Device Endpoint Management
+ * @{
+ */
+
+/** @brief Endpoint ID
+ *
+ * Integral type of size 8 bytes that can be used by the user to globally
+ * identify a successfully opened endpoint.  Although the contents of the
+ * endpoint id integral type remains opaque to the user, unique network id and
+ * OPA port number can be extracted using @ref psm2_epid_nid and @ref
+ * psm2_epid_context.
+ */
+typedef uint64_t psm2_epid_t;
+
+/** @brief Endpoint Address (opaque)
+ *
+ * Remote endpoint addresses are created when the user binds an endpoint ID
+ * to a particular endpoint handle using @ref psm2_ep_connect.  A given endpoint
+ * address is only guaranteed to be valid over a single endpoint.
+ */
+typedef struct psm2_epaddr *psm2_epaddr_t;
+
+/** @brief PSM2 Unique UID
+ *
+ * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an
+ * endpoint within a particular job.  Since PSM2 does not participate in job
+ * allocation and management, users are expected to generate a unique ID to
+ * associate endpoints to a particular parallel or collective job.
+ * @see psm2_uuid_generate
+ */
+typedef uint8_t psm2_uuid_t[16];
+
+/** @brief Get Endpoint identifier's Unique Network ID */
+uint64_t psm2_epid_nid(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA context number */
+uint64_t psm2_epid_context(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA port (deprecated, use
+ * @ref psm2_epid_context instead) */
+uint64_t psm2_epid_port(psm2_epid_t epid);
+
+/** @brief List the number of available OPA units
+ *
+ * Function used to determine the number of locally available OPA units.
+ * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1.
+ *
+ * @returns PSM2_OK unless the user has not called @ref psm2_init
+ */
+psm2_error_t psm2_ep_num_devunits(uint32_t *num_units);
+
+/** @brief Utility to generate UUIDs for @ref psm2_ep_open
+ *
+ * This function is available as a utility for generating unique job-wide ids.
+ * See discussion in @ref psm2_ep_open for further information.
+ *
+ * @remark This function does not require PSM2 to be initialized.
+ */
+void psm2_uuid_generate(psm2_uuid_t uuid_out);
+
+/* Affinity modes for the affinity member of struct psm2_ep_open_opts */
+#define PSM2_EP_OPEN_AFFINITY_SKIP     0	/**< Disable setting affinity */
+#define PSM2_EP_OPEN_AFFINITY_SET      1	/**< Enable setting affinity unless
+					  already set */
+#define PSM2_EP_OPEN_AFFINITY_FORCE    2	/**< Enable setting affinity regardless
+					  of current affinity setting */
+
+/* Default values for some constants */
+#define PSM2_EP_OPEN_PKEY_DEFAULT    0xffffffffffffffffULL
+				    /**< Default protection key */
+
+/** @brief Endpoint Open Options
+ *
+ * These options are available for opening a PSM2 endpoint.  Each is
+ * individually documented and setting each option to -1 or passing NULL as the
+ * options parameter in @ref psm2_ep_open instructs PSM2 to use
+ * implementation-defined defaults.
+ *
+ * Each option is documented in @ref psm2_ep_open
+ */
+struct psm2_ep_open_opts {
+	int64_t timeout;	/**< timeout in nanoseconds to open device */
+	int unit;		/**< OPA Unit ID to open on */
+	int affinity;		/**< How PSM2 should set affinity */
+	int shm_mbytes;	/**< Megabytes used for intra-node, deprecated */
+	int sendbufs_num;	/**< Preallocated send buffers */
+	uint64_t network_pkey;	/**< Network Protection Key (v1.01) */
+	int port;		/**< IB port to use (1 to N) */
+	int outsl;		/**< IB SL to use when sending pkts */
+	uint64_t service_id;	/* IB Service ID to use for endpoint */
+	psm2_path_res_t path_res_type;	/* Path resolution type */
+	int senddesc_num;	/* Preallocated send descriptors */
+	int imm_size;		/* Immediate data size for endpoint */
+};
+
+/** @brief OPA endpoint creation
+ *
+ * Function used to create a new local communication endpoint on an OPA
+ * adapter.  The returned endpoint handle is required in all PSM2 communication
+ * operations, as PSM2 can manage communication over multiple endpoints.  An
+ * opened endpoint has no global context until the user connects the endpoint
+ * to other global endpoints by way of @ref psm2_ep_connect.  All local endpoint
+ * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are
+ * also returned when an endpoint is opened.  It is assumed that the user can
+ * provide an out-of-band mechanism to distribute the endpoint IDs in order to
+ * establish connections between endpoints (@ref psm2_ep_connect for more
+ * information).
+ *
+ * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in
+ *                           a parallel job.  It is up to the user to ensure
+ *                           that the key is globally unique over a period long
+ *                           enough to prevent duplicate keys over the same set
+ *                           of endpoints (see comments below).
+ *
+ * @param[in] opts Open options of type @ref psm2_ep_open_opts
+ *                 (see @ref psm2_ep_open_opts_get_defaults).
+ *
+ * @param[out] ep User-supplied storage to return a pointer to the newly
+ *                created endpoint.  The returned pointer of type @ref psm2_ep_t
+ *                is a local handle and cannot be used to globally identify the
+ *                endpoint.
+ * @param[out] epid User-supplied storage to return the endpoint ID associated
+ *                  to the newly created local endpoint returned in the @c ep
+ *                  handle.  The endpoint ID is an integral type suitable for
+ *                  uniquely identifying the local endpoint.
+ *
+ * PSM2 does not internally verify the consistency of the uuid, it is up to the
+ * user to ensure that the uid is unique enough not to collide with other
+ * currently-running jobs.  Users can employ three mechanisms to obtain a uuid.
+ *
+ * 1. Use the supplied @ref psm2_uuid_generate utility
+ *
+ * 2. Use an OS or library-specific uuid generation utility, that complies with
+ *    OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on
+ *    FreeBSD.
+ *    (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm)
+ *
+ * 3. Manually pack a 16-byte string using a utility such as /dev/random or
+ *    other source with enough entropy and proper seeding to prevent two nodes
+ *    from generating the same uuid_t.
+ *
+ * The following options are relevent when opening an endpoint:
+ *   @li @c timeout establishes the number of nanoseconds to wait before
+ *                  failing to open a port (with -1, defaults to 15 secs).
+ *   @li @c unit sets the OPA unit number to use to open a port (with
+ *               -1, PSM2 determines the best unit to open the port).  If @c
+ *               HFI_UNIT is set in the environment, this setting is ignored.
+ *   @li @c affinity enables or disables PSM2 setting processor affinity.  The
+ *                   option can be controlled to either disable (@ref
+ *                   PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting
+ *                   only if it is already unset (@ref
+ *                   PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity begin
+ *                   set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE).
+ *                   If @c HFI_NO_CPUAFFINITY is set in the environment, this
+ *                   setting is ignored.
+ *   @li @c shm_mbytes sets a maximum number of megabytes that can be allocated
+ *		       to each local endpoint ID connected through this
+ *		       endpoint (with -1, defaults to 10 MB).
+ *   @li @c sendbufs_num sets the number of send buffers that can be
+ *                       pre-allocated for communication (with -1, defaults to
+ *                       512 buffers of MTU size).
+ *   @li @c network_pkey sets the protection key to employ for point-to-point
+ *                       PSM2 communication.  Unless a specific value is used,
+ *                       this parameter should be set to
+ *                       PSM2_EP_OPEN_PKEY_DEFAULT.
+ *
+ * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only
+ * once per process and subsequent calls will fail. In order to enable creation
+ * of multiple endoints per process, one must properly set the environment variable
+ * @ref PSM2_MULTI_EP before calling @ref psm2_init.
+ *
+ * @code{.c}
+    	// In order to open an endpoint and participate in a job, each endpoint has
+    	// to be distributed a unique 16-byte UUID key from an out-of-band source.
+    	// Presumably this can come from the parallel spawning utility either
+    	// indirectly through an implementors own spawning interface or as in this
+    	// example, the UUID is set as a string in an environment variable
+    	// propagated to all endpoints in the job.
+    	
+    	int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle
+    	                             psm2_epid_t *epid, // output endpoint identifier
+    	                             int unit)  // unit of our choice
+    	{
+    	   psm2_ep_open_opts epopts;
+    	   psm2_uuid_t job_uuid;
+    	   char *c;
+    	
+    	   // Let PSM2 assign its default values to the endpoint options.
+    	   psm2_ep_open_opts_get_defaults(&epopts);
+    	
+    	   // We want a stricter timeout and a specific unit
+    	   epopts.timeout = 15*1e9;  // 15 second timeout
+    	   epopts.unit = unit;	// We want a specific unit, -1 would let PSM
+    	                             // choose the unit for us.
+    	   epopts.port = port;	// We want a specific unit, <= 0 would let PSM
+    	                             // choose the port for us.
+    	   // We've already set affinity, don't let PSM2 do so if it wants to.
+    	   if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET)
+    	      epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP;
+    	
+    	   // ENDPOINT_UUID is set to the same value in the environment of all the
+    	   // processes that wish to communicate over PSM2 and was generated by
+    	   // the process spawning utility
+    	   c = getenv("ENDPOINT_UUID");
+    	   if (c && *c)
+    	      implementor_string_to_16byte_packing(c, job_uuid);
+    	   else {
+    	      fprintf(stderr, "Can't find UUID for endpoint\n);
+    	      return -1;
+    	   }
+    	
+    	   // Assume we don't want to handle errors here.
+    	   psm2_ep_open(job_uuid, &epopts, ep, epid);
+    	   return 1;
+    	}
+   @endcode
+ */
+psm2_error_t
+psm2_ep_open(const psm2_uuid_t unique_job_key,
+	    const struct psm2_ep_open_opts *opts, psm2_ep_t *ep,
+	    psm2_epid_t *epid);
+
+/** @brief Endpoint open default options.
+ *
+ * Function used to initialize the set of endpoint options to their default
+ * values for use in @ref psm2_ep_open.
+ *
+ * @param[out] opts Endpoint Open options.
+ *
+ * @warning For portable operation, users should always call this function
+ * prior to calling @ref psm2_ep_open.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_INIT_NOT_INIT If psm has not been initialized.
+ */
+psm2_error_t
+psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts);
+
+/** @brief Endpoint shared memory query
+ *
+ * Function used to determine if a remote endpoint shares memory with a
+ * currently opened local endpiont.
+ *
+ * @param[in] ep Endpoint handle
+ * @param[in] epid Endpoint ID
+ *
+ * @param[out] result Result is non-zero if the remote endpoint shares memory with the local
+ * endpoint @c ep, or zero otherwise.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_EPID_UNKNOWN If the epid is not recognized
+ */
+psm2_error_t
+psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result);
+
+/** @brief Close endpoint
+ * @param[in] ep PSM2 endpoint handle
+ * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE
+ * @param[in] timeout How long to wait in nanoseconds if mode is
+ *			PSM2_EP_CLOSE_GRACEFUL, 0 waits forever.  If @c mode is
+ *			@ref PSM2_EP_CLOSE_FORCE, this parameter is ignored.
+ *
+ * The following errors are returned, others are handled by the per-endpoint
+ * error handler:
+ *
+ * @return PSM2_OK  Endpoint was successfully closed without force or
+ *                 successfully closed with force within the supplied timeout.
+ * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed
+ *                              within timeout.
+ */
+psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout);
+
+#define PSM2_EP_CLOSE_GRACEFUL	0	/**< Graceful mode in @ref psm2_ep_close */
+#define PSM2_EP_CLOSE_FORCE	1	/**< Forceful mode in @ref psm2_ep_close */
+
+/** @brief Provide mappings for network id to hostname
+ *
+ * Since PSM2 does not assume or rely on the availability of an external
+ * networkid-to-hostname mapping service, users can provide one or more of
+ * these mappings.  The @ref psm2_map_nid_hostname function allows a list of
+ * network ids to be associated to hostnames.
+ *
+ * This function is not mandatory for correct operation but may allow PSM2 to
+ * provide better diagnostics when remote endpoints are unavailable and can
+ * otherwise only be identified by their network id.
+ *
+ * @param[in] num Number elements in @c nid and @c hostnames arrays
+ * @param[in] nids User-provided array of network ids (i.e. OPA LIDs),
+ *                 should be obtained by calling @ref psm2_epid_nid on each
+ *                 epid.
+ * @param[in] hostnames User-provided array of hostnames (array of
+ *                      NUL-terimated strings) where each hostname index
+ *                      maps to the provided nid hostname.
+ *
+ * @warning Duplicate nids may be provided in the input @c nids array, only
+ *          the first corresponding hostname will be remembered.
+ *
+ * @pre The user may or may not have already provided a hostname mappings.
+ * @post The user may free any dynamically allocated memory passed to the
+ *       function.
+ *
+ */
+psm2_error_t
+psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames);
+
+/** @brief Connect one or more remote endpoints to a local endpoint
+ *
+ * Function to non-collectively establish a connection to a set of endpoint IDs
+ * and translate endpoint IDs into endpoint addresses.  Establishing a remote
+ * connection with a set of remote endpoint IDs does not imply a collective
+ * operation and the user is free to connect unequal sets on each process.
+ * Similarly, a given endpoint address does not imply that a pairwise
+ * communication context exists between the local endpoint and remote endpoint.
+ *
+ * @param[in] ep PSM2 endpoint handle
+ *
+ * @param[in] num_of_epid The number of endpoints to connect to, which
+ *                        also establishes the number of elements contained in
+ *                        all of the function's array-based parameters.
+ *
+ * @param[in] array_of_epid User-allocated array that contains @c num_of_epid
+ *                          valid endpoint identifiers.  Each endpoint id (or
+ *                          epid) has been obtained through an out-of-band
+ *                          mechanism and each endpoint must have been opened
+ *                          with the same uuid key.
+ *
+ * @param[in] array_of_epid_mask User-allocated array that contains
+ *                          @c num_of_epid integers.  This array of masks
+ *                          allows users to select which of the epids in @c
+ *                          array_of_epid should be connected.  If the integer
+ *                          at index i is zero, psm does not attempt to connect
+ *                          to the epid at index i in @c array_of_epid.  If
+ *                          this parameter is NULL, psm will try to connect to
+ *                          each epid.
+ *
+ * @param[out] array_of_errors User-allocated array of at least @c num_of_epid
+ *                             elements. If the function does not return
+ *                             PSM2_OK, this array can be consulted for each
+ *                             endpoint not masked off by @c array_of_epid_mask
+ *                             to know why the endpoint could not be connected.
+ *                             Endpoints that could not be connected because of
+ *                             an unrelated failure will be marked as @ref
+ *                             PSM2_EPID_UNKNOWN.  If the function returns
+ *                             PSM2_OK, the errors for all endpoints will also
+ *                             contain PSM2_OK.
+ *
+ * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid
+ *                             elements of type psm2_epaddr_t.  Each
+ *                             successfully connected endpoint is updated with
+ *                             an endpoint address handle that corresponds to
+ *                             the endpoint id at the same index in @c
+ *                             array_of_epid.  Handles are only updated if the
+ *                             endpoint could be connected and if its error in
+ *                             array_of_errors is PSM2_OK.
+ *
+ * @param[in] timeout Timeout in nanoseconds after which connection attempts
+ *                    will be abandoned.  Setting this value to 0 disables
+ *                    timeout and waits until all endpoints have been
+ *                    successfully connected or until an error is detected.
+ *
+ * @pre The user has opened a local endpoint and obtained a list of endpoint
+ *      IDs to connect to a given endpoint handle using an out-of-band
+ *      mechanism not provided by PSM.
+ *
+ * @post If the connect is successful, @c array_of_epaddr is updated with valid
+ *       endpoint addresses.
+ *
+ * @post If unsuccessful, the user can query the return status of each
+ *       individual remote endpoint in @c array_of_errors.
+ *
+ * @post The user can call into @ref psm2_ep_connect many times with the same
+ *       endpoint ID and the function is guaranteed to return the same output
+ *       parameters.
+ *
+ * @post PSM2 does not keep any reference to the arrays passed into the
+ *       function and the caller is free to deallocate them.
+ *
+ * The error value with the highest importance is returned by
+ * the function if some portion of the communication failed.  Users should
+ * always refer to individual errors in @c array_of_errors whenever the
+ * function cannot return PSM2_OK.
+ *
+ * @returns PSM2_OK  The entire set of endpoint IDs were successfully connected
+ *                  and endpoint addresses are available for all endpoint IDs.
+ *
+ * @code{.c}
+   	int connect_endpoints(psm2_ep_t ep, int numep,
+   	                      const psm2_epid_t *array_of_epid,
+   	                      psm2_epaddr_t **array_of_epaddr_out)
+   	{
+   	    psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t));
+   	    if (errors == NULL)
+   	        return -1;
+   	
+   	    psm2_epaddr_t *all_epaddrs =
+   	             (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t));
+   	
+   	    if (all_epaddrs == NULL)
+   	        return -1;
+   	
+   	    psm2_ep_connect(ep, numep, array_of_epid,
+   	                   NULL, // We want to connect all epids, no mask needed
+   	                   errors,
+   	                   all_epaddrs,
+   	                   30*e9); // 30 second timeout, <1 ns is forever
+   	    *array_of_epaddr_out = all_epaddrs;
+   	    free(errors);
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid,
+		   const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		   psm2_epaddr_t *array_of_epaddr, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* This function is equivalent to calling psm2_ep_disconnect2() with mode ==
+* PSM2_EP_DISCONNECT_GRACEFUL.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+*                          which also indicates the number of elements contained
+*                          in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+*                            valid endpoint addresses. Each endpoint address (or
+*                            epaddr) has been obtained through a previous
+*                            psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+*                                 num_of_epaddr integers. This array of masks
+*                                 allows users to select which of the
+*                                 epaddresses in array_of_epaddr should be
+*                                 disconnected. If the integer at index i is
+*                                 zero, PSM2 does not attempt to disconnect to
+*                                 the epaddr at index i in array_of_epaddr. If
+*                                 this parameter is NULL, PSM2 tries to
+*                                 disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+*                             elements. If the function does not return PSM2_OK,
+*                             this array can be consulted for each endpoint
+*                             address not masked off by array_of_epaddr_mask to
+*                             know why the endpoint could not be disconnected.
+*                             Any endpoint address that could not be
+*                             disconnected because of an unrelated failure is
+*                             marked as PSM2_EPID_UNKNOWN. If the function
+*                             returns PSM2_OK, the errors for all endpoint
+*                             addresses also contain PSM2_OK.
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+*                    are abandoned. Setting this value to 0 disables timeout and
+*                    waits until all endpoints have been successfully
+*                    disconnected or until an error is detected.
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+*       array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+*       remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+*       and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+*       if some portion of the communication failed. Refer to individual errors
+*       in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+*          and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+             const psm2_epaddr_t *array_of_epaddr)
+{
+    psm2_error_t *errors =
+        (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+    if (errors == NULL)
+        return -1;
+    psm2_ep_disconnect(
+        ep, num_epaddr, array_of_epaddr,
+        NULL, // We want to disconnect all epaddrs, no mask needed,
+        errors,
+        30 * e9); // 30 second timeout, <1 ns is forever
+    free(errors);
+    return 1;
+}
+ at endcode
+*/
+psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+				psm2_epaddr_t *array_of_epaddr,
+				const int *array_of_epaddr_mask,
+				psm2_error_t *array_of_errors, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+*                          which also indicates the number of elements contained
+*                          in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+*                            valid endpoint addresses. Each endpoint address (or
+*                            epaddr) has been obtained through a previous
+*                            psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+*                                 num_of_epaddr integers. This array of masks
+*                                 allows users to select which of the
+*                                 epaddresses in array_of_epaddr should be
+*                                 disconnected. If the integer at index i is
+*                                 zero, PSM2 does not attempt to disconnect to
+*                                 the epaddr at index i in array_of_epaddr. If
+*                                 this parameter is NULL, PSM2 tries to
+*                                 disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+*                             elements. If the function does not return PSM2_OK,
+*                             this array can be consulted for each endpoint
+*                             address not masked off by array_of_epaddr_mask to
+*                             know why the endpoint could not be disconnected.
+*                             Any endpoint address that could not be
+*                             disconnected because of an unrelated failure is
+*                             marked as PSM2_EPID_UNKNOWN. If the function
+*                             returns PSM2_OK, the errors for all endpoint
+*                             addresses also contain PSM2_OK.
+*
+* @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+*                    are abandoned. Setting this value to 0 disables timeout and
+*                    waits until all endpoints have been successfully
+*                    disconnected or until an error is detected. Supplying a
+*                    negative value here sets the disconnection mode to "force".
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+*       array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+*       remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+*       and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+*       if some portion of the communication failed. Refer to individual errors
+*       in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+*          and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+             const psm2_epaddr_t *array_of_epaddr)
+{
+    psm2_error_t *errors =
+        (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+    if (errors == NULL)
+        return -1;
+    psm2_ep_disconnect2(
+        ep, num_epaddr, array_of_epaddr,
+        NULL, // We want to disconnect all epaddrs, no mask needed,
+        errors,
+	PSM2_EP_DISCONECT_GRACEFUL,
+        30 * e9); // 30 second timeout, 0 ns is forever
+    free(errors);
+    return 1;
+}
+ at endcode
+*/
+psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+				psm2_epaddr_t *array_of_epaddr,
+				const int *array_of_epaddr_mask,
+				psm2_error_t *array_of_errors,
+				int mode, int64_t timeout);
+
+#define PSM2_EP_DISCONNECT_GRACEFUL	PSM2_EP_CLOSE_GRACEFUL   /**< Graceful mode in @ref psm2_ep_disconnect2 */
+#define PSM2_EP_DISCONNECT_FORCE	PSM2_EP_CLOSE_FORCE   /**< Forceful mode in @ref psm2_ep_disconnect2 */
+
+/** @brief Ensure endpoint communication progress
+ *
+ * Function to ensure progress for all PSM2 components instantiated on an
+ * endpoint (currently, this only includes the MQ component).  The function
+ * never blocks and is typically required in two cases:
+ *
+ * @li Allowing all PSM2 components instantiated over a given endpoint to make
+ *     communication progress. Refer to @ref mq_progress for a detailed
+ *     discussion on MQ-level progress issues.
+ *
+ * @li Cases where users write their own synchronization primitives that
+ *     depend on remote communication (such as spinning on a memory location
+ *     which's new value depends on ongoing communication).
+ *
+ * The poll function doesn't block, but the user can rely on the @ref
+ * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of
+ * frequency (poll until an event happens) or execution environment (poll for a
+ * while but yield to other threads of CPUs are oversubscribed).
+ *
+ * @returns PSM2_OK             Some communication events were progressed
+ * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress
+ *
+ */
+psm2_error_t psm2_poll(psm2_ep_t ep);
+
+/** @brief Set a user-determined ep address label.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] epaddr_label_string User-allocated string to print when
+ *                   identifying endpoint in error handling or other verbose
+ *                   printing.  The NULL-terminated string must be allocated by
+ *                   the user since PSM2 only keeps a pointer to the label.  If
+ *                   users do not explicitly set a label for each endpoint,
+ *                   endpoints will identify themselves as hostname:port.
+ */
+void psm2_epaddr_setlabel(psm2_epaddr_t epaddr,
+			 const char *epaddr_label_string);
+
+/** @brief Set a user-determined ep address context.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] ctxt   Opaque user defined state to associate with an endpoint
+ *                   address. This state can be retrieved via
+ *                   @ref psm2_epaddr_getctxt.
+ */
+void
+psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt);
+
+/** @brief Get the user-determined ep address context. Users can associate an
+ *  opaque context with each endpoint via @ref psm2_epaddr_setctxt.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect.
+ */
+void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr);
+
+/* Below are all component specific options. The component object for each of
+ * the options is also specified.
+ */
+
+/* PSM2_COMPONENT_CORE options */
+/* PSM2 debug level */
+#define PSM2_CORE_OPT_DEBUG     0x101
+  /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set
+   * before initializing the PSM2 library.
+   *
+   * component object: (null)
+   * option value: PSM2 Debug mask to set or currently active debug level.
+   */
+
+/* PSM2 endpoint address context */
+#define PSM2_CORE_OPT_EP_CTXT   0x102
+  /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint
+   * address (psm2_epaddr_t).
+   *
+   * component object: PSM2 endpoint (@ref psm2_epaddr_t) address.
+   * option value: Context associated with PSM2 endpoint address.
+   */
+
+/* PSM2_COMPONENT_IB options */
+/* Default service level to use to communicate with remote endpoints */
+#define PSM2_IB_OPT_DF_SL 0x201
+  /**< [@b uint32_t ] Default OPA SL to use for all remote communication.
+   * If unset defaults to Service Level 0.
+   *
+   * component object: Opened PSM2 endpoint id (@ref psm2_ep_t).
+   * option value: Default IB SL to use for endpoint. (0 <= SL < 15)
+   */
+
+/* Set IB service level to use for communication to an endpoint */
+#define PSM2_IB_OPT_EP_SL 0x202
+  /**< [@b uint32_t ] OPA SL to use for communication to specified
+   * remote endpoint.
+   *
+   * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address.
+   * option value: SL used to communicate with remote endpoint. (0 <= SL < 15)
+   */
+
+/* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */
+/* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */
+#define PSM2_MQ_OPT_RNDV_IB_SZ       0x301
+  /**< [@b uint32_t ] Size at which to start enabling rendezvous
+   * messaging for OPA messages (if unset, defaults to values
+   * between 56000 and 72000 depending on the system configuration)
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+#define PSM2_MQ_RNDV_HFI_SZ          PSM2_MQ_OPT_RNDV_IB_SZ
+#define PSM2_MQ_RNDV_IPATH_SZ        PSM2_MQ_OPT_RNDV_IB_SZ
+
+#define PSM2_MQ_OPT_RNDV_SHM_SZ      0x302
+#define PSM2_MQ_RNDV_SHM_SZ          PSM2_MQ_OPT_RNDV_SHM_SZ
+  /**< [@b uint32_t ] Size at which to start enabling
+   * rendezvous messaging for shared memory (intra-node) messages (If
+   * unset, defaults to 64000 bytes).
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+
+#define PSM2_MQ_OPT_SYSBUF_MYBYTES   0x303
+#define PSM2_MQ_MAX_SYSBUF_MBYTES    PSM2_MQ_OPT_SYSBUF_MYBYTES
+  /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected
+   * messages.
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Deprecated; this option has no effect.
+   */
+
+/* PSM2_COMPONENT_AM options */
+#define PSM2_AM_OPT_FRAG_SZ          0x401
+#define PSM2_AM_MAX_FRAG_SZ          PSM2_AM_OPT_FRAG_SZ
+/*!< [@b uint32_t ] Maximum active message fragment size that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum active message fragment size in bytes.
+ */
+
+#define PSM2_AM_OPT_NARGS 0x402
+#define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS
+
+/*!< [@b uint32_t ] Maximum number of message arguments that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum number of active message arguments.
+ */
+
+#define PSM2_AM_OPT_HANDLERS 0x403
+#define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS
+/*!< [@b uint32_t ] Maximum number of message handlers that can be registered
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum number of active message handlers.
+ */
+
+/** @brief Set an option for a PSM2 component
+ *
+ * Function to set the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to set the option
+ * @param[in] component_obj Opaque component specify object to apply the set
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * @param[in] optname Name of component option to set. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * @param[in] optval Pointer to storage that contains the value to be updated
+ *                   for the supplied option.  It is up to the user to
+ *                   ensure that the pointer points to a memory location with a
+ *                   correct size and format.
+ * @param[in] optlen Size of the memory region pointed to by optval.
+ *
+ * @returns PSM2_OK if option could be set.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option.
+ *
+ */
+psm2_error_t
+psm2_setopt(psm2_component_t component, const void *component_obj,
+	   int optname, const void *optval, uint64_t optlen);
+
+/** @brief Get an option for a PSM2 component
+ *
+ * Function to get the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to get the option
+ * @param[in] component_obj Opaque component specify object to apply the get
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * @param[in] optname Name of component option to get. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * @param[out] optval Pointer to storage that contains the value to be updated
+ *                    for the supplied option.  It is up to the user to
+ *                    ensure that the pointer points to a valid memory region.
+ * @param[in,out] optlen This is a value result parameter initially containing
+ *                      the size of the memory region pointed to by optval and
+ *                      modified to return the actual size of optval.
+ *
+ * @returns PSM2_OK if option value could be retrieved successfully.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size.
+ *                         optlen contains the required memory region size for
+ *                         optname value.
+ *
+ */
+psm2_error_t
+psm2_getopt(psm2_component_t component, const void *component_obj,
+	   int optname, void *optval, uint64_t *optlen);
+
+/** @brief Datatype for end-point information */
+typedef struct psm2_epinfo {
+	psm2_ep_t ep;		/**< The ep for this end-point*/
+	psm2_epid_t epid;	/**< The epid for this end-point */
+	psm2_uuid_t uuid;	/**< The UUID for this end-point */
+	uint16_t jkey;		/**< The job key for this end-point */
+	char uuid_str[64];	/**< String representation of the UUID for this end-point */
+} psm2_epinfo_t;
+
+/** @brief Datatype for end-point connection */
+typedef struct psm2_epconn {
+	psm2_epaddr_t addr;	/**< The epaddr for this connection */
+	psm2_ep_t ep;		/**< The ep for this connection */
+	psm2_mq_t mq;		/**< The mq for this connection */
+} psm2_epconn_t;
+
+/** @brief Query PSM2 for end-point information.
+ *
+ * Function to query PSM2 for end-point information. This allows retrieval of
+ * end-point information in cases where the caller does not have access to the
+ * results of psm2_ep_open().  In the default single-rail mode PSM2 will use
+ * a single endpoint. If either multi-rail mode or multi-endpoint mode is
+ * enabled, PSM2 will use multiple endpoints.
+ *
+ * @param[in,out] num_of_epinfo On input, sizes the available number of entries
+ *                              in array_of_epinfo.  On output, specifies the
+ *                              returned number of entries in array_of_epinfo.
+ * @param[out] array_of_epinfo Returns end-point information structures.
+ *
+ * @pre PSM2 is initialized and the end-point has been opened.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ */
+psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo);
+
+/** @brief Query PSM2 for end-point connections.
+ *
+ * Function to query PSM2 for end-point connections. This allows retrieval of
+ * end-point connections in cases where the caller does not have access to the
+ * results of psm2_ep_connect().  The epid values can be found using
+ * psm2_ep_query() so that each PSM2 process can determine its own epid. These
+ * values can then be distributed across the PSM2 process so that each PSM
+ * process knows the epid for all other PSM2 processes.
+ *
+ * @param[in] epid The epid of a PSM2 process.
+ * @param[out] epconn The connection information for that PSM2 process.
+ *
+ * @pre PSM2 is initialized and the end-point has been connected to this epid.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM.
+ */
+psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Query given PSM2 end-point for its connections.
+ *
+ * The need for this function comes with 'multi-ep' feature.
+ * Function is similar to (@ref psm2_ep_epid_lookup).
+ * It differs in that an extra parameter which identifies
+ * the end-point [ep] must be provided which limits the lookup to that single ep.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM.
+ * @returns PSM2_PARAM_ERR if output [epconn] is NULL.
+ */
+psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Get PSM2 epid for given epaddr.
+ *
+ * @param[in] epaddr The endpoint address.
+ * @param[out] epid The epid of a PSM2 process.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL.
+ */
+psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/psm2_am.h b/psm2_am.h
new file mode 100644
index 0000000..1383fbb
--- /dev/null
+++ b/psm2_am.h
@@ -0,0 +1,411 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef PSM2_AM_H
+#define PSM2_AM_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <psm2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_am.h
+ * @brief PSM2 Active Message.
+ *
+ * @page psm2_am Active Message Interface
+ *
+ * PSM2 implements an Active Message (AM) component that lives alongside the
+ * Matched Queues (MQ) component. The active message interface essentially
+ * provides a remote procedure call mechanism. A PSM2 process can generate a
+ * request to run an active message handler on a remote PSM2 process
+ * identified by its end-point address (epaddr). End-point address values
+ * are returned by PSM2 when connecting end-points using the psm2_ep_connect()
+ * function.
+ *
+ * An AM handler may make local state updates, and may generate at most
+ * one reply to be returned to the original requestor. This reply will cause
+ * a handler to be run on that requestor. The requestor handler may make
+ * local state updates but is not allowed to reply nor request in that handler
+ * context. A request or reply can convey a small number of in-line arguments
+ * plus a short amount of data. A tight bound is placed on the number of
+ * in-line arguments to allow them to be packed into a header. A bound is
+ * placed on the size of the data payload so that the request or reply can
+ * be sent as a single packet within the MTU of the underlying communication
+ * transport. Longer payloads must be synthesized on top of the provided
+ * short request/reply mechanism by fragmentation and reassembly, or
+ * transported by some other means.
+ *
+ * Handlers are run in the process context of the targeted PSM2 process,
+ * either in its main thread of execution or in a progress thread. A handler
+ * may therefore be executed concurrently with the main thread of execution
+ * of the PSM2 process. PSM2 ensures that its own state is protected against this
+ * concurrent execution. However, a handler must make its own arrangements to
+ * protect its own state. Alternatively, the PSM2 progress thread can be
+ * disabled using the PSM2_RCVTHREAD environment variable if this is too
+ * onerous for the handler.
+ *
+ * PSM2 has an active progress model and requires that the PSM2 library is
+ * called in order to make progress. This can be achieved using the psm2_poll()
+ * function. A PSM2 implementatation may provide passive progress through some
+ * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume
+ * this and must arrange to make active progress through calls into the PSM
+ * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM
+ * interfaces, and that MTsafety must be provided by the consumer if required.
+ *
+ * The order in which AM requests are issued by an initiator to a particular
+ * target defines the order in which those AM requests will be executed on
+ * that target. Therefore the AM implementation will maintain the order
+ * of handler executions on a flow, and this also applies when progress
+ * threads are used. For multiple initiators issuing requests to a particular
+ * target, the handler executions will be interleaved in some sequentially
+ * consistent ordering.
+ */
+
+/*! @defgroup am PSM2 Active Message
+ *
+ * @{
+ */
+
+/** @brief Datatype for an index representing an active message handler */
+typedef uint32_t psm2_handler_t;
+
+/** @brief Datatype for a token for an active message handler.*/
+typedef void *psm2_am_token_t;
+
+/* PSM2 AM flags
+ * These flags may be combined using bitwise-or.
+ */
+#define PSM2_AM_FLAG_NONE    0 /**< No other PSM2 AM flags are needed. */
+#define PSM2_AM_FLAG_ASYNC   1 /**< No need to copy source data. */
+#define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is
+				   guaranteed not to generate a reply. */
+
+/** @brief The psm2_amarg type represents the type of an AM argument. This is
+ *  a 64-bit type and is broken down into four 16-bit fields, two 32-bit
+ *  fields or one 64-bit field for the convenience of code using the PSM2 AM
+ *  interface.
+ */
+typedef
+struct psm2_amarg {
+	union {
+		struct {
+			uint16_t u16w3;
+			uint16_t u16w2;
+			uint16_t u16w1;
+			uint16_t u16w0;
+		};
+		struct {
+			uint32_t u32w1;
+			uint32_t u32w0;
+		};
+		uint64_t u64w0;
+		uint64_t u64;
+	};
+} psm2_amarg_t;
+
+/** @brief The AM handler function type
+ *
+ * psm2_am_handler_fm_t is the datatype for an AM handler. PSM2 AM will call-back
+ * into an AM handler using this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * @param[in] token This is an opaque token value passed into a handler.
+ *                  A request handler may send at most one reply back to the
+ *                  original requestor, and must pass this value as the token
+ *                  parameter to the psm2_am_reply_short() function. A reply
+ *                  handler is also passed a token value, but must not attempt
+ *                  to reply.
+ * @param[in] args A pointer to the arguments provided to this handler.
+ * @param[in] nargs The number of arguments.
+ * @param[in] src A pointer to the data payload provided to this handler.
+ * @param[in] len The length of the data payload in bytes.
+ *
+ * @returns 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm2_am_handler_fn_t) (psm2_am_token_t token,
+			    psm2_amarg_t *args, int nargs,
+			    void *src, uint32_t len);
+
+/** @brief Type for a completion call-back handler.
+ *
+ * A completion handler can be specified to give a call-back on the initiation
+ * side that an AM request or reply has completed on the target side. The
+ * call-back has a context pointer which is provided along with the call-back
+ * function pointer when the initiator generates the request or reply. This
+ * approach will typically give higher performance than using an AM request or
+ * reply to achieve the same effect, though note that no additional information
+ * can be passed from the target side back to the initiator side with the
+ * completion handler approach.
+ *
+ * @param[in] context A context pointer.
+ * @returns void This handler has no return result.
+ */
+typedef
+void (*psm2_am_completion_fn_t) (void *context);
+
+/** @brief Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of
+ * handlers that can be registered is limited to the max_handlers value
+ * returned by psm2_am_get_parameters(). Handlers are associated with a PSM
+ * end-point. The handlers are allocated index numbers in the the handler table
+ * for that end-point.  The allocated index for the handler function in
+ * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These
+ * handler index values are used in the psm2_am_request_short() and
+ * psm2_am_reply_short() functions.
+ *
+ * @param[in] ep End-point value
+ * @param[in] handlers Array of handler functions
+ * @param[in] num_handlers Number of handlers (sizes the handlers and
+ *                         handlers_idx arrays)
+ * @param[out] handlers_idx Used to return handler index mapping table
+ *
+ * @returns PSM2_OK Indicates success
+ * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm2_error_t psm2_am_register_handlers(psm2_ep_t ep,
+				     const psm2_am_handler_fn_t *
+				     handlers, int num_handlers,
+				     int *handlers_idx);
+
+/** @brief Generate an AM request.
+ *
+ * This function generates an AM request causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_request_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ *   PSM2_AM_FLAG_NONE    - No flags
+ *   PSM2_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *   PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to
+ *                         generate a reply
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the request cannot be issued immediately.  However, if PSM2_AM_FLAG_ASYNC
+ * is provided then a copy will not be taken and the PSM2 AM implementation
+ * retains ownership of the payload src memory until the request is locally
+ * complete. Local completion can be determined using the completion handler
+ * call-back, or through an AM handler associated with an AM reply.
+ *
+ * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that
+ * a reply will not be generated. Use of this flag is optional, but it may
+ * enable a performance optimization in this case by indicating that reply
+ * state is not required.
+ *
+ * @param[in] epaddr End-point address to run handler on
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ *                  bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ *                          remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ *                            completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+		     psm2_amarg_t *args, int nargs, void *src,
+		     size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt);
+
+/** @brief Generate an AM reply.
+ *
+ * This function may only be called from an AM handler called due to an AM
+ * request.  If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM
+ * handler must not call this function. Otherwise, the AM request handler may
+ * call psm2_am_reply_short() at most once, and must pass in the token value
+ * that it received in its own handler call-back.
+ *
+ * This function generates an AM reply causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_reply_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ *   PSM2_AM_FLAG_NONE    - No flags
+ *   PSM2_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the reply cannot be issued immediately.  However, if PSM2_AM_FLAG_ASYNC is
+ * provided then a copy will not be taken and the PSM2 AM implementation retains
+ * ownership of the payload src memory until the reply is locally complete.
+ * Local completion can be determined using the completion handler call-back.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ *                  the reply.
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ *                  bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ *                          remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ *                            completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+		   psm2_amarg_t *args, int nargs, void *src,
+		   size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn,
+		   void *completion_ctxt);
+
+/** @brief Return the source end-point address for a token.
+ *
+ * This function is used to obtain the epaddr object representing the message
+ * initiator from a token passed by PSM2 to a message handler.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ *                  the reply.
+ * @param[out] epaddr_out Pointer to the where the epaddr should be returned.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL.
+ */
+psm2_error_t psm2_am_get_source(psm2_am_token_t token,
+			      psm2_epaddr_t *epaddr_out);
+
+/** @brief AM parameters
+ *
+ * This structure is used to return PSM2 AM implementation-specific parameter
+ * values back to the caller of the psm2_am_get_parameters() function. This
+ * API also specifies the minimum values for these parameters that an
+ * implementation must at least provide:
+ *   max_handlers >= 64,
+ *   max_nargs >= 2,
+ *   max_request_short >= 256 and
+ *   max_reply_short >= 256.
+ */
+struct psm2_am_parameters {
+	/** Maximum number of handlers that can be registered. */
+	uint32_t max_handlers;
+	/** Maximum number of arguments to an AM handler. */
+	uint32_t max_nargs;
+	/** Maximum number of bytes in a request payload. */
+	uint32_t max_request_short;
+	/** Maximum number of bytes in a reply payload. */
+	uint32_t max_reply_short;
+};
+
+/** @brief Get the AM parameter values
+ *
+ * This function retrieves the implementation-specific AM parameter values for
+ * the specified end-point.
+ *
+ * @param[in] ep The end-point value returned by psm2_ep_open().
+ * @param[out] parameters Pointer to the struct where the parameters will be
+ *                        returned.
+ * @param[in] sizeof_parameters_in The size in bytes of the struct provided by
+ *                                 the caller.
+ * @param[out] sizeof_parameters_out The size in bytes of the struct returned
+ *                                   by PSM.
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_get_parameters(psm2_ep_t ep,
+		      struct psm2_am_parameters *parameters,
+		      size_t sizeof_parameters_in,
+		      size_t *sizeof_parameters_out);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/psm2_linker_script.map b/psm2_linker_script.map
new file mode 100644
index 0000000..f1db50e
--- /dev/null
+++ b/psm2_linker_script.map
@@ -0,0 +1,93 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+   C++ // Comments don't work in this file. */
+
+PSM2_1.0
+{
+    /* Expose only those symbols we choose to. This way we do not
+       pollute users namespace more than absolutely necessary. */
+    global:
+        psm2_*;
+
+        /* Below symbols are used for hfidiags hfi1_pkt_test */
+        /* opa_udebug.h - global */
+        hfi_debug;
+        hfi_get_unit_name;
+        __progname;
+
+        /* opa_udebug.h - _HFI_DEBUGGING */
+        __hfi_mylabel;
+        hfi_set_mylabel;
+        hfi_get_mylabel;
+        __hfi_dbgout;
+
+        /* opa_service.h */
+        hfi_context_open;
+        hfi_get_port_vl2mtu;
+        hfi_get_port_lid;
+        hfi_context_close;
+        hfi_cmd_write;
+        hfi_mmap64;
+
+        /* opa_user.h */
+        hfi_userinit;
+        hfi_poll_type;
+        hfi_wait_for_packet;
+        __hfi_pico_per_cycle;
+
+    /* Make all other symbols local */
+    local:
+        *;
+};
+
diff --git a/psm2_linker_script_map.in b/psm2_linker_script_map.in
new file mode 100644
index 0000000..efa87c5
--- /dev/null
+++ b/psm2_linker_script_map.in
@@ -0,0 +1,95 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+   C++ // Comments don't work in this file. */
+
+PSM2_1.0
+{
+    /* Expose only those symbols we choose to. This way we do not
+       pollute users namespace more than absolutely necessary. */
+    global:
+        psm2_*;
+
+        /* Below symbols are used for hfidiags hfi1_pkt_test */
+        /* opa_udebug.h - global */
+        hfi_debug;
+        hfi_get_unit_name;
+        __progname;
+
+        /* opa_udebug.h - _HFI_DEBUGGING */
+        __hfi_mylabel;
+        hfi_set_mylabel;
+        hfi_get_mylabel;
+        __hfi_dbgout;
+
+        /* opa_service.h */
+        hfi_context_open;
+        hfi_get_port_vl2mtu;
+        hfi_get_port_lid;
+        hfi_context_close;
+        hfi_cmd_write;
+        hfi_mmap64;
+
+        /* opa_user.h */
+        hfi_userinit;
+        hfi_poll_type;
+        hfi_wait_for_packet;
+        __hfi_pico_per_cycle;
+
+	/* Additional globals */
+	_psm2_additional_globals_;
+    /* Make all other symbols local */
+    local:
+        *;
+};
+
diff --git a/psm2_mq.h b/psm2_mq.h
new file mode 100644
index 0000000..6c23b10
--- /dev/null
+++ b/psm2_mq.h
@@ -0,0 +1,1403 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MQ_H
+#define PSM2_MQ_H
+
+#include <psm2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_mq.h
+ * @brief PSM2 Matched Queues
+ *
+ * @page psm2_mq Matched Queues interface
+ *
+ * The Matched Queues (MQ) interface implements a queue-based communication
+ * model with the distinction that queue message consumers use a 3-tuple of
+ * metadata to match incoming messages against a list of preposted receive
+ * buffers.  These semantics are consistent with those presented by MPI-1.2
+ * and all the features and side-effects of Message-Passing find their way into
+ * Matched Queues. There is currently a single MQ context,
+ * If need be, MQs may expose a function to allocate more than
+ * one MQ context in the future.  Since an MQ is implicitly bound to a locally
+ * opened endpoint, handle all MQ functions use an MQ handle instead of an EP
+ * handle as a communication context.
+ *
+ * @section tagmatch MQ Tag Matching
+ *
+ * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t)
+ * and a 3-tuple of tag objects.  Two of the tag objects are provided by the
+ * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is
+ * provided by the sender as part of every message sent (@ref psm2_mq_send and
+ * @ref psm2_mq_isend).  Since MQ is a receiver-directed communication model,
+ * the tag matching done at the receiver involves matching the sent message's
+ * origin and send tag (@c stag) with the source endpointer address, tag (@c
+ * rtag), and tag selector (@c rtagsel) attached to every preposted receive
+ * buffer.  The incoming @c stag is compared to the posted @c rtag but only for
+ * significant bits set to @c 1 in the @c rtagsel.  The @c rtagsel can be used
+ * to mask off parts (or even all) of the bitwise comparison between sender and
+ * receiver tags.  A successful match causes the message to be received into
+ * the buffer with which the tag is matched.  If the incoming message is too
+ * large, it is truncated to the size of the posted receive buffer.  The
+ * bitwise operation corresponding to a successful match and receipt of an
+ * expected message amounts to the following expression evaluating as true:
+ *
+ *      @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim
+ *
+ * It is up to the user to encode (pack) into the 64-bit unsigned
+ * integers, including employing the @c rtagsel tag selector as a method to
+ * wildcart part or all of the bits significant in the tag matching operation.
+ * For example, MPI uses triple based on context (MPI communicator), source
+ * rank, send tag. The following code example shows how the triple can be
+ * packed into 64 bits:
+ *
+ * @code{.c}
+ 	//
+ 	// 64-bit send tag formed by packing the triple:
+ 	//
+ 	// ( context_id_16bits | source_rank_16bits | send_tag_32bits )
+ 	//
+ 	stag = ( (((context_id)&0xffffULL)<<48)|    \
+ 	         (((source_rank)&0xffffULL)<<32)|   \
+ 	         (((send_tag)&0xffffffffULL)) );
+   @endcode
+ *
+ * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel
+ * masking bits against a list of send tags and returns the first successful
+ * match.  Zero bits in the @c tagsel can be used to indicate wildcarded bits
+ * in the 64-bit tag which can be useful for implementing MPI's
+ * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG.  Following the example bit splicing in
+ * the above @c stag example:
+ *
+ * @code{.c}
+   	// Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333
+  
+   	// MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000000000007;
+   	rtagsel = 0xffff0000ffffffff;
+  
+   	// MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000300000000;
+   	rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag
+  
+   	// MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000300000000;
+   	rtagsel = 0xffff000080000000; // can't ignore sign bit in tag
+   @endcode
+ *
+ *
+ * Applications that do not follow tag matching semantics can simply always
+ * pass a value of @c 0 for @c rtagsel, which will always yield a successful
+ * match to the first preposted buffer.  If a message cannot be matched to any
+ * of the preposted buffers, the message is delivered as an unexpected
+ * message.
+ *
+ * @section mq_receive MQ Message Reception
+ *
+ * MQ messages are either received as @e expected or @e unexpected: @li The
+ * received message is @e expected if the incoming message tag matches the
+ * combination of tag and tag selector of at least one of the user-provided
+ * receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * @li The received message is @e unexpected if the incoming message tag @b
+ * doesn't match any combination of tag and tag selector from all the
+ * user-provided receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * Unexpected messages are messages that the MQ library buffers until the
+ * user provides a receive buffer that can match the unexpected message.
+ * With Matched Queues and MPI alike, unexpected messages can occur as a
+ * side-effect of the programming model, whereby the arrival of messages can be
+ * slightly out of step with the ordering in which the user
+ * provides receive buffers.  Unexpected messages can also be triggered by the
+ * difference between the rate at which a sender produces messages and the rate
+ * at which a paired receiver can post buffers and hence consume the messages.
+ *
+ * In all cases, too many @e unexpected messages will negatively affect
+ * performance.  Users can employ some of the following mechanisms to reduce
+ * the effect of added memory allocations and copies that result from
+ * unexpected messages:
+ *   @li If and when possible, receive buffers should be posted as early as
+ *       possible and ideally before calling into the progress engine.
+ *   @li Use of rendezvous messaging that can be controlled with
+ *       @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options.  These
+ *       options default to values determined to make effective use of
+ *       bandwidth and are hence not advisable for all communication message
+ *       sizes, but rendezvous messages inherently prevent unexpected
+ *       messages by synchronizing the sender with the receiver beforehand.
+ *   @li The amount of memory that is allocated to handle unexpected messages
+ *       can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *       option.
+ *   @li MQ statistics, such as the amount of received unexpected messages and
+ *       the aggregate amount of unexpected bytes are available in the @ref
+ *       psm2_mq_stats structure.
+ *
+ * Whenever a match occurs, whether the message is expected or unexpected, it
+ * is generally up to the user to ensure that the message is not truncated.
+ * Message truncation occurs when the size of the preposted buffer is less than
+ * the size of the incoming matched message.  MQ will correctly handle
+ * message truncation by always copying the appropriate amount of bytes as to
+ * not overwrite any data.  While it is valid to send less data than the amount
+ * of data that has been preposted, messages that are truncated will be marked
+ * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status
+ * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t).
+ *
+ * @section mq_completion MQ Completion Semantics
+ *
+ * Message completion in Matched Queues follows local completion semantics.
+ * When sending an MQ message, it is deemed complete when MQ guarantees that
+ * the source data has been sent and that the entire input source data memory
+ * location can be safely overwritten.  As with standard Message-Passing,
+ * MQ does not make any remote completion guarantees for sends.  MQ does
+ * however, allow a sender to synchronize with a receiver to send a synchronous
+ * message which sends a message only after a matching receive buffer has been
+ * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC).
+ *
+ * A receive is deemed complete after it has matched its associated receive
+ * buffer with an incoming send and that the data from the send has been
+ * completely delivered to the receive buffer.
+ *
+ * @section mq_progress MQ Progress Requirements
+ *
+ * Progress on MQs must be @e explicitly ensured by the user for correctness.
+ * The progress requirement holds even if certain areas of the MQ
+ * implementation require less network attention than others, or if progress
+ * may internally be guaranteed through interrupts.  The main polling function,
+ * @ref psm2_poll, is the most general form of ensuring process on a given
+ * endpoint.  Calling @ref psm2_poll ensures that progress is made over all the
+ * MQs and other components instantiated over the endpoint passed to @ref
+ * psm2_poll.
+ *
+ * While @ref psm2_poll is the only way to directly ensure progress, other MQ
+ * functions will conditionally ensure progres depending on how they are used:
+ *
+ * @li @ref psm2_mq_wait employs polling and waits until the request is
+ * completed.  For blocking communication operations where the caller is
+ * waiting on a single send or receive to complete, psm2_mq_wait usually
+ * provides the best responsiveness in terms of latency.
+ *
+ * @li @ref psm2_mq_test can test a particular request for completion, but @b
+ * never directly or indirectly ensures progress as it only tests the
+ * completion status of a request, nothing more.  See functional documentation
+ * in @ref psm2_mq_test for a detailed discussion.
+ *
+ * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion
+ * queue is empty and will not ensure progress as long as the completion queue
+ * is non-empty.  Users that always aggressively process all elements of the MQ
+ * completion queue as part of their own progress engine will indirectly always
+ * ensure MQ progress. The ipeek mechanism is the preferred way for
+ * ensuring progress when many non-blocking requests are in flight since ipeek
+ * returns requests in the order in which they complete.  Depending on how the
+ * user initiates and completes communication, this may be preferable to
+ * calling other progress functions on individual requests.
+ */
+
+/*! @defgroup mq PSM Matched Queues
+ *
+ * @{
+ */
+
+/** @brief Initialize the MQ component for MQ communication
+ *
+ * This function provides the Matched Queue handle necessary to perform all
+ * Matched Queue communication operations.
+ *
+ * @param[in] ep Endpoint over which to initialize Matched Queue
+ * @param[in] tag_order_mask Order mask hint to let MQ know what bits of the
+ *                           send tag are required to maintain MQ message
+ *                           order.  In MPI parlance, this mask sets the bits
+ *                           that store the context (or communicator ID).  The
+ *                           user can choose to pass PSM2_MQ_ORDERMASK_NONE or
+ *                           PSM2_MQ_ORDERMASK_ALL to tell MQ to respectively
+ *                           provide no ordering guarantees or to provide
+ *                           ordering over all messages by ignoring the
+ *                           contexts of the send tags.
+ * @param[in] opts Set of options for Matched Queue
+ * @param[in] numopts Number of options passed
+ * @param[out] mq User-supplied storage to return the Matched Queue handle
+ *                associated to the newly created Matched Queue.
+ *
+ * @remark This function can be called many times to retrieve the MQ handle
+ *         associated to an endpoint, but options are only considered the first
+ *         time the function is called.
+ *
+ * @post The user obtains a handle to an instantiated Match Queue.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A new Matched Queue has been instantiated across all the
+ *         members of the group.
+ *
+ * @code{.c}
+   	int try_open_endpoint_and_initialize_mq(
+   	       psm2_ep_t *ep,	// endpoint handle
+   	       psm2_epid_t *epid, // unique endpoint ID
+   	       psm2_uuid_t job_uuid, // unique job uuid, for ep_open
+   	       psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep'
+   	       uint64_t communicator_bits) // Where we store our communicator or
+   	                                   // context bits in the 64-bit tag.
+   	{
+   	    // Simplified open, see psm2_ep_open documentation for more info
+   	    psm2_ep_open(job_uuid,
+   	                NULL, // no options
+   	                ep, epid);
+
+   	    // We initialize a matched queue by telling PSM the bits that are
+   	    // order-significant in the tag.  Point-to-point ordering will not be
+   	    // maintained between senders where the communicator bits are not the
+   	    // same.
+   	    psm2_mq_init(ep,
+   	                communicator_bits,
+   	                NULL, // no other MQ options
+   	                0,    // 0 options passed
+   	                mq);  // newly initialized matched Queue
+
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+	    const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq);
+
+#define PSM2_MQ_ORDERMASK_NONE	0ULL
+	/**< Used to initialize MQ and disable all MQ message ordering
+	 * guarantees (this mask may prevent the use of MQ to maintain matched
+	 * message envelope delivery required in MPI). */
+
+#define PSM2_MQ_ORDERMASK_ALL	0xffffffffffffffffULL
+	/**< Used to initialize MQ with no message ordering hints, which forces
+	 * MQ to maintain order over all messages */
+
+/** @brief Finalize (close) an MQ handle
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A given Matched Queue has been freed and use of the future
+ * use of the handle produces undefined results.
+ */
+psm2_error_t
+psm2_mq_finalize(psm2_mq_t mq);
+
+#define PSM2_MQ_TAG_ELEMENTS 3
+	/**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t
+	 *   type. */
+
+/** @struct psm2_mq_tag
+ ** @brief MQ Message tag
+ *
+ * Extended message tag type introduced in PSM 2.0.  The previous 64 bit tag
+ * values are replaced by a struct containing three 32 bit tag values for a
+ * total of 96 bits.  Matching semantics are unchanged from the previous 64-bit
+ * matching scheme; the only difference is that 96 bits are matched instead of
+ * 64.  For interoperability with existing PSM routines, 64 bit tags are
+ * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to
+ * zero.  Other than this caveat, all of the existing routines using 64-bit
+ * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type.
+ * For example, a message sent using @ref psm2_mq_send can be received using
+ * @ref psm2_mq_irecv2, provided the tags match as described above.
+ */
+typedef
+//struct psm2_mq_tag {
+union psm2_mq_tag {
+//    union {
+		uint32_t tag[PSM2_MQ_TAG_ELEMENTS] __attribute__ ((aligned(16)));
+            /**< 3 x 32bit array representation of @ref psm2_mq_tag */
+		struct {
+			uint32_t tag0; /**< 1 of 3 uint32_t tag values */
+			uint32_t tag1; /**< 2 of 3 uint32_t tag values */
+			uint32_t tag2; /**< 3 of 3 uint32_t tag values */
+		};
+//	};
+} psm2_mq_tag_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.
+ * For wait and test functions, MQ fills in the structure upon completion.
+ * Upon completion, receive requests fill in every field of the status
+ * structure while send requests only return a valid error_code and context
+ * pointer.
+ */
+typedef
+struct psm2_mq_status {
+	/** Sender's original message tag (receive reqs only) */
+	uint64_t msg_tag;
+	/** Sender's original message length (receive reqs only) */
+	uint32_t msg_length;
+	/** Actual number of bytes transfered (receive reqs only) */
+	uint32_t nbytes;
+	/** MQ error code for communication operation */
+	psm2_error_t error_code;
+	/**< User-associated context for send or receive */
+	void *context;
+} psm2_mq_status_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.  For
+ * wait and test functions, MQ fills in the structure upon completion.  Upon
+ * completion, requests fill in every field of the status structure with the
+ * exception of the nbytes field, which is only valid for receives.  Version 2
+ * of the status type contains an @ref psm2_mq_tag_t type to represent the tag
+ * instead of a 64-bit integer value and is for use with PSM v2 routines.
+ */
+
+typedef
+struct psm2_mq_status2 {
+	/** Remote peer's epaddr */
+	psm2_epaddr_t msg_peer;
+	/** Sender's original message tag */
+	psm2_mq_tag_t msg_tag;
+	/** Sender's original message length */
+	uint32_t msg_length;
+	/** Actual number of bytes transfered (receiver only) */
+	uint32_t nbytes;
+	/** MQ error code for communication operation */
+	psm2_error_t error_code;
+	/** User-associated context for send or receive */
+	void *context;
+} psm2_mq_status2_t;
+
+/** @brief PSM2 Communication handle (opaque) */
+typedef struct psm2_mq_req *psm2_mq_req_t;
+
+/*! @} */
+/*! @ingroup mq
+ * @defgroup mq_options PSM Matched Queue Options
+ * @{
+ *
+ * MQ options can be modified at any point at runtime, unless otherwise noted.
+ * The following example shows how to retrieve the current message size at
+ * which messages are sent as synchronous.
+ *
+ * @code{.c}
+   	uint32_t get_hfirv_size(psm2_mq_t mq)
+   	{
+   	    uint32_t rvsize;
+   	    psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize);
+   	    return rvsize;
+   	}
+   @endcode
+ */
+
+/** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to retrieve the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve.  Possible values are:
+ *            @li @ref PSM2_MQ_RNDV_HFI_SZ
+ *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that can be used to store the value of
+ *            the option to be set.  It is up to the user to ensure that the
+ *            pointer points to a memory location large enough to accommodate
+ *            the value associated to the type.  Each option documents the size
+ *            associated to its value.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ */
+psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value);
+
+/** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to set the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve.  Possible values are:
+ *            @li @ref PSM2_MQ_RNDV_HFI_SZ
+ *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that contains the value to be updated
+ *                  for the supplied option number.  It is up to the user to
+ *                  ensure that the pointer points to a memory location with a
+ *                  correct size.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option
+ *                           (currently no MQ options are read-only).
+ */
+psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value);
+
+/*! @}  */
+/*! @ingroup mq
+ * @{
+ */
+
+#define PSM2_MQ_FLAG_SENDSYNC	0x01
+				/**< MQ Send Force synchronous send */
+
+#define PSM2_MQ_REQINVALID	((psm2_mq_req_t)(NULL))
+				/**< MQ request completion value */
+
+#define PSM2_MQ_ANY_ADDR		((psm2_epaddr_t)NULL)
+				/**< MQ receive from any source epaddr */
+
+/** @brief Post a receive to a Matched Queue with tag selection criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c tag and @c
+ * tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ *                    upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ *       match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags,
+	     void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with source and tag selection
+ *  criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c src, @c tag
+ * and @c tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status2_t
+ *                    upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ *       match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+	      psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len,
+	      void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with matched request
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. The provided request should already be matched using the @ref
+ * psm2_mq_improbe or @ref psm2_mq_improbe2 routines.  It is an error to pass a
+ * request that has not already been matched by one of those routines.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ *                    upon completion
+ * @param[inout] reqo PSM MQ Request handle matched previously by a matched
+ *		     probe routine (@ref psm2_mq_improbe or @ref
+ *		     psm2_mq_improbe2), also to be used for explicitly
+ *		     controlling message receive completion.
+ *
+ * @post The supplied receive buffer is given to MQ to deliver the matched
+ *       message.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+	      void *context, psm2_mq_req_t *reqo);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	    const void *buf, uint32_t len);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	     psm2_mq_tag_t *stag, const void *buf, uint32_t len);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ *                    psm2_mq_status_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ *                 be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ *       until its request is completed by either @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+   	psm2_mq_req_t
+   	non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+   	                      const void *buf, uint32_t len,
+   	     		 int context_id, int send_tag, const my_request_t *req)
+   	{
+   	    psm2_mq_req_t req_mq;
+   	    // Set up our send tag, assume that "my_rank" is global and represents
+   	    // the rank of this process in the job
+   	    uint64_t tag = ( ((context_id & 0xffff) << 48) |
+   	                     ((my_rank & 0xffff) << 32)    |
+   	                     ((send_tag & 0xffffffff)) );
+  
+   	    psm2_mq_isend(mq, dest_ep,
+   	                 0, // no flags
+   	                 tag,
+   	                 buf,
+   	                 len,
+   	                 req, // this req is available in psm2_mq_status_t when one
+   	                      // of the synchronization functions is called.
+   	                 &req_mq);
+   	    return req_mq;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	     const void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag, array of three 32-bit values.
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ *                    psm2_mq_status2_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ *                 be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ *       until its request is completed by either @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+   	psm2_mq_req_t
+   	non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+   	                      const void *buf, uint32_t len,
+   	     		 int context_id, int send_tag, const my_request_t *req)
+   	{
+   	    psm2_mq_req_t req_mq;
+   	    // Set up our send tag, assume that "my_rank" is global and represents
+   	    // the rank of this process in the job
+   	    psm2_mq_tag_t tag;
+   	    tag.tag[0] = send_tag;
+   	    tag.tag[1] = my_rank;
+   	    tag.tag[2] = context_id;
+  
+   	    psm2_mq_isend(mq, dest_ep,
+   	                 0, // no flags
+   	                 &tag,
+   	                 buf,
+   	                 len,
+   	                 req, // this req is available in psm2_mq_status2_t when one
+   	                      // of the synchronization functions is called.
+   	                 &req_mq);
+   	    return req_mq;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	      psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context,
+	      psm2_mq_req_t *req);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received.  The message is not fully matched until the user
+ * provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv.
+ * Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ *                               unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel,
+	      psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received.  The message is not fully matched until the
+ * user provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv.  Probing for messages may be useful if the size
+ * of the message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ *                               unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+	       psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received.  If a match is successful, the message is
+ * removed from the matching queue and returned as a request object.  The
+ * message can be received using @ref psm2_mq_imrecv.  It is erroneous to use
+ * the request object returned by @ref psm2_mq_improbe for any purpose other
+ * than passing to @ref psm2_mq_imrecv.  Probing for messages may be useful if
+ * the size of the message to be received is unknown, in which case its size
+ * will be available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] req PSM MQ Request handle, to be used for receiving the matched
+ *                 message.
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req,
+	       psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received.  If a match is successful, the message is removed from
+ * the matching queue and returned as a request object.  The message can be
+ * received using @ref psm2_mq_imrecv.  It is erroneous to use the request
+ * object returned by @ref psm2_mq_improbe for any purpose other than passing to
+ * @ref psm2_mq_imrecv.  Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be available
+ * in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched
+ *                  message.
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+		psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo,
+		psm2_mq_status2_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion.  Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress.  The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ *       PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ *                ready for completion.  If @c status is non-NULL, it is also
+ *                updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ *                               are no further requests ready for completion.
+ *                               The contents of @c req and @c status remain
+ *                               unchanged.
+ * @code{.c}
+   	// Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+   	// We return the amount of non-blocking requests that we've completed
+   	int main_progress_loop(psm2_mq_t mq)
+   	{
+   	    int num_completed = 0;
+   	    psm2_mq_req_t req;
+   	    psm2_mq_status_t status;
+   	    psm2_error_t err;
+   	    my_request_t *myreq;
+  
+   	    do {
+   	        err = psm2_mq_ipeek(mq, &req,
+   	                           NULL); // No need for status in ipeek here
+   	        if (err == PSM2_MQ_NO_COMPLETIONS)
+   	            return num_completed;
+   	        else if (err != PSM2_OK)
+   	            goto errh;
+   	        num_completed++;
+  
+   	        // We obtained 'req' at the head of the completion queue.  We can
+   	        // now free the request with PSM and obtain our original reques
+   	        // from the status' context
+   	        err = psm2_mq_test(&req, // will be marked as invalid
+   	                          &status); // we need the status
+   	        myreq = (my_request_t *) status.context;
+  
+   	        // handle the completion for myreq whether myreq is a posted receive
+   	        // or a non-blocking send.
+   	   }
+   	   while (1);
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion.  Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress.  The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ *       PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ *                ready for completion.  If @c status is non-NULL, it is also
+ *                updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ *                            are no further requests ready for completion.
+ *                            The contents of @c req and @c status remain
+ *                            unchanged.
+ * @code{.c}
+   	// Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+   	// We return the amount of non-blocking requests that we've completed
+   	int main_progress_loop(psm2_mq_t mq)
+   	{
+   	    int num_completed = 0;
+   	    psm2_mq_req_t req;
+   	    psm2_mq_status2_t status;
+   	    psm2_error_t err;
+   	    my_request_t *myreq;
+  
+   	    do {
+   	        err = psm2_mq_ipeek2(mq, &req,
+   	                           NULL); // No need for status in ipeek here
+   	        if (err == PSM2_MQ_NO_COMPLETIONS)
+   	            return num_completed;
+   	        else if (err != PSM2_OK)
+   	            goto errh;
+   	        num_completed++;
+  
+   	        // We obtained 'req' at the head of the completion queue.  We can
+   	        // now free the request with PSM and obtain our original reques
+   	        // from the status' context
+   	        err = psm2_mq_test2(&req, // will be marked as invalid
+   	                          &status); // we need the status
+   	        myreq = (my_request_t *) status.context;
+  
+   	        // handle the completion for myreq whether myreq is a posted receive
+   	        // or a non-blocking send.
+   	   }
+   	   while (1);
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends.  This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ *      suspended, the user need not ensure that progress is made prior to
+ *      calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ *       associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * @remarks
+ *  @li This function ensures progress on the endpoint as long as the request
+ *      is incomplete.
+ *  @li @c status can be NULL, in which case no status is written upon
+ *      completion.
+ *  @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ *      immediately.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ *                @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends.  This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ *      suspended, the user need not ensure that progress is made prior to
+ *      calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ *       associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * @remarks
+ *  @li This function ensures progress on the endpoint as long as the request
+ *      is incomplete.
+ *  @li @c status can be NULL, in which case no status is written upon
+ *      completion.
+ *  @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ *      immediately.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ *                @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion.  Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly.  It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed.  For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ *      psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ *       PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ *       back to the MQ library. If the request is incomplete, the contents of
+ *       @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ *       psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * The following two errors are always returned.  Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ *                PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ *                           unchanged.
+ *
+ * @code{.c}
+  	// Function that returns the first completed request in an array
+  	// of requests.
+  	void *
+  	user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+  	{
+  	  int i;
+  	  void *context = NULL;
+  	
+  	  // Ensure progress only once
+  	  psm2_poll(ep);
+  	
+  	  // Test for at least one completion and return it's context
+  	  psm2_mq_status_t stat;
+  	  for (i = 0; i < nreqs; i++) {
+  	    if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) {
+  	      context = stat.context;
+  	      break;
+  	    }
+  	  }
+  	  return context;
+  	}
+  @endcode
+ */
+psm2_error_t
+psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion.  Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly.  It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed.  For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ *      psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ *       PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ *       back to the MQ library. If the request is incomplete, the contents of
+ *       @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ *       psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * The following two errors are always returned.  Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ *                PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ *                           unchanged.
+ *
+ * @code{.c}
+  	// Function that returns the first completed request in an array
+  	// of requests.
+  	void *
+  	user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+  	{
+  	  int i;
+  	  void *context = NULL;
+  	
+  	  // Ensure progress only once
+  	  psm2_poll(ep);
+  	
+  	  // Test for at least one completion and return it's context
+  	  psm2_mq_status2_t stat;
+  	  for (i = 0; i < nreqs; i++) {
+  	    if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) {
+  	      context = stat.context;
+  	      break;
+  	    }
+  	  }
+  	  return context;
+  	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Cancel a preposted request
+ *
+ * Function to cancel a preposted receive request returned by @ref
+ * psm2_mq_irecv.  It is currently illegal to cancel a send request initiated
+ * with @ref psm2_mq_isend.
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend.
+ *
+ * @post Whether the cancel is successful or not, the user returns the
+ *       request to the library by way of @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * Only the two following errors can be returned directly, without being
+ * handled by the error handler (@ref psm2_error_register_handler):
+ *
+ * @retval PSM2_OK The request could be successfully cancelled such that the
+ *                preposted receive buffer could be removed from the preposted
+ *                receive queue before a match occurred. The associated @c
+ *                request remains unchanged and the user must still return
+ *                the storage to the MQ library.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled
+ *                           since the preposted receive buffer has already
+ *                           matched an incoming message.  The @c request
+ *                           remains unchanged.
+ *
+ */
+psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req);
+
+/*! @brief MQ statistics structure */
+struct psm2_mq_stats {
+	/** Bytes received into a matched user buffer */
+	uint64_t rx_user_bytes;
+	/** Messages received into a matched user buffer */
+	uint64_t rx_user_num;
+	/** Bytes received into an unmatched system buffer */
+	uint64_t rx_sys_bytes;
+	/** Messages received into an unmatched system buffer */
+	uint64_t rx_sys_num;
+
+	/** Total Messages transmitted (shm and hfi) */
+	uint64_t tx_num;
+	/** Messages transmitted eagerly */
+	uint64_t tx_eager_num;
+	/** Bytes transmitted eagerly */
+	uint64_t tx_eager_bytes;
+	/** Messages transmitted using expected TID mechanism */
+	uint64_t tx_rndv_num;
+	/** Bytes transmitted using expected TID mechanism */
+	uint64_t tx_rndv_bytes;
+	/** Messages transmitted (shm only) */
+	uint64_t tx_shm_num;
+	/** Messages received through shm */
+	uint64_t rx_shm_num;
+
+	/** Number of system buffers allocated  */
+	uint64_t rx_sysbuf_num;
+	/** Bytes allcoated for system buffers */
+	uint64_t rx_sysbuf_bytes;
+
+	/** Internally reserved for future use */
+	uint64_t _reserved[16];
+};
+
+#define PSM2_MQ_NUM_STATS    13	/**< How many stats are currently used in @ref psm2_mq_stats */
+
+/*! @see psm2_mq_stats */
+	typedef struct psm2_mq_stats psm2_mq_stats_t;
+
+/** @brief Retrieve statistics from an instantiated MQ */
+	void
+	 psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats);
+
+/*! @} */
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/psm_am.c b/psm_am.c
new file mode 100644
index 0000000..df193da
--- /dev/null
+++ b/psm_am.c
@@ -0,0 +1,269 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+   and copied out in __psm2_am_get_parameters.  When debugging is enabled,
+   various assertions reference these parameters for sanity checking. */
+struct psm2_am_parameters psmi_am_parameters = { 0 };
+
+static int _ignore_handler(PSMI_AM_ARGS_DEFAULT)
+{
+	return 0;
+}
+
+int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT)
+{
+	abort();
+	return 0;
+}
+
+static void psmi_am_min_parameters(struct psm2_am_parameters *dest,
+				   struct psm2_am_parameters *src)
+{
+	dest->max_handlers = min(dest->max_handlers, src->max_handlers);
+	dest->max_nargs = min(dest->max_nargs, src->max_nargs);
+	dest->max_request_short =
+	    min(dest->max_request_short, src->max_request_short);
+	dest->max_reply_short =
+	    min(dest->max_reply_short, src->max_reply_short);
+}
+
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
+{
+	int i;
+	psm2_am_handler_fn_t *am_htable;
+	struct psm2_am_parameters params;
+
+	psmi_am_parameters.max_handlers = INT_MAX;
+	psmi_am_parameters.max_nargs = INT_MAX;
+	psmi_am_parameters.max_request_short = INT_MAX;
+	psmi_am_parameters.max_reply_short = INT_MAX;
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		ep->ptl_self.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		ep->ptl_ips.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		ep->ptl_amsh.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	ep->am_htable =
+	    psmi_malloc(ep, UNDEFINED,
+			sizeof(psm2_am_handler_fn_t) * PSMI_AM_NUM_HANDLERS);
+	if (ep->am_htable == NULL)
+		return PSM2_NO_MEMORY;
+
+	am_htable = (psm2_am_handler_fn_t *) ep->am_htable;
+	for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++)
+		am_htable[i] = _ignore_handler;
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_am_register_handlers(psm2_ep_t ep,
+			   const psm2_am_handler_fn_t *handlers,
+			   int num_handlers, int *handlers_idx)
+{
+	int i, j;
+
+	PSM2_LOG_MSG("entering");
+	/* For now just assign any free one */
+	for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+		if (ep->am_htable[i] == _ignore_handler) {
+			ep->am_htable[i] = handlers[j];
+			handlers_idx[j] = i;
+			if (++j == num_handlers)	/* all registered */
+				break;
+		}
+	}
+
+	if (j < num_handlers) {
+		/* Not enough free handlers, restore unused handlers */
+		for (i = 0; i < j; i++)
+			ep->am_htable[handlers_idx[i]] = _ignore_handler;
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
+					 "Insufficient "
+					 "available AM handlers: registered %d of %d requested handlers",
+					 j, num_handlers);
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+}
+PSMI_API_DECL(psm2_am_register_handlers)
+
+psm2_error_t
+__psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+		       psm2_amarg_t *args, int nargs, void *src, size_t len,
+		       int flags, psm2_am_completion_fn_t completion_fn,
+		       void *completion_ctxt)
+{
+	psm2_error_t err;
+	ptl_ctl_t *ptlc = epaddr->ptlctl;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(epaddr != NULL);
+	psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+	psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+	psmi_assert(nargs > 0 ? args != NULL : 1);
+	psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short);
+	psmi_assert(len > 0 ? src != NULL : 1);
+
+	PSMI_LOCK(ptlc->ep->mq->progress_lock);
+
+	err = ptlc->am_short_request(epaddr, handler, args,
+				     nargs, src, len, flags, completion_fn,
+				     completion_ctxt);
+	PSMI_UNLOCK(ptlc->ep->mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_am_request_short)
+
+psm2_error_t
+__psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+		     psm2_amarg_t *args, int nargs, void *src, size_t len,
+		     int flags, psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt)
+{
+	psm2_error_t err;
+	struct psmi_am_token *tok;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ptlc;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_always(token != NULL);
+	psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+	psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+	psmi_assert(nargs > 0 ? args != NULL : 1);
+	psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short);
+	psmi_assert(len > 0 ? src != NULL : 1);
+
+	tok = (struct psmi_am_token *)token;
+	epaddr = tok->epaddr_incoming;
+	ptlc = epaddr->ptlctl;
+
+	/* No locking here since we are already within handler context and already
+	 * locked */
+
+	err = ptlc->am_short_reply(token, handler, args,
+				   nargs, src, len, flags, completion_fn,
+				   completion_ctxt);
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_am_reply_short)
+
+psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out)
+{
+	struct psmi_am_token *tok;
+
+	PSM2_LOG_MSG("entering");
+	if (token == NULL || epaddr_out == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					 "Invalid %s parameters", __FUNCTION__);
+	}
+
+	tok = (struct psmi_am_token *)token;
+	*epaddr_out = tok->epaddr_incoming;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_source)
+
+psm2_error_t
+__psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters,
+			size_t sizeof_parameters_in,
+			size_t *sizeof_parameters_out)
+{
+	size_t s;
+
+	PSM2_LOG_MSG("entering");
+	if (parameters == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					 "Invalid %s parameters", __FUNCTION__);
+	}
+
+	memset(parameters, 0, sizeof_parameters_in);
+	s = min(sizeof(psmi_am_parameters), sizeof_parameters_in);
+	memcpy(parameters, &psmi_am_parameters, s);
+	*sizeof_parameters_out = s;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_parameters)
diff --git a/psm_am_internal.h b/psm_am_internal.h
new file mode 100644
index 0000000..29edfb8
--- /dev/null
+++ b/psm_am_internal.h
@@ -0,0 +1,93 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM2_AM_INTERNAL_H
+#define _PSM2_AM_INTERNAL_H
+
+#define PSMI_AM_MAX_ARGS     10
+#define PSMI_AM_NUM_HANDLERS 256	/* must be power of 2 */
+
+#define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token,			\
+			     psm2_amarg_t *args, int nargs,		\
+			     void *src, uint32_t len
+
+struct psmi_am_token {
+	psm2_epaddr_t epaddr_incoming;
+	uint32_t flags;
+	/* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */
+	uint32_t can_reply;
+
+	/* PTLs may add other stuff here */
+};
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+   and copied out in __psm2_am_get_parameters.  When debugging is enabled,
+   various assertions reference these parameters for sanity checking. */
+extern struct psm2_am_parameters psmi_am_parameters;
+
+PSMI_ALWAYS_INLINE(psm2_am_handler_fn_t
+		   psm_am_get_handler_function(psm2_ep_t ep,
+					       psm2_handler_t handler_idx))
+{
+	int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1);
+	psm2_am_handler_fn_t fn = (psm2_am_handler_fn_t) ep->am_htable[hidx];
+	psmi_assert_always(fn != NULL);
+	return fn;
+}
+
+/* PSM internal initialization */
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep);
+
+#endif
diff --git a/psm_context.c b/psm_context.c
new file mode 100644
index 0000000..21bc893
--- /dev/null
+++ b/psm_context.c
@@ -0,0 +1,817 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "psm_user.h"
+
+#define HFI_USERINIT_RETRY_MAX 3
+#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT   1
+ustatic int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid);
+MOCK_DCL_EPILOGUE(psmi_sharedcontext_params);
+static int psmi_get_hfi_selection_algorithm(void);
+ustatic psm2_error_t psmi_init_userinfo_params(psm2_ep_t ep,
+					     int unit_id,
+					     psm2_uuid_t const unique_job_key,
+					     struct hfi1_user_info_dep *user_info);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable)
+{
+	int poll_type;
+	int ret;
+
+	if ((enable && (context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)) ||
+	    (!enable && !(context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)))
+		return PSM2_OK;
+
+	if (enable)
+		poll_type = HFI1_POLL_TYPE_URGENT;
+	else
+		poll_type = 0;
+
+	ret = hfi_poll_type(context->ctrl, poll_type);
+
+	if (ret != 0)
+		return PSM2_EP_NO_RESOURCES;
+	else {
+		if (enable)
+			context->runtime_flags |= PSMI_RUNTIME_INTR_ENABLED;
+		else
+			context->runtime_flags &= ~PSMI_RUNTIME_INTR_ENABLED;
+
+		return PSM2_OK;
+	}
+}
+
+int psmi_context_interrupt_isenabled(psmi_context_t *context)
+{
+	return context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED;
+}
+
+/* Returns 1 when all of the active units have their free contexts
+ * equal the number of contexts.  This is an indication that no
+ * jobs are currently running.
+ *
+ * Note that this code is clearly racy (this code may happen concurrently
+ * by two or more processes, and this point of observation,
+ * occurs earlier in time to when the decision is made for deciding which
+ * context to assign, which will also occurs earlier in time to when the
+ * context is actually assigned.  And, when the context is finally
+ * assigned, this will change the "nfreectxts" observed below.)
+ */
+static int psmi_all_active_units_have_max_freecontexts(int nunits)
+{
+	int u;
+
+	for (u=0;u < nunits;u++)
+	{
+		if (hfi_get_unit_active(u) > 0)
+		{
+			int64_t nfreectxts=0,nctxts=0;
+
+			if (!hfi_sysfs_unit_read_s64(u, "nctxts", &nctxts, 0) &&
+			    !hfi_sysfs_unit_read_s64(u, "nfreectxts", &nfreectxts, 0))
+			{
+				if (nfreectxts != nctxts)
+					return 0;
+			}
+		}
+	}
+	return 1;
+}
+
+/* returns the integer value of an environment variable, or 0 if the environment
+ * variable is not set. */
+static int psmi_get_envvar(const char *env)
+{
+	const char *env_val = getenv(env);
+
+	if (env_val && *env_val)
+	{
+		int r = atoi(env_val);
+		return (r >= 0) ? r : 0;
+	}
+	return 0;
+}
+
+/* returns the 8-bit hash value of an uuid. */
+static inline
+uint8_t
+psmi_get_uuid_hash(psm2_uuid_t const uuid)
+{
+	int i;
+	uint8_t hashed_uuid = 0;
+
+	for (i=0; i < sizeof(psm2_uuid_t); ++i)
+		hashed_uuid ^= *((uint8_t const *)uuid + i);
+
+	return hashed_uuid;
+}
+
+int psmi_get_current_proc_location()
+{
+        int core_id, node_id;
+
+	core_id = sched_getcpu();
+	if (core_id < 0)
+		return -EINVAL;
+
+	node_id = numa_node_of_cpu(core_id);
+	if (node_id < 0)
+		return -EINVAL;
+
+	return node_id;
+}
+
+static void
+psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start,
+			     long *unit_end, int nunits)
+{
+	/* if the number of ranks on the host is 1 and ... */
+	if ((psmi_get_envvar("MPI_LOCALNRANKS") == 1) &&
+		/*
+		 * All of the active units have free contexts equal the
+		 * number of contexts.
+		 */
+	    psmi_all_active_units_have_max_freecontexts(nunits)) {
+		/* we start looking at unit 0, and end at nunits-1: */
+		*unit_start = 0;
+		*unit_end = nunits - 1;
+	} else {
+		/* else, we are going to look at:
+		   (a hash of the job key plus the local rank id) mod nunits. */
+
+		*unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
+			psmi_get_uuid_hash(job_key)) % nunits;
+		if (*unit_start > 0)
+			*unit_end = *unit_start - 1;
+		else
+			*unit_end = nunits-1;
+	}
+}
+
+static
+psm2_error_t
+psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
+				int nunitsactive,int nunits,psm2_uuid_t const job_key,
+				long *unit_start,long *unit_end)
+{
+	int node_id, unit_id, found = 0;
+	int saved_hfis[nunits];
+	context->user_info.hfi1_alg = HFI1_ALG_ACROSS;
+	/* if the user did not set HFI_UNIT then ... */
+	if (unit_param == HFI_UNIT_ID_ANY)
+	{
+		/* Get the actual selection algorithm from the environment: */
+		context->user_info.hfi1_alg = psmi_get_hfi_selection_algorithm();
+		/* If round-robin is selection algorithm and ... */
+		if ((context->user_info.hfi1_alg == HFI1_ALG_ACROSS) &&
+		    /* there are more than 1 active units then ... */
+		    (nunitsactive > 1))
+		{
+			/*
+			 * Pick first HFI we find on same root complex
+			 * as current task. If none found, fall back to
+			 * load-balancing algorithm.
+			 */
+			node_id = psmi_get_current_proc_location();
+			if (node_id >= 0) {
+				for (unit_id = 0; unit_id < nunits; unit_id++) {
+					if (hfi_get_unit_active(unit_id) <= 0)
+						continue;
+
+					if (hfi_sysfs_unit_read_node_s64(unit_id) == node_id) {
+						saved_hfis[found] = unit_id;
+						found++;
+						_HFI_VDBG("Picking unit: %d for current task"
+							  " which is on node:%d\n", unit_id, node_id);
+					}
+				}
+
+				/*
+				 * Spread HFI selection between units if
+				 * we find more than one within a socket.
+				 */
+				if (found > 1) {
+					*unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
+						psmi_get_uuid_hash(job_key)) % found;
+
+					*unit_start = *unit_end = saved_hfis[*unit_start];
+				} else if (found == 1) {
+					*unit_start = *unit_end = saved_hfis[0];
+				}
+			}
+
+			if (node_id < 0 || !found) {
+				psmi_spread_hfi_selection(job_key, unit_start,
+							  unit_end, nunits);
+			}
+		} else if ((context->user_info.hfi1_alg == HFI1_ALG_ACROSS_ALL) &&
+			 (nunitsactive > 1)) {
+				psmi_spread_hfi_selection(job_key, unit_start,
+							  unit_end, nunits);
+		}
+		else {
+			*unit_start = 0;
+			*unit_end = nunits - 1;
+		}
+	} else if (unit_param >= 0) {
+	/* the user specified HFI_UNIT, we use it. */
+		*unit_start = *unit_end = unit_param;
+	} else {
+		psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM2 can't open unit: %ld for reading and writing",
+					unit_param);
+		return PSM2_EP_DEVICE_FAILURE;
+	}
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_param, long port,
+		  psm2_uuid_t const job_key, int64_t timeout_ns,
+		  psmi_context_t *context)
+{
+	long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev;
+	int lid, sc, vl;
+	uint64_t gid_hi, gid_lo;
+	char dev_name[MAXPATHLEN];
+	psm2_error_t err = PSM2_OK;
+	uint32_t hfi_type;
+	int nunits = hfi_get_num_units(), nunitsactive=0;
+
+	/*
+	 * If shared contexts are enabled, try our best to schedule processes
+	 * across one or many devices
+	 */
+
+	/* if no units, then no joy. */
+	if (nunits <= 0)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM2 no hfi units are available");
+		goto ret;
+	}
+
+	/* Calculate the number of active units: */
+	for (unit_id=0;unit_id < nunits;unit_id++)
+	{
+		if (hfi_get_unit_active(unit_id) > 0)
+			nunitsactive++;
+	}
+	/* if no active units, then no joy. */
+	if (nunitsactive == 0)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM2 no hfi units are active");
+		goto ret;
+	}
+	if (timeout_ns > 0)
+		open_timeout = (long)(timeout_ns / MSEC_ULL);
+
+
+	unit_start = 0; unit_end = nunits - 1;
+	err = psmi_compute_start_and_end_unit(context, unit_param,
+					      nunitsactive, nunits,
+					      job_key,
+					      &unit_start, &unit_end);
+	if (err != PSM2_OK)
+		return err;
+
+	/* this is the start of a loop that starts at unit_start and goes to unit_end.
+	   but note that the way the loop computes the loop control variable is by
+	   an expression involving the mod operator. */
+	context->fd = -1;
+	context->ctrl = NULL;
+	unit_id_prev = unit_id = unit_start;
+	do
+	{
+		/* close previous opened unit fd before attempting open of current unit. */
+		if (context->fd > 0)
+		{
+			hfi_context_close(context->fd);
+			context->fd = -1;
+		}
+
+		/* if the unit_id is not active, go to next one. */
+		if (hfi_get_unit_active(unit_id) <= 0) {
+			unit_id_prev = unit_id;
+			unit_id = (unit_id + 1) % nunits;
+			continue;
+		}
+
+		/* open this unit. */
+		context->fd = hfi_context_open_ex(unit_id, port, open_timeout,
+				       dev_name, sizeof(dev_name));
+
+		/* go to next unit if failed to open. */
+		if (context->fd == -1) {
+			unit_id_prev = unit_id;
+			unit_id = (unit_id + 1) % nunits;
+			continue;
+		}
+
+		/* collect the userinfo params. */
+		if ((err = psmi_init_userinfo_params(ep,
+						     (int)unit_id, job_key,
+						     &context->user_info)))
+			goto bail;
+
+		/* attempt to assign the context via hfi_userinit() */
+		int retry = 0;
+		do {
+			if (retry > 0)
+				_HFI_INFO("hfi_userinit: failed, trying again (%d/%d)\n",
+					  retry, HFI_USERINIT_RETRY_MAX);
+			context->ctrl = hfi_userinit(context->fd, &context->user_info);
+		} while (context->ctrl == NULL && ++retry <= HFI_USERINIT_RETRY_MAX);
+		unit_id_prev = unit_id;
+		unit_id = (unit_id + 1) % nunits;
+	} while (unit_id_prev != unit_end && context->ctrl == NULL);
+
+	if (context->ctrl == NULL)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM2 can't open hfi unit: %ld",unit_param);
+		goto ret;
+	}
+	_HFI_VDBG("hfi_userinit() passed.\n");
+
+	if ((lid = hfi_get_port_lid(context->ctrl->__hfi_unit,
+				    context->ctrl->__hfi_port)) <= 0) {
+		err = psmi_handle_error(NULL,
+					PSM2_EP_DEVICE_FAILURE,
+					"Can't get HFI LID in psm2_ep_open: is SMA running?");
+		goto bail;
+	}
+	if (hfi_get_port_gid(context->ctrl->__hfi_unit,
+			     context->ctrl->__hfi_port, &gid_hi,
+			     &gid_lo) == -1) {
+		err =
+		    psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+				      "Can't get HFI GID in psm2_ep_open: is SMA running?");
+		goto bail;
+	}
+	ep->unit_id = context->ctrl->__hfi_unit;
+	ep->portnum = context->ctrl->__hfi_port;
+	ep->gid_hi = gid_hi;
+	ep->gid_lo = gid_lo;
+
+	context->ep = (psm2_ep_t) ep;
+	context->runtime_flags = context->ctrl->ctxt_info.runtime_flags;
+
+#ifdef PSM_CUDA
+	/* Check backward compatibility bits here and save the info */
+	if (context->ctrl->ctxt_info.runtime_flags & HFI1_CAP_GPUDIRECT_OT)
+		is_driver_gpudirect_enabled = 1;
+#endif
+
+	/* Get type of hfi assigned to context */
+	hfi_type = psmi_get_hfi_type(context);
+
+	/* Endpoint out_sl contains the default SL to use for this endpoint. */
+	/* Get the MTU for this SL. */
+	if ((sc = hfi_get_port_sl2sc(ep->unit_id,
+				     context->ctrl->__hfi_port,
+				     ep->out_sl)) < 0) {
+		sc = PSMI_SC_DEFAULT;
+	}
+	if ((vl = hfi_get_port_sc2vl(ep->unit_id,
+				     context->ctrl->__hfi_port, sc)) < 0) {
+		vl = PSMI_VL_DEFAULT;
+	}
+	if (sc == PSMI_SC_ADMIN || vl == PSMI_VL_ADMIN) {
+		err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR,
+			"Invalid sl: %d, please specify correct sl via HFI_SL",
+			ep->out_sl);
+		goto bail;
+	}
+
+	if ((ep->mtu = hfi_get_port_vl2mtu(ep->unit_id,
+					   context->ctrl->__hfi_port,
+					   vl)) < 0) {
+		err =
+		    psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+				      "Can't get MTU for VL %d", vl);
+		goto bail;
+	}
+
+	/* Construct epid for this Endpoint */
+	switch (PSMI_EPID_VERSION) {
+		case PSMI_EPID_V1:
+			context->epid = PSMI_EPID_PACK_V1(lid, context->ctrl->ctxt_info.ctxt,
+								context->ctrl->ctxt_info.subctxt,
+								context->ctrl->__hfi_unit,
+								PSMI_EPID_VERSION, 0x3ffffff);
+			break;
+		case PSMI_EPID_V2:
+			context->epid = PSMI_EPID_PACK_V2(lid, context->ctrl->ctxt_info.ctxt,
+								context->ctrl->ctxt_info.subctxt,
+								PSMI_EPID_IPS_SHM, /*Not a only-shm epid */
+								PSMI_EPID_VERSION, ep->gid_hi);
+			break;
+		default:
+			/* Epid version is greater than max supportd version. */
+			psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+			break;
+	}
+
+
+	_HFI_VDBG
+	    ("construct epid: lid %d ctxt %d subctxt %d hcatype %d mtu %d\n",
+	     lid, context->ctrl->ctxt_info.ctxt,
+	     context->ctrl->ctxt_info.subctxt, hfi_type, ep->mtu);
+
+	goto ret;
+
+bail:
+	_HFI_PRDBG("%s open failed: %d (%s)\n", dev_name, err, strerror(errno));
+	if (context->fd != -1) {
+		hfi_context_close(context->fd);
+		context->fd = -1;
+	}
+ret:
+
+	_HFI_VDBG("psmi_context_open() return %d\n", err);
+	return err;
+}
+
+psm2_error_t psmi_context_close(psmi_context_t *context)
+{
+	if (context->fd >= 0) {
+		hfi_context_close(context->fd);
+		context->fd = -1;
+	}
+	return PSM2_OK;
+}
+
+/*
+ * This function works whether a context is initialized or not in a psm2_ep.
+ *
+ * Returns one of
+ *
+ * PSM2_OK: Port status is ok (or context not initialized yet but still "ok")
+ * PSM2_OK_NO_PROGRESS: Cable pulled
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ * The message follows the per-port status
+ * As of 7322-ready driver, need to check port-specific qword for IB
+ * as well as older unit-only.  For now, we don't have the port interface
+ * defined, so just check port 0 qword for spi_status
+ */
+psm2_error_t psmi_context_check_status(const psmi_context_t *contexti)
+{
+	psm2_error_t err = PSM2_OK;
+	psmi_context_t *context = (psmi_context_t *) contexti;
+	struct hfi1_status *status =
+	    (struct hfi1_status *)context->ctrl->base_info.status_bufbase;
+	char *errmsg = NULL;
+
+	/* Fatal chip-related errors */
+	if (!(status->dev & HFI1_STATUS_CHIP_PRESENT) ||
+	    !(status->dev & HFI1_STATUS_INITTED) ||
+	    (status->dev & HFI1_STATUS_HWERROR)) {
+
+		err = PSM2_EP_DEVICE_FAILURE;
+		if (err != context->status_lasterr) {	/* report once */
+			volatile char *errmsg_sp =
+			    (volatile char *)status->freezemsg;
+			if (*errmsg_sp)
+				psmi_handle_error(context->ep, err,
+						  "Hardware problem: %s",
+						  errmsg_sp);
+			else {
+				if (status->dev & HFI1_STATUS_HWERROR)
+					errmsg = "Hardware error";
+				else
+					errmsg = "Hardware not found";
+
+				psmi_handle_error(context->ep, err,
+						  "%s", errmsg);
+			}
+		}
+	}
+	/* Fatal network-related errors with timeout: */
+	else if (!(status->port & HFI1_STATUS_IB_CONF) ||
+		 !(status->port & HFI1_STATUS_IB_READY)) {
+		err = PSM2_EP_NO_NETWORK;
+		if (err != context->status_lasterr) {	/* report once */
+			context->networkLostTime = time(NULL);
+		}
+		else
+		{
+			time_t now = time(NULL);
+			static const double seventySeconds = 70.0;
+
+			/* The linkup time duration for a system should allow the time needed
+			   to complete 3 LNI passes which is:
+			   50 seconds for a passive copper channel
+			   65 seconds for optical channel.
+			   (we add 5 seconds of margin.) */
+			if (difftime(now,context->networkLostTime) > seventySeconds)
+			{
+				volatile char *errmsg_sp =
+					(volatile char *)status->freezemsg;
+
+				psmi_handle_error(context->ep, err, "%s",
+						  *errmsg_sp ? errmsg_sp :
+						  "Network down");
+			}
+		}
+	}
+
+	if (err == PSM2_OK && context->status_lasterr != PSM2_OK)
+		context->status_lasterr = PSM2_OK;	/* clear error */
+	else if (err != PSM2_OK)
+		context->status_lasterr = err;	/* record error */
+
+	return err;
+}
+
+/*
+ * Prepare user_info params for driver open, used only in psmi_context_open
+ */
+ustatic
+psm2_error_t
+psmi_init_userinfo_params(psm2_ep_t ep, int unit_id,
+			  psm2_uuid_t const unique_job_key,
+			  struct hfi1_user_info_dep *user_info)
+{
+	/* static variables, shared among rails */
+	static int shcontexts_enabled = -1, rankid, nranks;
+
+	int avail_contexts = 0, max_contexts, ask_contexts;
+	int ranks_per_context = 0;
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val env_maxctxt, env_ranks_per_context;
+	static int subcontext_id_start;
+
+	memset(user_info, 0, sizeof(*user_info));
+	user_info->userversion = HFI1_USER_SWMINOR|(hfi_get_user_major_version()<<HFI1_SWMAJOR_SHIFT);
+
+	user_info->subctxt_id = 0;
+	user_info->subctxt_cnt = 0;
+	memcpy(user_info->uuid, unique_job_key, sizeof(user_info->uuid));
+
+	if (shcontexts_enabled == -1) {
+		shcontexts_enabled =
+		    psmi_sharedcontext_params(&nranks, &rankid);
+	}
+	if (!shcontexts_enabled)
+		return err;
+
+	avail_contexts = hfi_get_num_contexts(unit_id);
+
+	if (avail_contexts == 0) {
+		err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
+					"PSM2 found 0 available contexts on opa device(s).");
+		goto fail;
+	}
+
+	/* See if the user wants finer control over context assignments */
+	if (!psmi_getenv("PSM2_MAX_CONTEXTS_PER_JOB",
+			 "Maximum number of contexts for this PSM2 job",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			 (union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
+		max_contexts = max(env_maxctxt.e_int, 1);		/* needs to be non-negative */
+		ask_contexts = min(max_contexts, avail_contexts);	/* needs to be available */
+	} else if (!psmi_getenv("PSM2_SHAREDCONTEXTS_MAX",
+			 "Maximum number of contexts for this PSM2 job",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			 (union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
+
+		_HFI_INFO
+		    ("This env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n");
+
+		max_contexts = max(env_maxctxt.e_int, 1);		/* needs to be non-negative */
+		ask_contexts = min(max_contexts, avail_contexts);	/* needs to be available */
+	} else
+		ask_contexts = max_contexts = avail_contexts;
+
+	if (!psmi_getenv("PSM2_RANKS_PER_CONTEXT",
+			 "Number of ranks per context",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			 (union psmi_envvar_val)1, &env_ranks_per_context)) {
+		ranks_per_context = max(env_ranks_per_context.e_int, 1);
+		ranks_per_context = min(ranks_per_context, HFI1_MAX_SHARED_CTXTS);
+	}
+
+	/*
+	 * See if we could get a valid ppn.  If not, approximate it to be the
+	 * number of cores.
+	 */
+	if (nranks == -1) {
+		long nproc = sysconf(_SC_NPROCESSORS_ONLN);
+		if (nproc < 1)
+			nranks = 1;
+		else
+			nranks = nproc;
+	}
+
+	/*
+	 * Make sure that our guesses are good educated guesses
+	 */
+	if (rankid >= nranks) {
+		_HFI_PRDBG
+		    ("PSM2_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n",
+		     rankid, nranks);
+		goto fail;
+	}
+
+	if (ranks_per_context) {
+		int contexts =
+		    (nranks + ranks_per_context - 1) / ranks_per_context;
+		if (contexts > ask_contexts) {
+			err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
+						"Incompatible settings for "
+						"(PSM2_SHAREDCONTEXTS_MAX / PSM2_MAX_CONTEXTS_PER_JOB) and PSM2_RANKS_PER_CONTEXT");
+			goto fail;
+		}
+		ask_contexts = contexts;
+	}
+
+	/* group id based on total groups and local rank id */
+	user_info->subctxt_id = subcontext_id_start + rankid % ask_contexts;
+	/* this is for multi-rail, when we setup a new rail,
+	 * we can not use the same subcontext ID as the previous
+	 * rail, otherwise, the driver will match previous rail
+	 * and fail.
+	 */
+	subcontext_id_start += ask_contexts;
+
+	/* Need to compute with how many *other* peers we will be sharing the
+	 * context */
+	if (nranks > ask_contexts) {
+		user_info->subctxt_cnt = nranks / ask_contexts;
+		/* If ppn != multiple of contexts, some contexts get an uneven
+		 * number of subcontexts */
+		if (nranks % ask_contexts > rankid % ask_contexts)
+			user_info->subctxt_cnt++;
+		/* The case of 1 process "sharing" a context (giving 1 subcontext)
+		 * is supcontexted by the driver and PSM. However, there is no
+		 * need to share in this case so disable context sharing. */
+		if (user_info->subctxt_cnt == 1)
+			user_info->subctxt_cnt = 0;
+		if (user_info->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) {
+			err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR,
+						"Calculation of subcontext count exceeded maximum supported");
+			goto fail;
+		}
+	}
+	/* else subcontext_cnt remains 0 and context sharing is disabled. */
+
+	_HFI_PRDBG("PSM2_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d,"
+		   "max_contexts=%d,ask_contexts=%d,"
+		   "ranks_per_context=%d,id=%u,cnt=%u\n",
+		   rankid, nranks, avail_contexts, max_contexts,
+		   ask_contexts, ranks_per_context,
+		   user_info->subctxt_id, user_info->subctxt_cnt);
+fail:
+	return err;
+}
+
+ustatic
+int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid)
+{
+	union psmi_envvar_val enable_shcontexts;
+	char *ppn_env = NULL, *lrank_env = NULL, *c;
+
+	*rankid = -1;
+	*nranks = -1;
+
+#if 0
+	/* DEBUG: Used to selectively test possible shared context and shm-only
+	 * settings */
+	unsetenv("PSC_MPI_NODE_RANK");
+	unsetenv("PSC_MPI_PPN");
+	unsetenv("MPI_LOCALRANKID");
+	unsetenv("MPI_LOCALRANKS");
+#endif
+
+	/* We do not support context sharing for multiple endpoints */
+	if (psmi_multi_ep_enabled) {
+		return 0;
+	}
+
+	/* New name in 2.0.1, keep observing old name */
+	psmi_getenv("PSM2_SHAREDCONTEXTS", "Enable shared contexts",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+		    (union psmi_envvar_val)
+		    PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT,
+		    &enable_shcontexts);
+	if (!enable_shcontexts.e_int)
+		return 0;
+
+	/* We support two types of syntaxes to let users give us a hint what
+	 * our local rankid is.  Moving towards MPI_, but still support PSC_ */
+	if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') {
+		lrank_env = "MPI_LOCALRANKID";
+		ppn_env = "MPI_LOCALNRANKS";
+	} else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') {
+		ppn_env = "PSC_MPI_PPN";
+		lrank_env = "PSC_MPI_NODE_RANK";
+	}
+
+	if (ppn_env != NULL && lrank_env != NULL) {
+		union psmi_envvar_val env_rankid, env_nranks;
+
+		psmi_getenv(lrank_env, "Shared context rankid",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			    (union psmi_envvar_val)-1, &env_rankid);
+
+		psmi_getenv(ppn_env, "Shared context numranks",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			    (union psmi_envvar_val)-1, &env_nranks);
+
+		*rankid = env_rankid.e_int;
+		*nranks = env_nranks.e_int;
+
+		return 1;
+	} else
+		return 0;
+}
+MOCK_DEF_EPILOGUE(psmi_sharedcontext_params);
+
+static
+int psmi_get_hfi_selection_algorithm(void)
+{
+	union psmi_envvar_val env_hfi1_alg;
+	int hfi1_alg = HFI1_ALG_ACROSS;
+
+	/* If a specific unit is set in the environment, use that one. */
+	psmi_getenv("HFI_SELECTION_ALG",
+		    "HFI Device Selection Algorithm to use. Round Robin (Default) "
+		    ", Packed or Round Robin All.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)"Round Robin", &env_hfi1_alg);
+
+	if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin"))
+		hfi1_alg = HFI1_ALG_ACROSS;
+	else if (!strcasecmp(env_hfi1_alg.e_str, "Packed"))
+		hfi1_alg = HFI1_ALG_WITHIN;
+	else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All"))
+		hfi1_alg = HFI1_ALG_ACROSS_ALL;
+	else {
+		_HFI_ERROR
+		    ("Unknown HFI selection algorithm %s. Defaulting to Round Robin "
+		     "allocation of HFIs.\n", env_hfi1_alg.e_str);
+		hfi1_alg = HFI1_ALG_ACROSS;
+	}
+
+	return hfi1_alg;
+}
diff --git a/psm_context.h b/psm_context.h
new file mode 100644
index 0000000..fe2aec7
--- /dev/null
+++ b/psm_context.h
@@ -0,0 +1,102 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_context.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_CONTEXT_H
+#define _PSM_CONTEXT_H
+
+typedef
+struct psmi_context {
+	struct _hfi_ctrl *ctrl;	/* driver opaque hfi_proto */
+	void *spio_ctrl;
+	void *tid_ctrl;
+	void *tf_ctrl;
+
+	int fd;			/* driver fd */
+	psm2_ep_t ep;		/* psm ep handle */
+	psm2_epid_t epid;	/* psm integral ep id */
+	struct hfi1_user_info_dep user_info;
+	uint32_t runtime_flags;
+	uint32_t rcvthread_flags;
+	psm2_error_t status_lasterr;
+	time_t networkLostTime;
+} psmi_context_t;
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_id, long port,
+		  psm2_uuid_t const job_key,
+		  int64_t timeout_ns, psmi_context_t *context);
+
+psm2_error_t psmi_context_close(psmi_context_t *context);
+
+/* Check status of context */
+psm2_error_t psmi_context_check_status(const psmi_context_t *context);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable);
+int psmi_context_interrupt_isenabled(psmi_context_t *context);
+
+/* Runtime flags describe what features are enabled in hw/sw and which
+ * corresponding PSM features are being used.
+ *
+ * Hi 16 bits are PSM options
+ * Lo 16 bits are HFI_RUNTIME options copied from (hfi_common.h)
+ */
+#define PSMI_RUNTIME_RCVTHREAD	    0x80000000
+#define PSMI_RUNTIME_INTR_ENABLED   0x40000000
+
+#endif /* PSM_CONTEXT_H */
diff --git a/psm_diags.c b/psm_diags.c
new file mode 100644
index 0000000..2a43c22
--- /dev/null
+++ b/psm_diags.c
@@ -0,0 +1,362 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n);
+static int psmi_test_memcpy(memcpy_fn_t, const char *name);
+static int psmi_test_epid_table(int numelems);
+
+int psmi_diags(void);
+
+#define diags_assert(x)	do {					\
+	    if (!(x))  {					\
+		_HFI_ERROR("Diags assertion failure: %s\n",	\
+		    #x);					\
+		goto fail;					\
+	    }							\
+	} while (0)
+
+#define DIAGS_RETURN_PASS(str)						\
+	do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; }	\
+	    while (0)
+#define DIAGS_RETURN_FAIL(str)						\
+	do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; }	\
+	    while (0)
+
+int psmi_diags(void)
+{
+	int ret = 0;
+	ret |= psmi_test_epid_table(2048);
+	ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo");
+	/* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */
+
+	if (ret)
+		DIAGS_RETURN_FAIL("");
+	else
+		DIAGS_RETURN_PASS("");
+}
+
+/*
+ * Hash table test
+ */
+#define NALLOC	1024
+static int psmi_test_epid_table(int numelems)
+{
+	ptl_ctl_t ctl;
+	psm2_epaddr_t *ep_array, epaddr, ep_alloc;
+	psm2_epid_t *epid_array, epid_tmp;
+	psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00;
+	struct psmi_epid_table *tab;
+	int i, j;
+	struct drand48_data drand48_data;
+
+	ep_alloc =
+	    (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+				       sizeof(struct psm2_epaddr));
+	ep_array =
+	    (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+					 sizeof(struct psm2_epaddr *));
+	epid_array =
+	    (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+				       sizeof(psm2_epid_t));
+	diags_assert(ep_alloc != NULL);
+	diags_assert(ep_array != NULL);
+	diags_assert(epid_array != NULL);
+
+	srand48_r(12345678, &drand48_data);
+
+	psmi_epid_init();
+	tab = &psmi_epid_table;
+	ctl.ep = ep;
+
+	for (i = 0; i < numelems; i++) {
+		epid_array[i] = i;
+		ep_alloc[i].ptlctl = &ctl;
+		ep_alloc[i].epid = epid_array[i];
+		ep_array[i] = &ep_alloc[i];
+	}
+	for (i = 0; i < numelems; i++) {
+		psmi_epid_add(ep, epid_array[i], ep_array[i]);
+	}
+
+	/* Randomize epid_array */
+	for (i = 0; i < numelems; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		j = (int)(rand_result % numelems);
+		epid_tmp = epid_array[i];
+		epid_array[i] = epid_array[j];
+		epid_array[j] = epid_tmp;
+	}
+	/* Lookup. */
+	for (i = 0; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+	}
+
+	/* Randomize epid_array again */
+	for (i = 0; i < numelems; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		j = (int)(rand_result % numelems);
+		epid_tmp = epid_array[i];
+		epid_array[i] = epid_array[j];
+		epid_array[j] = epid_tmp;
+	}
+	/* Delete half */
+	for (i = 0; i < numelems / 2; i++) {
+		epaddr = psmi_epid_remove(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+	}
+	/* Lookup other half -- expect non-NULL, then delete */
+	for (i = numelems / 2; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+		epaddr = psmi_epid_remove(ep, epid_array[i]);
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr == NULL);
+	}
+	/* Lookup whole thing, expect done */
+	for (i = 0; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr == NULL);
+	}
+	for (i = 0; i < tab->tabsize; i++) {
+		diags_assert(tab->table[i].entry == NULL ||
+			     tab->table[i].entry == EPADDR_DELETED);
+	}
+
+	/* Make sure we're not leaking memory somewhere... */
+	diags_assert(tab->tabsize > tab->tabsize_used &&
+		     tab->tabsize * PSMI_EPID_TABLOAD_FACTOR >
+		     tab->tabsize_used);
+
+	/* Only free on success */
+	psmi_epid_fini();
+	psmi_free(epid_array);
+	psmi_free(ep_array);
+	psmi_free(ep_alloc);
+	DIAGS_RETURN_PASS("");
+
+fail:
+	/* Klocwork scan report memory leak. */
+	psmi_epid_fini();
+	if (epid_array)
+		psmi_free(epid_array);
+	if (ep_array)
+		psmi_free(ep_array);
+	if (ep_alloc)
+		psmi_free(ep_alloc);
+	DIAGS_RETURN_FAIL("");
+}
+
+/*
+ * Memcpy correctness test
+ */
+static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n);
+static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n);
+
+static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name)
+{
+	const int CORNERS = 0;
+	const long long lo = 1;
+	const long long hi = 16 * 1024 * 1024;
+	const long long below = 32;
+	const long long above = 32;
+	long long n, m;
+	char buf[128];
+	int ret = 0;
+	int memcpy_passed;
+	int memcpy_failed;
+
+	memcpy_passed = 0;
+	memcpy_failed = 0;
+
+	ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0);
+	if (ret < 0)
+		DIAGS_RETURN_FAIL("no heap space");
+
+	for (n = lo; n <= hi; n <<= 1) {
+		_HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n);
+		for (m = n - below; m <= n + above; m++) {
+			if (m == n) {
+				ret =
+				    memcpy_check_size(fn, &memcpy_passed,
+						      &memcpy_failed, n);
+				if (ret < 0)
+					DIAGS_RETURN_FAIL("no heap space");
+			} else if (CORNERS && m >= lo && m <= hi && m > (n >> 1)
+				   && m < max(n, ((n << 1) - below))) {
+				ret =
+				    memcpy_check_size(fn, &memcpy_passed,
+						      &memcpy_failed,
+						      (size_t) m);
+				if (ret < 0)
+					DIAGS_RETURN_FAIL("no heap space");
+			}
+		}
+	}
+
+	int total = memcpy_passed + memcpy_failed;
+	if (total > 0) {
+		_HFI_INFO("%d memcpy tests with %d passed (%.2f%%) "
+			  "and %d failed (%.2f%%)\n",
+			  total, memcpy_passed, (100.0 * memcpy_passed) / total,
+			  memcpy_failed, (100.0 * memcpy_failed) / total);
+	}
+	if (memcpy_failed) {
+		snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed",
+			 memcpy_name, (100.0 * memcpy_failed) / total);
+		DIAGS_RETURN_FAIL(buf);
+	} else {
+		DIAGS_RETURN_PASS(memcpy_name);
+	}
+}
+
+void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n)
+{
+	int ok = 1;
+	unsigned int seed = (unsigned int)
+	    ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n);
+	size_t i;
+	struct drand48_data drand48_data;
+
+	if (!n)
+		return dst;
+
+	memset(src, 0x55, n);
+	memset(dst, 0xaa, n);
+	srand48_r(seed, &drand48_data);
+	for (i = 0; i < n; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff;
+	}
+
+	fn(dst, src, n);
+	memset(src, 0, n);
+	srand48_r(seed, &drand48_data);
+	for (i = 0; i < n; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16);
+		int v = (int)((uint8_t *) dst)[i];
+		if (v != value) {
+			_HFI_ERROR
+			    ("Error on index %llu : got %d instead of %d\n",
+			     (unsigned long long)i, v, value);
+			ok = 0;
+		}
+	}
+	return ok ? dst : NULL;
+}
+
+int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n)
+{
+#define num_aligns 16
+#define USE_MALLOC 0
+#define DEBUG 0
+	uint8_t *src;
+	uint8_t *dst;
+	size_t size = n * 2 + num_aligns;
+	if (USE_MALLOC) {
+		src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+		dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+		if (src == NULL || dst == NULL)
+			if (src)
+				psmi_free(src);
+		if (dst)
+			psmi_free(dst);
+		return -1;
+	} else {
+		void *src_p, *dst_p;
+		if (posix_memalign(&src_p, 64, size) != 0 ||
+		    posix_memalign(&dst_p, 64, size) != 0)
+			return -1;
+		else {
+			src = (uint8_t *) src_p;
+			dst = (uint8_t *) dst_p;
+		}
+	}
+	int src_align, dst_align;
+	for (src_align = 0; src_align < num_aligns; src_align++) {
+		for (dst_align = 0; dst_align < num_aligns; dst_align++) {
+			uint8_t *d = ((uint8_t *) dst) + dst_align;
+			uint8_t *s = ((uint8_t *) src) + src_align;
+			int ok = (memcpy_check_one(fn, d, s, n) != NULL);
+			if (DEBUG || !ok) {
+				_HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s,
+					  (unsigned long long)n,
+					  ok ? "passed" : "failed");
+			}
+			if (ok) {
+				(*p)++;
+			} else {
+				(*f)++;
+			}
+		}
+	}
+	psmi_free(src);
+	psmi_free(dst);
+	return 0;
+}
diff --git a/psm_ep.c b/psm_ep.c
new file mode 100644
index 0000000..d01c9aa
--- /dev/null
+++ b/psm_ep.c
@@ -0,0 +1,1527 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+/*
+ * Endpoint management
+ */
+psm2_ep_t psmi_opened_endpoint = NULL;
+int psmi_opened_endpoint_count = 0;
+
+static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
+				       const struct psm2_ep_open_opts *opts,
+				       const psm2_uuid_t unique_job_key,
+				       struct psmi_context *context,
+				       psm2_epid_t *epid);
+
+/*
+ * Device management
+ *
+ * PSM uses "devices" as components to manage communication to self, to peers
+ * reachable via shared memory and finally to peers reachable only through
+ * hfi.
+ *
+ * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is
+ * tested for reachability to each peer.  First self, then shm and finally
+ * hfi.  The order should really only affect endpoints that happen to be on
+ * the same node.  PSM will correctly detect that two endpoints are on the same
+ * node even though they may be using different host interfaces.
+ */
+
+#define PSMI_DEVICES_DEFAULT "self,shm,hfi"
+static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
+				      const char *devstr);
+static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o)
+{
+	static int num_units = -1;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (num_units == -1) {
+		num_units = hfi_get_num_units();
+		if (num_units == -1)
+			num_units = 0;
+	}
+
+	*num_units_o = (uint32_t) num_units;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_num_devunits)
+
+static int cmpfunc(const void *p1, const void *p2)
+{
+	uint64_t a = ((uint64_t *) p1)[0];
+	uint64_t b = ((uint64_t *) p2)[0];
+	if (a < b)
+		return -1;
+	if (a == b)
+		return 0;
+	return 1;
+}
+
+static psm2_error_t
+psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port)
+{
+	uint32_t num_units;
+	uint64_t gid_hi, gid_lo;
+	int i, j, ret, count = 0;
+	char *env;
+	psm2_error_t err = PSM2_OK;
+	uint64_t gidh[HFI_MAX_RAILS][3];
+	union psmi_envvar_val env_multirail;
+	int multirail_within_socket_used = 0;
+	int node_id = -1, found = 0;
+
+	psmi_getenv("PSM2_MULTIRAIL",
+			"Use all available HFIs in the system for communication.\n"
+			 "0: Disabled (default),\n"
+			 "1: Enable multirail across all available HFIs,\n"
+			 "2: Enable multirail within socket.\n"
+			 "\t For multirail within a socket, we try to find at\n"
+			 "\t least one HFI on the same socket as current task.\n"
+			 "\t If none found, we continue to use other HFIs within\n"
+			 "\t the system.",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0,
+			&env_multirail);
+	if (!env_multirail.e_int) {
+		*num_rails = 0;
+		return err;
+	}
+
+	if (env_multirail.e_int == 2)
+		multirail_within_socket_used = 1;
+
+/*
+ * map is in format: unit:port,unit:port,...
+ */
+	if ((env = getenv("PSM2_MULTIRAIL_MAP"))) {
+		if (sscanf(env, "%d:%d", &i, &j) == 2) {
+			char *comma = strchr(env, ',');
+			unit[count] = i;
+			port[count] = j;
+			count++;
+			while (comma) {
+				if (sscanf(comma, ",%d:%d", &i, &j) != 2) {
+					break;
+				}
+				unit[count] = i;
+				port[count] = j;
+				count++;
+				if (count == HFI_MAX_RAILS)
+					break;
+				comma = strchr(comma + 1, ',');
+			}
+		}
+		*num_rails = count;
+
+/*
+ * Check if any of the port is not usable.
+ */
+		for (i = 0; i < count; i++) {
+			ret = hfi_get_port_active(unit[i], port[i]);
+			if (ret <= 0) {
+				err =
+				    psmi_handle_error(NULL,
+						      PSM2_EP_DEVICE_FAILURE,
+						      "Unit/port: %d:%d is not active.",
+						      unit[i], port[i]);
+				return err;
+			}
+			ret = hfi_get_port_lid(unit[i], port[i]);
+			if (ret <= 0) {
+				err =
+				    psmi_handle_error(NULL,
+						      PSM2_EP_DEVICE_FAILURE,
+						      "Couldn't get lid for unit %d:%d",
+						      unit[i], port[i]);
+				return err;
+			}
+			ret =
+			    hfi_get_port_gid(unit[i], port[i], &gid_hi,
+					     &gid_lo);
+			if (ret == -1) {
+				err =
+				    psmi_handle_error(NULL,
+						      PSM2_EP_DEVICE_FAILURE,
+						      "Couldn't get gid for unit %d:%d",
+						      unit[i], port[i]);
+				return err;
+			}
+		}
+
+		return err;
+	}
+
+	if ((err = psm2_ep_num_devunits(&num_units))) {
+		return err;
+	}
+	if (num_units > HFI_MAX_RAILS) {
+		_HFI_INFO
+		    ("Found %d units, max %d units are supported, use %d\n",
+		     num_units, HFI_MAX_RAILS, HFI_MAX_RAILS);
+		num_units = HFI_MAX_RAILS;
+	}
+
+	/*
+	 * PSM2_MULTIRAIL=2 functionality-
+	 *   - Try to find at least find one HFI in the same root
+	 *     complex. If none found, continue to run and
+	 *     use remaining HFIs in the system.
+	 *   - If we do find at least one HFI in same root complex, we
+	 *     go ahead and add to list.
+	 */
+	if (multirail_within_socket_used) {
+		node_id = psmi_get_current_proc_location();
+		for (i = 0; i < num_units; i++) {
+			if (hfi_get_unit_active(i) <= 0)
+				continue;
+
+			if (hfi_sysfs_unit_read_node_s64(i) == node_id) {
+				found = 1;
+				break;
+			}
+		}
+	}
+/*
+ * Get all the ports with a valid lid and gid, one per unit.
+ */
+	for (i = 0; i < num_units; i++) {
+		if (multirail_within_socket_used &&
+			found && (hfi_sysfs_unit_read_node_s64(i) != node_id))
+			continue;
+
+		for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+			ret = hfi_get_port_lid(i, j);
+			if (ret <= 0)
+				continue;
+			ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
+			if (ret == -1)
+				continue;
+
+			gidh[count][0] = gid_hi;
+			gidh[count][1] = i;
+			gidh[count][2] = j;
+			count++;
+			break;
+		}
+	}
+
+/*
+ * Sort all the ports with gidh from small to big.
+ * This is for multiple fabrics, and we use fabric with the
+ * smallest gid to make the master connection.
+ */
+	qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc);
+
+	for (i = 0; i < count; i++) {
+		unit[i] = (uint32_t) gidh[i][1];
+		port[i] = (uint16_t) (uint32_t) gidh[i][2];
+	}
+	*num_rails = count;
+	return err;
+}
+
+static psm2_error_t
+psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o,
+		uint64_t my_gid_hi, uint64_t my_gid_lo)
+{
+	static uint16_t *hfi_lids;
+	static uint32_t nlids;
+	uint32_t num_units;
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (hfi_lids == NULL) {
+		if ((err = psm2_ep_num_devunits(&num_units)))
+			goto fail;
+		hfi_lids = (uint16_t *)
+		    psmi_calloc(PSMI_EP_NONE, UNDEFINED,
+				num_units * HFI_NUM_PORTS, sizeof(uint16_t));
+		if (hfi_lids == NULL) {
+			err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+						"Couldn't allocate memory for dev_lids structure");
+			goto fail;
+		}
+
+		for (i = 0; i < num_units; i++) {
+			int j;
+			for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+				int lid = hfi_get_port_lid(i, j);
+				int ret;
+				uint64_t gid_hi = 0, gid_lo = 0;
+
+				if (lid <= 0)
+					continue;
+				ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
+				if (ret == -1)
+					continue;
+				else if (my_gid_hi != gid_hi) {
+					_HFI_VDBG("LID %d, unit %d, port %d, "
+						  "mismatched GID %llx:%llx and "
+						  "%llx:%llx\n",
+						  lid, i, j,
+						  (unsigned long long)gid_hi,
+						  (unsigned long long)gid_lo,
+						  (unsigned long long)my_gid_hi,
+						  (unsigned long long)
+						  my_gid_lo);
+					continue;
+				}
+				_HFI_VDBG("LID %d, unit %d, port %d, "
+					  "matching GID %llx:%llx and "
+					  "%llx:%llx\n", lid, i, j,
+					  (unsigned long long)gid_hi,
+					  (unsigned long long)gid_lo,
+					  (unsigned long long)my_gid_hi,
+					  (unsigned long long)my_gid_lo);
+
+				hfi_lids[nlids++] = (uint16_t) lid;
+			}
+		}
+		if (nlids == 0) {
+			err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"Couldn't get lid&gid from any unit/port");
+			goto fail;
+		}
+	}
+	*lids = hfi_lids;
+	*num_lids_o = nlids;
+
+fail:
+	return err;
+}
+
+static psm2_error_t
+psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey)
+{
+	int i, ret;
+	psm2_error_t err;
+
+	for (i = 0; i < 16; i++) {
+		ret = hfi_get_port_index2pkey(ep->unit_id, ep->portnum, i);
+		if (ret < 0) {
+			err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"Can't get a valid pkey value from pkey table\n");
+			return err;
+		} else if (ret == 0x7fff || ret == 0xffff) {
+			continue;	/* management pkey, not for app traffic. */
+		}
+
+		if (pkey == (uint16_t) ret)
+			break;
+	}
+
+	/* if pkey does not match */
+	if (i == 16) {
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"Wrong pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n",
+					pkey);
+		return err;
+	}
+
+	/* return the final pkey */
+	*opkey = pkey;
+
+	return PSM2_OK;
+}
+
+uint64_t __psm2_epid_nid(psm2_epid_t epid)
+{
+	uint64_t rv;
+
+	PSM2_LOG_MSG("entering");
+	rv = (uint64_t) PSMI_EPID_GET_LID(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_nid)
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * subcontexts */
+uint64_t psmi_epid_subcontext(psm2_epid_t epid)
+{
+	return (uint64_t) PSMI_EPID_GET_SUBCONTEXT(epid);
+}
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * service levels encoding within epids. This may require
+ * changing to expose SLs
+ */
+uint64_t psmi_epid_version(psm2_epid_t epid)
+{
+	return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid);
+}
+
+uint64_t __psm2_epid_context(psm2_epid_t epid)
+{
+	uint64_t rv;
+
+	PSM2_LOG_MSG("entering");
+	rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_context)
+
+uint64_t __psm2_epid_port(psm2_epid_t epid)
+{
+	uint64_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = __psm2_epid_context(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_port)
+
+psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
+{
+	psm2_error_t err = PSM2_OK;
+	int i;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (*num_of_epinfo <= 0) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_query parameters");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	ep = psmi_opened_endpoint;
+	for (i = 0; i < *num_of_epinfo; i++) {
+		if (ep == NULL)
+			break;
+		array_of_epinfo[i].ep = ep;
+		array_of_epinfo[i].epid = ep->epid;
+		array_of_epinfo[i].jkey = ep->jkey;
+		memcpy(array_of_epinfo[i].uuid,
+		       (void *)ep->uuid, sizeof(psm2_uuid_t));
+		psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str);
+		ep = ep->user_ep_next;
+	}
+	*num_of_epinfo = i;
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_query)
+
+psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	/* Need to have an opened endpoint before we can resolve epids */
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	ep = psmi_opened_endpoint;
+	while (ep) {
+		epaddr = psmi_epid_lookup(ep, epid);
+		if (!epaddr) {
+			ep = ep->user_ep_next;
+			continue;
+		}
+
+		/* Found connection for epid. Return info about endpoint to caller. */
+		psmi_assert_always(epaddr->ptlctl->ep == ep);
+		epconn->addr = epaddr;
+		epconn->ep = ep;
+		epconn->mq = ep->mq;
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN,
+				"Endpoint connection status unknown");
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup);
+
+psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	/* Need to have an opened endpoint before we can resolve epids */
+	if (ep == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	if (epconn == NULL) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid output parameter");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid);
+	if (epaddr) {
+		/* Found connection for epid. Return info about endpoint to caller. */
+		psmi_assert_always(epaddr->ptlctl->ep == ep);
+		epconn->addr = epaddr;
+		epconn->ep = ep;
+		epconn->mq = ep->mq;
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN,
+				"Endpoint connection status unknown");
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup2);
+
+psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid)
+{
+	psm2_error_t err = PSM2_OK;
+	PSM2_LOG_MSG("entering");
+	if (epaddr && epid) {
+		*epid = epaddr->epid;
+	}
+	else {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid input epaddr or output epid parameter");
+	}
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_epaddr_to_epid);
+
+psm2_error_t
+__psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o)
+{
+	uint32_t num_lids = 0;
+	uint16_t *lids = NULL;
+	int i;
+	uint16_t epid_lid;
+	int result = 0;
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert_always(ep != NULL);
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) ||
+		(psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) {
+		/* If we are in the no hfi-mode, or the other process is,
+		 * the epid doesn't help us - so assume both we're on the same
+		 * machine and try to connect.
+		 */
+		result = 1;
+	} else {
+		epid_lid = (uint16_t) psm2_epid_nid(epid);
+		err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo);
+		if (err) {
+			PSM2_LOG_MSG("leaving");
+			return err;
+		}
+		for (i = 0; i < num_lids; i++) {
+			if (epid_lid == lids[i]) {
+				/* we share memory if the lid is the same. */
+				result = 1;
+				break;
+			}
+		}
+	}
+	*result_o = result;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_epid_share_memory)
+
+psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts)
+{
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (!opts)
+		return PSM2_PARAM_ERR;
+
+	/* Set in order in the structure. */
+	opts->timeout = 30000000000LL;	/* 30 sec */
+	opts->unit = HFI_UNIT_ID_ANY;
+	opts->affinity = PSM2_EP_OPEN_AFFINITY_SET;
+	opts->shm_mbytes = 0;	/* deprecated in psm2.h */
+	opts->sendbufs_num = 1024;
+	opts->network_pkey = HFI_DEFAULT_P_KEY;
+	opts->port = HFI_PORT_NUM_ANY;
+	opts->outsl = PSMI_SL_DEFAULT;
+	opts->service_id = HFI_DEFAULT_SERVICE_ID;
+	opts->path_res_type = PSM2_PATH_RES_NONE;
+	opts->senddesc_num = 4096;
+	opts->imm_size = 128;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_open_opts_get_defaults)
+
+psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly);
+
+psm2_error_t
+__psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
+		       struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq,
+		       psm2_ep_t *epo, psm2_epid_t *epido)
+{
+	psm2_ep_t ep = NULL;
+	uint32_t num_units;
+	size_t len;
+	psm2_error_t err;
+	psm2_epaddr_t epaddr = NULL;
+	char buf[128], *p, *e;
+	union psmi_envvar_val envvar_val;
+	size_t ptl_sizes;
+	struct psm2_ep_open_opts opts;
+	ptl_t *amsh_ptl, *ips_ptl, *self_ptl;
+	int i;
+
+	/* First get the set of default options, we overwrite with the user's
+	 * desired values afterwards */
+	if ((err = psm2_ep_open_opts_get_defaults(&opts)))
+		goto fail;
+
+	if (opts_i != NULL) {
+		if (opts_i->timeout != -1)
+			opts.timeout = opts_i->timeout;
+		if (opts_i->unit != -1)
+			opts.unit = opts_i->unit;
+		if (opts_i->affinity != -1)
+			opts.affinity = opts_i->affinity;
+
+		if (opts_i->sendbufs_num != -1)
+			opts.sendbufs_num = opts_i->sendbufs_num;
+
+		if (opts_i->network_pkey != HFI_DEFAULT_P_KEY)
+			opts.network_pkey = opts_i->network_pkey;
+
+		if (opts_i->port != 0)
+			opts.port = opts_i->port;
+
+		if (opts_i->outsl != -1)
+			opts.outsl = opts_i->outsl;
+
+		if (opts_i->service_id)
+			opts.service_id = (uint64_t) opts_i->service_id;
+		if (opts_i->path_res_type != PSM2_PATH_RES_NONE)
+			opts.path_res_type = opts_i->path_res_type;
+
+		if (opts_i->senddesc_num)
+			opts.senddesc_num = opts_i->senddesc_num;
+		if (opts_i->imm_size)
+			opts.imm_size = opts_i->imm_size;
+	}
+
+	/* Get Service ID from environment */
+	if (!psmi_getenv("PSM2_IB_SERVICE_ID",
+			 "HFI Service ID for path resolution",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_ULONG_ULONG,
+			 (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID,
+			 &envvar_val)) {
+		opts.service_id = (uint64_t) envvar_val.e_ulonglong;
+	}
+
+	/* Get Path resolution type from environment Possible choices are:
+	 *
+	 * NONE : Default same as previous instances. Utilizes static data.
+	 * OPP  : Use OFED Plus Plus library to do path record queries.
+	 * UMAD : Use raw libibumad interface to form and process path records.
+	 */
+	if (!psmi_getenv("PSM2_PATH_REC",
+			 "Mechanism to query HFI path record (default is no path query)",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)"none", &envvar_val)) {
+		if (!strcasecmp(envvar_val.e_str, "none"))
+			opts.path_res_type = PSM2_PATH_RES_NONE;
+		else if (!strcasecmp(envvar_val.e_str, "opp"))
+			opts.path_res_type = PSM2_PATH_RES_OPP;
+		else if (!strcasecmp(envvar_val.e_str, "umad"))
+			opts.path_res_type = PSM2_PATH_RES_UMAD;
+		else {
+			_HFI_ERROR("Unknown path resolution type %s. "
+				   "Disabling use of path record query.\n",
+				   envvar_val.e_str);
+			opts.path_res_type = PSM2_PATH_RES_NONE;
+		}
+	}
+
+	/* If a specific unit is set in the environment, use that one. */
+	if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+			 (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) {
+		opts.unit = envvar_val.e_long;
+	}
+
+	/* Get user specified port number to use. */
+	if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+			 (union psmi_envvar_val)HFI_PORT_NUM_ANY,
+			 &envvar_val)) {
+		opts.port = envvar_val.e_long;
+	}
+
+	/* Get service level from environment, path-query overrides it */
+	if (!psmi_getenv
+	    ("HFI_SL", "HFI outging ServiceLevel number (default 0)",
+	     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+	     (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) {
+		opts.outsl = envvar_val.e_long;
+	}
+
+	/* Get network key from environment. MVAPICH and other vendor MPIs do not
+	 * specify it on ep open and we may require it for vFabrics.
+	 * path-query will override it.
+	 */
+	if (!psmi_getenv("PSM2_PKEY",
+			 "HFI PKey to use for endpoint",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_ULONG,
+			 (union psmi_envvar_val)HFI_DEFAULT_P_KEY,
+			 &envvar_val)) {
+		opts.network_pkey = (uint64_t) envvar_val.e_ulong;
+	}
+
+	/* BACKWARDS COMPATIBILITY:  Open MPI likes to choose its own PKEY of
+	   0x7FFF.  That's no longer a valid default, so override it if the
+	   client was compiled against PSM v1 */
+	if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 &&
+			opts.network_pkey == 0x7FFF) {
+		opts.network_pkey = HFI_DEFAULT_P_KEY;
+	}
+
+	/* Get number of default send buffers from environment */
+	if (!psmi_getenv("PSM2_NUM_SEND_BUFFERS",
+			 "Number of send buffers to allocate [1024]",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)1024, &envvar_val)) {
+		opts.sendbufs_num = envvar_val.e_uint;
+	}
+
+	/* Get immediate data size - transfers less than immediate data size do
+	 * not consume a send buffer and require just a send descriptor.
+	 */
+	if (!psmi_getenv("PSM2_SEND_IMMEDIATE_SIZE",
+			 "Immediate data send size not requiring a buffer [128]",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)128, &envvar_val)) {
+		opts.imm_size = envvar_val.e_uint;
+	}
+
+	/* Get number of send descriptors - by default this is 4 times the number
+	 * of send buffers - mainly used for short/inlined messages.
+	 */
+	if (!psmi_getenv("PSM2_NUM_SEND_DESCRIPTORS",
+			 "Number of send descriptors to allocate [4096]",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)4096, &envvar_val)) {
+		opts.senddesc_num = envvar_val.e_uint;
+	}
+
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK)
+			goto fail;
+	} else
+		num_units = 0;
+
+	/* do some error checking */
+	if (opts.timeout < -1) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid timeout value %lld",
+					(long long)opts.timeout);
+		goto fail;
+	} else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid Device Unit ID %d (%d units found)",
+					opts.unit, num_units);
+		goto fail;
+	} else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) &&
+				opts.port != HFI_PORT_NUM_ANY) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid Device port number %d",
+					opts.port);
+		goto fail;
+	} else if (opts.affinity < 0
+		   || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) {
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Invalid Affinity option: %d",
+				      opts.affinity);
+		goto fail;
+	} else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid SL number: %lld",
+					(unsigned long long)opts.outsl);
+		goto fail;
+	}
+
+	/* Set environment variable if PSM is not allowed to set affinity */
+	if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP)
+		setenv("HFI_NO_CPUAFFINITY", "1", 1);
+
+	/* Allocate end point structure storage */
+	ptl_sizes =
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
+	     psmi_ptl_self.sizeof_ptl() : 0) +
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ?
+	     psmi_ptl_ips.sizeof_ptl() : 0) +
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ?
+	     psmi_ptl_amsh.sizeof_ptl() : 0);
+	if (ptl_sizes == 0)
+		return PSM2_EP_NO_DEVICE;
+
+	ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64,
+				      sizeof(struct psm2_ep) + ptl_sizes);
+	epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+					    1, sizeof(struct psm2_epaddr));
+	if (ep == NULL || epaddr == NULL) {
+		err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+					"Couldn't allocate memory for %s structure",
+					ep == NULL ? "psm2_ep" : "psm2_epaddr");
+		goto fail;
+	}
+	memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes);
+
+	/* Copy PTL enabled status */
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		ep->devid_enabled[i] = devid_enabled[i];
+
+	/* Matched Queue initialization.  We do this early because we have to
+	 * make sure ep->mq exists and is valid before calling ips_do_work.
+	 */
+	ep->mq = mq;
+
+	/* Get ready for PTL initialization */
+	memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t));
+	ep->epaddr = epaddr;
+	ep->memmode = mq->memmode;
+	ep->hfi_num_sendbufs = opts.sendbufs_num;
+	ep->service_id = opts.service_id;
+	ep->path_res_type = opts.path_res_type;
+	ep->hfi_num_descriptors = opts.senddesc_num;
+	ep->hfi_imm_size = opts.imm_size;
+	ep->errh = psmi_errhandler_global;	/* by default use the global one */
+	ep->ptl_amsh.ep_poll = psmi_poll_noop;
+	ep->ptl_ips.ep_poll = psmi_poll_noop;
+	ep->connections = 0;
+
+	/* See how many iterations we want to spin before yielding */
+	psmi_getenv("PSM2_YIELD_SPIN_COUNT",
+		    "Spin poll iterations before yield",
+		    PSMI_ENVVAR_LEVEL_HIDDEN,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD,
+		    &envvar_val);
+	ep->yield_spin_cnt = envvar_val.e_uint;
+
+	ptl_sizes = 0;
+	amsh_ptl = ips_ptl = self_ptl = NULL;
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_amsh.sizeof_ptl();
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_ips.sizeof_ptl();
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_self.sizeof_ptl();
+	}
+
+	if ((err = psmi_ep_open_device(ep, &opts, unique_job_key,
+				       &(ep->context), &ep->epid)))
+		goto fail;
+
+	psmi_assert_always(ep->epid != 0);
+	ep->epaddr->epid = ep->epid;
+
+	_HFI_VDBG("psmi_ep_open_device() passed\n");
+
+	/* Set our new label as soon as we know what it is */
+	strncpy(buf, psmi_gethostname(), sizeof(buf) - 1);
+	buf[sizeof(buf) - 1] = '\0';
+
+	p = buf + strlen(buf);
+
+	/* If our rank is set, use it. If not, use context.subcontext notation */
+	if (((e = getenv("MPI_RANKID")) != NULL && *e) ||
+	    ((e = getenv("PSC_MPI_RANK")) != NULL && *e))
+		len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e));
+	else
+		len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.",
+			       (uint32_t) psm2_epid_context(ep->epid),
+			       (uint32_t) psmi_epid_subcontext(ep->epid));
+	*(p + len) = '\0';
+	ep->context_mylabel = psmi_strdup(ep, buf);
+	if (ep->context_mylabel == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	/* hfi_set_mylabel(ep->context_mylabel); */
+
+	if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0)))
+		goto fail;
+
+	_HFI_VDBG("start ptl device init...\n");
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self)))
+			goto fail;
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips)))
+			goto fail;
+	}
+	/* If we're shm-only, this device is enabled above */
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh)))
+			goto fail;
+	} else {
+		/* We may have pre-attached as part of getting our rank for enabling
+		 * shared contexts.  */
+	}
+
+	_HFI_VDBG("finish ptl device init...\n");
+
+	/*
+	 * Keep only IPS since only IPS support multi-rail, other devices
+	 * are only setup once. IPS device can come to this function again.
+	 */
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (devid_enabled[i] != PTL_DEVID_IPS) {
+			devid_enabled[i] = -1;
+		}
+	}
+
+	*epido = ep->epid;
+	*epo = ep;
+
+	return PSM2_OK;
+
+fail:
+	if (ep != NULL) {
+		if (ep->context.fd != -1)
+			close(ep->context.fd);
+		psmi_free(ep);
+	}
+	if (epaddr != NULL)
+		psmi_free(epaddr);
+	return err;
+}
+
+psm2_error_t
+__psm2_ep_open(psm2_uuid_t const unique_job_key,
+	      struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo,
+	      psm2_epid_t *epido)
+{
+	psm2_error_t err;
+	psm2_mq_t mq;
+	psm2_epid_t epid;
+	psm2_ep_t ep, tmp;
+	uint32_t units[HFI_MAX_RAILS];
+	uint16_t ports[HFI_MAX_RAILS];
+	int i, num_rails = 0;
+	char *uname = "HFI_UNIT";
+	char *pname = "HFI_PORT";
+	char uvalue[6], pvalue[6];
+	int devid_enabled[PTL_MAX_INIT];
+	union psmi_envvar_val devs;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (!epo || !epido)
+		return PSM2_PARAM_ERR;
+
+	/* Allowing only one EP (unless explicitly enabled). */
+	if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_TOO_MANY_ENDPOINTS;
+	}
+
+	/* Matched Queue initialization.  We do this early because we have to
+	 * make sure ep->mq exists and is valid before calling ips_do_work.
+	 */
+	err = psmi_mq_malloc(&mq);
+	PSMI_LOCK(psmi_creation_lock);
+	if (err != PSM2_OK)
+		goto fail;
+
+	/* Set some of the MQ thresholds from the environment.
+	   Do this before ptl initialization - the ptl may have other
+	   constraints that will limit the MQ's settings. */
+	err = psmi_mq_initialize_defaults(mq);
+	if (err != PSM2_OK)
+		goto fail;
+
+	psmi_init_lock(&(mq->progress_lock));
+
+	/* See which ptl devices we want to use for this ep to be opened */
+	psmi_getenv("PSM2_DEVICES",
+		    "Ordered list of PSM-level devices",
+		    PSMI_ENVVAR_LEVEL_USER,
+		    PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs);
+
+	if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
+		goto fail;
+
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		err = psmi_ep_multirail(&num_rails, units, ports);
+		if (err != PSM2_OK)
+			goto fail;
+
+		/* If multi-rail is used, set the first ep unit/port */
+		if (num_rails > 0) {
+			snprintf(uvalue, 6, "%1d", units[0]);
+			snprintf(pvalue, 6, "%1d", ports[0]);
+			setenv(uname, uvalue, 1);
+			setenv(pname, pvalue, 1);
+		}
+	}
+
+	err = __psm2_ep_open_internal(unique_job_key,
+				     devid_enabled, opts_i, mq, &ep, &epid);
+	if (err != PSM2_OK)
+		goto fail;
+
+	if (psmi_opened_endpoint == NULL) {
+		psmi_opened_endpoint = ep;
+	} else {
+		tmp = psmi_opened_endpoint;
+		while (tmp->user_ep_next)
+			tmp = tmp->user_ep_next;
+		tmp->user_ep_next = ep;
+	}
+	psmi_opened_endpoint_count++;
+	ep->mctxt_prev = ep->mctxt_next = ep;
+	ep->mctxt_master = ep;
+	mq->ep = ep;
+
+	/* Active Message initialization */
+	err = psmi_am_init_internal(ep);
+	if (err != PSM2_OK)
+		goto fail;
+
+	*epo = ep;
+	*epido = epid;
+
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		for (i = 1; i < num_rails; i++) {
+			snprintf(uvalue, 6, "%1d", units[i]);
+			snprintf(pvalue, 6, "%1d", ports[i]);
+			setenv(uname, uvalue, 1);
+			setenv(pname, pvalue, 1);
+
+			/* Create slave EP */
+			err = __psm2_ep_open_internal(unique_job_key,
+						     devid_enabled, opts_i, mq,
+						     &tmp, &epid);
+			if (err)
+				goto fail;
+
+			/* Point back to shared resources on the master EP */
+			tmp->am_htable = ep->am_htable;
+
+			/* Link slave EP after master EP. */
+			PSM_MCTXT_APPEND(ep, tmp);
+		}
+	}
+
+	_HFI_VDBG("psm2_ep_open() OK....\n");
+
+fail:
+	PSMI_UNLOCK(psmi_creation_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_open)
+
+psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
+{
+	psm2_error_t err = PSM2_OK;
+#if _HFI_DEBUGGING
+	uint64_t t_start = 0;
+	if (_HFI_PRDBG_ON) {
+		t_start = get_cycles();
+	}
+#endif
+	union psmi_envvar_val timeout_intval;
+	psm2_ep_t tmp;
+	psm2_mq_t mmq;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+	psmi_assert_always(ep->mctxt_master == ep);
+
+	PSMI_LOCK(psmi_creation_lock);
+
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		PSMI_UNLOCK(psmi_creation_lock);
+		return err;
+	}
+
+	tmp = psmi_opened_endpoint;
+	while (tmp && tmp != ep) {
+		tmp = tmp->user_ep_next;
+	}
+	if (!tmp) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		PSMI_UNLOCK(psmi_creation_lock);
+		return err;
+	}
+
+	psmi_getenv("PSM2_CLOSE_TIMEOUT",
+		    "End-point close timeout over-ride.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &timeout_intval);
+
+	if (getenv("PSM2_CLOSE_TIMEOUT")) {
+		timeout_in = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout_in > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100);
+	}
+
+	if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT)
+		timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+	/* Infinite and excessive close time-out are limited here to a max.
+	 * The "rationale" is that there is no point waiting around forever for
+	 * graceful termination. Normal (or forced) process termination should clean
+	 * up the context state correctly even if termination is not graceful. */
+	if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT)
+		timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT;
+	_HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and "
+		   "%d connections\n",
+		   ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO",
+		   (double)timeout_in / 1e9, (int)ep->connections);
+
+	/* XXX We currently cheat in the sense that we leave each PTL the allowed
+	 * timeout.  There's no good way to do this until we change the PTL
+	 * interface to allow asynchronous finalization
+	 */
+
+	/*
+	 * Before freeing the master ep itself,
+	 * remove it from the global linklist.
+	 * We do it here to let atexit handler in ptl_am directory
+	 * to search the global linklist and free the shared memory file.
+	 */
+	if (psmi_opened_endpoint == ep) {
+		/* Removing ep from global endpoint list. */
+		psmi_opened_endpoint = ep->user_ep_next;
+	} else {
+		tmp = psmi_opened_endpoint;
+		while (tmp->user_ep_next != ep) {
+			tmp = tmp->user_ep_next;
+		}
+		/* Removing ep from global endpoint list. */
+		tmp->user_ep_next = ep->user_ep_next;
+	}
+	psmi_opened_endpoint_count--;
+
+	/*
+	 * This do/while loop is used to close and free memory of endpoints.
+	 *
+	 * If MULTIRAIL feature is disable this loop will be passed only once
+	 * and only endpoint passed in psm2_ep_close will be closed/removed.
+	 *
+	 * If MULTIRAIL feature is enabled then this loop will be passed
+	 * multiple times (depending on number of rails). The order in which
+	 * endpoints will be closed is shown below:
+	 *
+	 *                      |--this is master endpoint in case of multirail
+	 *	                |  this endpoint is passed to psm2_ep_close and
+	 *			V  this is only endpoint known to user.
+	 *   +<-Ep0<-Ep1<-Ep2<-Ep3
+	 *   |__________________|	Ep3->mctxt_prev points to Ep2
+	 *	(3)  (2)  (1)  (4)	Ep2->mctxt_prev points to Ep1
+	 *	 ^			Ep1->mctxt_prev points to Ep0
+	 *	 |			Ep0->mctxt_prev points to Ep3 (master ep)
+	 *	 |
+	 *       |---- order in which endpoints will be closed.
+	 *
+	 * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0)
+	 * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if
+	 * feature is disabled then Ep3->mctxt_prev will point to Ep3 and
+	 * do/while loop will have one pass.
+	 *
+	 * In case of MULTIRAIL enabled Ep3 which is master endpoint will be
+	 * closed as the last one.
+	 */
+	mmq = ep->mq;
+	tmp = ep->mctxt_prev;
+	do {
+		ep = tmp;
+		tmp = ep->mctxt_prev;
+
+		PSMI_LOCK(ep->mq->progress_lock);
+
+		PSM_MCTXT_REMOVE(ep);
+		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH))
+			err =
+			    psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode,
+					       timeout_in);
+
+		if ((err == PSM2_OK || err == PSM2_TIMEOUT) &&
+		    psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+			err =
+			    psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode,
+					      timeout_in);
+
+		/* If there's timeouts in the disconnect requests,
+		 * still make sure that we still get to close the
+		 *endpoint and mark it closed */
+		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+			psmi_context_close(&ep->context);
+
+		psmi_free(ep->epaddr);
+		psmi_free(ep->context_mylabel);
+
+		PSMI_UNLOCK(ep->mq->progress_lock);
+
+		ep->mq = NULL;
+		psmi_free(ep);
+
+	} while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep);
+
+	if (mmq)
+	        err = psmi_mq_free(mmq);
+
+
+	PSMI_UNLOCK(psmi_creation_lock);
+
+	if (_HFI_PRDBG_ON) {
+		_HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n",
+				 (double)cycles_to_nanosecs(get_cycles() -
+				 t_start) / SEC_ULL);
+	}
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_close)
+
+static
+psm2_error_t
+psmi_ep_open_device(const psm2_ep_t ep,
+		    const struct psm2_ep_open_opts *opts,
+		    const psm2_uuid_t unique_job_key,
+		    struct psmi_context *context, psm2_epid_t *epid)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Skip affinity.  No affinity if:
+	 * 1. User explicitly sets no-affinity=YES in environment.
+	 * 2. User doesn't set affinity in environment and PSM is opened with
+	 *    option affinity skip.
+	 */
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		uint32_t rcvthread_flags;
+		union psmi_envvar_val env_rcvthread;
+		static int norcvthread;	/* only for first rail */
+
+		ep->out_sl = opts->outsl;
+
+		if ((err =
+		     psmi_context_open(ep, opts->unit, opts->port,
+				       unique_job_key, opts->timeout,
+				       context)) != PSM2_OK)
+			goto fail;
+
+		_HFI_DBG("[%d]use unit %d port %d\n", getpid(),
+			 context->ctrl->__hfi_unit, 1);
+
+		/* At this point, we have the unit id and port number, so
+		 * check if pkey is not 0x0/0x7fff/0xffff, and match one
+		 * of the pkey in table.
+		 */
+		if ((err =
+		     psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey,
+					 &ep->network_pkey)) != PSM2_OK)
+			goto fail;
+
+		/* See if we want to activate support for receive thread */
+		psmi_getenv("PSM2_RCVTHREAD",
+			    "Recv thread flags (0 disables thread)",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)(norcvthread++ ? 0 :
+						    PSMI_RCVTHREAD_FLAGS),
+			    &env_rcvthread);
+		rcvthread_flags = env_rcvthread.e_uint;
+
+		/* If enabled, use the pollurg capability to implement a receive
+		 * interrupt thread that can handle urg packets */
+		if (rcvthread_flags) {
+			context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD;
+#ifdef PSMI_PLOCK_IS_NOLOCK
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "#define PSMI_PLOCK_IS_NOLOCK not functional yet "
+					  "with RCVTHREAD on");
+#endif
+		}
+		context->rcvthread_flags = rcvthread_flags;
+
+		*epid = context->epid;
+	} else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		int rank;
+
+		/* In shm-only mode, we need to derive a valid epid
+		 * based on our rank.  We try to get it from the
+		 * environment if its available, or resort to using
+		 * our PID as the rank.
+		 */
+		union psmi_envvar_val env_rankid;
+
+		if (psmi_getenv
+		    ("MPI_LOCALRANKID", "Shared context rankid",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		     (union psmi_envvar_val)-1, &env_rankid)) {
+			if (psmi_getenv
+			    ("PSC_MPI_NODE_RANK",
+			     "Shared context rankid",
+			     PSMI_ENVVAR_LEVEL_HIDDEN,
+			     PSMI_ENVVAR_TYPE_INT,
+			     (union psmi_envvar_val)-1, &env_rankid)) {
+				rank = getpid();
+			} else
+				rank = env_rankid.e_int;
+		} else
+			rank = env_rankid.e_int;
+
+		/*
+		 * We use a LID of 0 for non-HFI communication.
+		 * Since a jobkey is not available from IPS, pull the
+		 * first 16 bits from the UUID.
+		 */
+		switch (PSMI_EPID_VERSION) {
+			case PSMI_EPID_V1:
+				*epid = PSMI_EPID_PACK_V1(((uint16_t *) unique_job_key)[0],
+					   (rank >> 3), rank, 0,
+					   PSMI_EPID_VERSION_SHM, rank);
+				break;
+			case PSMI_EPID_V2:
+				/* Construct epid for this Endpoint */
+				*epid = PSMI_EPID_PACK_V2_SHM(getpid(),
+								PSMI_EPID_SHM_ONLY, /*is a only-shm epid */
+								PSMI_EPID_VERSION);
+				break;
+			default:
+				/* Epid version is greater than max supportd version. */
+				psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+				break;
+		}
+	} else {
+		/* Self-only, meaning only 1 proc max */
+		switch (PSMI_EPID_VERSION) {
+			case PSMI_EPID_V1:
+				*epid = PSMI_EPID_PACK_V1(
+					0, 0, 0, 0, PSMI_EPID_VERSION_SHM, 0x3ffffff);
+				break;
+			case PSMI_EPID_V2:
+				*epid = PSMI_EPID_PACK_V2_SHM(0,
+								PSMI_EPID_SHM_ONLY, /*is a only-shm epid */
+								PSMI_EPID_VERSION);
+				break;
+			default:
+				/* Epid version is greater than max supportd version. */
+				psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+				break;
+		}
+	}
+
+fail:
+	return err;
+}
+
+/* Get a list of PTLs we want to use.  The order is important, it affects
+ * whether node-local processes use shm or ips */
+static
+psm2_error_t
+psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
+{
+	char *devstr = NULL;
+	char *b_new, *e, *ee, *b;
+	psm2_error_t err = PSM2_OK;
+	int len;
+	int i = 0;
+
+	psmi_assert_always(devstring != NULL);
+	len = strlen(devstring) + 1;
+
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		devices[i] = -1;
+
+	devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len);
+	if (devstr == NULL)
+		goto fail;
+
+	b_new = (char *)devstr;
+	e = b_new + len;
+	strncpy(e, devstring, len - 1);
+	e[len - 1] = '\0';
+	ee = e + len;
+	i = 0;
+	while (e < ee && *e && i < PTL_MAX_INIT) {
+		while (*e && !isalpha(*e))
+			e++;
+		b = e;
+		while (*e && isalpha(*e))
+			e++;
+		*e = '\0';
+		if (*b) {
+			if (!strcasecmp(b, "self")) {
+				devices[i++] = PTL_DEVID_SELF;
+				b_new = strcpy(b_new, "self,");
+				b_new += 5;
+			} else if (!strcasecmp(b, "shm") ||
+					!strcasecmp(b, "shmem") ||
+					!strcasecmp(b, "amsh")) {
+				devices[i++] = PTL_DEVID_AMSH;
+				strcpy(b_new, "amsh,");
+				b_new += 5;
+			} else if (!strcasecmp(b, "hfi") ||
+					!strcasecmp(b, "ipath") ||
+					!strcasecmp(b, "ips")) {
+				devices[i++] = PTL_DEVID_IPS;
+				strcpy(b_new, "ips,");
+				b_new += 4;
+			} else {
+				err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+							"%s set in environment variable PSM_PTL_DEVICES=\"%s\" "
+							"is not one of the recognized PTL devices (%s)",
+							b, devstring,
+							PSMI_DEVICES_DEFAULT);
+				goto fail;
+			}
+			e++;
+		}
+	}
+	if (b_new != devstr)	/* we parsed something, remove trailing comma */
+		*(b_new - 1) = '\0';
+
+	_HFI_PRDBG("PSM Device allocation order: %s\n", devstr);
+fail:
+	if (devstr != NULL)
+		psmi_free(devstr);
+	return err;
+
+}
+
+static
+int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
+{
+	int i;
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		if (devid_enabled[i] == devid)
+			return 1;
+	return 0;
+}
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid)
+{
+	return psmi_device_is_enabled(ep->devid_enabled, devid);
+}
diff --git a/psm_ep.h b/psm_ep.h
new file mode 100644
index 0000000..78b12f1
--- /dev/null
+++ b/psm_ep.h
@@ -0,0 +1,245 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm2_ep.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_EP_H
+#define _PSMI_EP_H
+
+/*
+ * EPIDs encode the following information:
+ *
+ * LID:16 bits - LID for endpoint
+ * CONTEXT:8 bits - Context used for bits (upto 256 contexts)
+ * SUBCONTEXT:3 bits - Subcontext used for endpoint
+ * HFIUNIT: 2 bits - HFI unit number
+ * HFITYPE: 3 bits - OPA1, OPA2, ...
+ * RANK: 26 bits - process rank
+ * reserved: 6 bit - for future usage
+ */
+
+#define PSMI_HFI_TYPE_UNKNOWN 0
+#define PSMI_HFI_TYPE_OPA1    1
+#define PSMI_HFI_TYPE_OPA2    2
+
+#define PSMI_SL_DEFAULT 0
+#define PSMI_SC_DEFAULT 0
+#define PSMI_VL_DEFAULT 0
+#define PSMI_SL_MIN	0
+#define PSMI_SL_MAX	31
+#define PSMI_SC_ADMIN	15
+#define PSMI_VL_ADMIN	15
+
+#define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \
+	(((((uint64_t)lid)&0xffff)<<16)			|								\
+	 ((((uint64_t)context)&0xff)<<8)		|			  					\
+	 ((((uint64_t)subcontext)&0x7)<<5)		|								\
+	 ((((uint64_t)hfiunit)&0x3)<<3)			|								\
+	 ((((uint64_t)epid_version)&0x7)<<0)	|								\
+	 ((((uint64_t)rank)&0x3ffffff)<<32))
+
+#define PSMI_EPID_PACK_V2(lid, context, subcontext, shmbool, epid_version, subnet_id) \
+	(((((uint64_t)lid)&0xffffff)<<16)			|								\
+	 ((((uint64_t)context)&0xff)<<8)		|			  					\
+	 ((((uint64_t)subcontext)&0x7)<<5)		|								\
+	 ((((uint64_t)shmbool)&0x1)<<3)			|								\
+	 ((((uint64_t)epid_version)&0x7)<<0)	|								\
+	 ((((uint64_t)subnet_id)&0xffff)<<48))
+
+#define PSMI_EPID_PACK_V2_SHM(process_id, shmbool, epid_version) \
+	(((((uint64_t)process_id)&0xffffffff)<<32)			|								\
+	 ((((uint64_t)shmbool)&0x1)<<3)		|			  					\
+	 ((((uint64_t)epid_version)&0x7)<<0))
+
+#define PSMI_EPID_GET_LID_V1(epid)			(((epid)>>16)&0xffff)
+#define PSMI_EPID_GET_LID_V2(epid)			(((epid)>>16)&0xffffff)
+#define PSMI_EPID_GET_CONTEXT(epid)			(((epid)>>8)&0xff)
+#define PSMI_EPID_GET_SUBCONTEXT(epid)		(((epid)>>5)&0x7)
+#define PSMI_EPID_GET_HFIUNIT(epid)			(((epid)>>3)&0x3)
+#define PSMI_EPID_GET_EPID_VERSION(epid)	(((epid)>>0)&0x7)
+#define PSMI_EPID_GET_RANK(epid)			(((epid)>>32)&0x3ffffff)
+#define PSMI_EPID_GET_SHMBOOL(epid)			(((epid)>>3)&0x1)
+#define PSMI_EPID_GET_SUBNET_ID(epid)		(((epid)>>48)&0xffff)
+#define PSMI_EPID_GET_PROCESS_ID(epid)		(((epid)>>32)&0xffffffff)
+
+#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL)
+#define PSMI_MIN_EP_CLOSE_TIMEOUT   (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_TIMEOUT   (2 * SEC_ULL)
+
+#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL)
+
+#define PSM_MCTXT_APPEND(head, node)	\
+	node->mctxt_prev = head->mctxt_prev; \
+	node->mctxt_next = head; \
+	head->mctxt_prev->mctxt_next = node; \
+	head->mctxt_prev = node; \
+	node->mctxt_master = head
+#define PSM_MCTXT_REMOVE(node)	\
+	node->mctxt_prev->mctxt_next = node->mctxt_next; \
+	node->mctxt_next->mctxt_prev = node->mctxt_prev; \
+	node->mctxt_next = node->mctxt_prev = node; \
+	node->mctxt_master = NULL
+
+#define HFI_MAX_RAILS		4
+
+struct psm2_ep {
+	psm2_epid_t epid;	    /**> This endpoint's Endpoint ID */
+	psm2_epaddr_t epaddr;	    /**> This ep's ep address */
+	psm2_mq_t mq;		    /**> only 1 MQ */
+	int unit_id;
+	uint16_t portnum;
+	uint16_t out_sl;
+	uint16_t mtu;		/* out_sl-->vl-->mtu in sysfs */
+	uint16_t network_pkey;	      /**> OPA Pkey */
+	int did_syslog;
+	psm2_uuid_t uuid;
+	uint16_t jkey;
+	uint64_t service_id;	/* OPA service ID */
+	psm2_path_res_t path_res_type;	/* Path resolution for endpoint */
+	psm2_ep_errhandler_t errh;
+	int devid_enabled[PTL_MAX_INIT];
+	int memmode;		    /**> min, normal, large memory mode */
+
+	uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */
+	uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/
+	uint32_t hfi_imm_size;	  /** Immediate data size */
+	uint32_t connections;	    /**> Number of connections */
+
+	psmi_context_t context;
+	char *context_mylabel;
+	uint32_t yield_spin_cnt;
+
+	/* EP link-lists */
+	struct psm2_ep *user_ep_next;
+
+	/* EP link-lists for multi-context. */
+	struct psm2_ep *mctxt_prev;
+	struct psm2_ep *mctxt_next;
+	struct psm2_ep *mctxt_master;
+
+	/* Active Message handler table */
+	void **am_htable;
+
+	uint64_t gid_hi;
+	uint64_t gid_lo;
+
+	ptl_ctl_t ptl_amsh;
+	ptl_ctl_t ptl_ips;
+	ptl_ctl_t ptl_self;
+
+	/* All ptl data is allocated inline below */
+	uint8_t ptl_base_data[0] __attribute__ ((aligned(64)));
+};
+
+struct mqq {
+	psm2_mq_req_t first;
+	psm2_mq_req_t last;
+};
+
+typedef
+union psmi_seqnum {
+	struct {
+		uint32_t psn_seq:11;
+		uint32_t psn_gen:20;
+	};
+	struct {
+		uint32_t psn_num:31;
+	};
+	uint32_t psn_val;
+} psmi_seqnum_t;
+
+/*
+ * PSM end point address. One per connection and per rail.
+ */
+struct psm2_epaddr {
+	psm2_epid_t epid;	/* peer's epid */
+	ptl_ctl_t *ptlctl;	/* The control structure for the ptl */
+	struct ips_proto *proto;	/* only for ips protocol */
+	void *usr_ep_ctxt;	/* User context associated with endpoint */
+};
+
+#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD
+#  define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD  250
+#endif
+
+/*
+ * Users of BLOCKUNTIL should check the value of err upon return
+ */
+#define PSMI_BLOCKUNTIL(ep, err, cond)	do {				\
+	int spin_cnt = 0;						\
+	PSMI_PROFILE_BLOCK();						\
+	while (!(cond)) {						\
+		err = psmi_poll_internal(ep, 1);			\
+		if (err == PSM2_OK_NO_PROGRESS) {			\
+			PSMI_PROFILE_REBLOCK(1);			\
+			if (++spin_cnt == (ep)->yield_spin_cnt) {	\
+				spin_cnt = 0;				\
+				PSMI_YIELD((ep)->mq->progress_lock);	\
+			}						\
+		}							\
+		else if (err == PSM2_OK) {				\
+			PSMI_PROFILE_REBLOCK(0);			\
+			spin_cnt = 0;					\
+		}							\
+		else							\
+		break;							\
+	}								\
+	PSMI_PROFILE_UNBLOCK();						\
+} while (0)
+
+#endif /* _PSMI_EP_H */
diff --git a/psm_ep_connect.c b/psm_ep_connect.c
new file mode 100644
index 0000000..9657209
--- /dev/null
+++ b/psm_ep_connect.c
@@ -0,0 +1,620 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+#if _HFI_DEBUGGING
+PSMI_ALWAYS_INLINE(
+char *psmi_getdevice(int type))
+{
+	switch (type) {
+	case PTL_DEVID_IPS:
+		return "ips";
+	case PTL_DEVID_AMSH:
+		return "amsh";
+	case PTL_DEVID_SELF:
+		return "self";
+	default:
+		return "ips";
+	}
+}
+#endif
+
+psm2_error_t
+__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid,
+		 int const *array_of_epid_mask,	/* can be NULL */
+		 psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr,
+		 int64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ptl_ctl_t *ptlctl;
+	ptl_t *ptl;
+	int i, j, dup_idx;
+	int num_toconnect = 0;
+	int *epid_mask = NULL;
+	int *epid_mask_isdupof = NULL;
+	uint64_t t_start = get_cycles();
+	uint64_t t_left;
+	union psmi_envvar_val timeout_intval;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	/*
+	 * Normally we would lock here, but instead each implemented ptl component
+	 * does its own locking.  This is mostly because the ptl components are
+	 * ahead of the PSM2 interface in that they can disconnect their peers.
+	 */
+	if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL ||
+	    num_of_epid < 1) {
+		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_connect parameters");
+		goto fail_nolock;
+	}
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	/* We need two of these masks to detect duplicates */
+	err = PSM2_NO_MEMORY;
+	epid_mask =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+	if (epid_mask == NULL)
+		goto fail;
+	epid_mask_isdupof =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+	if (epid_mask_isdupof == NULL)
+		goto fail;
+	err = PSM2_OK;
+
+	/* Eventually handle timeouts across all connects. */
+	for (j = 0; j < num_of_epid; j++) {
+		if (array_of_epid_mask != NULL && !array_of_epid_mask[j])
+			epid_mask[j] = 0;
+		else {
+			epid_mask[j] = 1;
+			array_of_errors[j] = PSM2_EPID_UNKNOWN;
+			array_of_epaddr[j] = NULL;
+			if (psmi_epid_version(array_of_epid[j]) >
+						 PSMI_EPID_VERSION) {
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Unkown version of EPID - %"PRIu64" \n"
+					  "Please upgrade PSM2 or set PSM2_ADDR_FMT=1 in the environment to force EPID version 1 \n",
+					  psmi_epid_version(array_of_epid[j]));
+			}
+			num_toconnect++;
+		}
+		epid_mask_isdupof[j] = -1;
+	}
+
+	psmi_getenv("PSM2_CONNECT_TIMEOUT",
+		    "End-point connection timeout over-ride. 0 for no time-out.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &timeout_intval);
+
+	if (getenv("PSM2_CONNECT_TIMEOUT")) {
+		timeout = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout = max(timeout, (num_toconnect * SEC_ULL) / 100);
+	}
+
+	if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+		timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+	_HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n",
+		   num_toconnect, (double)timeout / 1e9);
+
+	/* Look for duplicates in input array */
+	for (i = 0; i < num_of_epid; i++) {
+		for (j = i + 1; j < num_of_epid; j++) {
+			if (array_of_epid[i] == array_of_epid[j] &&
+			    epid_mask[i] && epid_mask[j]) {
+				epid_mask[j] = 0;	/* don't connect more than once */
+				epid_mask_isdupof[j] = i;
+			}
+		}
+	}
+
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (ep->devid_enabled[i] == -1)
+			continue;
+		/* Set up the right connect ptrs */
+		switch (ep->devid_enabled[i]) {
+		case PTL_DEVID_IPS:
+			ptlctl = &ep->ptl_ips;
+			ptl = ep->ptl_ips.ptl;
+			break;
+		case PTL_DEVID_AMSH:
+			ptlctl = &ep->ptl_amsh;
+			ptl = ep->ptl_amsh.ptl;
+			break;
+		case PTL_DEVID_SELF:
+			ptlctl = &ep->ptl_self;
+			ptl = ep->ptl_self.ptl;
+			break;
+		default:
+			ptlctl = &ep->ptl_ips;	/*no-unused */
+			ptl = ep->ptl_ips.ptl;	/*no-unused */
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled PTL id %d\n",
+					  ep->devid_enabled[i]);
+			break;
+		}
+		t_left = psmi_cycles_left(t_start, timeout);
+
+		if (_HFI_VDBG_ON) {
+			_HFI_VDBG_ALWAYS
+				("Trying to connect with device %s\n",
+				psmi_getdevice(ep->devid_enabled[i]));
+		}
+		if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid,
+					      epid_mask, array_of_errors,
+					      array_of_epaddr,
+					      cycles_to_nanosecs(t_left)))) {
+			if (_HFI_PRDBG_ON) {
+				_HFI_PRDBG_ALWAYS
+					("Connect failure in device %s err=%d\n",
+					psmi_getdevice(ep->devid_enabled[i]), err);
+			}
+			goto connect_fail;
+		}
+
+		/* Now process what's been connected */
+		for (j = 0; j < num_of_epid; j++) {
+			dup_idx = epid_mask_isdupof[j];
+			if (!epid_mask[j] && dup_idx == -1)
+				continue;
+
+			if (dup_idx != -1) {	/* dup */
+				array_of_epaddr[j] = array_of_epaddr[dup_idx];
+				array_of_errors[j] = array_of_errors[dup_idx];
+				epid_mask_isdupof[j] = -1;
+			}
+
+			if (array_of_errors[j] == PSM2_OK) {
+				epid_mask[j] = 0;	/* don't try on next ptl */
+				ep->connections++;
+			}
+		}
+	}
+
+	for (i = 0; i < num_of_epid; i++) {
+		ptl_ctl_t *c = NULL;
+		if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
+			continue;
+		/* If we see unreachable here, that means some PTLs were not enabled */
+		if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+			err = PSM2_EPID_UNREACHABLE;
+			break;
+		}
+
+		psmi_assert_always(array_of_epaddr[i] != NULL);
+		c = array_of_epaddr[i]->ptlctl;
+		psmi_assert_always(c != NULL);
+		_HFI_VDBG("%-20s DEVICE %s (%p)\n",
+			  psmi_epaddr_get_name(array_of_epid[i]),
+			  c == &ep->ptl_ips ? "hfi" :
+			  (c == &ep->ptl_amsh ? "amsh" : "self"),
+			  (void *)array_of_epaddr[i]->ptlctl->ptl);
+	}
+
+        if (err == PSM2_OK)
+                for (i=0; i<num_of_epid; i++)
+                        array_of_errors[i] = PSM2_OK;
+
+connect_fail:
+	/* If the error is a timeout (at worse) and the client is OPA MPI,
+	 * just return timeout to let OPA MPI handle the hostnames that
+	 * timed out */
+	if (err != PSM2_OK) {
+		char errbuf[PSM2_ERRSTRING_MAXLEN];
+		size_t len;
+		int j = 0;
+
+		if (err == PSM2_EPID_UNREACHABLE) {
+			char *deverr = "of an incorrect setting";
+			char *eperr = "";
+			char *devname = NULL;
+			if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+				deverr =
+				    "there is no shared memory PSM2 device (shm)";
+				eperr = " shared memory";
+			} else
+			    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+				deverr =
+				    "there is no OPA PSM2 device (hfi)";
+				eperr = " OPA";
+			}
+
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "Some%s endpoints could not be connected because %s "
+				       "in the currently enabled PSM2_DEVICES (",
+				       eperr, deverr);
+			for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1;
+			     i++) {
+				switch (ep->devid_enabled[i]) {
+				case PTL_DEVID_IPS:
+					devname = "hfi";
+					break;
+				case PTL_DEVID_AMSH:
+					devname = "shm";
+					break;
+				case PTL_DEVID_SELF:
+				default:
+					devname = "self";
+					break;
+				}
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%s,",
+					     devname);
+			}
+			if (len < sizeof(errbuf) - 1 && devname != NULL)
+				/* parsed something, remove trailing comma */
+				errbuf[len - 1] = ')';
+		} else
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "%s", err == PSM2_TIMEOUT ?
+				       "Detected connection timeout" :
+				       psm2_error_get_string(err));
+
+		/* first pass, look for all nodes with the error */
+		for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) {
+			if (array_of_epid_mask != NULL
+			    && !array_of_epid_mask[i])
+				continue;
+			if (array_of_errors[i] == PSM2_OK)
+				continue;
+			if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+			    err != PSM2_EPID_UNREACHABLE)
+				continue;
+			if (err == array_of_errors[i]) {
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%c %s",
+					     j == 0 ? ':' : ',',
+					     psmi_epaddr_get_hostname
+					     (array_of_epid[i]));
+				j++;
+			}
+		}
+		errbuf[sizeof(errbuf) - 1] = '\0';
+		err = psmi_handle_error(ep, err, "%s", errbuf);
+	}
+
+fail:
+	PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+	if (epid_mask != NULL)
+		psmi_free(epid_mask);
+	if (epid_mask_isdupof != NULL)
+		psmi_free(epid_mask_isdupof);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_connect)
+
+psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+				  psm2_epaddr_t *array_of_epaddr,
+				  const int *array_of_epaddr_mask,
+				  psm2_error_t *array_of_errors,
+				  int64_t timeout)
+{
+	return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr,
+				   array_of_epaddr_mask, array_of_errors,
+				   PSM2_EP_DISCONNECT_GRACEFUL, timeout);
+}
+PSMI_API_DECL(psm2_ep_disconnect)
+
+psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+				  psm2_epaddr_t *array_of_epaddr,
+				  const int *array_of_epaddr_mask,
+				  psm2_error_t *array_of_errors,
+				  int mode, int64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ptl_ctl_t *ptlctl;
+	ptl_t *ptl;
+	int i, j, dup_idx;
+	int num_todisconnect = 0;
+	int *epaddr_mask = NULL;
+	int *epaddr_mask_isdupof = NULL;
+	uint64_t t_start = get_cycles();
+	uint64_t t_left;
+	union psmi_envvar_val timeout_intval;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+
+	/*
+	 * Normally we would lock here, but instead each implemented ptl component
+	 * does its own locking.  This is mostly because the ptl components are
+	 * ahead of the PSM2 interface in that they can disconnect their peers.
+	 */
+	if (ep == NULL || array_of_epaddr == NULL ||
+	    num_of_epaddr < 1) {
+		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_disconnect parameters");
+		goto fail_nolock;
+	}
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	/* We need two of these masks to detect duplicates */
+	err = PSM2_NO_MEMORY;
+	epaddr_mask =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+	if (epaddr_mask == NULL)
+		goto fail;
+	epaddr_mask_isdupof =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+	if (epaddr_mask_isdupof == NULL)
+		goto fail;
+	err = PSM2_OK;
+
+	/* Eventually handle timeouts across all connects. */
+	for (j = 0; j < num_of_epaddr; j++) {
+		if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j])
+			epaddr_mask[j] = 0;
+		else {
+			epaddr_mask[j] = 1;
+			array_of_errors[j] = PSM2_EPID_UNKNOWN;
+			num_todisconnect++;
+		}
+		epaddr_mask_isdupof[j] = -1;
+	}
+
+	psmi_getenv("PSM2_DISCONNECT_TIMEOUT",
+		    "End-point disconnection timeout over-ride. 0 for no time-out.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &timeout_intval);
+
+	if (getenv("PSM2_DISCONNECT_TIMEOUT")) {
+		timeout = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100);
+	}
+
+	if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+		timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+	_HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n",
+		   num_todisconnect, (double)timeout / 1e9);
+
+	/* Look for duplicates in input array */
+	for (i = 0; i < num_of_epaddr; i++) {
+		for (j = i + 1; j < num_of_epaddr; j++) {
+			if (array_of_epaddr[i] == array_of_epaddr[j] &&
+			    epaddr_mask[i] && epaddr_mask[j]) {
+				epaddr_mask[j] = 0;	/* don't disconnect more than once */
+				epaddr_mask_isdupof[j] = i;
+			}
+		}
+	}
+
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (ep->devid_enabled[i] == -1)
+			continue;
+		/* Set up the right connect ptrs */
+		switch (ep->devid_enabled[i]) {
+		case PTL_DEVID_IPS:
+			ptlctl = &ep->ptl_ips;
+			ptl = ep->ptl_ips.ptl;
+			break;
+		case PTL_DEVID_AMSH:
+			ptlctl = &ep->ptl_amsh;
+			ptl = ep->ptl_amsh.ptl;
+			break;
+		case PTL_DEVID_SELF:
+			ptlctl = &ep->ptl_self;
+			ptl = ep->ptl_self.ptl;
+			break;
+		default:
+			ptlctl = &ep->ptl_ips;	/*no-unused */
+			ptl = ep->ptl_ips.ptl;	/*no-unused */
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled PTL id %d\n",
+					  ep->devid_enabled[i]);
+			break;
+		}
+		t_left = psmi_cycles_left(t_start, timeout);
+
+		if (_HFI_VDBG_ON) {
+			_HFI_VDBG_ALWAYS
+				("Trying to disconnect with device %s\n",
+				psmi_getdevice(ep->devid_enabled[i]));
+		}
+		if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE),
+					      num_of_epaddr, array_of_epaddr,
+					      epaddr_mask, array_of_errors,
+					      cycles_to_nanosecs(t_left)))) {
+			if (_HFI_PRDBG_ON) {
+				_HFI_PRDBG_ALWAYS
+					("Disconnect failure in device %s err=%d\n",
+					psmi_getdevice(ep->devid_enabled[i]), err);
+			}
+			goto disconnect_fail;
+		}
+
+		/* Now process what's been disconnected */
+		for (j = 0; j < num_of_epaddr; j++) {
+			dup_idx = epaddr_mask_isdupof[j];
+			if (!epaddr_mask[j] && dup_idx == -1)
+				continue;
+
+			if (dup_idx != -1) {	/* dup */
+				array_of_errors[j] = array_of_errors[dup_idx];
+				epaddr_mask_isdupof[j] = -1;
+			}
+
+			if (array_of_errors[j] == PSM2_OK) {
+				epaddr_mask[j] = 0;	/* don't try on next ptl */
+				array_of_epaddr[j] = NULL;
+				ep->connections--;
+			}
+		}
+	}
+
+	for (i = 0; i < num_of_epaddr; i++) {
+		if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i])
+			continue;
+		/* If we see unreachable here, that means some PTLs were not enabled */
+		if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+			err = PSM2_EPID_UNREACHABLE;
+			break;
+		}
+	}
+
+disconnect_fail:
+	/* If the error is a timeout (at worse) and the client is OPA MPI,
+	 * just return timeout to let OPA MPI handle the hostnames that
+	 * timed out */
+	if (err != PSM2_OK) {
+		char errbuf[PSM2_ERRSTRING_MAXLEN];
+		size_t len;
+		int j = 0;
+
+		if (err == PSM2_EPID_UNREACHABLE) {
+			char *deverr = "of an incorrect setting";
+			char *eperr = "";
+			char *devname = NULL;
+			if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+				deverr =
+				    "there is no shared memory PSM2 device (shm)";
+				eperr = " shared memory";
+			} else
+			    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+				deverr =
+				    "there is no OPA PSM2 device (hfi)";
+				eperr = " OPA";
+			}
+
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "Some%s endpoints could not be disconnected because %s "
+				       "in the currently enabled PSM2_DEVICES (",
+				       eperr, deverr);
+			for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) {
+				switch (ep->devid_enabled[i]) {
+				case PTL_DEVID_IPS:
+					devname = "hfi";
+					break;
+				case PTL_DEVID_AMSH:
+					devname = "shm";
+					break;
+				case PTL_DEVID_SELF:
+				default:
+					devname = "self";
+					break;
+				}
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%s,",
+					     devname);
+			}
+			if (len < sizeof(errbuf) - 1 && devname != NULL)
+				/* parsed something, remove trailing comma */
+				errbuf[len - 1] = ')';
+		} else
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "%s", err == PSM2_TIMEOUT ?
+				       "Detected disconnect timeout" :
+				       psm2_error_get_string(err));
+
+		/* first pass, look for all nodes with the error */
+		for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) {
+			if (array_of_epaddr_mask != NULL
+			    && !array_of_epaddr_mask[i])
+				continue;
+			if (array_of_errors[i] == PSM2_OK)
+				continue;
+			if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+			    err != PSM2_EPID_UNREACHABLE)
+				continue;
+			if (err == array_of_errors[i]) {
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%c %s",
+					     j == 0 ? ':' : ',',
+					     psmi_epaddr_get_hostname
+					     (array_of_epaddr[i]->epid));
+				j++;
+			}
+		}
+		errbuf[sizeof(errbuf) - 1] = '\0';
+		err = psmi_handle_error(ep, err, "%s", errbuf);
+	}
+
+fail:
+	PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+	if (epaddr_mask != NULL)
+		psmi_free(epaddr_mask);
+	if (epaddr_mask_isdupof != NULL)
+		psmi_free(epaddr_mask_isdupof);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_disconnect2)
diff --git a/psm_error.c b/psm_error.c
new file mode 100644
index 0000000..99bb94f
--- /dev/null
+++ b/psm_error.c
@@ -0,0 +1,348 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_NOLOG  -1
+
+struct psm2_error_token {
+	psm2_ep_t ep;
+	psm2_error_t error;
+	char err_string[PSM2_ERRSTRING_MAXLEN];
+};
+
+static
+psm2_error_t
+psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err,
+		     const char *error_string, psm2_error_token_t token)
+{
+	return err;
+}
+
+static
+psm2_error_t
+psmi_errhandler_psm(psm2_ep_t ep,
+		    const psm2_error_t err,
+		    const char *error_string, psm2_error_token_t token)
+{
+	/* we want the error to be seen through ssh, etc., so we flush and then
+	 * sleep a bit.   Not perfect, but not doing so means it almost never
+	 * gets seen. */
+	fprintf(stderr, "%s%s\n", hfi_get_mylabel(), token->err_string);
+	fflush(stdout);
+	fflush(stderr);
+
+	/* XXX Eventually, this will hook up to a connection manager, and we'll
+	 * issue an upcall into the connection manager at shutdown time */
+	sleep(3);
+
+	/* We use this "special" ep internally to handle internal errors that are
+	 * triggered from within code that is not expected to return to the user.
+	 * Errors of this sort on not expected to be handled by users and always
+	 * mean we have an internal PSM bug. */
+	if (err == PSM2_INTERNAL_ERR)
+		abort();
+	else
+		exit(-1);
+}
+
+psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop;
+
+psm2_error_t __psm2_error_defer(psm2_error_token_t token)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_errhandler_psm(token->ep, token->error, token->err_string,
+				   token);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_error_defer)
+
+psm2_error_t
+__psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler)
+{
+	psm2_ep_errhandler_t *errh;
+
+	PSM2_LOG_MSG("entering");
+
+	if (ep == NULL)
+		errh = &psmi_errhandler_global;
+	else
+		errh = &ep->errh;
+
+	if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER)
+		*errh = psmi_errhandler_psm;
+	else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER)
+		*errh = psmi_errhandler_noop;
+	else
+		*errh = errhandler;
+
+	PSM2_LOG_MSG("leaving");
+
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_error_register_handler)
+
+psm2_error_t
+MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...)
+{
+	va_list argptr;
+	int syslog_level;
+	int console_print = 0;
+	psm2_error_t newerr;
+	struct psm2_error_token token;
+	char *c, fullmsg[PSM2_ERRSTRING_MAXLEN];
+	token.error = error;
+	snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf);
+	fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+	va_start(argptr, buf);
+	vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr);
+	va_end(argptr);
+	token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+
+	/* Unless the user has set PSM2_NO_VERBOSE_ERRORS, always print errors to
+	 * console */
+	c = getenv("PSM2_NO_VERBOSE_ERRORS");
+	console_print = 0;
+	if (ep == PSMI_EP_LOGEVENT)
+		console_print = 1;
+	else if (!c || *c == '\0') {	/* no desire to prevent verbose errors */
+		/* Remove the console print if we're internally handling the error */
+		if (ep == PSMI_EP_NORETURN)
+			console_print = 0;
+		else if (ep == NULL
+			 && psmi_errhandler_global != psmi_errhandler_psm)
+			console_print = 1;
+		else if (ep != NULL && ep->errh != psmi_errhandler_psm)
+			console_print = 1;
+	}
+
+	/* Before we let the user even handle the error, send to syslog */
+	syslog_level = psmi_error_syslog_level(error);
+	if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT)
+		psmi_syslog(ep, console_print,
+			    ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level,
+			    "%s (err=%d)", token.err_string, error);
+
+	if (ep == PSMI_EP_LOGEVENT)	/* we're just logging */
+		newerr = PSM2_OK;
+	else if (ep == PSMI_EP_NORETURN)
+		newerr =
+		    psmi_errhandler_psm(NULL, error, token.err_string, &token);
+	else if (ep == NULL)
+		newerr =
+		    psmi_errhandler_global(NULL, error, token.err_string,
+					   &token);
+	else
+		newerr = ep->errh(ep, error, token.err_string, &token);
+
+	return newerr;
+}
+MOCK_DEF_EPILOGUE(psmi_handle_error);
+
+/* Returns the "worst" error out of errA and errB */
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB)
+{
+#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err)
+
+	/* Bad runtime or before initialization */
+	_PSMI_ERR_IS(PSM2_NO_MEMORY);
+	_PSMI_ERR_IS(PSM2_INTERNAL_ERR);
+	_PSMI_ERR_IS(PSM2_INIT_NOT_INIT);
+	_PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION);
+
+	/* Before we cget an endpoint */
+	_PSMI_ERR_IS(PSM2_EP_NO_DEVICE);
+	_PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND);
+	_PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE);
+	_PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL);
+	_PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS);
+
+	/* As we open/close the endpoint */
+	_PSMI_ERR_IS(PSM2_EP_NO_NETWORK);
+	_PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR);
+	_PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT);
+	_PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY);
+	_PSMI_ERR_IS(PSM2_EP_NO_RESOURCES);
+
+	/* In connect phase */
+	_PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_NODE);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_MTU);
+
+	/* Timeout if nothing else */
+	_PSMI_ERR_IS(PSM2_TIMEOUT);
+
+	/* Last resort */
+	return max(errA, errB);
+}
+
+struct psmi_error_item {
+	int syslog_level;
+	const char *error_string;
+};
+
+static
+struct psmi_error_item psmi_error_items[] = {
+	{PSMI_NOLOG, "Success"},	/*  PSM2_OK = 0, */
+	{PSMI_NOLOG, "No events were progressed in psm_poll"},	/* PSM2_OK_NO_PROGRESS = 1 */
+	{PSMI_NOLOG, "unknown 2"},
+	{PSMI_NOLOG, "Error in a function parameter"},	/* PSM2_PARAM_ERR = 3 */
+	{LOG_CRIT, "Ran out of memory"},	/* PSM2_NO_MEMORY = 4 */
+	{PSMI_NOLOG, "PSM has not been initialized by psm2_init"},	/* PSM2_INIT_NOT_INIT = 5 */
+	{LOG_INFO, "API version passed in psm2_init is incompatible"},	/* PSM2_INIT_BAD_API_VERSION = 6 */
+	{PSMI_NOLOG, "PSM Could not set affinity"},	/* PSM2_NO_AFFINITY = 7 */
+	{LOG_ALERT, "PSM Unresolved internal error"},	/* PSM2_INTERNAL_ERR = 8 */
+	{LOG_CRIT, "PSM could not set up shared memory segment"},	/* PSM2_SHMEM_SEGMENT_ERR = 9 */
+	{PSMI_NOLOG, "PSM option is a read-only option"},	/* PSM2_OPT_READONLY = 10 */
+	{PSMI_NOLOG, "Operation timed out"},	/* PSM2_TIMEOUT = 11 */
+	{LOG_INFO, "Exceeded supported amount of endpoints"},
+	/* PSM2_TOO_MANY_ENDPOINTS = 12 */
+	{PSMI_NOLOG, "PSM is in the finalized state"},	/* PSM2_IS_FINALIZED = 13 */
+	{PSMI_NOLOG, "unknown 14"},
+	{PSMI_NOLOG, "unknown 15"},
+	{PSMI_NOLOG, "unknown 16"},
+	{PSMI_NOLOG, "unknown 17"},
+	{PSMI_NOLOG, "unknown 18"},
+	{PSMI_NOLOG, "unknown 19"},
+	{PSMI_NOLOG, "Endpoint was closed"},	/* PSM2_EP_WAS_CLOSED = 20 */
+	{LOG_ALERT, "PSM Could not find an OPA Unit"},	/* PSM2_EP_NO_DEVICE = 21 */
+	{PSMI_NOLOG, "User passed a bad unit number"},	/* PSM2_EP_UNIT_NOT_FOUND = 22 */
+	{LOG_ALERT, "Failure in initializing endpoint"},	/* PSM2_EP_DEVICE_FAILURE = 23 */
+	{PSMI_NOLOG, "Error closing the endpoing error"},	/* PSM2_EP_CLOSE_TIMEOUT = 24 */
+	{PSMI_NOLOG, "No free contexts could be obtained"},	/* PSM2_EP_NO_PORTS_AVAIL = 25 */
+	{LOG_ALERT, "Could not detect network connectivity"},	/* PSM2_EP_NO_NETWORK = 26 */
+	{LOG_INFO, "Invalid Unique job-wide UUID Key"},	/* PSM2_EP_INVALID_UUID_KEY = 27 */
+	{LOG_INFO, "Out of endpoint resources"},	/* PSM2_EP_NO_RESOURCES = 28 */
+	{PSMI_NOLOG, "unknown 29"},
+	{PSMI_NOLOG, "unknown 30"},
+	{PSMI_NOLOG, "unknown 31"},
+	{PSMI_NOLOG, "unknown 32"},
+	{PSMI_NOLOG, "unknown 33"},
+	{PSMI_NOLOG, "unknown 34"},
+	{PSMI_NOLOG, "unknown 35"},
+	{PSMI_NOLOG, "unknown 36"},
+	{PSMI_NOLOG, "unknown 37"},
+	{PSMI_NOLOG, "unknown 38"},
+	{PSMI_NOLOG, "unknown 39"},
+	{PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"},	/* PSM2_EPID_UNKNOWN = 40 */
+	{PSMI_NOLOG, "Endpoint could not be reached"},	/* PSM2_EPID_UNREACHABLE = 41 */
+	{PSMI_NOLOG, "unknown 42"},
+	{LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"},	/* PSM2_EPID_INVALID_NODE = 43 */
+	{LOG_CRIT, "Invalid MTU"},	/* PSM2_EPID_INVALID_MTU =  44 */
+	{PSMI_NOLOG, "UUID key mismatch"},	/* PSM2_EPID_INVALID_UUID_KEY = 45 */
+	{LOG_ERR, "Incompatible PSM version"},	/* PSM2_EPID_INVALID_VERSION = 46 */
+	{LOG_CRIT, "Connect received garbled connection information"},	/* PSM2_EPID_INVALID_CONNECT = 47 */
+	{PSMI_NOLOG, "Endpoint was already connected"},	/* PSM2_EPID_ALREADY_CONNECTED = 48 */
+	{LOG_CRIT, "Two or more endpoints have the same network id (LID)"},	/* PSM2_EPID_NETWORK_ERROR = 49 */
+	{LOG_CRIT, "Endpoint provided incompatible Partition Key"},
+	{LOG_CRIT, "Unable to resolve network path. Is the SM running?"},
+	{PSMI_NOLOG, "unknown 52"},
+	{PSMI_NOLOG, "unknown 53"},
+	{PSMI_NOLOG, "unknown 54"},
+	{PSMI_NOLOG, "unknown 55"},
+	{PSMI_NOLOG, "unknown 56"},
+	{PSMI_NOLOG, "unknown 57"},
+	{PSMI_NOLOG, "unknown 58"},
+	{PSMI_NOLOG, "unknown 59"},
+	{PSMI_NOLOG, "MQ Non-blocking request is incomplete"},	/* PSM2_MQ_NO_COMPLETIONS = 60 */
+	{PSMI_NOLOG, "MQ Message has been truncated at the receiver"},	/* PSM2_MQ_TRUNCATION = 61 */
+	{PSMI_NOLOG, "unknown 62"},
+	{PSMI_NOLOG, "unknown 63"},
+	{PSMI_NOLOG, "unknown 64"},
+	{PSMI_NOLOG, "unknown 65"},
+	{PSMI_NOLOG, "unknown 66"},
+	{PSMI_NOLOG, "unknown 67"},
+	{PSMI_NOLOG, "unknown 68"},
+	{PSMI_NOLOG, "unknown 69"},
+	{PSMI_NOLOG, "Invalid AM reply"},
+	{PSMI_NOLOG, "unknown 71"},
+	{PSMI_NOLOG, "unknown 72"},
+	{PSMI_NOLOG, "unknown 73"},
+	{PSMI_NOLOG, "unknown 74"},
+	{PSMI_NOLOG, "unknown 75"},
+	{PSMI_NOLOG, "unknown 76"},
+	{PSMI_NOLOG, "unknown 77"},
+	{PSMI_NOLOG, "unknown 78"},
+	{PSMI_NOLOG, "unknown 79"},
+	{PSMI_NOLOG, "unknown 80"},
+};
+
+const char *__psm2_error_get_string(psm2_error_t error)
+{
+	PSM2_LOG_MSG("entering");
+	if (error >= PSM2_ERROR_LAST) {
+		PSM2_LOG_MSG("leaving");
+		return "unknown";
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return psmi_error_items[error].error_string;
+	}
+}
+PSMI_API_DECL(psm2_error_get_string)
+
+int psmi_error_syslog_level(psm2_error_t error)
+{
+	if (error >= PSM2_ERROR_LAST)
+		return PSMI_NOLOG;
+	else
+		return psmi_error_items[error].syslog_level;
+}
diff --git a/psm_error.h b/psm_error.h
new file mode 100644
index 0000000..f335382
--- /dev/null
+++ b/psm_error.h
@@ -0,0 +1,78 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+#include "psm2_mock_testing.h"
+
+#ifndef _PSMI_IN_USER_H
+#error psm_error.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_ERROR_H
+#define _PSMI_ERROR_H
+
+#define PSMI_EP_NONE		    (NULL)
+#define PSMI_EP_NORETURN	    ((psm2_ep_t) -2)
+#define PSMI_EP_LOGEVENT	    ((psm2_ep_t) -3)
+
+psm2_ep_errhandler_t psmi_errhandler_global;
+
+psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error,
+			      const char *buf, ...)
+			      __attribute__((format(printf, 3, 4)));
+MOCK_DCL_EPILOGUE(psmi_handle_error);
+
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB);
+int psmi_error_syslog_level(psm2_error_t error);
+
+#endif /* _PSMI_ERROR_H */
diff --git a/psm_help.h b/psm_help.h
new file mode 100644
index 0000000..12ebe5b
--- /dev/null
+++ b/psm_help.h
@@ -0,0 +1,190 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_HELP_H
+#define _PSMI_HELP_H
+#include "psm_log.h"
+
+/* XXX gcc only */
+#define PSMI_INLINE(FN) \
+	static inline FN
+
+#define PSMI_ALWAYS_INLINE(FN) \
+	static __inline__ FN __attribute__((always_inline));  \
+	static __inline__ FN
+
+#define PSMI_NEVER_INLINE(FN)             \
+	static FN __attribute__((noinline));  \
+	static FN
+
+#define _PPragma(x) _Pragma(x)
+
+#define STRINGIFY(s)	_STRINGIFY(s)
+#define _STRINGIFY(s)	#s
+#define PSMI_CURLOC	__FILE__ ":" STRINGIFY(__LINE__)
+#define psmi_assert_always_loc(x, curloc)				\
+	do {								\
+	if_pf(!(x)) {							\
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Assertion failure at %s: %s", curloc,	\
+				STRINGIFY(x));				\
+	} } while (0)
+
+#define psmi_assert_always(x)  psmi_assert_always_loc(x, PSMI_CURLOC)
+
+#ifdef PSM_DEBUG
+#  define psmi_assert(x)	psmi_assert_always(x)
+#  define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized())
+#else
+#  define psmi_assert(x)
+#  define PSMI_ASSERT_INITIALIZED()
+#endif
+
+#define _PSMI_API_NAME(FN)  __ ## FN
+#define _PSMI_API_STR(FN)   _STRINGIFY(__ ## FN)
+#define PSMI_API_DECL(FN)							\
+	typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN))));
+
+#define PSMI_ERR_UNLESS_INITIALIZED(ep)					\
+	do {								\
+		if (!psmi_isinitialized()) {				\
+			PSM2_LOG_MSG("leaving");				\
+			return psmi_handle_error(ep, PSM2_INIT_NOT_INIT,	\
+				"PSM2 has not been initialized");	\
+	  }								\
+	} while (0)
+
+#define PSMI_CHECKMEM(err, mem)			\
+	do {					\
+		if ((mem) == NULL) {		\
+			(err) = PSM2_NO_MEMORY;	\
+			goto fail;		\
+		}				\
+	} while (0)
+
+#define PSMI_CACHEALIGN	__attribute__((aligned(64)))
+
+/* Easy way to ignore the OK_NO_PROGRESS case */
+PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err))
+{
+	if (err > PSM2_OK_NO_PROGRESS)
+		return err;
+	else
+		return PSM2_OK;
+}
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+#define SEC_ULL	 1000000000ULL
+#define MSEC_ULL 1000000ULL
+#define USEC_ULL 1000ULL
+#define NSEC_ULL 1ULL
+
+#define PSMI_TRUE   1
+#define PSMI_FALSE  0
+
+#define PSMI_CYCLES_TO_SECSF(cycles)			\
+		((double) cycles_to_nanosecs(cycles) / 1.0e9)
+
+#define PSMI_PAGESIZE       psmi_getpagesize()
+#define PSMI_POWEROFTWO(P)  (((P)&((P)-1)) == 0)
+#define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1)))
+#define PSMI_ALIGNUP(p, P)   (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P)))
+
+#define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff))
+
+#ifdef PSM_DEBUG
+
+/* The intent of the following two macros is to emit an internal error if a size of a
+   'member' is not as expected, violating an assumption in the code. There are some
+   problems with the implementation of this code:
+
+   The first macro creates a static const variable with ABSOLUTELY NO references
+   to them.  For example there are ABSOLUTELY NO uses of the second macro in the
+   PSM code. This is not completely pure. GCC version 5, for example, emits a
+   warning for defining a static const when it is not referenced.
+
+   A better implementation of the intent of this code is to use static_assert()
+   so that at compile time the violations can be caught and corrected - not at
+   run time.  */
+
+#define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz
+#define PSMI_STRICT_SIZE_VERIFY(member, sz)				\
+	do {								\
+		if (__psm2_ss_ ## member != (sz)) {			\
+			char errmsg[64];				\
+			snprintf(errmsg, 32, "Internal error: %s "	\
+					"size doesn't match expected %d bytes",	\
+					STRINGIFY(member), (int) __psm2_ss_ ## member);	\
+			exit(-1);					\
+		}							\
+	} while (0)
+
+#else
+
+#define PSMI_STRICT_SIZE_DECL(member, sz)   /* nothing */
+#define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */
+
+#endif /*  PSM_DEBUG */
+
+#endif /* _PSMI_HELP_H */
diff --git a/psm_lock.h b/psm_lock.h
new file mode 100644
index 0000000..56e82a8
--- /dev/null
+++ b/psm_lock.h
@@ -0,0 +1,142 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_lock.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_LOCK_H
+#define _PSMI_LOCK_H
+
+#ifndef PSMI_USE_PTHREAD_SPINLOCKS
+#define PSMI_USE_PTHREAD_SPINLOCKS 0
+#endif
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+typedef pthread_spinlock_t psmi_spinlock_t;
+
+#define psmi_spin_init(lock)	  pthread_spin_init(lock, \
+					PTHREAD_PROCESS_PRIVATE)
+#define psmi_spin_lock(lock)	  pthread_spin_lock(lock)
+#define psmi_spin_trylock(lock) pthread_spin_trylock(lock)
+#define psmi_spin_unlock(lock)  pthread_spin_unlock(lock)
+#else
+typedef ips_atomic_t psmi_spinlock_t;
+#define PSMI_SPIN_LOCKED    1
+#define PSMI_SPIN_UNLOCKED  0
+#endif
+
+/* psmi_lock_t structure */
+typedef struct {
+
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	psmi_spinlock_t lock;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	pthread_mutex_t lock;
+	pthread_t lock_owner;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+	pthread_mutex_t lock;
+#endif
+} psmi_lock_t;
+
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+#else
+PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock))
+{
+	ips_atomic_set(lock, PSMI_SPIN_UNLOCKED);
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock))
+{
+	if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED)
+			== PSMI_SPIN_UNLOCKED)
+		return 0;
+	else
+		return EBUSY;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock))
+{
+	while (psmi_spin_trylock(lock) == EBUSY) {
+	}
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock))
+{
+	atomic_set(lock, PSMI_SPIN_UNLOCKED);
+	return 0;
+}
+#endif /* PSMI_USE_PTHREAD_SPINLOCKS */
+
+PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
+{
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	psmi_spin_init(&(lock->lock));
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+	pthread_mutex_init(&(lock->lock), NULL);
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP);
+	pthread_mutex_init(&(lock->lock), &attr);
+	pthread_mutexattr_destroy(&attr);
+	lock->lock_owner = PSMI_LOCK_NO_OWNER;
+#endif
+}
+
+#endif /* _PSMI_LOCK_H */
diff --git a/psm_log.h b/psm_log.h
new file mode 100644
index 0000000..c808c5c
--- /dev/null
+++ b/psm_log.h
@@ -0,0 +1,224 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_LOG_H
+#define _PSMI_LOG_H
+
+/*
+
+  A note about PSM_LOG and PSM_LOG_FAST_IO:
+
+  By default, the PSM_LOG facility is safe, slow, and is complete.  That is, if the
+  test case you are debugging has an abnormal termiation, no problem.  The logs are
+  saved up to the point of the abnormal termination.  Abnormal termination can be
+  a seg fault, the test case issues a fatal error, or exit()'s or abort()'s.
+
+  However, debugging timing sensitive problems, make the usual SLOW PSM_LOG
+  facility inadequate as the timing overhead that it introduces dominates, and the
+  symptoms of your problem may change or go away.
+
+  For this case, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO.  To use
+  PSM_LOG_FAST_IO though, caution: for abnormal program termination, you will get
+  no log file.
+
+  To workaround this problem, and allow you to get a log file even after an abnormal
+  program terminiation, we expose psmi_log_fini to the outside world (via the linker
+  script file), and so in your client test code, you can call psmi_log_fini() on a
+  fatal error (e.g. in a signal handler).
+
+  --------------------------------------------------------------------------------
+
+  This file (psm_log.h) defines macros for logging messages to assist investigations
+  into the psm library.
+
+  By default, these macros are not defined when building psm.  When not defined, the
+  macros become no-ops in the PSM code.
+
+  When enabled (by defining the PSM_LOG symbol), the macros present information to
+  the psmi_log_message() facility for processing.  See below for more information on the
+  psmi_log_message() facility.
+
+  To enable the macros, build PSM with the PSM_LOG environment variable exported, ala:
+
+  PSM_LOG=1 make ...
+
+  The macros are described in the following:
+
+  PSM2_LOG_MSG(FORMAT,...)        Spills a printf-style message to the log.
+  PSM_LOG_DECLARE_BT_BUFFER()    Declares a local back trace buffer for use with the
+  				 PSM_LOG_BT() macro.
+  PSM_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs from the
+                                 previous backtrace spilled to the log.
+
+  The psmi_log_message() facility is the backend for these messages when PSM_LOG is enabled.
+  The psmi_log_message() facility spills messages to unique log files based on the process id
+  and the thread id.  So every unique process id, and thread id will spill to unique log files.
+  The psmi_log_message prefixes each message in the log files with a high resolution timer
+  message so that messages from multiple threads and log files can be reconciled to one timeline.
+  It is left as an exercise to the reader to reconcile log messages from different hosts to one
+  timeline.
+
+  The backtrace capability in the PSM_LOG functionality needs some explanation: often a bug
+  happens only when the code is tickled from a specific call-chain.  The PSM_LOG_BT() macro
+  supports identifying the unique call-chain when a problem occurs.  The model is as follows:
+
+  A unique declaration is made for a backtrace to spill the backtrace information to.  This
+  declaration should be made in the same basic block as the use of the PSM_LOG_BT() macro.
+  To make the declaration, use PSM_LOG_DECLARE_BT_BUFFER().
+
+  When the PSM_LOG is enabled, at the statement for the macro: PSM_LOG_BT(NFRAMES,FORMAT,...),
+  the psmi_log_message() facility generates the current backtrace, and compares the first
+  NFRAMES of the current backtrace against the previous backtrace stored in the backtrace
+  buffer declared with the declaration.  If the two backtraces differ, the psmi_log_message()
+  code saves the current backtrace into the declared buffer, and then spills the backtrace to the
+  log file.
+
+  At runtime, setting environment variables can squelch the log file from getting too big:
+
+  PSM2_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated FNL) (see below),
+  that will INClude the FNL's into the colleciton of functions to spill log data for.
+
+  PSM2_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude the FNL's from the
+  collection of functions to spill log data for.
+
+  An FNL is a 'Function Name List' that is defined by the following grammar:
+
+  # A LINE1 is either a single line number of a range of line numbers:
+  LINE1 :: lineNumber |
+           lineNumber1 '-' lineNumber2
+
+  # LINES is a list of LINE1's separated by commas:
+  LINES :: LINE1 |
+           LINE1 ',' LINES
+
+  # An FN is either a function name, or a function name with a list of lines:
+  FN :: functionName |
+        functionName ';' LINES
+
+  # A FNL is a list of FN's separated by colons:
+  FNL ::  FN |
+          FN ':' FNL
+
+  # Examples:
+  foo:bar    the two functions foo and bar
+  foo;1-10   lines 1 to 10 of function foo.
+  bar;1,3,5  lines 1, 3 and 5 of function bar
+
+  PSM2_LOG_SRCH_FORMAT_STRING If set, overrides the PSM2_LOG_INC_FUNCTION_NAMES
+  and PSM2_LOG_EXC_FUNCTION_NAMES settings.  Causes the psmi_log_message() facility
+  to only emit the log messages that match (using fnmatch()) the message in FORMAT.
+
+ */
+
+#define PSM_LOG_EPM_TX ((int)1)
+#define PSM_LOG_EPM_RX ((int)0)
+
+
+#ifdef PSM_LOG
+
+extern void psmi_log_initialize(void);
+
+/* defined in psm_utils.c */
+extern void psmi_log_message(const char *fileName,
+			     const char *functionName,
+			     int lineNumber,
+			     const char *format, ...);
+
+#ifdef PSM_LOG_FAST_IO
+extern void psmi_log_fini(void);
+#else
+#define psmi_log_fini() /* nothing */
+#endif
+
+#define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__)
+
+#define PSM_LOG_BT_BUFFER_SIZE 100
+
+#define PSM_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM_LOG_BT_BUFFER_SIZE]
+
+#define PSM_LOG_BT_MAGIC ((const char *)-1)
+
+#define PSM_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__)
+
+#define PSM_LOG_EPM_MAGIC ((const char *)-2)
+
+/* EPM is short for Emit Protocol Message to the log file.
+OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h
+TXRX is an int, and should be one of the above two consts (PSM_LOG_EPM_TX, or PSM_LOG_EPM_RX).
+FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender   of the message
+                                   and the toepid   should be the epid (end point id) of the receiver of the message
+    */
+#define PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__)
+
+/* Just adds a condition to the PSM_LOG_EPM() macro. */
+#define PSM_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) if (COND) PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__)
+
+#else
+
+#define psmi_log_initialize()                               /* nothing */
+
+#define PSM2_LOG_MSG(FORMAT , ...)                          /* nothing */
+
+#define psmi_log_fini()                                     /* nothing */
+
+#define PSM_LOG_DECLARE_BT_BUFFER()                         /* nothing */
+
+#define PSM_LOG_BT(NFRAMES,FORMAT , ...)                    /* nothing */
+
+#define PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#define PSM_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#endif /* #ifdef PSM_LOG */
+
+#endif /* #ifndef _PSMI_LOG_H */
diff --git a/psm_memcpy.c b/psm_memcpy.c
new file mode 100644
index 0000000..d3c2b11
--- /dev/null
+++ b/psm_memcpy.c
@@ -0,0 +1,67 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "psm_mq_internal.h"
+
+void *psmi_memcpyo(void *dst, const void *src, size_t n)
+{
+	psmi_mq_mtucpy(dst, src, n);
+	return dst;
+}
diff --git a/psm_mock.c b/psm_mock.c
new file mode 100644
index 0000000..bdcfd41
--- /dev/null
+++ b/psm_mock.c
@@ -0,0 +1,90 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm2_mock_testing.h"
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK_INIT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_init);
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl)
+{
+	int ret = _PSMI_LOCK_TRY(*pl);
+	return ret;
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_try);
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock);
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl)
+{
+	_PSMI_UNLOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock);
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert);
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl)
+{
+	_PSMI_UNLOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert);
+#endif
diff --git a/psm_mpool.c b/psm_mpool.c
new file mode 100644
index 0000000..99f6748
--- /dev/null
+++ b/psm_mpool.c
@@ -0,0 +1,588 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_MPOOL_ALIGNMENT	64
+
+struct mpool_element {
+	union {
+		SLIST_ENTRY(mpool_element) me_next;
+		mpool_t me_mpool;
+	};
+
+	uint32_t me_gen_count;
+	uint32_t me_index;
+#ifdef PSM_DEBUG
+	uint32_t me_isused;
+#endif
+} __attribute__ ((aligned(8)));
+
+#ifdef PSM_DEBUG
+#  define me_mark_used(me)    ((me)->me_isused = 1)
+#  define me_mark_unused(me)  ((me)->me_isused = 0)
+#else
+#  define me_mark_used(me)
+#  define me_mark_unused(me)
+#endif
+
+struct mpool {
+	int mp_type;
+	int mp_flags;
+	int mp_vector_shift;
+
+	uint32_t mp_elm_vector_size;
+	uint32_t mp_elm_offset;
+	uint32_t mp_num_obj;
+	uint32_t mp_num_obj_inuse;
+	uint32_t mp_elm_size;
+	uint32_t mp_obj_size;
+	uint32_t mp_num_obj_per_chunk;
+	uint32_t mp_num_obj_max_total;
+	psmi_memtype_t mp_memtype;
+
+	 SLIST_HEAD(, mpool_element) mp_head;
+	struct mpool_element **mp_elm_vector;
+	struct mpool_element **mp_elm_vector_free;
+	non_empty_callback_fn_t mp_non_empty_cb;
+	void *mp_non_empty_cb_context;
+
+#ifdef PSM_CUDA
+	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
+	void *mp_alloc_dealloc_cb_context;
+#endif
+};
+
+static int psmi_mpool_allocate_chunk(mpool_t);
+
+/**
+ * psmi_mpool_create()
+ *
+ * Create a memory pool and allocates <num_obj_per_chunk> objects of size
+ * <obj_size>.  If more memory is needed to accommodate mpool_get()
+ * requests, the memory pool will allocate another chunk of
+ * <num_obj_per_chunk> objects, until it reaches the maximum number of objects
+ * it can allocate.
+ *
+ * <obj_size>		size of each individual object
+ * <num_obj_per_chunk>	number of objects to allocate per chunk (power of two)
+ * <num_obj_max_total>	total number of objects that may be allocated
+ *			at any given time. Must be a power of two greater than
+ *			<num_obj_per_chunk>.
+ *
+ * <flags>		flags to be applied on the memory pool (ie. memory
+ *			alignment)
+ *
+ * <cb>			callback to be called when the memory pool has some
+ *			free objects available again (after running out of them).
+ * <context>		context pointer for the callback
+ *
+ * Return the mpool on success, NULL on failure.
+ */
+mpool_t
+psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk,
+			uint32_t num_obj_max_total, int flags,
+			psmi_memtype_t statstype,
+			non_empty_callback_fn_t cb, void *context)
+{
+	mpool_t mp;
+	int s;
+	size_t hdr_size;
+
+#ifdef PSM_VALGRIND
+	/* For Valgrind we wish to define a "redzone" before and after the
+	 * allocation block, so we also allocate a blank mpool_element
+	 * at the end of the user's block */
+#endif
+
+	if (!PSMI_POWEROFTWO(num_obj_per_chunk) ||
+	    !PSMI_POWEROFTWO(num_obj_max_total) ||
+	    num_obj_max_total < num_obj_per_chunk) {
+		return NULL;
+	}
+
+	mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool));
+	if (mp == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool: %s\n",
+			strerror(errno));
+		return NULL;
+	}
+
+	for (s = 1; s < num_obj_per_chunk; s <<= 1)
+		mp->mp_vector_shift++;
+
+	mp->mp_flags = flags;
+	mp->mp_num_obj_per_chunk = num_obj_per_chunk;
+	mp->mp_num_obj_max_total = num_obj_max_total;
+	mp->mp_non_empty_cb = cb;
+	mp->mp_non_empty_cb_context = context;
+
+	mp->mp_memtype = statstype;
+
+	SLIST_INIT(&mp->mp_head);
+	mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk;
+	mp->mp_elm_vector =
+	    psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size,
+			sizeof(struct mpool_element *));
+	if (mp->mp_elm_vector == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool vector: "
+			"%s\n", strerror(errno));
+		psmi_free(mp);
+		return NULL;
+	}
+
+	mp->mp_elm_vector_free = mp->mp_elm_vector;
+
+	if (flags & PSMI_MPOOL_ALIGN) {
+		/* User wants its block to start on a PSMI_MPOOL_ALIGNMENT
+		 * boundary. */
+		hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element),
+					PSMI_MPOOL_ALIGNMENT);
+		mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT);
+		mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+
+		mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element);
+	} else {
+		hdr_size = sizeof(struct mpool_element);
+		mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8);
+		mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+		mp->mp_elm_offset = 0;
+	}
+
+	return mp;
+}
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+		  uint32_t num_obj_max_total, int flags,
+		  psmi_memtype_t statstype, non_empty_callback_fn_t cb,
+		  void *context)
+{
+	mpool_t mp;
+
+	mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+					num_obj_max_total, flags, statstype,
+					cb, context);
+
+	if (mp == NULL)
+		return NULL;
+
+	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+		psmi_mpool_destroy(mp);
+		return NULL;
+	}
+
+	VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */ ,
+				PSM_VALGRIND_MEM_UNDEFINED);
+
+	return mp;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_create);
+
+#ifdef PSM_CUDA
+mpool_t
+psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+			   uint32_t num_obj_max_total, int flags,
+			   psmi_memtype_t statstype,
+			   non_empty_callback_fn_t cb, void *context,
+			   alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+{
+	mpool_t mp;
+
+	mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+					num_obj_max_total, flags, statstype,
+					cb, context);
+
+	if (mp == NULL)
+		return NULL;
+
+	mp->mp_alloc_dealloc_cb = ad_cb;
+	mp->mp_alloc_dealloc_cb_context = ad_context;
+
+	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+		psmi_mpool_destroy(mp);
+		return NULL;
+	}
+
+	VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */ ,
+				PSM_VALGRIND_MEM_UNDEFINED);
+
+	return mp;
+}
+#endif
+
+/**
+ * psmi_mpool_get()
+ *
+ * <mp>	    memory pool
+ *
+ * Requests an object from the memory pool.
+ *
+ * Returns NULL if the maximum number of objects has been allocated (refer to
+ * <num_obj_max_total> in psmi_mpool_create) or if running out of memory.
+ */
+void *psmi_mpool_get(mpool_t mp)
+{
+	struct mpool_element *me;
+	void *obj;
+
+	if (SLIST_EMPTY(&mp->mp_head)) {
+		if (psmi_mpool_allocate_chunk(mp) != PSM2_OK)
+			return NULL;
+	}
+
+	me = SLIST_FIRST(&mp->mp_head);
+	SLIST_REMOVE_HEAD(&mp->mp_head, me_next);
+
+	psmi_assert(!me->me_isused);
+	me_mark_used(me);
+
+	/* store a backpointer to the memory pool */
+	me->me_mpool = mp;
+	mp->mp_num_obj_inuse++;
+	psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj);
+
+	obj = (void *)((uintptr_t) me + sizeof(struct mpool_element));
+	VALGRIND_MEMPOOL_ALLOC(mp, obj, mp->mp_obj_size);
+	return obj;
+}
+
+/**
+ * psmi_mpool_put()
+ *
+ * <obj>    object to return to the memory pool
+ *
+ * Returns an <obj> to the memory pool subsystem.  This object will be re-used
+ * to fulfill new psmi_mpool_get() requests.
+ */
+void psmi_mpool_put(void *obj)
+{
+	struct mpool_element *me;
+	int was_empty;
+	mpool_t mp;
+
+	me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+	me->me_gen_count++;
+
+	mp = me->me_mpool;
+
+	psmi_assert(mp != NULL);
+	psmi_assert(mp->mp_num_obj_inuse >= 0);
+	psmi_assert(me->me_isused);
+	me_mark_unused(me);
+
+	was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total;
+	SLIST_INSERT_HEAD(&mp->mp_head, me, me_next);
+
+	mp->mp_num_obj_inuse--;
+
+	VALGRIND_MEMPOOL_FREE(mp, obj);
+
+	/* tell the user that memory is available */
+	if (mp->mp_non_empty_cb && was_empty)
+		mp->mp_non_empty_cb(mp->mp_non_empty_cb_context);
+}
+
+/**
+ * psmi_mpool_get_obj_index()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in the memory pool.
+ */
+
+int psmi_mpool_get_obj_index(void *obj)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	return me->me_index;
+}
+
+/**
+ * psmi_mpool_get_obj_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the generation count of the <obj>.
+ */
+uint32_t psmi_mpool_get_obj_gen_count(void *obj)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	return me->me_gen_count;
+}
+
+/**
+ * psmi_mpool_get_obj_index_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in <index>.
+ * Returns the generation count of the <obj> in <gen_count>.
+ */
+int
+psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index,
+				   uint32_t *gen_count)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	*index = me->me_index;
+	*gen_count = me->me_gen_count;
+	return 0;
+}
+
+/**
+ * psmi_mpool_find_obj_by_index()
+ *
+ * <mp>	    memory pool
+ * <index>  index of the object
+ *
+ * Returns the object located at <index> in the memory pool or NULL if the
+ * <index> is invalid.
+ */
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index)
+{
+	struct mpool_element *me;
+
+	if_pf(index < 0 || index >= mp->mp_num_obj)
+	    return NULL;
+
+	me = (struct mpool_element *)
+	    ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] +
+	     (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size +
+	     mp->mp_elm_offset);
+
+	/* If this mpool doesn't require generation counts, it's illegal to find a
+	 * freed object */
+#ifdef PSM_DEBUG
+	if (mp->mp_flags & PSMI_MPOOL_NOGENERATION)
+		psmi_assert(!me->me_isused);
+#endif
+
+	return (void *)((uintptr_t) me + sizeof(struct mpool_element));
+}
+
+#ifdef PSM_CUDA
+/**
+ * psmi_mpool_chunk_dealloc()
+ * <mp>	    memory pool
+ * <i>	    index
+ * Calls the dealloc function on each element in the chunk.
+ */
+void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
+{
+	int j;
+	for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
+		mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
+					mp->mp_alloc_dealloc_cb_context,
+					((void *) mp->mp_elm_vector[idx]) +
+					j * mp->mp_elm_size +
+					sizeof(struct mpool_element));
+}
+#endif
+/**
+ * psmi_mpool_destroy()
+ *
+ * <mp>	    memory pool
+ *
+ * Destroy a previously allocated memory pool and reclaim its associated
+ * memory.  The behavior is undefined if some objects have not been returned
+ * to the memory pool with psmi_mpool_put().
+ */
+void psmi_mpool_destroy(mpool_t mp)
+{
+	int i = 0;
+	size_t nbytes = mp->mp_num_obj * mp->mp_elm_size;
+
+	for (i = 0; i < mp->mp_elm_vector_size; i++) {
+		if (mp->mp_elm_vector[i]) {
+#ifdef PSM_CUDA
+			if (mp->mp_alloc_dealloc_cb)
+				psmi_mpool_chunk_dealloc(mp, i);
+#endif
+			psmi_free(mp->mp_elm_vector[i]);
+		}
+	}
+	psmi_free(mp->mp_elm_vector);
+	nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *);
+	VALGRIND_DESTROY_MEMPOOL(mp);
+	psmi_free(mp);
+	nbytes += sizeof(struct mpool);
+}
+
+/**
+ * psmi_mpool_get_max_obj()
+ *
+ * <mp>	    memory pool
+ *
+ * Returns the num-obj-per-chunk
+ * Returns the num-obj-max-total
+ */
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+			uint32_t *num_obj_max_total)
+{
+	*num_obj_per_chunk = mp->mp_num_obj_per_chunk;
+	*num_obj_max_total = mp->mp_num_obj_max_total;
+	return;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info);
+
+static int psmi_mpool_allocate_chunk(mpool_t mp)
+{
+	struct mpool_element *elm;
+	void *chunk;
+	uint32_t i = 0, num_to_allocate;
+
+	num_to_allocate =
+	    mp->mp_num_obj + mp->mp_num_obj_per_chunk >
+	    mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk;
+
+	psmi_assert(mp->mp_num_obj + num_to_allocate <=
+		    mp->mp_num_obj_max_total);
+
+	if (num_to_allocate == 0)
+		return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+	if (mp->mp_alloc_dealloc_cb)
+		chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype,
+				    num_to_allocate, mp->mp_elm_size);
+	else
+		chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+				    num_to_allocate * mp->mp_elm_size);
+#else
+	chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+			    num_to_allocate * mp->mp_elm_size);
+#endif
+	if (chunk == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool chunk: %s\n",
+			strerror(errno));
+		return PSM2_NO_MEMORY;
+	}
+
+	for (i = 0; i < num_to_allocate; i++) {
+#ifdef PSM_CUDA
+		if (mp->mp_alloc_dealloc_cb)
+			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
+						mp->mp_alloc_dealloc_cb_context,
+						chunk + i * mp->mp_elm_size +
+						sizeof(struct mpool_element));
+#endif
+		elm = (struct mpool_element *)((uintptr_t) chunk +
+					       i * mp->mp_elm_size +
+					       mp->mp_elm_offset);
+		elm->me_gen_count = 0;
+		elm->me_index = mp->mp_num_obj + i;
+#ifdef PSM_DEBUG
+		elm->me_isused = 0;
+#endif
+		SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next);
+#if 0
+		fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n",
+			(long)(mp->mp_elm_vector_free - mp->mp_elm_vector),
+			(int)i, elm,
+			(void *)((uintptr_t) elm +
+				 sizeof(struct mpool_element)), SLIST_NEXT(elm,
+									   me_next));
+#endif
+	}
+
+	psmi_assert((uintptr_t) mp->mp_elm_vector_free
+		    < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size
+		    * sizeof(struct mpool_element *));
+
+	mp->mp_elm_vector_free[0] = chunk;
+	mp->mp_elm_vector_free++;
+	mp->mp_num_obj += num_to_allocate;
+
+	return PSM2_OK;
+}
+
+#if 0
+void psmi_mpool_dump(mpool_t mp)
+{
+	int i, j;
+	struct mpool_element *me;
+
+	fprintf(stderr, "Memory pool %p has %d elements per chunk.\n",
+		mp, mp->mp_num_obj_per_chunk);
+	for (i = 0; i < mp->mp_elm_vector_size; i++) {
+		if (mp->mp_elm_vector[i] != NULL) {
+			fprintf(stderr, "===========================\n");
+			fprintf(stderr, "mpool chunk #%d\n", i);
+
+			for (j = 0, me = mp->mp_elm_vector[i];
+			     j < mp->mp_num_obj_per_chunk;
+			     j++, me = (struct mpool_element *)
+			     ((uintptr_t) me + mp->mp_elm_size)) {
+				fprintf(stderr,
+					"obj=%p index=%d gen_count=%d\n",
+					(void *)((uintptr_t) me +
+						 sizeof(struct mpool_element)),
+					me->me_index, me->me_gen_count);
+			}
+			fprintf(stderr, "===========================\n");
+		}
+	}
+}
+#endif
diff --git a/psm_mpool.h b/psm_mpool.h
new file mode 100644
index 0000000..8098f60
--- /dev/null
+++ b/psm_mpool.h
@@ -0,0 +1,107 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_mpool.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef PSM_MPOOL_H
+#define PSM_MPOOL_H
+
+/* mpool flags */
+#define PSMI_MPOOL_ALIGN_CACHE	0x1
+#define PSMI_MPOOL_ALIGN_PAGE   0x2
+#define PSMI_MPOOL_NOGENERATION 0x4
+
+/* Backwards compatibility */
+#define PSMI_MPOOL_ALIGN	PSMI_MPOOL_ALIGN_CACHE
+
+typedef struct mpool *mpool_t;
+typedef void (*non_empty_callback_fn_t) (void *context);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
+					     void *chunk);
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+			  uint32_t num_obj_max_total, int flags,
+			  psmi_memtype_t statstype,
+			  non_empty_callback_fn_t cb, void *context);
+MOCK_DCL_EPILOGUE(psmi_mpool_create);
+
+mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+				   uint32_t num_obj_max_total, int flags,
+				   psmi_memtype_t statstype,
+				   non_empty_callback_fn_t cb, void *context,
+				   alloc_dealloc_callback_fn_t ad_cb,
+				   void *ad_context);
+
+void psmi_mpool_destroy(mpool_t mp);
+
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+			     uint32_t *num_obj_max_total);
+MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info);
+
+void *psmi_mpool_get(mpool_t mp);
+void psmi_mpool_put(void *obj);
+
+int psmi_mpool_get_obj_index(void *obj);
+uint32_t psmi_mpool_get_obj_gen_count(void *obj);
+int psmi_mpool_get_obj_index_gen_count(void *obj,
+				       uint32_t *index, uint32_t *gen_count);
+
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index);
+
+#endif
diff --git a/psm_mq.c b/psm_mq.c
new file mode 100644
index 0000000..44b602a
--- /dev/null
+++ b/psm_mq.c
@@ -0,0 +1,1433 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sched.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Functions to manipulate the expected queue in mq_ep.
+ */
+
+/*
+ * Once the linked lists cross the size limit, this function will enable tag
+ * hashing and disable the non-hashing fastpath. We need to go back and insert
+ * reqs into the hash tables where the hashing searches will look for them.
+ */
+void
+psmi_mq_fastpath_disable(psm2_mq_t mq)
+{
+	psm2_mq_req_t *curp, cur;
+	struct mqq *qp;
+	unsigned hashvals[NUM_HASH_CONFIGS];
+	int t = PSM2_ANYTAG_ANYSRC;
+
+	mq->nohash_fastpath = 0;
+	/* Everything in the unexpected_q needs to be duplicated into
+	   each of the (three) unexpected hash tables. */
+	qp = &mq->unexpected_q;
+	for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) {
+		mq->unexpected_hash_len++;
+		hashvals[PSM2_TAG_SRC] =
+			hash_64(*(uint64_t *) cur->tag.tag) % NUM_HASH_BUCKETS;
+		hashvals[PSM2_TAG_ANYSRC] =
+			hash_32(cur->tag.tag[0]) % NUM_HASH_BUCKETS;
+		hashvals[PSM2_ANYTAG_SRC] =
+			hash_32(cur->tag.tag[1]) % NUM_HASH_BUCKETS;
+		for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++)
+			mq_qq_append_which(mq->unexpected_htab,
+					   t, hashvals[t], cur);
+	}
+
+	/* Everything in the expected_q needs to be moved into the
+	   (single) correct expected hash table. */
+	qp = &mq->expected_q;
+	for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) {
+		/* must read next ptr before remove */
+		curp = &cur->next[PSM2_ANYTAG_ANYSRC];
+		if ((cur->tagsel.tag[0] == 0xFFFFFFFF) &&
+		    (cur->tagsel.tag[1] == 0xFFFFFFFF)) {
+			/* hash tag0 and tag1 */
+			t = PSM2_TAG_SRC;
+			hashvals[t] = hash_64(*(uint64_t *) cur->tag.tag) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else if (cur->tagsel.tag[0] == 0xFFFFFFFF) {
+			t = PSM2_TAG_ANYSRC;
+			hashvals[t] = hash_32(cur->tag.tag[0]) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else if (cur->tagsel.tag[1] == 0xFFFFFFFF) {
+			t = PSM2_ANYTAG_SRC;
+			hashvals[t] = hash_32(cur->tag.tag[1]) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else
+			continue; /* else, req must stay in ANY ANY */
+
+		mq->expected_list_len--;
+		mq->expected_hash_len++;
+		mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC);
+	}
+}
+
+/* easy threshold to re-enable: if |hash| == 0 && |list| < X
+   aggressive threshold: if |hash| + |list| < X
+   even easier: if |hash| + |list| == 0
+   might be better approach to avoid constant bouncing between modes */
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq)
+{
+	if_pf(mq->nohash_fastpath == 0 &&
+	      mq->unexpected_hash_len == 0 &&
+	      mq->expected_hash_len == 0 &&
+	      mq->unexpected_list_len == 0 &&
+	      mq->expected_list_len == 0){
+		mq->nohash_fastpath = 1;
+	}
+}
+
+/*
+ * ! @brief PSM exposed version to allow PTLs to match
+ */
+
+/*! @brief Try to match against the MQ using a tag and tagsel
+ *
+ * @param[in] mq Message Queue
+ * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR.
+ * @param[in] tag Input Tag
+ * @param[in] tagsel Input Tag Selector
+ * @param[in] remove Non-zero to remove the req from the queue
+ *
+ * @returns NULL if no match or an mq request if there is a match
+ */
+static
+psm2_mq_req_t
+mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src,
+			 psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove)
+{
+	psm2_mq_req_t *curp;
+	psm2_mq_req_t cur;
+	unsigned hashval;
+	int i, j = 0;
+	struct mqq *qp;
+
+	if_pt (mq->nohash_fastpath) {
+		i = j = PSM2_ANYTAG_ANYSRC;
+		qp = &mq->unexpected_q;
+	} else if ((tagsel->tag[0] == 0xFFFFFFFF) &&
+		   (tagsel->tag[1] == 0xFFFFFFFF)) {
+		i = PSM2_TAG_SRC;
+		hashval = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else if (tagsel->tag[0] == 0xFFFFFFFF) {
+		i = PSM2_TAG_ANYSRC;
+		hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else if (tagsel->tag[1] == 0xFFFFFFFF) {
+		i = PSM2_ANYTAG_SRC;
+		hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else {
+		/* unhashable tag */
+		i = PSM2_ANYTAG_ANYSRC;
+		qp = &mq->unexpected_q;
+	}
+
+	for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) {
+		psmi_assert(cur->peer != PSM2_MQ_ANY_ADDR);
+		if ((src == PSM2_MQ_ANY_ADDR || src == cur->peer) &&
+		    !((tag->tag[0] ^ cur->tag.tag[0]) & tagsel->tag[0]) &&
+		    !((tag->tag[1] ^ cur->tag.tag[1]) & tagsel->tag[1]) &&
+		    !((tag->tag[2] ^ cur->tag.tag[2]) & tagsel->tag[2])) {
+			/* match! */
+			if (remove) {
+				if_pt (i == PSM2_ANYTAG_ANYSRC)
+					mq->unexpected_list_len--;
+				else
+					mq->unexpected_hash_len--;
+				for (; j < NUM_MQ_SUBLISTS; j++)
+					mq_qq_remove_which(cur, j);
+				psmi_mq_fastpath_try_reenable(mq);
+			}
+			return cur;
+		}
+	}
+	return NULL;
+}
+
+static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	unsigned hashval;
+	int i;
+
+	req->timestamp = mq->timestamp++;
+	if_pt (mq->nohash_fastpath) {
+		mq_qq_append(&mq->expected_q, req);
+		req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+		mq->expected_list_len++;
+		if_pf (mq->expected_list_len >= HASH_THRESHOLD)
+			psmi_mq_fastpath_disable(mq);
+	} else if ((req->tagsel.tag[0] == 0xFFFFFFFF) &&
+		   (req->tagsel.tag[1] == 0xFFFFFFFF)) {
+		i = PSM2_TAG_SRC;
+		hashval = hash_64(*(uint64_t *) req->tag.tag) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else if (req->tagsel.tag[0] == 0xFFFFFFFF) {
+		i = PSM2_TAG_ANYSRC;
+		hashval = hash_32(req->tag.tag[0]) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else if (req->tagsel.tag[1] == 0xFFFFFFFF) {
+		i = PSM2_ANYTAG_SRC;
+		hashval = hash_32(req->tag.tag[1]) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else {
+		mq_qq_append(&mq->expected_q, req);
+		req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+		mq->expected_list_len++;
+	}
+}
+
+/*! @brief Try to remove the req in the MQ
+ *
+ * @param[in] mq Message Queue
+ * @param[in] req MQ request
+ *
+ * @returns 1 if successfully removed, or 0 if req cannot be found.
+ */
+static
+int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	int i;
+
+	/* item should only exist in one expected queue at a time */
+	psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1);
+
+	for (i = 0; i < NUM_MQ_SUBLISTS; i++)
+		if (req->q[i]) /* found */
+			break;
+	switch (i) {
+	case PSM2_ANYTAG_ANYSRC:
+		mq->expected_list_len--;
+		break;
+	case PSM2_TAG_SRC:
+	case PSM2_TAG_ANYSRC:
+	case PSM2_ANYTAG_SRC:
+		mq->expected_hash_len--;
+		break;
+	default:
+		return 0;
+	}
+
+	mq_qq_remove_which(req, i);
+	psmi_mq_fastpath_try_reenable(mq);
+	return 1;
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars)
+{
+	unsigned char *dest = (unsigned char *)vdest;
+	const unsigned char *src = (const unsigned char *)vsrc;
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+		PSMI_CUDA_CALL(cudaMemcpy,
+			       vdest, vsrc, nchars, cudaMemcpyDefault);
+		return;
+	}
+#endif
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+}
+MOCK_DEF_EPILOGUE(psmi_mq_mtucpy);
+
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)
+{
+	unsigned char *dest = (unsigned char *)vdest;
+	const unsigned char *src = (const unsigned char *)vsrc;
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+}
+
+#if 0				/* defined(__x86_64__) No consumers of mtucpy safe */
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars)
+{
+	unsigned char *dest = (unsigned char *)vdest;
+	const unsigned char *src = (const unsigned char *)vsrc;
+	if (nchars >> 2)
+		hfi_dwordcpy_safe((uint32_t *) dest, (uint32_t *) src,
+				  nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+}
+#endif
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src,
+		     psm2_mq_tag_t *tag,
+		     psm2_mq_tag_t *tagsel, int remove_req))
+{
+	psm2_mq_req_t req;
+
+	PSMI_LOCK(mq->progress_lock);
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+	if (req != NULL) {
+		PSMI_UNLOCK(mq->progress_lock);
+		return req;
+	}
+
+	psmi_poll_internal(mq->ep, 1);
+	/* try again */
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+	PSMI_UNLOCK(mq->progress_lock);
+	return req;
+}
+
+psm2_error_t
+__psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src,
+		 psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		 psm2_mq_status2_t *status)
+{
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0);
+	psmi_assert_req_not_internal(req);
+
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status2_copy(req, status);
+		}
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe2)
+
+psm2_error_t
+__psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+		psm2_mq_status_t *status)
+{
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	*(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	*(uint64_t *) rtagsel.tag = tagsel;
+	rtagsel.tag[2] = 0;
+
+	req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0);
+	psmi_assert_req_not_internal(req);
+
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status_copy(req, status);
+		}
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	PSM2_LOG_MSG("leaving");
+
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe)
+
+psm2_error_t
+__psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src,
+		  psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		  psm2_mq_req_t *reqo, psm2_mq_status2_t *status)
+{
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+
+	req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1);
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status2_copy(req, status);
+		}
+		*reqo = req;
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	*reqo = NULL;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe2)
+
+psm2_error_t
+__psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+		 psm2_mq_req_t *reqo, psm2_mq_status_t *status)
+{
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	*(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	*(uint64_t *) rtagsel.tag = tagsel;
+	rtagsel.tag[2] = 0;
+
+	req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1);
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status_copy(req, status);
+		}
+		*reqo = req;
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	*reqo = NULL;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe)
+
+psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq)
+{
+	psm2_mq_req_t req = *ireq;
+	psm2_mq_t mq;
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_MQ_NO_COMPLETIONS;
+	}
+
+	/* Cancelling a send is a blocking operation, and expensive.
+	 * We only allow cancellation of rendezvous sends, consider the eager sends
+	 * as always unsuccessfully cancelled.
+	 */
+	mq = req->mq;
+	PSMI_LOCK(mq->progress_lock);
+
+	if (MQE_TYPE_IS_RECV(req->type)) {
+		if (req->state == MQ_STATE_POSTED) {
+			int rc;
+
+			rc = mq_req_remove_single(mq, req);
+			psmi_assert_always(rc);
+			req->state = MQ_STATE_COMPLETE;
+			mq_qq_append(&mq->completed_q, req);
+			err = PSM2_OK;
+		} else
+			err = PSM2_MQ_NO_COMPLETIONS;
+	} else {
+		err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+					"Cannot cancel send requests (req=%p)",
+					req);
+	}
+
+	PSMI_UNLOCK(mq->progress_lock);
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_cancel)
+
+/* This is the only PSM function that blocks.
+ * We handle it in a special manner since we don't know what the user's
+ * execution environment is (threads, oversubscribing processes, etc).
+ *
+ * The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status,
+		   psmi_mq_status_copy_t status_copy,
+		   int do_lock))
+{
+	psm2_error_t err = PSM2_OK;
+
+	psm2_mq_req_t req = *ireq;
+	if (req == PSM2_MQ_REQINVALID) {
+		return PSM2_OK;
+	}
+
+	if (do_lock)
+		PSMI_LOCK(req->mq->progress_lock);
+
+	if (req->state != MQ_STATE_COMPLETE) {
+		psm2_mq_t mq = req->mq;
+
+		/* We'll be waiting on this req, mark it as so */
+		req->type |= MQE_TYPE_WAITING;
+
+		_HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n",
+			  req, req->buf, req->buf_len);
+
+		if (req->testwait_callback) {
+			err = req->testwait_callback(ireq);
+			if (do_lock)
+				PSMI_UNLOCK(req->mq->progress_lock);
+			if (status != NULL) {
+				status_copy(req, status);
+			}
+			return err;
+		}
+
+		PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE);
+
+		if (err > PSM2_OK_NO_PROGRESS)
+			goto fail_with_lock;
+		else
+			err = PSM2_OK;
+	}
+
+	if(!psmi_is_req_internal(req))
+		mq_qq_remove(&req->mq->completed_q, req);
+
+	if (status != NULL) {
+		status_copy(req, status);
+	}
+
+	_HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n",
+		  req, req->buf, req->buf_len, req->error_code);
+
+	psmi_mq_req_free(req);
+	*ireq = PSM2_MQ_REQINVALID;
+
+fail_with_lock:
+	if (do_lock)
+		PSMI_UNLOCK(req->mq->progress_lock);
+	return err;
+}
+
+psm2_error_t
+__psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_req_not_internal(*ireq);
+
+	rv = psmi_mq_wait_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status2_copy, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_wait2)
+
+psm2_error_t
+__psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_req_not_internal(*ireq);
+
+	rv = psmi_mq_wait_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status_copy, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_wait)
+
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq)
+{
+	return psmi_mq_wait_inner(ireq, NULL, NULL, 0);
+}
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status,
+		   psmi_mq_status_copy_t status_copy))
+{
+	psm2_mq_req_t req = *ireq;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == PSM2_MQ_REQINVALID) {
+		return PSM2_OK;
+	}
+
+	if (req->state != MQ_STATE_COMPLETE) {
+		if (req->testwait_callback) {
+			PSMI_LOCK(req->mq->progress_lock);
+			err = req->testwait_callback(ireq);
+			if (status != NULL) {
+				status_copy(req, status);
+			}
+			PSMI_UNLOCK(req->mq->progress_lock);
+			return err;
+		} else
+			return PSM2_MQ_NO_COMPLETIONS;
+	}
+
+	if (status != NULL)
+		status_copy(req, status);
+
+	_HFI_VDBG
+	    ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n",
+	     req, req->tag.tag[0], req->tag.tag[1], req->tag.tag[2], req->buf,
+	     req->buf_len, req->error_code);
+
+	PSMI_LOCK(req->mq->progress_lock);
+	mq_qq_remove(&req->mq->completed_q, req);
+	psmi_mq_req_free(req);
+	PSMI_UNLOCK(req->mq->progress_lock);
+
+	*ireq = PSM2_MQ_REQINVALID;
+
+	return err;
+}
+
+psm2_error_t
+__psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_test_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status2_copy);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_test2)
+
+psm2_error_t
+__psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_test_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status_copy);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+
+}
+PSMI_API_DECL(psm2_mq_test)
+
+psm2_error_t
+__psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+		psm2_mq_tag_t *stag, const void *buf, uint32_t len,
+		void *context, psm2_mq_req_t *req)
+{
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(stag != NULL);
+
+	PSMI_LOCK(mq->progress_lock);
+	err =
+	    dest->ptlctl->mq_isend(mq, dest, flags, stag, buf, len, context,
+				   req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+#if 0
+#ifdef PSM_VALGRIND
+	/* If the send isn't completed yet, make sure that we mark the memory as
+	 * unaccessible
+	 */
+	if (*req != PSM2_MQ_REQINVALID && (*req)->state != MQ_STATE_COMPLETE)
+		VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+#endif
+#endif
+	psmi_assert(*req != NULL);
+	psmi_assert_req_not_internal(*req);
+
+	(*req)->peer = dest;
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_isend2)
+
+psm2_error_t
+__psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	       const void *buf, uint32_t len, void *context, psm2_mq_req_t *req)
+{
+	psm2_error_t err;
+	psm2_mq_tag_t tag;
+
+	PSM2_LOG_MSG("entering");
+
+	*((uint64_t *) tag.tag) = stag;
+	tag.tag[2] = 0;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+	err =
+	    dest->ptlctl->mq_isend(mq, dest, flags, &tag, buf, len, context,
+				   req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+#if 0
+#ifdef PSM_VALGRIND
+	/* If the send isn't completed yet, make sure that we mark the memory as
+	 * unaccessible
+	 */
+	if (*req != PSM2_MQ_REQINVALID && (*req)->state != MQ_STATE_COMPLETE)
+		VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+#endif
+#endif
+	psmi_assert(*req != NULL);
+	psmi_assert_req_not_internal(*req);
+
+	(*req)->peer = dest;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_isend)
+
+psm2_error_t
+__psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	       psm2_mq_tag_t *stag, const void *buf, uint32_t len)
+{
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(stag != NULL);
+
+	PSMI_LOCK(mq->progress_lock);
+	err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len);
+	PSMI_UNLOCK(mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_send2)
+
+psm2_error_t
+__psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	      const void *buf, uint32_t len)
+{
+	psm2_error_t err;
+	psm2_mq_tag_t tag;
+
+	PSM2_LOG_MSG("entering");
+
+	*((uint64_t *) tag.tag) = stag;
+	tag.tag[2] = 0;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+	err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len);
+	PSMI_UNLOCK(mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_send)
+
+/*
+ * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv.  This code assumes
+ * that the provided request has been matched, and begins copying message data
+ * that has already arrived to the user's buffer.  Any remaining data is copied
+ * by PSM polling until the message is complete.
+ */
+static psm2_error_t
+psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
+{
+	uint32_t copysz;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(MQE_TYPE_IS_RECV(req->type));
+#ifdef PSM_CUDA
+	psmi_mtucpy_fn_t psmi_mtucpy_fn;
+	if (req->is_buf_gpu_mem)
+		psmi_mtucpy_fn = psmi_mq_mtucpy;
+	else
+		psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+#endif
+
+	switch (req->state) {
+	case MQ_STATE_COMPLETE:
+		if (req->buf != NULL) {	/* 0-byte messages don't alloc a sysbuf */
+			copysz = mq_set_msglen(req, len, req->send_msglen);
+#ifdef PSM_CUDA
+			psmi_mtucpy_fn
+#else
+			psmi_mq_mtucpy
+#endif
+				(buf, (const void *)req->buf, copysz);
+			psmi_mq_sysbuf_free(mq, req->buf);
+		}
+		req->buf = buf;
+		req->buf_len = len;
+		mq_qq_append(&mq->completed_q, req);
+		break;
+
+	case MQ_STATE_UNEXP:	/* not done yet */
+		copysz = mq_set_msglen(req, len, req->send_msglen);
+		/* Copy What's been received so far and make sure we don't receive
+		 * any more than copysz.  After that, swap system with user buffer
+		 */
+		req->recv_msgoff = min(req->recv_msgoff, copysz);
+		if (req->recv_msgoff) {
+#ifdef PSM_CUDA
+			psmi_mtucpy_fn
+#else
+			psmi_mq_mtucpy
+#endif
+				(buf, (const void *)req->buf,
+				       req->recv_msgoff);
+		}
+		/* What's "left" is no access */
+		VALGRIND_MAKE_MEM_NOACCESS((void *)((uintptr_t) buf +
+						    req->recv_msgoff),
+					   len - req->recv_msgoff);
+		psmi_mq_sysbuf_free(mq, req->buf);
+
+		req->state = MQ_STATE_MATCHED;
+		req->buf = buf;
+		req->buf_len = len;
+		break;
+
+	case MQ_STATE_UNEXP_RV:	/* rendez-vous ... */
+		copysz = mq_set_msglen(req, len, req->send_msglen);
+		/* Copy What's been received so far and make sure we don't receive
+		 * any more than copysz.  After that, swap system with user buffer
+		 */
+		req->recv_msgoff = min(req->recv_msgoff, copysz);
+		if (req->recv_msgoff) {
+#ifdef PSM_CUDA
+			psmi_mtucpy_fn
+#else
+			psmi_mq_mtucpy
+#endif
+				(buf, (const void *)req->buf,
+				       req->recv_msgoff);
+		}
+		/* What's "left" is no access */
+		VALGRIND_MAKE_MEM_NOACCESS((void *)((uintptr_t) buf +
+						    req->recv_msgoff),
+					   len - req->recv_msgoff);
+		if (req->send_msgoff) {
+			psmi_mq_sysbuf_free(mq, req->buf);
+		}
+
+		req->state = MQ_STATE_MATCHED;
+		req->buf = buf;
+		req->buf_len = len;
+		req->rts_callback(req, 0);
+		break;
+
+	default:
+		fprintf(stderr, "Unexpected state %d in req %p\n", req->state,
+			req);
+		fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+			req->type, req->mq, req->tag.tag[0], req->tag.tag[1],
+			req->tag.tag[2]);
+		abort();
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
+		psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		uint32_t flags, void *buf, uint32_t len, void *context,
+		psm2_mq_req_t *reqo)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_mq_req_t req;
+
+#ifdef PSM_CUDA
+	int gpu_mem;
+	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+	 * when the buffer pointer received into PSM has been allocated
+	 * by the application. This guarantees the all memory operations
+	 * to this region of memory (used by multiple layers of the stack)
+	 * always synchronize
+	 */
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
+		int trueflag = 1;
+		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+			      (CUdeviceptr)buf);
+		gpu_mem = 1;
+	} else
+		gpu_mem = 0;
+#endif
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+
+	/* First check unexpected Queue and remove req if found */
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY);
+
+	if (req == NULL) {
+		/* prepost before arrival, add to expected q */
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+		if_pf(req == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto ret;
+		}
+
+		req->peer = src;
+		req->tag = *tag;
+		req->tagsel = *tagsel;
+		req->state = MQ_STATE_POSTED;
+		req->buf = buf;
+		req->buf_len = len;
+		req->recv_msglen = len;
+		req->recv_msgoff = 0;
+		req->context = context;
+
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem = gpu_mem;
+#endif
+
+		/* Nobody should touch the buffer after it's posted */
+		VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+
+		mq_add_to_expected_hashes(mq, req);
+		_HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x "
+			  " tagsel=%08x.%08x.%08x req=%p\n",
+			  buf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+	} else {
+		_HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x"
+			  " tagsel=%08x.%08x.%08x req=%p\n", buf, len,
+			  tag->tag[0], tag->tag[1], tag->tag[2],
+			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem = gpu_mem;
+#endif
+
+		req->context = context;
+
+		psm2_mq_irecv_inner(mq, req, buf, len);
+	}
+
+ret:
+	PSMI_UNLOCK(mq->progress_lock);
+	psmi_assert_req_not_internal(req);
+	*reqo = req;
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_irecv2)
+
+psm2_error_t
+__psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags,
+	       void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo)
+{
+	psm2_error_t rv;
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+
+	*reqo = NULL;
+
+	PSM2_LOG_MSG("entering");
+
+	*(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	*(uint64_t *) rtagsel.tag = tagsel;
+	rtagsel.tag[2] = 0;
+	rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel,
+			       flags, buf, len, context, reqo);
+
+	psmi_assert_req_not_internal(*reqo);
+	PSM2_LOG_MSG("leaving");
+
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_irecv)
+
+psm2_error_t
+__psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+		void *context, psm2_mq_req_t *reqo)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_mq_req_t req = *reqo;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == PSM2_MQ_REQINVALID) {
+		err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+					"Invalid request (req=%p)", req);
+	} else {
+		/* Message is already matched -- begin delivering message data to the
+		   user's buffer. */
+		req->context = context;
+
+#ifdef PSM_CUDA
+	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+	 * when the buffer pointer received into PSM has been allocated
+	 * by the application. This guarantees the all memory operations
+	 * to this region of memory (used by multiple layers of the stack)
+	 * always synchronize
+	 */
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
+		int trueflag = 1;
+		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+			      (CUdeviceptr)buf);
+		req->is_buf_gpu_mem = 1;
+	} else
+		req->is_buf_gpu_mem = 0;
+#endif
+
+		PSMI_LOCK(mq->progress_lock);
+		psm2_mq_irecv_inner(mq, req, buf, len);
+		PSMI_UNLOCK(mq->progress_lock);
+	}
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_imrecv)
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq,
+		    void *status,
+		    psmi_mq_status_copy_t status_copy))
+{
+	psm2_mq_req_t req;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	if ((req = mq->completed_q.first) == NULL) {
+		PSMI_LOCK(mq->progress_lock);
+		psmi_poll_internal(mq->ep, 1);
+		if ((req = mq->completed_q.first) == NULL) {
+			PSMI_UNLOCK(mq->progress_lock);
+			return PSM2_MQ_NO_COMPLETIONS;
+		}
+		PSMI_UNLOCK(mq->progress_lock);
+	}
+	/* something in the queue */
+	*oreq = req;
+	if (status != NULL)
+		status_copy(req, status);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+
+	*oreq = NULL;
+
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_ipeek_inner(mq, oreq, status,
+				   (psmi_mq_status_copy_t) mq_status2_copy);
+
+	psmi_assert_req_not_internal(*oreq);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek2)
+
+psm2_error_t
+__psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+
+	*oreq = NULL;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_ipeek_inner(mq, oreq, status,
+				   (psmi_mq_status_copy_t) mq_status_copy);
+
+	psmi_assert_req_not_internal(*oreq);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek)
+
+static
+psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
+{
+	psm2_error_t err = PSM2_OK;
+	uint32_t val32;
+
+	switch (key) {
+	case PSM2_MQ_RNDV_HFI_SZ:
+		if (get)
+			*((uint32_t *) value) = mq->hfi_thresh_rv;
+		else {
+			val32 = *((uint32_t *) value);
+			mq->hfi_thresh_rv = val32;
+		}
+		_HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n",
+			  mq->hfi_thresh_rv, get ? "GET" : "SET");
+		break;
+
+	case PSM2_MQ_RNDV_SHM_SZ:
+		if (get)
+			*((uint32_t *) value) = mq->shm_thresh_rv;
+		else {
+			val32 = *((uint32_t *) value);
+			mq->shm_thresh_rv = val32;
+		}
+		_HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n",
+			  mq->shm_thresh_rv, get ? "GET" : "SET");
+		break;
+	case PSM2_MQ_MAX_SYSBUF_MBYTES:
+		/* Deprecated: this option no longer does anything. */
+		break;
+
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown option key=%u", key);
+		break;
+	}
+	return err;
+}
+
+psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+	rv = psmi_mqopt_ctl(mq, key, value, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_getopt)
+
+psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+	rv = psmi_mqopt_ctl(mq, key, (void *)value, 0);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_setopt)
+
+/*
+ * This is the API for the user.  We actually allocate the MQ much earlier, but
+ * the user can set options after obtaining an endpoint
+ */
+psm2_error_t
+__psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+	      const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (ep == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	psm2_mq_t mq = ep->mq;
+	int i;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	psmi_assert_always(mq != NULL);
+	psmi_assert_always(mq->ep != NULL);
+
+	/* Process options */
+	for (i = 0; err == PSM2_OK && i < numopts; i++)
+		err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0);
+	if (err != PSM2_OK)	/* error already handled */
+		goto fail;
+
+	/* Initialize the unexpected system buffer allocator */
+	psmi_mq_sysbuf_init(mq);
+	char buf[128];
+	psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf);
+	_HFI_VDBG("%s", buf);
+
+	*mqo = mq;
+
+fail:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_init)
+
+static
+void
+psmi_mq_print_stats(psm2_mq_t mq)
+{
+	psm2_mq_stats_t stats;
+
+	psm2_mq_get_stats(mq, &stats);
+	_HFI_INFO("rx_user_bytes %lu\n", stats.rx_user_bytes);
+	_HFI_INFO("rx_user_num %lu\n", stats.rx_user_num);
+	_HFI_INFO("rx_sys_bytes %lu\n", stats.rx_sys_bytes);
+	_HFI_INFO("rx_sys_num %lu\n", stats.rx_sys_num);
+
+	_HFI_INFO("tx_num %lu\n", stats.tx_num);
+	_HFI_INFO("tx_eager_num %lu\n", stats.tx_eager_num);
+	_HFI_INFO("tx_eager_bytes %lu\n", stats.tx_eager_bytes);
+	_HFI_INFO("tx_rndv_num %lu\n", stats.tx_rndv_num);
+	_HFI_INFO("tx_rndv_bytes %lu\n", stats.tx_rndv_bytes);
+
+	_HFI_INFO("tx_shm_num %lu\n", stats.tx_shm_num);
+	_HFI_INFO("rx_shm_num %lu\n", stats.rx_shm_num);
+
+	_HFI_INFO("rx_sysbuf_num %lu\n", stats.rx_sysbuf_num);
+	_HFI_INFO("rx_sysbuf_bytes %lu\n", stats.rx_sysbuf_bytes);
+}
+
+psm2_error_t __psm2_mq_finalize(psm2_mq_t mq)
+{
+	psm2_error_t rv = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+
+	if (mq->print_stats != 0)
+		psmi_mq_print_stats(mq);
+
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_finalize)
+
+void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats)
+{
+	PSM2_LOG_MSG("entering");
+	memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t));
+	PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_mq_get_stats)
+
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo)
+{
+	psm2_error_t err = PSM2_OK;
+
+	psm2_mq_t mq =
+	    (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq));
+	if (mq == NULL) {
+		err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+					"Couldn't allocate memory for mq endpoint");
+		goto fail;
+	}
+
+	mq->ep = NULL;
+	/*mq->unexpected_callback = NULL; */
+	mq->memmode = psmi_parse_memmode();
+
+	memset(mq->unexpected_htab, 0,
+	       NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+	memset(mq->expected_htab, 0,
+	       NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+	memset(&mq->expected_q, 0, sizeof(struct mqq));
+	memset(&mq->unexpected_q, 0, sizeof(struct mqq));
+	memset(&mq->completed_q, 0, sizeof(struct mqq));
+	memset(&mq->outoforder_q, 0, sizeof(struct mqq));
+	STAILQ_INIT(&mq->eager_q);
+
+
+	/* The values are overwritten in initialize_defaults, they're just set to
+	 * sensible defaults until then */
+	if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M)
+	{
+		mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2;
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2;
+	} else {
+		mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON;
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON;
+	}
+	mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY;
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		mq->hfi_base_window_rv = MQ_HFI_THRESH_RNDV_CUDA;
+#endif
+	mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
+
+	memset(&mq->stats, 0, sizeof(psm2_mq_stats_t));
+	err = psmi_mq_req_init(mq);
+	if (err)
+		goto fail;
+
+	*mqo = mq;
+
+	return PSM2_OK;
+fail:
+	if (mq != NULL)
+		psmi_free(mq);
+	return err;
+}
+
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq)
+{
+	union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv,
+		env_shmrv, env_stats;
+
+	psmi_getenv("PSM2_MQ_TINY_HFI_THRESH",
+		    "hfi tiny packet switchover (max 8, default 8)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny);
+	mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8);
+
+	psmi_getenv("PSM2_MQ_RNDV_HFI_THRESH",
+		    "hfi eager-to-rendezvous switchover",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv);
+	mq->hfi_thresh_rv = env_hfirv.e_uint;
+
+	psmi_getenv("PSM2_MQ_RNDV_HFI_WINDOW",
+		    "hfi rendezvous window size, max 4M",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin);
+	mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint);
+
+	/* Re-evaluate this since it may have changed after initializing the shm
+	 * device */
+	mq->shm_thresh_rv = psmi_shm_mq_rv_thresh;
+	psmi_getenv("PSM2_MQ_RNDV_SHM_THRESH",
+		    "shm eager-to-rendezvous switchover",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv);
+	mq->shm_thresh_rv = env_shmrv.e_uint;
+
+	psmi_getenv("PSM2_MQ_PRINT_STATS",
+		    "Print MQ stats during finalization",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val) 0, &env_stats);
+	mq->print_stats = env_stats.e_uint;
+
+	mq->nohash_fastpath = 1;
+	return PSM2_OK;
+}
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq)
+{
+	psmi_mq_req_fini(mq);
+	psmi_mq_sysbuf_fini(mq);
+	psmi_free(mq);
+	return PSM2_OK;
+}
+MOCK_DEF_EPILOGUE(psmi_mq_free);
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
new file mode 100644
index 0000000..f20bf34
--- /dev/null
+++ b/psm_mq_internal.h
@@ -0,0 +1,639 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef MQ_INT_H
+#define MQ_INT_H
+
+/* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */
+#ifdef malloc
+#undef malloc
+#endif
+#ifdef free
+#undef free
+#endif
+#include <smmintrin.h>
+#include "psm_user.h"
+#include "psm_sysbuf.h"
+
+#include "psm2_mock_testing.h"
+
+#if 0
+typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t)
+	(psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr,
+	 uint64_t tag, uint32_t send_msglen, const void *payload,
+	 uint32_t paylen);
+#endif
+
+#define NUM_HASH_BUCKETS 64
+#define HASH_THRESHOLD 65
+#define NUM_HASH_CONFIGS 3
+#define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1)
+#define REMOVE_ENTRY 1
+
+enum psm2_mq_tag_pattern {
+	PSM2_TAG_SRC = 0,
+	PSM2_TAG_ANYSRC,
+	PSM2_ANYTAG_SRC,
+	PSM2_ANYTAG_ANYSRC,
+};
+
+struct psm2_mq {
+	psm2_ep_t ep;		/**> ep back pointer */
+	mpool_t sreq_pool;
+	mpool_t rreq_pool;
+
+	struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+	struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+
+	/* in case the compiler can't figure out how to preserve the hashed values
+	between mq_req_match() and mq_add_to_unexpected_hashes() ... */
+	unsigned hashvals[NUM_HASH_CONFIGS];
+
+	/*psm_mq_unexpected_callback_fn_t unexpected_callback; */
+	struct mqq expected_q;		/**> Preposted (expected) queue */
+	struct mqq unexpected_q;	/**> Unexpected queue */
+	struct mqq completed_q;		/**> Completed queue */
+
+	struct mqq outoforder_q;	/**> OutofOrder queue */
+	STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */
+
+	uint32_t hfi_thresh_tiny;
+	uint32_t hfi_thresh_rv;
+	uint32_t shm_thresh_rv;
+	uint32_t hfi_base_window_rv;	/**> this is a base rndv window size,
+					     will be further trimmed down per-connection based
+					     on the peer's MTU */
+	int memmode;
+
+	uint64_t timestamp;
+	psm2_mq_stats_t stats;	/**> MQ stats, accumulated by each PTL */
+	int print_stats;
+	int nohash_fastpath;
+	unsigned unexpected_hash_len;
+	unsigned unexpected_list_len;
+	unsigned expected_hash_len;
+	unsigned expected_list_len;
+
+	psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS];
+	int mem_ctrl_is_init;
+	uint64_t mem_ctrl_total_bytes;
+
+	psmi_lock_t progress_lock;
+};
+
+#define MQ_HFI_THRESH_TINY	8
+#define MQ_HFI_THRESH_EGR_SDMA_XEON 34000       /* Eager Xeon blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000      /* Eager Phi2 blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16000    /* Eager Xeon non-blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536    /* Eager Phi2 non-blocking */
+
+#define MQ_HFI_THRESH_RNDV_PHI2 200000
+#define MQ_HFI_THRESH_RNDV_XEON  64000
+
+#define MQ_HFI_WINDOW_RNDV_PHI2 4194304
+#define MQ_HFI_WINDOW_RNDV_XEON  131072
+
+#ifdef PSM_CUDA
+#define MQ_HFI_THRESH_RNDV_CUDA 2097152
+#endif
+
+#define MQ_SHM_THRESH_RNDV 16000
+
+#define MQE_TYPE_IS_SEND(type)	((type) & MQE_TYPE_SEND)
+#define MQE_TYPE_IS_RECV(type)	((type) & MQE_TYPE_RECV)
+
+#define MQE_TYPE_SEND		0x1000
+#define MQE_TYPE_RECV		0x2000
+#define MQE_TYPE_FLAGMASK	0x0fff
+#define MQE_TYPE_WAITING	0x0001
+#define MQE_TYPE_WAITING_PEER	0x0004
+#define MQE_TYPE_EAGER_QUEUE	0x0008
+
+#define MQ_STATE_COMPLETE	0
+#define MQ_STATE_POSTED		1
+#define MQ_STATE_MATCHED	2
+#define MQ_STATE_UNEXP		3
+#define MQ_STATE_UNEXP_RV	4
+#define MQ_STATE_FREE		5
+
+/*
+ * These must match the ips protocol message opcode.
+ */
+#define MQ_MSG_TINY		0xc1
+#define MQ_MSG_SHORT		0xc2
+#define MQ_MSG_EAGER		0xc3
+#define MQ_MSG_LONGRTS		0xc4
+
+/*
+ * Descriptor allocation limits.
+ * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure
+ */
+#define MQ_SENDREQ_LIMITS {					\
+	    .env = "PSM2_MQ_SENDREQS_MAX",			\
+	    .descr = "Max num of isend requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+#define MQ_RECVREQ_LIMITS {					\
+	    .env = "PSM2_MQ_RECVREQS_MAX",			\
+	    .descr = "Max num of irecv requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted);
+typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req);
+
+
+/* If request is marked as internal, then it will not
+   be exposed to the user, will not be added to the mq->completed_q.
+   This flag is set if request is used by e.g. MPI_SEND */
+#define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0)
+
+#define psmi_is_req_internal(req) ((req)->flags & PSMI_REQ_FLAG_IS_INTERNAL)
+
+#define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \
+							(!psmi_is_req_internal(req)))
+
+/* receive mq_req, the default */
+struct psm2_mq_req {
+	struct {
+		psm2_mq_req_t next[NUM_MQ_SUBLISTS];
+		psm2_mq_req_t prev[NUM_MQ_SUBLISTS];
+		STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */
+	};
+	struct mqq *q[NUM_MQ_SUBLISTS];
+	uint64_t timestamp;
+	uint32_t state;
+	uint32_t type;
+	psm2_mq_t mq;
+
+	/* Tag matching vars */
+	psm2_epaddr_t peer;
+	psm2_mq_tag_t tag;
+	psm2_mq_tag_t tagsel;	/* used for receives */
+
+	/* Some PTLs want to get notified when there's a test/wait event */
+	mq_testwait_callback_fn_t testwait_callback;
+
+	/* Buffer attached to request.  May be a system buffer for unexpected
+	 * messages or a user buffer when an expected message */
+	uint8_t *buf;
+	uint32_t buf_len;
+	uint32_t error_code;
+
+	uint16_t msg_seqnum;	/* msg seq num for mctxt */
+	uint32_t recv_msglen;	/* Message length we are ready to receive */
+	uint32_t send_msglen;	/* Message length from sender */
+	uint32_t recv_msgoff;	/* Message offset into buf */
+	union {
+		uint32_t send_msgoff;	/* Bytes received so far.. can be larger than buf_len */
+		uint32_t recv_msgposted;
+	};
+	uint32_t rts_reqidx_peer;
+
+	uint64_t flags;
+
+	/* Used for request to send messages */
+	void *context;		/* user context associated to sends or receives */
+
+	/* Used to keep track of unexpected rendezvous */
+	mq_rts_callback_fn_t rts_callback;
+	psm2_epaddr_t rts_peer;
+	uintptr_t rts_sbuf;
+
+#ifdef PSM_CUDA
+	/* is_buf_gpu_mem - used to indicate if the send or receive is issued
+	 * on a device/host buffer.
+	 * is_sendbuf_gpu_mem - Used to always select TID path on the receiver
+	 * when send is on a device buffer
+	 */
+	uint8_t is_buf_gpu_mem;
+	uint8_t is_sendbuf_gpu_mem;
+	STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch;
+	uint32_t prefetch_send_msgoff;
+	int cuda_hostbuf_used;
+	cudaIpcMemHandle_t cuda_ipc_handle;
+	cudaEvent_t cuda_ipc_event;
+	uint8_t cuda_ipc_handle_attached;
+#endif
+
+	/* PTLs get to store their own per-request data.  MQ manages the allocation
+	 * by allocating psm2_mq_req so that ptl_req_data has enough space for all
+	 * possible PTLs.
+	 */
+	union {
+		void *ptl_req_ptr;	/* when used by ptl as pointer */
+		uint8_t ptl_req_data[0];	/* when used by ptl for "inline" data */
+	};
+};
+
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_64(uint64_t a))
+{
+	return _mm_crc32_u64(0, a);
+}
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_32(uint32_t a))
+{
+	return _mm_crc32_u32(0, a);
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars);
+MOCK_DCL_EPILOGUE(psmi_mq_mtucpy);
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars);
+
+#if defined(__x86_64__)
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars);
+#else
+#define psmi_mq_mtucpy_safe psmi_mq_mtucpy
+#endif
+
+/*
+ * Optimize for 0-8 byte case, but also handle others.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) {
+		if (!PSMI_IS_CUDA_ENABLED) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				 "Please enable PSM CUDA support when using GPU buffer \n");
+			return;
+		}
+		PSMI_CUDA_CALL(cudaMemcpy, dest, src, len, cudaMemcpyDefault);
+		return;
+	}
+#endif
+	switch (len) {
+	case 8:
+		*dest++ = *src++;
+	case 4:
+		*dest++ = *src++;
+	case 0:
+		return;
+	case 7:
+	case 6:
+	case 5:
+		*dest++ = *src++;
+		len -= 4;
+	case 3:
+	case 2:
+	case 1:
+		break;
+	default:		/* greater than 8 */
+		psmi_mq_mtucpy(dest, src, len);
+		return;
+	}
+	uint8_t *dest1 = (uint8_t *) dest;
+	uint8_t *src1 = (uint8_t *) src;
+	switch (len) {
+	case 3:
+		*dest1++ = *src1++;
+	case 2:
+		*dest1++ = *src1++;
+	case 1:
+		*dest1++ = *src1++;
+	}
+}
+
+#ifdef PSM_CUDA
+typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len);
+
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+	switch (len) {
+	case 8:
+		*dest++ = *src++;
+	case 4:
+		*dest++ = *src++;
+	case 0:
+		return;
+	case 7:
+	case 6:
+	case 5:
+		*dest++ = *src++;
+		len -= 4;
+	case 3:
+	case 2:
+	case 1:
+		break;
+	default:		/* greater than 8 */
+		psmi_mq_mtucpy(dest, src, len);
+		return;
+	}
+	uint8_t *dest1 = (uint8_t *) dest;
+	uint8_t *src1 = (uint8_t *) src;
+	switch (len) {
+	case 3:
+		*dest1++ = *src1++;
+	case 2:
+		*dest1++ = *src1++;
+	case 1:
+		*dest1++ = *src1++;
+	}
+}
+#endif
+
+/* Typedef describing a function to populate a psm2_mq_status(2)_t given a
+ * matched request.  The purpose of this typedef is to avoid duplicating
+ * code to handle both PSM v1 and v2 status objects.  Outer routines pass in
+ * either mq_status_copy or mq_status2_copy and the inner routine calls that
+ * provided routine to fill in the correct status type.
+ */
+typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status);
+
+/*
+ * Given an req with buffer ubuf of length ubuf_len,
+ * fill in the req's status and return the amount of bytes the request
+ * can receive.
+ *
+ * The function sets status truncation errors. Basically what MPI_Status does.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status))
+{
+	status->msg_tag = *((uint64_t *) req->tag.tag);
+	status->msg_length = req->send_msglen;
+	status->nbytes = req->recv_msglen;
+	status->error_code = req->error_code;
+	status->context = req->context;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status))
+{
+	status->msg_peer = req->peer;
+	status->msg_tag = req->tag;
+	status->msg_length = req->send_msglen;
+	status->nbytes = req->recv_msglen;
+	status->error_code = req->error_code;
+	status->context = req->context;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t
+mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen))
+{
+	req->send_msglen = sendlen;
+	if (recvlen < sendlen) {
+		req->recv_msglen = recvlen;
+		req->error_code = PSM2_MQ_TRUNCATION;
+		return recvlen;
+	} else {
+		req->recv_msglen = sendlen;
+		req->error_code = PSM2_OK;
+		return sendlen;
+	}
+}
+
+PSMI_ALWAYS_INLINE(
+int
+min_timestamp_4(psm2_mq_req_t *match))
+{
+	uint64_t oldest = -1;
+	int which = -1, i;
+	for (i = 0; i < 4; i++) {
+		if (match[i] && (match[i]->timestamp < oldest)) {
+			oldest = match[i]->timestamp;
+			which = i;
+		}
+	}
+	return which;
+}
+
+#ifndef PSM_DEBUG
+/*! Append to Queue */
+PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req))
+{
+	req->next[PSM2_ANYTAG_ANYSRC] = NULL;
+	req->prev[PSM2_ANYTAG_ANYSRC] = q->last;
+	if (q->last)
+		q->last->next[PSM2_ANYTAG_ANYSRC] = req;
+	else
+		q->first = req;
+	q->last = req;
+	req->q[PSM2_ANYTAG_ANYSRC] = q;
+}
+#else
+#define mq_qq_append(qq, req)						\
+	do {								\
+		psmi_assert_req_not_internal(req);			\
+		(req)->next[PSM2_ANYTAG_ANYSRC] = NULL;			\
+		(req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last;		\
+		if ((qq)->last)						\
+			(qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req);	\
+		else							\
+			(qq)->first = (req);				\
+		(qq)->last = (req);					\
+		(req)->q[PSM2_ANYTAG_ANYSRC] = (qq);			\
+		if (qq == &(req)->mq->completed_q)			\
+			_HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \
+				  (req), __FILE__, __LINE__);		\
+	} while (0)
+#endif
+PSMI_ALWAYS_INLINE(
+void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS],
+			int table, int bucket, psm2_mq_req_t req))
+{
+	req->next[table] = NULL;
+	req->prev[table] = q[table][bucket].last;
+	if (q[table][bucket].last)
+		q[table][bucket].last->next[table] = req;
+	else
+		q[table][bucket].first = req;
+	q[table][bucket].last = req;
+	req->q[table] = &q[table][bucket];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req))
+{
+	if (req->next[PSM2_ANYTAG_ANYSRC] != NULL)
+		req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] =
+			req->prev[PSM2_ANYTAG_ANYSRC];
+	else
+		q->last = req->prev[PSM2_ANYTAG_ANYSRC];
+	if (req->prev[PSM2_ANYTAG_ANYSRC])
+		req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] =
+			req->next[PSM2_ANYTAG_ANYSRC];
+	else
+		q->first = req->next[PSM2_ANYTAG_ANYSRC];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table))
+{
+	struct mqq *q = req->q[table];
+
+	req->q[table] = NULL;
+	if (req->next[table] != NULL)
+		req->next[table]->prev[table] = req->prev[table];
+	else
+		q->last = req->prev[table];
+	if (req->prev[table])
+		req->prev[table]->next[table] = req->next[table];
+	else
+		q->first = req->next[table];
+}
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq);
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq);
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type);
+MOCK_DCL_EPILOGUE(psmi_mq_req_alloc);
+#define      psmi_mq_req_free(req)  psmi_mpool_put(req)
+
+/*
+ * Main receive progress engine, for shmops and hfi, in mq.c
+ */
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo);
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq);
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq);
+MOCK_DCL_EPILOGUE(psmi_mq_free);
+
+/* Three functions that handle all MQ stuff */
+#define MQ_RET_MATCH_OK	0
+#define MQ_RET_UNEXP_OK 1
+#define MQ_RET_UNEXP_NO_RESOURCES 2
+#define MQ_RET_DATA_OK 3
+#define MQ_RET_DATA_OUT_OF_ORDER 4
+
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req);
+int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+			uint32_t offset, const void *payload, uint32_t paylen);
+int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+		       uint32_t msglen, const void *payload, uint32_t paylen,
+		       int msgorder, mq_rts_callback_fn_t cb,
+		       psm2_mq_req_t *req_o);
+int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+			    uint32_t msglen, uint32_t offset,
+			    const void *payload, uint32_t paylen, int msgorder,
+			    uint32_t opcode, psm2_mq_req_t *req_o);
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
+
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn);
+
+void psmi_mq_fastpath_disable(psm2_mq_t mq);
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq);
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum))
+{
+	psm2_mq_req_t *curp;
+	psm2_mq_req_t cur;
+
+	for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) {
+		if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) {
+			/* match! */
+			mq_qq_remove(q, cur);
+			return cur;
+		}
+	}
+	return NULL; /* no match */
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum))
+{
+	psm2_mq_req_t cur;
+
+	cur = STAILQ_FIRST(&mq->eager_q);
+	while (cur) {
+		if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum)
+			return cur;
+		cur = STAILQ_NEXT(cur, nextq);
+	}
+	return NULL;		/* no match */
+}
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+				     psm_mq_unexpected_callback_fn_t fn);
+#endif
+
+PSMI_ALWAYS_INLINE(void psmi_mq_stats_rts_account(psm2_mq_req_t req))
+{
+	psm2_mq_t mq = req->mq;
+	if (MQE_TYPE_IS_SEND(req->type)) {
+		mq->stats.tx_num++;
+		mq->stats.tx_rndv_num++;
+		mq->stats.tx_rndv_bytes += req->send_msglen;
+	} else {
+		mq->stats.rx_user_num++;
+		mq->stats.rx_user_bytes += req->recv_msglen;
+	}
+	return;
+}
+
+#endif
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
new file mode 100644
index 0000000..3217714
--- /dev/null
+++ b/psm_mq_recv.c
@@ -0,0 +1,593 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "ptl_ips/ips_proto_header.h"
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+				     psm_mq_unexpected_callback_fn_t fn)
+{
+	psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback;
+	mq->unexpected_callback = fn;
+	return old_fn;
+}
+#endif
+
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req)
+{
+	psm2_mq_t mq = req->mq;
+
+	/* Stats on rendez-vous messages */
+	psmi_mq_stats_rts_account(req);
+	req->state = MQ_STATE_COMPLETE;
+	ips_barrier();
+	if(!psmi_is_req_internal(req))
+		mq_qq_append(&mq->completed_q, req);
+#ifdef PSM_VALGRIND
+	if (MQE_TYPE_IS_RECV(req->type))
+		PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+					    req->recv_msglen);
+	else
+		VALGRIND_MAKE_MEM_DEFINED(req->buf, req->buf_len);
+#endif
+	_HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n",
+		  req, req->recv_msglen);
+	return;
+}
+
+static void
+psmi_mq_req_copy(psm2_mq_req_t req,
+		 uint32_t offset, const void *buf, uint32_t nbytes)
+{
+	/* recv_msglen may be changed by unexpected receive buf. */
+	uint32_t msglen_this, end;
+	uint8_t *msgptr = (uint8_t *) req->buf + offset;
+
+	/* out of receiving range. */
+	if (offset >= req->recv_msglen) {
+		req->send_msgoff += nbytes;
+		return;
+	}
+
+	end = offset + nbytes;
+	if (end > req->recv_msglen) {
+		msglen_this = req->recv_msglen - offset;
+		end = req->recv_msglen;
+	} else {
+		msglen_this = nbytes;
+	}
+
+	VALGRIND_MAKE_MEM_DEFINED(msgptr, msglen_this);
+	psmi_mq_mtucpy(msgptr, buf, msglen_this);
+
+	if (req->recv_msgoff < end) {
+		req->recv_msgoff = end;
+	}
+
+	req->send_msgoff += nbytes;
+	return;
+}
+
+int
+psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+		    uint32_t offset, const void *buf, uint32_t nbytes)
+{
+	psmi_assert(req != NULL);
+	int rc;
+
+	if (req->state == MQ_STATE_MATCHED)
+		rc = MQ_RET_MATCH_OK;
+	else {
+		psmi_assert(req->state == MQ_STATE_UNEXP);
+		rc = MQ_RET_UNEXP_OK;
+	}
+
+	psmi_mq_req_copy(req, offset, buf, nbytes);
+
+	/*
+	 * the reason to use >= is because send_msgoff
+	 * may be DW pad included.
+	 */
+	if (req->send_msgoff >= req->send_msglen) {
+		if (req->type & MQE_TYPE_EAGER_QUEUE) {
+			STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq);
+		}
+
+		if (req->state == MQ_STATE_MATCHED) {
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+		} else {	/* MQ_STATE_UNEXP */
+			req->state = MQ_STATE_COMPLETE;
+		}
+	}
+
+	return rc;
+}
+
+static
+void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	int table;
+	mq_qq_append(&mq->unexpected_q, req);
+	req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q;
+	mq->unexpected_list_len++;
+	if_pt (mq->nohash_fastpath) {
+		if_pf (mq->unexpected_list_len >= HASH_THRESHOLD)
+			psmi_mq_fastpath_disable(mq);
+		return;
+	}
+
+	for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+		mq_qq_append_which(mq->unexpected_htab,
+				   table, mq->hashvals[table], req);
+	mq->unexpected_hash_len++;
+}
+
+
+psm2_mq_req_t
+mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold)
+{
+	psm2_mq_req_t *curp, cur;
+
+	for (curp = &q->first;
+	     ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold);
+	     curp = &cur->next[which]) {
+		if ((cur->peer == PSM2_MQ_ANY_ADDR || src == cur->peer) &&
+		    !((tag->tag[0] ^ cur->tag.tag[0]) & cur->tagsel.tag[0]) &&
+		    !((tag->tag[1] ^ cur->tag.tag[1]) & cur->tagsel.tag[1]) &&
+		    !((tag->tag[2] ^ cur->tag.tag[2]) & cur->tagsel.tag[2])) {
+			*time_threshold = cur->timestamp;
+			return cur;
+		}
+	}
+	return NULL;
+}
+
+psm2_mq_req_t
+mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove)
+{
+	psm2_mq_req_t match[4];
+	int table;
+	uint64_t best_ts = -1;
+
+	if (mq->nohash_fastpath) {
+		table = PSM2_ANYTAG_ANYSRC;
+		match[table] =
+			mq_list_scan(&mq->expected_q,
+				     src, tag, PSM2_ANYTAG_ANYSRC, &best_ts);
+		if (match[table] && remove) {
+			mq->expected_list_len--;
+			mq_qq_remove_which(match[table], table);
+		}
+		return match[table];
+	}
+
+	mq->hashvals[PSM2_TAG_SRC] = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS;
+	mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+	mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+
+	for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+		match[table] =
+			mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]],
+				     src, tag, table, &best_ts);
+	table = PSM2_ANYTAG_ANYSRC;
+	match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts);
+
+	table = min_timestamp_4(match);
+	if (table == -1)
+		return NULL;
+
+	if (remove) {
+		if_pt (table == PSM2_ANYTAG_ANYSRC)
+			mq->expected_list_len--;
+		else
+			mq->expected_hash_len--;
+		mq_qq_remove_which(match[table], table);
+		psmi_mq_fastpath_try_reenable(mq);
+	}
+	return match[table];
+}
+/*
+ * This handles the rendezvous MPI envelopes, the packet might have the whole
+ * message payload, or zero payload.
+ */
+int
+psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+		   uint32_t send_msglen, const void *payload, uint32_t paylen,
+		   int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req;
+	uint32_t msglen;
+	int rc;
+
+	PSMI_LOCK_ASSERT(mq->progress_lock);
+
+	if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+		/* we have a match, no need to callback */
+		msglen = mq_set_msglen(req, req->buf_len, send_msglen);
+		/* reset send_msglen because sender only sends this many */
+		req->send_msglen = msglen;
+		req->state = MQ_STATE_MATCHED;
+		req->peer = src;
+		req->tag = *tag;
+
+		if (paylen > msglen) paylen = msglen;
+		if (paylen) {
+			psmi_mq_mtucpy(req->buf, payload, paylen);
+		}
+		req->recv_msgoff = req->send_msgoff = paylen;
+		*req_o = req;	/* yes match */
+		PSM_LOG_EPM(OPCODE_LONG_RTS,PSM_LOG_EPM_RX,src->epid,mq->ep->epid,
+			    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+		rc = MQ_RET_MATCH_OK;
+	} else if (msgorder > 1) {
+		/* There is NO request match, and this is the first time
+		 * to try to process this packet, we leave the packet in
+		 * hardware queue for retry in hope there is a request
+		 * match next time, this is for performance
+		 * consideration.
+		 */
+		rc = MQ_RET_UNEXP_NO_RESOURCES;
+	} else {		/* No match, keep track of callback */
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+		psmi_assert(req != NULL);
+		/* We don't know recv_msglen yet but we set it here for
+		 * mq_iprobe */
+		req->send_msglen = req->recv_msglen = send_msglen;
+		PSM_LOG_EPM_COND(req->send_msglen > mq->hfi_thresh_rv,
+				 OPCODE_LONG_RTS,PSM_LOG_EPM_RX,src->epid,mq->ep->epid,
+				    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+		req->state = MQ_STATE_UNEXP_RV;
+		req->peer = src;
+		req->tag = *tag;
+		req->rts_callback = cb;
+		if (paylen > send_msglen) paylen = send_msglen;
+		if (paylen) {
+			req->buf = psmi_mq_sysbuf_alloc(mq, paylen);
+			mq->stats.rx_sysbuf_num++;
+			mq->stats.rx_sysbuf_bytes += paylen;
+			psmi_mq_mtucpy(req->buf, payload, paylen);
+		}
+		req->recv_msgoff = req->send_msgoff = paylen;
+
+		if (msgorder) {
+			mq_add_to_unexpected_hashes(mq, req);
+		}
+		/* caller will handle out of order case */
+		*req_o = req;	/* no match, will callback */
+		rc = MQ_RET_UNEXP_OK;
+	}
+
+#ifdef PSM_DEBUG
+	if (req)
+		_HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d "
+			  "sendlen=%d errcode=%d\n",
+			  rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+			  psmi_epaddr_get_name(src->epid),
+			  req->tag.tag[0], req->tag.tag[1], req->tag.tag[2],
+			  req->recv_msglen, req->send_msglen, req->error_code);
+	else
+		_HFI_VDBG("match=%s (req=%p) src=%s\n",
+			  rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+			  psmi_epaddr_get_name(src->epid));
+#endif /* #ifdef PSM_DEBUG */
+	return rc;
+}
+
+/*
+ * This handles the regular (i.e. non-rendezvous MPI envelopes)
+ */
+int
+psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+			uint32_t send_msglen, uint32_t offset,
+			const void *payload, uint32_t paylen, int msgorder,
+			uint32_t opcode, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req;
+	uint32_t msglen;
+
+	if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+		/* we have a match */
+		psmi_assert(MQE_TYPE_IS_RECV(req->type));
+		req->peer = src;
+		req->tag = *tag;
+		msglen = mq_set_msglen(req, req->buf_len, send_msglen);
+
+		_HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x"
+			  " msglen=%d paylen=%d\n", req, opcode,
+			  psmi_epaddr_get_name(src->epid),
+			  tag->tag[0], tag->tag[1], tag->tag[2], msglen,
+			  paylen);
+
+		switch (opcode) {
+		case MQ_MSG_TINY:
+			PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+						    msglen);
+			/* mq_copy_tiny() can handle zero byte */
+			mq_copy_tiny((uint32_t *) req->buf,
+				     (uint32_t *) payload, msglen);
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+			break;
+
+		case MQ_MSG_SHORT:	/* message fits in 1 payload */
+			PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+						    msglen);
+			if (msglen <= paylen) {
+				psmi_mq_mtucpy(req->buf, payload, msglen);
+			} else {
+				psmi_assert((msglen & ~0x3) == paylen);
+				psmi_mq_mtucpy(req->buf, payload, paylen);
+				/*
+				 * there are nonDW bytes attached in header,
+				 * copy after the DW payload.
+				 */
+				mq_copy_tiny((uint32_t *)(req->buf+paylen),
+					(uint32_t *)&offset, msglen & 0x3);
+			}
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+			break;
+
+		case MQ_MSG_EAGER:
+			req->state = MQ_STATE_MATCHED;
+			req->type |= MQE_TYPE_EAGER_QUEUE;
+			req->send_msgoff = req->recv_msgoff = 0;
+			STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+			_HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n",
+				  msglen, paylen);
+			if (paylen > 0)
+				psmi_mq_handle_data(mq, req, offset, payload,
+						    paylen);
+			break;
+
+		default:
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Internal error, unknown packet 0x%x",
+					  opcode);
+		}
+
+		mq->stats.rx_user_bytes += msglen;
+		mq->stats.rx_user_num++;
+
+		*req_o = req;	/* yes match */
+		return MQ_RET_MATCH_OK;
+	}
+
+	/* unexpected message or out of order message. */
+
+#if 0
+	/*
+	 * Keep a callback here in case we want to fit some other high-level
+	 * protocols over MQ (i.e. shmem).  These protocols would bypass the
+	 * normal message handling and go to higher-level message handlers.
+	 */
+	if (msgorder && mq->unexpected_callback) {
+		mq->unexpected_callback(mq, opcode, epaddr, tag, send_msglen,
+					payload, paylen);
+		*req_o = NULL;
+		return MQ_RET_UNEXP_OK;
+	}
+#endif
+
+	if (msgorder > 1) {
+		/* There is NO request match, and this is the first time
+		 * to try to process this packet, we leave the packet in
+		 * hardware queue for retry in hope there is a request
+		 * match nex time, this is for performance
+		 * consideration.
+		 */
+		return MQ_RET_UNEXP_NO_RESOURCES;
+	}
+
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	psmi_assert(req != NULL);
+
+	req->peer = src;
+	req->tag = *tag;
+	req->recv_msgoff = 0;
+	req->recv_msglen = req->send_msglen = req->buf_len = msglen =
+	    send_msglen;
+
+	_HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x"
+		  " send_msglen=%d\n", req, opcode,
+		  psmi_epaddr_get_name(src->epid),
+		  tag->tag[0], tag->tag[1], tag->tag[2], send_msglen);
+
+	switch (opcode) {
+	case MQ_MSG_TINY:
+		if (msglen > 0) {
+			req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+			mq->stats.rx_sysbuf_num++;
+			mq->stats.rx_sysbuf_bytes += paylen;
+			mq_copy_tiny((uint32_t *) req->buf,
+				     (uint32_t *) payload, msglen);
+		} else
+			req->buf = NULL;
+		req->state = MQ_STATE_COMPLETE;
+		break;
+
+	case MQ_MSG_SHORT:
+		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		mq->stats.rx_sysbuf_num++;
+		mq->stats.rx_sysbuf_bytes += paylen;
+		if (msglen <= paylen) {
+			psmi_mq_mtucpy(req->buf, payload, msglen);
+		} else {
+			psmi_assert((msglen & ~0x3) == paylen);
+			psmi_mq_mtucpy(req->buf, payload, paylen);
+			/*
+			 * there are nonDW bytes attached in header,
+			 * copy after the DW payload.
+			 */
+			mq_copy_tiny((uint32_t *)(req->buf+paylen),
+				(uint32_t *)&offset, msglen & 0x3);
+		}
+		req->state = MQ_STATE_COMPLETE;
+		break;
+
+	case MQ_MSG_EAGER:
+		req->send_msgoff = 0;
+		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		mq->stats.rx_sysbuf_num++;
+		mq->stats.rx_sysbuf_bytes += paylen;
+		req->state = MQ_STATE_UNEXP;
+		req->type |= MQE_TYPE_EAGER_QUEUE;
+		STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+		_HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n",
+			  msglen, paylen);
+		if (paylen > 0)
+			psmi_mq_handle_data(mq, req, offset, payload, paylen);
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Internal error, unknown packet 0x%x",
+				  opcode);
+	}
+
+	mq->stats.rx_sys_bytes += msglen;
+	mq->stats.rx_sys_num++;
+
+	if (msgorder) {
+		mq_add_to_unexpected_hashes(mq, req);
+	}
+	/* caller will handle out of order case */
+	*req_o = req;		/* no match, will callback */
+	return MQ_RET_UNEXP_OK;
+}
+
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
+{
+	psm2_mq_req_t ereq;
+	uint32_t msglen;
+
+	ereq = mq_req_match(mq, ureq->peer, &ureq->tag, 1);
+	if (ereq == NULL) {
+		mq_add_to_unexpected_hashes(mq, ureq);
+		return 0;
+	}
+
+	psmi_assert(MQE_TYPE_IS_RECV(ereq->type));
+	ereq->peer = ureq->peer;
+	ereq->tag = ureq->tag;
+	msglen = mq_set_msglen(ereq, ereq->buf_len, ureq->send_msglen);
+
+	switch (ureq->state) {
+	case MQ_STATE_COMPLETE:
+		if (ureq->buf != NULL) {	/* 0-byte don't alloc a sysbuf */
+			psmi_mq_mtucpy(ereq->buf, (const void *)ureq->buf,
+				       msglen);
+			psmi_mq_sysbuf_free(mq, ureq->buf);
+		}
+		ereq->state = MQ_STATE_COMPLETE;
+		ips_barrier();
+		mq_qq_append(&mq->completed_q, ereq);
+		break;
+	case MQ_STATE_UNEXP:	/* not done yet */
+		ereq->state = MQ_STATE_MATCHED;
+		ereq->msg_seqnum = ureq->msg_seqnum;
+		ereq->ptl_req_ptr = ureq->ptl_req_ptr;
+		ereq->send_msgoff = ureq->send_msgoff;
+		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+		if (ereq->recv_msgoff) {
+			psmi_mq_mtucpy(ereq->buf,
+				       (const void *)ureq->buf,
+				       ereq->recv_msgoff);
+		}
+		psmi_mq_sysbuf_free(mq, ureq->buf);
+		ereq->type = ureq->type;
+		STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq);
+		STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq);
+		break;
+	case MQ_STATE_UNEXP_RV:	/* rendez-vous ... */
+		ereq->state = MQ_STATE_MATCHED;
+		ereq->rts_peer = ureq->rts_peer;
+		ereq->rts_sbuf = ureq->rts_sbuf;
+		ereq->send_msgoff = ureq->send_msgoff;
+		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+		if (ereq->recv_msgoff) {
+			psmi_mq_mtucpy(ereq->buf,
+				       (const void *)ureq->buf,
+				       ereq->recv_msgoff);
+		}
+		if (ereq->send_msgoff) {
+			psmi_mq_sysbuf_free(mq, ureq->buf);
+		}
+		ereq->rts_callback = ureq->rts_callback;
+		ereq->rts_reqidx_peer = ureq->rts_reqidx_peer;
+		ereq->type = ureq->type;
+		ereq->rts_callback(ereq, 0);
+		break;
+	default:
+		fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state,
+			ureq);
+		fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+			ureq->type, ureq->mq, ureq->tag.tag[0],
+			ureq->tag.tag[1], ureq->tag.tag[2]);
+		abort();
+	}
+
+	psmi_mq_req_free(ureq);
+	return 0;
+}
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
new file mode 100644
index 0000000..ff8a52a
--- /dev/null
+++ b/psm_mq_utils.c
@@ -0,0 +1,273 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * MQ request allocator
+ *
+ */
+
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
+{
+	psm2_mq_req_t req;
+
+	psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND);
+
+	if (type == MQE_TYPE_SEND)
+		req = psmi_mpool_get(mq->sreq_pool);
+	else
+		req = psmi_mpool_get(mq->rreq_pool);
+
+	if_pt(req != NULL) {
+		/* A while ago there were issues about forgetting to zero-out parts of the
+		 * structure, I'm leaving this as a debug-time option */
+#ifdef PSM_DEBUG
+		memset(req, 0, sizeof(struct psm2_mq_req));
+#endif
+		req->type = type;
+		req->state = MQ_STATE_FREE;
+		memset(req->next, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t));
+		memset(req->prev, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t));
+		memset(req->q, 0, NUM_MQ_SUBLISTS * sizeof(struct mqq *));
+		req->error_code = PSM2_OK;
+		req->mq = mq;
+		req->testwait_callback = NULL;
+		req->rts_peer = NULL;
+		req->peer = NULL;
+		req->ptl_req_ptr = NULL;
+		req->flags = 0;
+		return req;
+	} else {	/* we're out of reqs */
+		int issend = (type == MQE_TYPE_SEND);
+		uint32_t reqmax, reqchunk;
+		psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool,
+					&reqchunk, &reqmax);
+
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR,
+				  "Exhausted %d MQ %s request descriptors, which usually indicates "
+				  "a user program error or insufficient request descriptors (%s=%d)",
+				  reqmax, issend ? "isend" : "irecv",
+				  issend ? "PSM2_MQ_SENDREQS_MAX" :
+				  "PSM2_MQ_RECVREQS_MAX", reqmax);
+		return NULL;
+	}
+}
+MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
+
+#ifdef PSM_CUDA
+void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
+	psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
+	if (is_alloc)
+		PSMI_CUDA_CALL(cudaEventCreate, &recvreq->cuda_ipc_event);
+	else
+		PSMI_CUDA_CALL(cudaEventDestroy, recvreq->cuda_ipc_event);
+	return;
+}
+#endif
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
+{
+	psm2_mq_req_t warmup_req;
+	psm2_error_t err = PSM2_OK;
+
+	_HFI_VDBG("mq element sizes are %d bytes\n",
+		  (int)sizeof(struct psm2_mq_req));
+
+	/*
+	 * Send MQ requests
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err =
+		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		if ((mq->sreq_pool =
+		     psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+				       maxsz, 0, DESCRIPTORS, NULL,
+				       NULL)) == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Receive MQ requests
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err =
+		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+			goto fail;
+		/* Have a callback function for receive req mpool which creates
+		 * and destroy events.
+		 */
+#ifdef PSM_CUDA
+		if (PSMI_IS_CUDA_ENABLED) {
+			if ((mq->rreq_pool =
+	                     psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
+                                       maxsz, 0, DESCRIPTORS, NULL,
+                                       NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto fail;
+			}
+		}
+		else {
+			if ((mq->rreq_pool =
+				psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+                                       maxsz, 0, DESCRIPTORS, NULL,
+                                       NULL)) == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto fail;
+			}
+		}
+#else
+		if ((mq->rreq_pool =
+			psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+				       maxsz, 0, DESCRIPTORS, NULL,
+				       NULL)) == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+#endif
+	}
+
+	/* Warm up the allocators */
+	warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	psmi_assert_always(warmup_req != NULL);
+	psmi_mq_req_free(warmup_req);
+
+	warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	psmi_assert_always(warmup_req != NULL);
+	psmi_mq_req_free(warmup_req);
+
+fail:
+	return err;
+}
+
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq)
+{
+	psmi_mpool_destroy(mq->rreq_pool);
+	psmi_mpool_destroy(mq->sreq_pool);
+	return PSM2_OK;
+}
+
+
+/*
+ * Hooks to plug into QLogic MPI stats
+ */
+
+static
+void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args)
+{
+	uint64_t *entry = args->stats;
+	psm2_mq_t mq = (psm2_mq_t) args->context;
+	psm2_mq_stats_t mqstats;
+
+	psm2_mq_get_stats(mq, &mqstats);
+
+	if (args->num < 8)
+		return;
+
+	entry[0] = mqstats.tx_eager_num;
+	entry[1] = mqstats.tx_eager_bytes;
+	entry[2] = mqstats.tx_rndv_num;
+	entry[3] = mqstats.tx_rndv_bytes;
+
+	entry[4] = mqstats.rx_user_num;
+	entry[5] = mqstats.rx_user_bytes;
+	entry[6] = mqstats.rx_sys_num;
+	entry[7] = mqstats.rx_sys_bytes;
+}
+
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn)
+{
+	char *desc[8];
+	uint16_t flags[8];
+	int i;
+	struct mpspawn_stats_add_args mp_add;
+	/*
+	 * Hardcode flags until we correctly move mpspawn to its own repo.
+	 * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN;
+	 */
+	for (i = 0; i < 8; i++)
+		flags[i] = MPSPAWN_STATS_REDUCTION_ALL;
+
+	desc[0] = "Eager count sent";
+	desc[1] = "Eager bytes sent";
+	desc[2] = "Rendezvous count sent";
+	desc[3] = "Rendezvous bytes sent";
+	desc[4] = "Expected count received";
+	desc[5] = "Expected bytes received";
+	desc[6] = "Unexpect count received";
+	desc[7] = "Unexpect bytes received";
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = 8;
+	mp_add.header = "MPI Statistics Summary (max,min @ rank)";
+	mp_add.req_fn = psmi_mq_stats_callback;
+	mp_add.desc = desc;
+	mp_add.flags = flags;
+	mp_add.context = mq;
+
+	add_fn(&mp_add);
+}
diff --git a/psm_perf.c b/psm_perf.c
new file mode 100644
index 0000000..f3d7e94
--- /dev/null
+++ b/psm_perf.c
@@ -0,0 +1,246 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+#include "psm_user.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <asm/unistd.h>
+
+struct rdpmc_ctx global_rdpmc_ctx;
+
+u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+unsigned int global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
+unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+
+struct rdpmc_ctx {
+	int fd;
+	struct perf_event_mmap_page *buf;
+};
+
+typedef unsigned long long u64;
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+#include "immintrin.h"
+#endif
+
+/**
+ * DOC: Ring 3 counting for CPU performance counters
+ *
+ * This library allows accessing CPU performance counters from ring 3
+ * using the perf_events subsystem. This is useful to measure specific
+ * parts of programs (e.g. excluding initialization code)
+ *
+ * Requires a Linux 3.3+ kernel
+ */
+
+/**
+ * rdpmc_open_attr - initialize a raw ring 3 readable performance counter
+ * @attr: perf struct %perf_event_attr for the counter
+ * @ctx:  Pointer to struct %rdpmc_ctx that is initialized.
+ * @leader_ctx: context of group leader or NULL
+ *
+ * This allows more flexible setup with a custom &perf_event_attr.
+ * For simple uses rdpmc_open() should be used instead.
+ * Must be called for each thread using the counter.
+ * Must be closed with rdpmc_close()
+ */
+PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx,
+									   struct rdpmc_ctx *leader_ctx))
+{
+	ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1,
+			  leader_ctx ? leader_ctx->fd : -1, 0);
+	if (ctx->fd < 0) {
+		perror("perf_event_open");
+		return -1;
+	}
+	ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0);
+	if (ctx->buf == MAP_FAILED) {
+		close(ctx->fd);
+		perror("mmap on perf fd");
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * rdpmc_open - initialize a simple ring 3 readable performance counter
+ * @counter: Raw event descriptor (UUEE UU unit mask EE event)
+ * @ctx:     Pointer to struct &rdpmc_ctx that is initialized
+ *
+ * The counter will be set up to count CPU events excluding the kernel.
+ * Must be called for each thread using the counter.
+ * The caller must make sure counter is suitable for the running CPU.
+ * Only works in 3.3+ kernels.
+ * Must be closed with rdpmc_close()
+ */
+
+PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx))
+{
+	struct perf_event_attr attr = {
+		.type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE,
+		.size = PERF_ATTR_SIZE_VER0,
+		.config = counter,
+		.sample_type = PERF_SAMPLE_READ,
+		.exclude_kernel = 1,
+	};
+	return rdpmc_open_attr(&attr, ctx, NULL);
+}
+
+/**
+ * rdpmc_close: free a ring 3 readable performance counter
+ * @ctx: Pointer to &rdpmc_ctx context.
+ *
+ * Must be called by each thread for each context it initialized.
+ */
+PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx))
+{
+	close(ctx->fd);
+	munmap(ctx->buf, sysconf(_SC_PAGESIZE));
+}
+
+/**
+ * rdpmc_read: read a ring 3 readable performance counter
+ * @ctx: Pointer to initialized &rdpmc_ctx structure.
+ *
+ * Read the current value of a running performance counter.
+ */
+unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
+{
+	u64 val;
+	unsigned seq;
+	u64 offset = 0;
+
+	typeof (ctx->buf) buf = ctx->buf;
+	do {
+		seq = buf->lock;
+		ips_rmb();
+		if (buf->index <= 0)
+			return buf->offset;
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+                val = _rdpmc(buf->index - 1);
+#else /* GCC */
+                val = __builtin_ia32_rdpmc(buf->index - 1);
+#endif
+		offset = buf->offset;
+		ips_rmb();
+	} while (buf->lock != seq);
+	return val + offset;
+}
+
+void psmi_rdpmc_perf_framework_init()
+{
+    int rdpmc_retval;
+
+    struct rdpmc_ctx *leader = NULL;
+
+    int env_result    = 1;
+    char * env_type = NULL;
+    char * env_config = NULL;
+
+    env_type = getenv("RDPMC_PERF_TYPE");
+
+    if (env_type)
+    {
+        global_rdpmc_type = (int)strtoll(env_type, NULL, 16);
+    }
+    else
+    {
+        env_result = 0;
+    }
+
+    env_config = getenv("RDPMC_PERF_CONFIG");
+
+    if (env_config)
+    {
+        global_rdpmc_config = (int)strtoll(env_config, NULL, 16);
+    }
+    else
+    {
+        env_result = 0;
+    }
+
+    if (env_result != 1)
+    {
+        global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
+        global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+    }
+
+    struct perf_event_attr attr = {
+        .type = global_rdpmc_type,
+        .size = sizeof(struct perf_event_attr),
+        .config = global_rdpmc_config,
+        .sample_type = PERF_SAMPLE_READ,
+    };
+
+    rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader);
+
+    if (rdpmc_retval < 0)
+    {
+        printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval);
+        exit(-1);
+    }
+}
+
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/psm_perf.h b/psm_perf.h
new file mode 100644
index 0000000..6fa06d2
--- /dev/null
+++ b/psm_perf.h
@@ -0,0 +1,142 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+#include <linux/perf_event.h>
+
+/* Configuration */
+
+#define RDPMC_PERF_DEFAULT_TYPE   (PERF_TYPE_HARDWARE)
+#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES)
+
+#define RDPMC_PERF_MAX_SLOT_NUMBER (8)
+#define RDPMC_PERF_MAX_SLOT_NAME   (256)
+
+/* RDPMC infrastructure */
+
+extern struct rdpmc_ctx global_rdpmc_ctx;
+
+typedef unsigned long long u64;
+
+extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+extern unsigned int global_rdpmc_type;
+extern unsigned int global_rdpmc_config;
+
+extern void psmi_rdpmc_perf_framework_init();
+
+extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
+
+#define RDPMC_PERF_INIT() \
+{                         \
+    int i;                \
+    for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+    {                                                \
+        global_rdpmc_begin[i]  = 0; \
+        global_rdpmc_summ[i]   = 0; \
+        global_rdpmc_number[i] = 0; \
+        global_rdpmc_slot_name[i][0] = '\0'; \
+    }                               \
+}
+
+/* There is no slot_number max range check */
+
+#define RDPMC_PERF_SET_SLOT_NAME(slot_number, name)  \
+{                                                    \
+    strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \
+    global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0';           \
+}
+
+#define RDPMC_PERF_BEGIN(slot_number) \
+{                                     \
+    global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \
+}
+
+#define RDPMC_PERF_END(slot_number) \
+{                        \
+    global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \
+    global_rdpmc_number[(slot_number)]++;                                                                    \
+}
+
+#define RDPMC_PERF_DUMP(stream) \
+{                         \
+    int i;                \
+    for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+    {                                                \
+        if (global_rdpmc_slot_name[i][0])                  \
+        {                                            \
+            fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \
+                    global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \
+                    (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \
+            fflush((stream));                                                 \
+        } \
+    }     \
+}
+
+#define GENERIC_PERF_INIT()                           RDPMC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number)               RDPMC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number)                 RDPMC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream)                     RDPMC_PERF_DUMP(stream)
+#else /* RDPMC_PERF_FRAMEWORK */
+#define GENERIC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream)
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/psm_stats.c b/psm_stats.c
new file mode 100644
index 0000000..0015174
--- /dev/null
+++ b/psm_stats.c
@@ -0,0 +1,664 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+struct psmi_stats_type {
+	STAILQ_ENTRY(psmi_stats_type) next;
+	struct psmi_stats_entry *entries;
+
+	int num_entries;
+	void *heading;
+	uint32_t statstype;
+	void *context;
+};
+
+static STAILQ_HEAD(, psmi_stats_type) psmi_stats =
+STAILQ_HEAD_INITIALIZER(psmi_stats);
+
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries_i,
+			 int num_entries, void *context)
+{
+	struct psmi_stats_entry *entries;
+	struct psmi_stats_type *type;
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	entries =
+	    psmi_calloc(PSMI_EP_NONE, STATS, num_entries,
+			sizeof(struct psmi_stats_entry));
+	type =
+	    psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type));
+	PSMI_CHECKMEM(err, entries);
+	PSMI_CHECKMEM(err, type);
+
+	type->entries = entries;
+	type->num_entries = num_entries;
+	type->statstype = statstype;
+	type->context = context;
+	type->heading = (char *)heading;
+
+	for (i = 0; i < num_entries; i++) {
+		type->entries[i].desc = entries_i[i].desc;
+		type->entries[i].flags = entries_i[i].flags;
+		type->entries[i].getfn = entries_i[i].getfn;
+		type->entries[i].u.val = entries_i[i].u.val;
+	}
+
+	STAILQ_INSERT_TAIL(&psmi_stats, type, next);
+	return err;
+
+fail:
+	if (entries)
+		psmi_free(entries);
+	if (type)
+		psmi_free(type);
+	return err;
+}
+
+psm2_error_t psmi_stats_deregister_all(void)
+{
+	struct psmi_stats_type *type;
+
+	/* Currently our mpi still reads stats after finalize so this isn't safe
+	 * yet */
+	while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) {
+		STAILQ_REMOVE_HEAD(&psmi_stats, next);
+		psmi_free(type->entries);
+		psmi_free(type);
+	}
+
+	return PSM2_OK;
+}
+
+static uint32_t typestring_to_type(const char *typestr)
+{
+	if (strncasecmp(typestr, "all", 4) == 0)
+		return PSMI_STATSTYPE_ALL;
+	else if (strncasecmp(typestr, "p2p", 4) == 0)
+		return PSMI_STATSTYPE_P2P;
+	else if (strncasecmp(typestr, "hfi", 6) == 0)
+		return PSMI_STATSTYPE_HFI;
+	else if (strncasecmp(typestr, "ips", 4) == 0)
+		return PSMI_STATSTYPE_IPSPROTO;
+	else if ((strncasecmp(typestr, "intr", 5) == 0) ||
+		 (strncasecmp(typestr, "thread", 7) == 0) ||
+		 (strncasecmp(typestr, "rcvthread", 10) == 0))
+		return PSMI_STATSTYPE_RCVTHREAD;
+	else if ((strncasecmp(typestr, "mq", 3) == 0) ||
+		 (strncasecmp(typestr, "mpi", 4) == 0))
+		return PSMI_STATSTYPE_MQ;
+	else if ((strncasecmp(typestr, "tid", 4) == 0) ||
+		 (strncasecmp(typestr, "tids", 5) == 0))
+		return PSMI_STATSTYPE_TIDS;
+	else if ((strncasecmp(typestr, "counter", 8) == 0) ||
+		 (strncasecmp(typestr, "counters", 9) == 0))
+		return PSMI_STATSTYPE_DEVCOUNTERS;
+	else if (strncasecmp(typestr, "devstats", 9) == 0)
+		return PSMI_STATSTYPE_DEVSTATS;
+	else if ((strncasecmp(typestr, "memory", 7) == 0) ||
+		 (strncasecmp(typestr, "alloc", 6) == 0) ||
+		 (strncasecmp(typestr, "malloc", 7) == 0))
+		return PSMI_STATSTYPE_MEMORY;
+	else
+		return 0;
+}
+
+static uint32_t stats_parse_enabled_mask(const char *stats_string)
+{
+	char *b = (char *)stats_string;
+	char *e = b;
+	char buf[128];
+
+	uint32_t stats_enabled_mask = 0;
+
+	while (*e) {
+		b = e;
+		while (*e && *e != ',' && *e != '+' && *e != '.' &&
+		       *e != '|' && *e != ':')
+			e++;
+		if (e > b) {	/* something new to parse */
+			int len = ((e - b) > (sizeof(buf) - 1)) ?
+			    (sizeof(buf) - 1) : (e - b);
+			strncpy(buf, b, len);
+			buf[len] = '\0';
+			stats_enabled_mask |= typestring_to_type(buf);
+		}
+		if (*e)
+			e++;	/* skip delimiter */
+	}
+	return stats_enabled_mask;
+}
+
+static
+void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args)
+{
+	const struct psmi_stats_entry *entry;
+	struct psmi_stats_type *type = (struct psmi_stats_type *)args->context;
+	int i, num = args->num;
+	uint64_t *stats = args->stats;
+	uint64_t *c = NULL;
+	uint64_t *s = NULL;
+
+	psmi_assert(num == type->num_entries);
+
+	if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS ||
+	    type->statstype == PSMI_STATSTYPE_DEVSTATS) {
+		int unit_id = ((psm2_ep_t) type->context)->unit_id;
+		int portno = ((psm2_ep_t) type->context)->portnum;
+		uintptr_t off;
+		uint8_t *p = NULL;
+		int nc, npc, ns;
+		int nstats = hfi_get_stats_names_count();
+		int nctrs = hfi_get_ctrs_unit_names_count(unit_id);
+		int npctrs = hfi_get_ctrs_port_names_count(unit_id);
+
+		if (nctrs != -1 && npctrs != -1)
+			c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs + npctrs,
+					sizeof(uint64_t));
+		if (nstats != -1)
+			s = psmi_calloc(PSMI_EP_NONE, STATS, nstats,
+					sizeof(uint64_t));
+
+		/*
+		 * If hfifs is not loaded, we set NAN everywhere.  We don't want
+		 * stats to break just because 1 node didn't have hfi-stats
+		 */
+		if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) {
+			nc = hfi_get_ctrs_unit(unit_id, c, nctrs);
+			if (nc != -1 && nc == nctrs)
+				p = (uint8_t *) c;
+			if (nc == -1)
+				nc = 0;
+			npc =
+			    hfi_get_ctrs_port(unit_id, portno, c + nc, npctrs);
+			if (!p && npc > 0 && npc == npctrs)
+				p = (uint8_t *) c;
+		} else if (s != NULL) {
+			ns = hfi_get_stats(s, nstats);
+			if (ns != -1)
+				p = (uint8_t *) s;
+		}
+		for (i = 0; i < num; i++) {
+			entry = &type->entries[i];
+			if (p) {
+				off = (uintptr_t) entry->u.off;
+				stats[i] = *((uint64_t *) (p + off));
+			} else
+				stats[i] = MPSPAWN_NAN_U64;
+		}
+	} else if (type->statstype == PSMI_STATSTYPE_MEMORY) {
+		for (i = 0; i < num; i++) {
+			entry = &type->entries[i];
+			stats[i] =
+			    *(uint64_t *) ((uintptr_t) &psmi_stats_memory +
+					   (uintptr_t) entry->u.off);
+		}
+	} else {
+		for (i = 0; i < num; i++) {
+			entry = &type->entries[i];
+			if (entry->getfn != NULL)
+				stats[i] = entry->getfn(type->context);
+			else
+				stats[i] = *entry->u.val;
+		}
+	}
+
+	if (c != NULL)
+		psmi_free(c);
+	if (s != NULL)
+		psmi_free(s);
+}
+
+static
+void
+stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn,
+			      char *heading,
+			      int num_entries,
+			      struct psmi_stats_entry *entries,
+			      mpspawn_stats_req_fn req_fn, void *context)
+{
+	int i;
+	struct mpspawn_stats_add_args mp_add;
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = num_entries;
+	mp_add.header = heading;
+	mp_add.req_fn = req_fn;
+	mp_add.context = context;
+
+	mp_add.desc = (char **)alloca(sizeof(char *) * num_entries);
+	psmi_assert_always(mp_add.desc != NULL);
+
+	mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries);
+	psmi_assert_always(mp_add.flags != NULL);
+
+	for (i = 0; i < num_entries; i++) {
+		mp_add.desc[i] = (char *)entries[i].desc;
+		mp_add.flags[i] = entries[i].flags;
+	}
+
+	/* Ignore return code, doesn't matter to *us* if register failed */
+	add_fn(&mp_add);
+
+	return;
+}
+
+static void stats_register_hfi_counters(psm2_ep_t ep);
+static void stats_register_hfi_stats(psm2_ep_t ep);
+static void stats_register_mem_stats(psm2_ep_t ep);
+static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args
+					      *args);
+
+/*
+ * Downcall from QLogic MPI into PSM, so we can register stats
+ */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args)
+{
+	struct psmi_stats_type *type;
+	uint32_t statsmask;
+
+	/*
+	 * Args has a version string in it, but we can ignore it since mpspawn
+	 * will decide if it supports *our* version
+	 */
+
+	/*
+	 * Eventually, parse the stats_types to add various "flavours" of stats
+	 */
+	if (args->stats_types == NULL)
+		return NULL;
+
+	statsmask = stats_parse_enabled_mask(args->stats_types);
+
+	/* MQ (MPI-level) statistics */
+	if (statsmask & PSMI_STATSTYPE_MQ)
+		psmi_mq_stats_register(args->mq, args->add_fn);
+
+	/* PSM and hfi level statistics */
+	if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS)
+		stats_register_hfi_counters(args->mq->ep);
+
+	if (statsmask & PSMI_STATSTYPE_DEVSTATS)
+		stats_register_hfi_stats(args->mq->ep);
+
+	if (statsmask & PSMI_STATSTYPE_MEMORY)
+		stats_register_mem_stats(args->mq->ep);
+
+	/*
+	 * At this point all PSM and hfi-level components have registered stats
+	 * with the PSM stats interface.  We register with the mpspawn stats
+	 * interface with an upcall in add_fn
+	 */
+	STAILQ_FOREACH(type, &psmi_stats, next) {
+		if (type->statstype & statsmask)
+			stats_register_mpspawn_single(args->add_fn,
+						      type->heading,
+						      type->num_entries,
+						      type->entries,
+						      psmi_stats_mpspawn_callback,
+						      type);
+	}
+
+	/*
+	 * Special handling for per-endpoint statistics
+	 * Only MPI knows what the endpoint-addresses are in the running program,
+	 * PSM has no sense of MPI worlds.  In stats register, MPI tells PSM how
+	 * many endpoints it anticipates having and PSM simply reserves that amount
+	 * of stats entries X the amount of per-endpoint stats.
+	 */
+	if (statsmask & PSMI_STATSTYPE_P2P)
+		psmi_stats_epaddr_register(args);
+
+	return NULL;
+}
+
+struct stats_epaddr {
+	psm2_ep_t ep;
+	mpspawn_map_epaddr_fn epaddr_map_fn;
+	int num_ep;
+	int num_ep_stats;
+};
+
+static
+void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args)
+{
+	int i, num, off;
+	uint64_t *statsp;
+	struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context;
+	psm2_ep_t ep = stats_ctx->ep;
+	psm2_epaddr_t epaddr;
+
+	num = stats_ctx->num_ep * stats_ctx->num_ep_stats;
+
+	/* First always NAN the entire stats request */
+	for (i = 0; i < num; i++) {
+		if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE)
+			args->stats[i] = MPSPAWN_NAN;
+		else
+			args->stats[i] = MPSPAWN_NAN_U64;
+	}
+
+	for (i = 0; i < stats_ctx->num_ep; i++) {
+		statsp = args->stats + i * stats_ctx->num_ep_stats;
+		off = 0;
+		epaddr = stats_ctx->epaddr_map_fn(i);
+		if (epaddr == NULL)
+			continue;
+
+		/* Self */
+		if (&ep->ptl_self == epaddr->ptlctl) {
+			if (ep->ptl_self.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_self.epaddr_stats_get(epaddr,
+								  statsp + off);
+		} else {
+			if (ep->ptl_self.epaddr_stats_num != NULL)
+				off += ep->ptl_self.epaddr_stats_num();
+		}
+
+		/* Shm */
+		if (&ep->ptl_amsh == epaddr->ptlctl) {
+			if (ep->ptl_amsh.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_amsh.epaddr_stats_get(epaddr,
+								  statsp + off);
+		} else {
+			if (ep->ptl_amsh.epaddr_stats_num != NULL)
+				off += ep->ptl_amsh.epaddr_stats_num();
+		}
+
+		/* ips */
+		if (&ep->ptl_ips == epaddr->ptlctl) {
+			if (ep->ptl_ips.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_ips.epaddr_stats_get(epaddr,
+								 statsp + off);
+		} else {
+			if (ep->ptl_ips.epaddr_stats_num != NULL)
+				off += ep->ptl_ips.epaddr_stats_num();
+		}
+	}
+	return;
+}
+
+static
+psm2_error_t
+psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args)
+{
+	int i = 0, j;
+	int num_ep = args->num_epaddr;
+	int num_ep_stats = 0;
+	int nz;
+	char **desc, **desc_i;
+	uint16_t *flags, *flags_i;
+	char *p;
+	char buf[128];
+	psm2_ep_t ep;
+	struct mpspawn_stats_add_args mp_add;
+	struct stats_epaddr *stats_ctx;
+	psm2_error_t err = PSM2_OK;
+
+	if (args->mq == NULL)
+		return PSM2_OK;
+	ep = args->mq->ep;
+
+	/* Figure out how many stats there are in an endpoint from all devices */
+	if (ep->ptl_self.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_self.epaddr_stats_num();
+	if (ep->ptl_amsh.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_amsh.epaddr_stats_num();
+	if (ep->ptl_ips.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_ips.epaddr_stats_num();
+
+	/* Allocate desc and flags and let each device initialize their
+	 * descriptions and flags */
+	desc =
+	    psmi_malloc(ep, STATS,
+			sizeof(char *) * num_ep_stats * (num_ep + 1));
+	if (desc == NULL)
+		return PSM2_NO_MEMORY;
+	flags =
+	    psmi_malloc(ep, STATS,
+			sizeof(uint16_t) * num_ep_stats * (num_ep + 1));
+	if (flags == NULL) {
+		psmi_free(desc);
+		return PSM2_NO_MEMORY;
+	}
+
+	/* Get the descriptions/flags from each device */
+	i = 0;
+	i += ep->ptl_self.epaddr_stats_num != NULL ?
+	    ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0;
+	i += ep->ptl_amsh.epaddr_stats_num != NULL ?
+	    ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0;
+	i += ep->ptl_ips.epaddr_stats_num != NULL ?
+	    ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0;
+	psmi_assert_always(i == num_ep_stats);
+
+	/*
+	 * Clone the descriptions for each endpoint but append "rank %d" to it
+	 * beforehand.
+	 */
+	nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 :	/* cheap log */
+				 (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 :
+						       (num_ep <
+							10000 ? 5 : 6)))));
+
+	desc_i = desc + num_ep_stats;
+	flags_i = flags + num_ep_stats;
+	memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats);
+
+	for (i = 0; i < num_ep; i++) {
+		for (j = 0; j < num_ep_stats; j++) {
+			snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i,
+				 desc[j]);
+			buf[sizeof(buf) - 1] = '\0';
+			p = psmi_strdup(ep, buf);
+			if (p == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto clean;
+			}
+			desc_i[i * num_ep_stats + j] = p;
+			flags_i[i * num_ep_stats + j] = flags[j];
+		}
+	}
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = num_ep_stats * num_ep;
+	mp_add.header = "Endpoint-to-Endpoint Stats (by <rank>)";
+	mp_add.req_fn = psmi_stats_epaddr_callback;
+	mp_add.desc = desc_i;
+	mp_add.flags = flags_i;
+	stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr));
+	if (stats_ctx == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto clean;
+	}
+	stats_ctx->ep = ep;
+	stats_ctx->epaddr_map_fn = args->epaddr_map_fn;
+	stats_ctx->num_ep = num_ep;
+	stats_ctx->num_ep_stats = num_ep_stats;
+	mp_add.context = stats_ctx;
+
+	args->add_fn(&mp_add);
+
+clean:
+	/* Now we can free all the descriptions */
+	for (i = 0; i < num_ep; i++) {
+		for (j = 0; j < num_ep_stats; j++)
+			if (desc_i[i * num_ep_stats + j])
+				psmi_free(desc_i[i * num_ep_stats + j]);
+	}
+
+	psmi_free(desc);
+	psmi_free(flags);
+
+	return err;
+}
+
+static
+void stats_register_hfi_counters(psm2_ep_t ep)
+{
+	int i, nc, npc;
+	char *cnames = NULL, *pcnames = NULL;
+	struct psmi_stats_entry *entries = NULL;
+
+	nc = hfi_get_ctrs_unit_names(ep->unit_id, &cnames);
+	if (nc == -1 || cnames == NULL)
+		goto bail;
+	npc = hfi_get_ctrs_port_names(ep->unit_id, &pcnames);
+	if (npc == -1 || pcnames == NULL)
+		goto bail;
+	entries =
+	    psmi_calloc(ep, STATS, nc + npc, sizeof(struct psmi_stats_entry));
+	if (entries == NULL)
+		goto bail;
+
+	for (i = 0; i < nc; i++) {
+		entries[i].desc = hfi_get_next_name(&cnames);
+		entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+		    MPSPAWN_STATS_SKIP_IF_ZERO;
+		entries[i].getfn = NULL;
+		entries[i].u.off = i * sizeof(uint64_t);
+	}
+	for (i = nc; i < nc + npc; i++) {
+		entries[i].desc = hfi_get_next_name(&pcnames);
+		entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+		    MPSPAWN_STATS_SKIP_IF_ZERO;
+		entries[i].getfn = NULL;
+		entries[i].u.off = i * sizeof(uint64_t);
+	}
+	psmi_stats_register_type("OPA device counters",
+				 PSMI_STATSTYPE_DEVCOUNTERS,
+				 entries, nc + npc, ep);
+	return;
+
+bail:
+	if (cnames != NULL)
+		hfi_release_names(cnames);
+	if (pcnames != NULL)
+		hfi_release_names(pcnames);
+	if (entries != NULL)
+		psmi_free(entries);
+}
+
+static
+void stats_register_hfi_stats(psm2_ep_t ep)
+{
+	int i, ns;
+	char *snames = NULL;
+	struct psmi_stats_entry *entries = NULL;
+
+	ns = hfi_get_stats_names(&snames);
+	if (ns == -1 || snames == NULL)
+		goto bail;
+	entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry));
+	if (entries == NULL)
+		goto bail;
+
+	for (i = 0; i < ns; i++) {
+		entries[i].desc = hfi_get_next_name(&snames);
+		entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+		    MPSPAWN_STATS_SKIP_IF_ZERO;
+		entries[i].getfn = NULL;
+		entries[i].u.off = i * sizeof(uint64_t);
+	}
+	psmi_stats_register_type("OPA device statistics",
+				 PSMI_STATSTYPE_DEVSTATS, entries, ns, ep);
+	return;
+
+bail:
+	if (snames != NULL)
+		hfi_release_names(snames);
+	if (entries != NULL)
+		psmi_free(entries);
+}
+
+#undef _SDECL
+#define _SDECL(_desc, _param) {					\
+	    .desc  = _desc,					\
+	    .flags = MPSPAWN_STATS_REDUCTION_ALL		\
+		     | MPSPAWN_STATS_SKIP_IF_ZERO,		\
+	    .getfn = NULL,					\
+	    .u.off = offsetof(struct psmi_stats_malloc, _param)	\
+	}
+
+static
+void stats_register_mem_stats(psm2_ep_t ep)
+{
+	struct psmi_stats_entry entries[] = {
+		_SDECL("Total (current)", m_all_total),
+		_SDECL("Total (max)", m_all_max),
+		_SDECL("All Peers (current)", m_perpeer_total),
+		_SDECL("All Peers (max)", m_perpeer_max),
+		_SDECL("Network Buffers (current)", m_netbufs_total),
+		_SDECL("Network Buffers (max)", m_netbufs_max),
+		_SDECL("PSM desctors (current)", m_descriptors_total),
+		_SDECL("PSM desctors (max)", m_descriptors_max),
+		_SDECL("Unexp. buffers (current)", m_unexpbufs_total),
+		_SDECL("Unexp. Buffers (max)", m_unexpbufs_max),
+		_SDECL("Other (current)", m_undefined_total),
+		_SDECL("Other (max)", m_undefined_max),
+	};
+
+	psmi_stats_register_type("PSM memory allocation statistics",
+				 PSMI_STATSTYPE_MEMORY,
+				 entries, PSMI_STATS_HOWMANY(entries), ep);
+}
diff --git a/psm_stats.h b/psm_stats.h
new file mode 100644
index 0000000..9e9e0a9
--- /dev/null
+++ b/psm_stats.h
@@ -0,0 +1,120 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_stats.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_STATS_H
+#define _PSM_STATS_H
+
+#include "mpspawn_stats.h"
+
+#define PSMI_STATSTYPE_MQ	    0x00001
+#define PSMI_STATSTYPE_RCVTHREAD    0x00100	/* num_wakups, ratio, etc. */
+#define PSMI_STATSTYPE_IPSPROTO	    0x00200	/* acks,naks,err_chks */
+#define PSMI_STATSTYPE_TIDS	    0x00400
+#define PSMI_STATSTYPE_MEMORY	    0x01000
+#define PSMI_STATSTYPE_HFI	    (PSMI_STATSTYPE_RCVTHREAD|	\
+				     PSMI_STATSTYPE_IPSPROTO |  \
+				     PSMI_STATSTYPE_MEMORY |  \
+				     PSMI_STATSTYPE_TIDS)
+#define PSMI_STATSTYPE_P2P	    0x00800	/* ep-to-ep details */
+#define PSMI_STATSTYPE_DEVCOUNTERS  0x10000
+#define PSMI_STATSTYPE_DEVSTATS	    0x20000
+#define PSMI_STATSTYPE_ALL	    0xfffff
+#define _PSMI_STATSTYPE_DEVMASK	    0xf0000
+
+/* Used to determine how many stats in static array decl. */
+#define PSMI_STATS_HOWMANY(entries)	    \
+	    (sizeof(entries)/sizeof(entries[0]))
+
+#define PSMI_STATS_NO_HEADING    NULL
+
+#define PSMI_STATS_DECL(_desc, _flags, _getfn, _val)   \
+	{  .desc  = _desc,			    \
+	   .flags = _flags,			    \
+	   .getfn = _getfn,			    \
+	   .u.val = _val,			    \
+	}
+
+#define PSMI_STATS_DECLU64(_desc, _val)					  \
+	    PSMI_STATS_DECL(_desc,					  \
+		MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \
+		NULL,							  \
+		_val)
+
+struct psmi_stats_entry {
+	const char *desc;
+	uint16_t flags;
+	uint64_t(*getfn) (void *context); /* optional fn ptr to get value */
+	union {
+		uint64_t *val;	/* where value is stored if getfn is NULL */
+		uint64_t off;	/* of offset if that makes more sense */
+	} u;
+};
+
+/*
+ * Copy the array of entries and keep track of the context
+ */
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries,
+			 int num_entries, void *context);
+
+psm2_error_t psmi_stats_deregister_all(void);
+
+#endif /* PSM_STATS_H */
diff --git a/psm_sysbuf.c b/psm_sysbuf.c
new file mode 100644
index 0000000..04298f0
--- /dev/null
+++ b/psm_sysbuf.c
@@ -0,0 +1,234 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * System buffer (unexpected message) allocator
+ *
+ */
+
+#define MM_FLAG_NONE  0
+#define MM_FLAG_TRANSIENT  0x1
+
+struct psmi_mem_block_ctrl {
+	union {
+		psmi_mem_ctrl_t *mem_handler;
+		struct psmi_mem_block_ctrl *next;
+	};
+	char _redzone[PSM_VALGRIND_REDZONE_SZ];
+};
+
+
+/* Per MQ allocators */
+void psmi_mq_sysbuf_init(psm2_mq_t mq)
+{
+    int i;
+    uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1};
+    uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0};
+
+    if (mq->mem_ctrl_is_init)
+        return;
+    mq->mem_ctrl_is_init = 1;
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        mq->handler_index[i].block_size = block_sizes[i];
+        mq->handler_index[i].current_available = 0;
+        mq->handler_index[i].free_list = NULL;
+        mq->handler_index[i].total_alloc = 0;
+        mq->handler_index[i].replenishing_rate = replenishing_rate[i];
+
+        if (block_sizes[i] == -1) {
+            psmi_assert_always(replenishing_rate[i] == 0);
+            mq->handler_index[i].flags = MM_FLAG_TRANSIENT;
+        }
+        else {
+            psmi_assert_always(replenishing_rate[i] > 0);
+            mq->handler_index[i].flags = MM_FLAG_NONE;
+        }
+    }
+
+    VALGRIND_CREATE_MEMPOOL(mq, PSM_VALGRIND_REDZONE_SZ,
+                            PSM_VALGRIND_MEM_UNDEFINED);
+
+    /* Hit once on each block size so we have a pool that's allocated */
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        void *ptr;
+        if (block_sizes[i] == -1)
+            continue;
+        ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]);
+        psmi_mq_sysbuf_free(mq, ptr);
+    }
+}
+
+void psmi_mq_sysbuf_fini(psm2_mq_t mq)  // free all buffers that is currently not used
+{
+    struct psmi_mem_block_ctrl *block;
+    int i;
+
+    if (mq->mem_ctrl_is_init == 0)
+        return;
+
+    VALGRIND_DESTROY_MEMPOOL(mq);
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        while ((block = mq->handler_index[i].free_list) != NULL) {
+            mq->handler_index[i].free_list = block->next;
+            psmi_free(block);
+        }
+    }
+    mq->mem_ctrl_is_init = 0;
+}
+
+void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len)
+{
+    snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n",
+             mq->mem_ctrl_total_bytes);
+    buf[len-1] = '\0';
+    return;
+}
+
+void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
+{
+    psmi_mem_ctrl_t *mm_handler = mq->handler_index;
+    struct psmi_mem_block_ctrl *new_block;
+    int replenishing;
+
+    /* There is a timing race with ips initialization, fix later.
+ *      * XXX */
+    if (!mq->mem_ctrl_is_init)
+        psmi_mq_sysbuf_init(mq);
+
+    mq->stats.rx_sysbuf_num++;
+    mq->stats.rx_sysbuf_bytes += alloc_size;
+
+    while (mm_handler->block_size < alloc_size)
+        mm_handler++;
+
+    replenishing = mm_handler->replenishing_rate;
+
+    if (mm_handler->current_available == 0) { // allocate more buffers
+        if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+            uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl)
+                                        + PSM_VALGRIND_REDZONE_SZ;
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+
+            if (new_block) {
+                new_block->mem_handler = mm_handler;
+                new_block++;
+                mm_handler->total_alloc++;
+                mq->mem_ctrl_total_bytes += newsz;
+                VALGRIND_MEMPOOL_ALLOC(mq, new_block, alloc_size);
+            }
+            return new_block;
+        }
+
+        do {
+            uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl) +
+                             PSM_VALGRIND_REDZONE_SZ;
+
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+            mq->mem_ctrl_total_bytes += newsz;
+
+            if (new_block) {
+                mm_handler->current_available++;
+                mm_handler->total_alloc++;
+
+                new_block->next = mm_handler->free_list;
+                mm_handler->free_list = new_block;
+            }
+
+        } while (--replenishing && new_block);
+    }
+
+    if (mm_handler->current_available) {
+        mm_handler->current_available--;
+
+        new_block = mm_handler->free_list;
+        mm_handler->free_list = new_block->next;
+
+        new_block->mem_handler = mm_handler;
+        new_block++;
+
+        VALGRIND_MEMPOOL_ALLOC(mq, new_block, mm_handler->block_size);
+        return new_block;
+    }
+    return NULL;
+}
+
+void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free)
+{
+    struct psmi_mem_block_ctrl * block_to_free;
+    psmi_mem_ctrl_t *mm_handler;
+
+    psmi_assert_always(mq->mem_ctrl_is_init);
+
+    block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1;
+    mm_handler = block_to_free->mem_handler;
+
+    VALGRIND_MEMPOOL_FREE(mq, mem_to_free);
+
+    if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+        psmi_free(block_to_free);
+    } else {
+        block_to_free->next = mm_handler->free_list;
+        mm_handler->free_list = block_to_free;
+        mm_handler->current_available++;
+    }
+
+    return;
+}
diff --git a/psm_sysbuf.h b/psm_sysbuf.h
new file mode 100644
index 0000000..07ab593
--- /dev/null
+++ b/psm_sysbuf.h
@@ -0,0 +1,81 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef SYSBUF_INT_H
+#define SYSBUF_INT_H
+
+#include "psm_user.h"
+
+#define MM_NUM_OF_POOLS 7
+
+typedef struct psmi_mem_ctrl {
+    struct psmi_mem_block_ctrl *free_list;
+    uint32_t total_alloc;
+    uint32_t current_available;
+    uint32_t block_size;
+    uint32_t flags;
+    uint32_t replenishing_rate;
+} psmi_mem_ctrl_t;
+
+/*
+ * MQ unexpected buffer management
+ */
+void  psmi_mq_sysbuf_init(psm2_mq_t mq);
+void  psmi_mq_sysbuf_fini(psm2_mq_t mq);
+void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes);
+void  psmi_mq_sysbuf_free(psm2_mq_t mq, void *);
+void  psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len);
+
+#endif /* SYSBUF_INT_H */
diff --git a/psm_timer.c b/psm_timer.c
new file mode 100644
index 0000000..9a8dddd
--- /dev/null
+++ b/psm_timer.c
@@ -0,0 +1,198 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#if PSMI_TIMER_STATS
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)	((ctrl)->num_insertions++)
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)	((ctrl)->num_traversals++)
+#else
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)
+#endif
+
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl)
+{
+	ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+
+#if PSMI_TIMER_STATS
+	ctrl->num_insertions = 0;
+	ctrl->num_traversals = 0;
+#endif
+
+	TAILQ_INIT(&ctrl->timerq);
+	return PSM2_OK;
+}
+
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl)
+{
+#if PSMI_TIMER_STATS
+	if (ctrl->num_insertions > 0) {
+		_HFI_INFO("avg elem traversals/insertion = %3.2f %%\n",
+			  100.0 * (double)ctrl->num_traversals /
+			  ctrl->num_insertions);
+	}
+#endif
+	return PSM2_OK;
+}
+
+void
+psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+			  struct psmi_timer *t_insert, uint64_t t_cyc_expire)
+{
+	struct psmi_timer *t_cursor;
+
+	psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING));
+
+	t_insert->t_timeout = t_cyc_expire;
+	t_insert->flags |= PSMI_TIMER_FLAG_PENDING;
+
+	/*
+	 * We keep the list from oldest (head) to newest (tail), with the
+	 * assumption that insert and remove occur much more often than search
+	 * (when the timer expires).  Newly added timers are more likely to expire
+	 * later rather than sooner, which is why the head is older.
+	 */
+	PSMI_TIMER_STATS_ADD_INSERTION(ctrl);
+
+	if (TAILQ_EMPTY(&ctrl->timerq)) {	/* Common case */
+		TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+		ctrl->t_cyc_next_expire = t_cyc_expire;
+		PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		return;
+	} else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) {
+		TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) {
+			if (t_cursor->t_timeout <= t_cyc_expire) {
+				TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer);
+				return;
+			}
+			PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		}
+		/* Got to the end of the list -- We're the next to expire */
+		ctrl->t_cyc_next_expire = t_cyc_expire;
+		TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+		return;
+	} else {
+		TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) {
+			if (t_cursor->t_timeout >= t_cyc_expire) {
+				TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor,
+						   t_insert, timer);
+				ctrl->t_cyc_next_expire =
+				    min(t_cyc_expire, ctrl->t_cyc_next_expire);
+				return;
+			}
+			PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		}
+		TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer);
+		/* No need to check if we inserted last, given first branch case */
+		/* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */
+		/* ctrl->t_cyc_next_expire = t_cyc_expire; */
+		return;
+	}
+
+	return;
+}
+
+psm2_error_t
+psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire)
+{
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq);
+
+	PSM2_LOG_MSG("entering");
+
+	while (t_cursor) {
+		if (t_cursor->t_timeout > t_cyc_expire)
+			break;
+
+		err = PSM2_OK;
+		psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING);
+		t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING;
+		TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer);
+		t_cursor->expire_callback(t_cursor, t_cyc_expire);
+		t_cursor = TAILQ_PREV(t_cursor, timerq, timer);
+	}
+
+	if (TAILQ_EMPTY(&ctrl->timerq))
+		ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+	else
+		ctrl->t_cyc_next_expire =
+		    TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+void
+psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+			struct psmi_timer *t_remove)
+{
+
+	psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING);
+
+	t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING;
+	TAILQ_REMOVE(&ctrl->timerq, t_remove, timer);
+
+	/*
+	 * If we're removing the last entry, we need to reset the
+	 * expiration cycle time.
+	 */
+	if (TAILQ_EMPTY(&ctrl->timerq))
+		ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+	else
+		ctrl->t_cyc_next_expire =
+		    TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+	return;
+}
diff --git a/psm_timer.h b/psm_timer.h
new file mode 100644
index 0000000..a57fd7a
--- /dev/null
+++ b/psm_timer.h
@@ -0,0 +1,164 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_timer.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_TIMER_H
+#define _PSMI_TIMER_H
+
+#include "psm_user.h"
+
+/* Keep timer stats */
+#define PSMI_TIMER_STATS 0
+
+typedef struct psmi_timer psmi_timer;
+typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *,
+						    uint64_t);
+
+struct psmi_timer {
+	TAILQ_ENTRY(psmi_timer) timer;	/* opaque */
+	uint64_t t_timeout;	/* opaque */
+	uint8_t flags;		/* opaque */
+
+	psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */
+	void *context;		/* user -- callback param */
+};
+
+struct psmi_timer_ctrl {
+	uint64_t t_cyc_next_expire;
+	 TAILQ_HEAD(timerq, psmi_timer) timerq;
+
+#if PSMI_TIMER_STATS
+	uint64_t num_insertions;
+	uint64_t num_traversals;
+#endif
+};
+
+/*
+ * Some events need to be unconditionally enqueued at the beginning of the
+ * timerq -- they are not timers meant to expire but merely operations that
+ * need to be delayed.  For delayed operations, there are 5 levels of
+ * priority.
+ */
+#define PSMI_TIMER_PRIO_0	 0ULL
+#define PSMI_TIMER_PRIO_1	 1ULL
+#define PSMI_TIMER_PRIO_2	 2ULL
+#define PSMI_TIMER_PRIO_3	 3ULL
+#define PSMI_TIMER_PRIO_4	 4ULL
+#define PSMI_TIMER_PRIO_LAST	 PSMI_TIMER_PRIO_4
+
+#define PSMI_TIMER_INFINITE	 0xFFFFFFFFFFFFFFFFULL
+#define PSMI_TIMER_FLAG_PENDING  0x01
+
+/*
+ * Timer control initialization and finalization
+ */
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl);
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl);
+
+/*
+ * Timer entry initialization (a timer must be initialized before it can be
+ * added to the timer request queue).
+ */
+
+PSMI_ALWAYS_INLINE(
+void
+psmi_timer_entry_init(struct psmi_timer *t_init,
+		      psmi_timer_expire_callback_t expire_fn,
+		      void *context))
+{
+	t_init->flags = 0;
+	t_init->expire_callback = expire_fn;
+	t_init->context = context;
+	return;
+}
+
+/*
+ * Timer requests, conditional (macro) or unconditional
+ */
+#define psmi_timer_request(ctrl, t_insert, t_cyc)			\
+	    if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING))		\
+		psmi_timer_request_always((ctrl), (t_insert), (t_cyc))
+
+void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+			       struct psmi_timer *t_insert,
+			       uint64_t t_cyc_expire);
+
+/*
+ * Timer cancelations, conditional (macro) only (cancel_inner is internal)
+ */
+#define psmi_timer_cancel(ctrl, t_remove)		    \
+	    if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \
+		psmi_timer_cancel_inner(ctrl, t_remove)
+void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+			     struct psmi_timer *t_remove);
+
+/*
+ * Timer processing, conditional or unconditional.
+ */
+#define psmi_timer_process_if_expired(ctrl, t_cyc_expire)		\
+	    (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ?		\
+	     psmi_timer_process_expired(ctrl, t_cyc_expire) :           \
+	     PSM2_OK_NO_PROGRESS)
+
+#define psmi_timer_is_expired(ctrl, t_cyc_expire)			\
+	    ((ctrl)->t_cyc_next_expire <= (t_cyc_expire))
+
+psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl,
+				       uint64_t t_cyc_expire);
+
+#endif /* _PSMI_TIMER_H */
diff --git a/psm_user.h b/psm_user.h
new file mode 100644
index 0000000..dd5384f
--- /dev/null
+++ b/psm_user.h
@@ -0,0 +1,500 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_USER_H
+#define _PSMI_USER_H
+
+#include <inttypes.h>
+#include <pthread.h>
+
+#include <sched.h>
+#include <numa.h>
+
+#include "psm2.h"
+#include "psm2_mq.h"
+
+#include "ptl.h"
+
+#include "opa_user.h"
+#include "opa_queue.h"
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "psm_log.h"
+#include "psm_perf.h"
+
+#ifdef PSM_VALGRIND
+#define PSM_VALGRIND_REDZONE_SZ	     8
+#define PSM_VALGRIND_DEFINE_MQ_RECV(buf, posted_len, recv_len)	do {	\
+	    VALGRIND_MAKE_MEM_DEFINED((void *)(buf), (posted_len));	\
+	    if ((recv_len) < (posted_len))				\
+		VALGRIND_MAKE_MEM_UNDEFINED(				\
+		(void *) ((uintptr_t) (buf) + (recv_len)),		\
+		(posted_len) - (recv_len));				\
+	    } while (0)
+
+#else
+#define PSM_VALGRIND_REDZONE_SZ	     0
+#define PSM_VALGRIND_DEFINE_MQ_RECV(buf, posted_len, recv_len)
+#define VALGRIND_CREATE_MEMPOOL(ARG1,ARG2,ARG3)
+#define VALGRIND_MAKE_MEM_DEFINED(ARG1,ARG2)
+#define VALGRIND_DESTROY_MEMPOOL(ARG1)
+#define VALGRIND_MEMPOOL_ALLOC(ARG1,ARG2,ARG3)
+#define VALGRIND_MEMPOOL_FREE(ARG1,ARG2)
+#define VALGRIND_MAKE_MEM_NOACCESS(ARG1,ARG2)
+#endif
+
+/* Parameters for use in valgrind's "is_zeroed" */
+#define PSM_VALGRIND_MEM_DEFINED     1
+#define PSM_VALGRIND_MEM_UNDEFINED   0
+
+#define PSMI_LOCK_NO_OWNER	((pthread_t)(-1))
+
+#ifdef PSM_DEBUG
+#define PSMI_LOCK_IS_MUTEXLOCK_DEBUG
+#else
+#define PSMI_LOCK_IS_SPINLOCK
+/* #define PSMI_LOCK_IS_MUTEXLOCK */
+/* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */
+/* #define PSMI_PLOCK_IS_NOLOCK */
+#endif
+
+#define _PSMI_IN_USER_H
+#include "psm_help.h"
+#include "psm_error.h"
+#include "psm_context.h"
+#include "psm_utils.h"
+#include "psm_timer.h"
+#include "psm_mpool.h"
+#include "psm_ep.h"
+#include "psm_lock.h"
+#include "psm_stats.h"
+#include "psm2_mock_testing.h"
+#undef _PSMI_IN_USER_H
+
+#define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff))
+#define PSMI_VERNO  PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR)
+#define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff)
+#define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff)
+
+int psmi_verno_client();
+int psmi_verno_isinteroperable(uint16_t verno);
+int MOCKABLE(psmi_isinitialized)();
+MOCK_DCL_EPILOGUE(psmi_isinitialized);
+
+psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh);
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq);
+
+int psmi_get_current_proc_location();
+
+extern int psmi_epid_ver;
+extern uint32_t non_dw_mul_sdma;
+extern psmi_lock_t psmi_creation_lock;
+
+extern psm2_ep_t psmi_opened_endpoint;
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_get_epid_version()) {
+	return psmi_epid_ver;
+}
+
+#define PSMI_EPID_VERSION_SHM 			0
+#define PSMI_EPID_SHM_ONLY				1
+#define PSMI_EPID_IPS_SHM				0
+#define PSMI_EPID_VERSION 				_psmi_get_epid_version()
+#define PSMI_MAX_EPID_VERNO_SUPPORTED	2
+#define PSMI_MIN_EPID_VERNO_SUPPORTED	1
+#define PSMI_EPID_VERNO_DEFAULT			2
+#define PSMI_EPID_V1					1
+#define PSMI_EPID_V2					2
+
+#define PSMI_EPID_GET_LID(epid) (PSMI_EPID_VERSION == PSMI_EPID_V1) ? \
+								 (int)PSMI_EPID_GET_LID_V1(epid)      \
+							   : (int)PSMI_EPID_GET_LID_V2(epid)
+
+#define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffff)
+/*
+ * Default setting for Receive thread
+ *
+ *   0 disables rcvthread by default
+ * 0x1 enables ips receive thread by default
+ */
+#define PSMI_RCVTHREAD_FLAGS	0x1
+
+/*
+ * Define one of these below.
+ *
+ * Spinlock gives the best performance and makes sense with the progress thread
+ * only because the progress thread does a "trylock" and then goes back to
+ * sleep in a poll.
+ *
+ * Mutexlock should be used for experimentation while the more useful
+ * mutexlock-debug should be enabled during development to catch potential
+ * errors.
+ */
+#ifdef PSMI_LOCK_IS_SPINLOCK
+#define _PSMI_LOCK_INIT(pl)	psmi_spin_init(&((pl).lock))
+#define _PSMI_LOCK_TRY(pl)	psmi_spin_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl)		psmi_spin_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl)	psmi_spin_unlock(&((pl).lock))
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#define PSMI_LOCK_DISABLED	0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_trylock_inner(pthread_mutex_t *mutex,
+			  const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner != pthread_self(),
+			       curloc);
+	int ret = pthread_mutex_trylock(mutex);
+	if (ret == 0)
+		*lock_owner = pthread_self();
+	return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_lock_inner(pthread_mutex_t *mutex,
+		       const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner != pthread_self(),
+			       curloc);
+	int ret = pthread_mutex_lock(mutex);
+	psmi_assert_always_loc(ret != EDEADLK, curloc);
+	*lock_owner = pthread_self();
+	return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+_psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
+			 const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner == pthread_self(),
+			       curloc);
+	*lock_owner = PSMI_LOCK_NO_OWNER;
+	psmi_assert_always_loc(pthread_mutex_unlock(mutex) !=
+			       EPERM, curloc);
+	return;
+}
+
+#define _PSMI_LOCK_INIT(pl)	/* static initialization */
+#define _PSMI_LOCK_TRY(pl)							\
+	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,	\
+					&((pl).lock_owner))
+#define _PSMI_LOCK(pl)								\
+	    _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC,	\
+                                        &((pl).lock_owner))
+#define _PSMI_UNLOCK(pl)							\
+	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,	\
+                                        &((pl).lock_owner))
+#define _PSMI_LOCK_ASSERT(pl)							\
+	    psmi_assert_always(pl.lock_owner == pthread_self());
+#define _PSMI_UNLOCK_ASSERT(pl)						\
+	    psmi_assert_always(pl.lock_owner != pthread_self());
+#define PSMI_LOCK_DISABLED	0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+#define _PSMI_LOCK_INIT(pl)	/* static initialization */
+#define _PSMI_LOCK_TRY(pl)	pthread_mutex_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl)		pthread_mutex_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl)	pthread_mutex_unlock(&((pl).lock))
+#define PSMI_LOCK_DISABLED	0
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+
+#elif defined(PSMI_PLOCK_IS_NOLOCK)
+#define _PSMI_LOCK_TRY(pl)	0	/* 0 *only* so progress thread never succeeds */
+#define _PSMI_LOCK(pl)
+#define _PSMI_UNLOCK(pl)
+#define PSMI_LOCK_DISABLED	1
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#else
+#error No LOCK lock type declared
+#endif
+
+#define PSMI_YIELD(pl)							\
+	do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0)
+
+#ifdef PSM2_MOCK_TESTING
+/* If this is a mocking tests build, all the operations on the locks
+ * are routed through functions which may be mocked, if necessary.  */
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_init);
+
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_try);
+
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock);
+
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock);
+
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert);
+
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert);
+
+#define PSMI_LOCK_INIT(pl)	psmi_mockable_lock_init(&(pl))
+#define PSMI_LOCK_TRY(pl)	psmi_mockable_lock_try(&(pl))
+#define PSMI_LOCK(pl)		psmi_mockable_lock(&(pl))
+#define PSMI_UNLOCK(pl)		psmi_mockable_unlock(&(pl))
+#define PSMI_LOCK_ASSERT(pl)	psmi_mockable_lock_assert(&(pl))
+#define PSMI_UNLOCK_ASSERT(pl)	psmi_mockable_unlock_assert(&(pl))
+#else
+#define PSMI_LOCK_INIT(pl)	_PSMI_LOCK_INIT(pl)
+#define PSMI_LOCK_TRY(pl)	_PSMI_LOCK_TRY(pl)
+#define PSMI_LOCK(pl)		_PSMI_LOCK(pl)
+#define PSMI_UNLOCK(pl)		_PSMI_UNLOCK(pl)
+#define PSMI_LOCK_ASSERT(pl)	_PSMI_LOCK_ASSERT(pl)
+#define PSMI_UNLOCK_ASSERT(pl)	_PSMI_UNLOCK_ASSERT(pl)
+#endif
+
+#ifdef PSM_PROFILE
+void psmi_profile_block() __attribute__ ((weak));
+void psmi_profile_unblock() __attribute__ ((weak));
+void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
+
+#define PSMI_PROFILE_BLOCK()		psmi_profile_block()
+#define PSMI_PROFILE_UNBLOCK()		psmi_profile_unblock()
+#define PSMI_PROFILE_REBLOCK(noprog)	psmi_profile_reblock(noprog)
+#else
+#define PSMI_PROFILE_BLOCK()
+#define PSMI_PROFILE_UNBLOCK()
+#define PSMI_PROFILE_REBLOCK(noprog)
+#endif
+
+#ifdef PSM_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
+
+#if CUDART_VERSION < 4010
+#error Please update CUDA runtime, required minimum version is 4.1
+#endif
+
+extern int is_cuda_enabled;
+extern int device_support_gpudirect;
+extern int cuda_runtime_version;
+
+extern CUcontext ctxt;
+void *psmi_cudart_lib;
+void *psmi_cuda_lib;
+CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+cudaError_t (*psmi_cudaRuntimeGetVersion)(int *runtime_version);
+cudaError_t (*psmi_cudaGetDeviceCount)(int *n);
+cudaError_t (*psmi_cudaGetDeviceProperties)(struct cudaDeviceProp *p, int d);
+cudaError_t (*psmi_cudaGetDevice)(int *n);
+cudaError_t (*psmi_cudaSetDevice)(int n);
+cudaError_t (*psmi_cudaStreamCreate)(cudaStream_t *s);
+cudaError_t (*psmi_cudaStreamCreateWithFlags)(cudaStream_t *s, unsigned f);
+cudaError_t (*psmi_cudaStreamSynchronize)(cudaStream_t s);
+cudaError_t (*psmi_cudaDeviceSynchronize)();
+cudaError_t (*psmi_cudaEventCreate)(cudaEvent_t *event);
+cudaError_t (*psmi_cudaEventDestroy)(cudaEvent_t event);
+cudaError_t (*psmi_cudaEventQuery)(cudaEvent_t event);
+cudaError_t (*psmi_cudaEventRecord)(cudaEvent_t event, cudaStream_t stream);
+cudaError_t (*psmi_cudaEventSynchronize)(cudaEvent_t event);
+cudaError_t (*psmi_cudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+cudaError_t (*psmi_cudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t s);
+cudaError_t (*psmi_cudaMalloc)(void **devPtr, size_t size);
+cudaError_t (*psmi_cudaHostAlloc)(void **devPtr, size_t size, unsigned int flags);
+cudaError_t (*psmi_cudaFreeHost)(void *ptr);
+
+cudaError_t (*psmi_cudaIpcGetMemHandle)(cudaIpcMemHandle_t* handle, void* devPtr);
+cudaError_t (*psmi_cudaIpcOpenMemHandle)(void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags);
+cudaError_t (*psmi_cudaIpcCloseMemHandle)(void* devPtr);
+
+#define PSMI_CUDA_DRIVER_API_CALL(func, args...) do {			\
+		CUresult cudaerr;					\
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != CUDA_SUCCESS) {				\
+			if (ctxt == NULL)				\
+				_HFI_ERROR(				\
+				"Check if cuda runtime is initialized"	\
+				"before psm2_ep_open call \n");	\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"	\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		}							\
+	} while (0)
+
+#define PSMI_CUDA_CALL(func, args...) do {				\
+		cudaError_t cudaerr;					\
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != cudaSuccess) {				\
+			if (ctxt == NULL)				\
+				_HFI_ERROR(				\
+				"Check if cuda runtime is initialized"	\
+				"before psm2_ep_open call \n");	\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"	\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);    \
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		}							\
+	} while (0)
+
+#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
+		cudaerr = psmi_cudaEventQuery(event);			\
+		if ((cudaerr != cudaSuccess) &&			\
+		    (cudaerr != cudaErrorNotReady)) {			\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() returned %d\n",	\
+				"cudaEventQuery", cudaerr);		\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		}							\
+	} while (0)
+
+
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_cuda_mem(void *ptr))
+{
+	CUresult cres;
+	CUmemorytype mt;
+	cres = psmi_cuPointerGetAttribute(
+		&mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr);
+	if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE))
+		return 1;
+	else
+		return 0;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_cuda_enabled())
+{
+	return is_cuda_enabled;
+}
+
+#define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled()
+
+#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p)
+/* XXX TODO: Getting the gpu page size from driver at init time */
+#define PSMI_GPU_PAGESIZE 65536
+
+struct ips_cuda_hostbuf {
+	STAILQ_ENTRY(ips_cuda_hostbuf) req_next;
+	STAILQ_ENTRY(ips_cuda_hostbuf) next;
+	uint32_t size, offset, bytes_read;
+	/* This flag indicates whether a chb is
+	 * pulled from a mpool or dynamically
+	 * allocated using calloc. */
+	uint8_t is_tempbuf;
+	cudaEvent_t copy_status;
+	psm2_mq_req_t req;
+	void *host_buf, *gpu_buf;
+};
+
+struct ips_cuda_hostbuf_mpool_cb_context {
+	unsigned bufsz;
+};
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+
+#define CUDA_HOSTBUFFER_LIMITS {				\
+	    .env = "PSM_CUDA_BOUNCEBUFFERS_MAX",		\
+	    .descr = "Max CUDA bounce buffers (in MB)",		\
+	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
+	    .minval = 1,					\
+	    .maxval = 1<<30,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = {  16, 256 },		\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,   1 },		\
+	    .mode[PSMI_MEMMODE_LARGE]   = {  32, 512 }		\
+	}
+
+#define CUDA_SMALLHOSTBUF_SZ	(256*1024)
+#define CUDA_WINDOW_PREFETCH_DEFAULT	2
+#define GPUDIRECT_THRESH_RV 3
+
+extern uint32_t gpudirect_send_threshold;
+extern uint32_t gpudirect_recv_threshold;
+
+enum psm2_chb_match_type {
+	/* Complete data found in a single chb */
+	PSMI_CUDA_FULL_MATCH_FOUND = 0,
+	/* Data is spread across two chb's */
+	PSMI_CUDA_SPLIT_MATCH_FOUND = 1,
+	/* Data is only partially prefetched */
+	PSMI_CUDA_PARTIAL_MATCH_FOUND = 2,
+	PSMI_CUDA_CONTINUE = 3
+};
+typedef enum psm2_chb_match_type psm2_chb_match_type_t;
+
+#endif /* PSM_CUDA */
+#endif /* _PSMI_USER_H */
diff --git a/psm_utils.c b/psm_utils.c
new file mode 100644
index 0000000..37446a4
--- /dev/null
+++ b/psm_utils.c
@@ -0,0 +1,2553 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <netdb.h>		/* gethostbyname */
+#include <malloc.h>             /* malloc_usable_size */
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+struct psmi_epid_table psmi_epid_table;
+
+/* Iterator to access the epid table.
+ * 'ep' can be NULL if remote endpoints from all endpoint handles are requested
+ */
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep)
+{
+	itor->i = 0;
+	itor->ep = ep;
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+}
+
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor)
+{
+	int i;
+	struct psmi_epid_tabentry *e;
+
+	if (itor->i >= psmi_epid_table.tabsize)
+		return NULL;
+	for (i = itor->i; i < psmi_epid_table.tabsize; i++) {
+		e = &psmi_epid_table.table[i];
+		if (!e->entry || e->entry == EPADDR_DELETED)
+			continue;
+		if (itor->ep && e->ep != itor->ep)
+			continue;
+		itor->i = i + 1;
+		return e->entry;
+	}
+	itor->i = psmi_epid_table.tabsize;	/* put at end of table */
+	return NULL;
+}
+
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor)
+{
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	itor->i = 0;
+}
+
+#define mix64(a, b, c) \
+{ \
+	a -= b; a -= c; a ^= (c>>43); \
+	b -= c; b -= a; b ^= (a<<9);  \
+	c -= a; c -= b; c ^= (b>>8);  \
+	a -= b; a -= c; a ^= (c>>38); \
+	b -= c; b -= a; b ^= (a<<23); \
+	c -= a; c -= b; c ^= (b>>5);  \
+	a -= b; a -= c; a ^= (c>>35); \
+	b -= c; b -= a; b ^= (a<<49); \
+	c -= a; c -= b; c ^= (b>>11); \
+	a -= b; a -= c; a ^= (c>>12); \
+	b -= c; b -= a; b ^= (a<<18); \
+	c -= a; c -= b; c ^= (b>>22); \
+}
+
+psm2_error_t psmi_epid_init()
+{
+	pthread_mutexattr_t attr;
+	psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0;
+	psmi_epid_table.tabsize_used = 0;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutex_init(&psmi_epid_table.tablock, &attr);
+	pthread_mutexattr_destroy(&attr);
+	return PSM2_OK;
+};
+
+psm2_error_t psmi_epid_fini()
+{
+	if (psmi_epid_table.table != NULL) {
+		psmi_free(psmi_epid_table.table);
+		psmi_epid_table.table = NULL;
+	}
+	psmi_epid_table.tabsize = 0;
+	psmi_epid_table.tabsize_used = 0;
+	return PSM2_OK;
+}
+
+PSMI_ALWAYS_INLINE(
+uint64_t
+hash_this(const psm2_ep_t ep, const psm2_epid_t epid))
+{
+	uint64_t ep_i = (uint64_t) (uintptr_t) ep;
+	uint64_t epid_i = (uint64_t) epid;
+	uint64_t hash = 0x9e3779b97f4a7c13LL;
+	mix64(ep_i, epid_i, hash);
+	return hash;
+}
+
+PSMI_ALWAYS_INLINE(
+void *
+psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove))
+{
+	uint64_t key = hash_this(ep, epid);
+	struct psmi_epid_tabentry *e;
+	void *entry = NULL;
+	int idx;
+
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+	if (!psmi_epid_table.table)
+		goto ret;
+	idx = (int)(key % psmi_epid_table.tabsize);
+	while (psmi_epid_table.table[idx].entry != NULL) {
+		/* An epid can be added twice if there's more than one opened endpoint,
+		 * but really we match on epid *and* on endpoint */
+		e = &psmi_epid_table.table[idx];
+		if (e->entry != EPADDR_DELETED && e->key == key) {
+			entry = e->entry;
+			if (remove)
+				psmi_epid_table.table[idx].entry =
+				    EPADDR_DELETED;
+			goto ret;
+		}
+		if (++idx == psmi_epid_table.tabsize)
+			idx = 0;
+	}
+ret:
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	return entry;
+}
+
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid)
+{
+	void *entry = psmi_epid_lookup_inner(ep, epid, 0);
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid,
+			  entry);
+	return entry;
+}
+
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid)
+{
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid);
+	return psmi_epid_lookup_inner(ep, epid, 1);
+}
+
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry)
+{
+	uint64_t key;
+	int idx, i, newsz;
+	struct psmi_epid_tabentry *e;
+	psm2_error_t err = PSM2_OK;
+
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid,
+			  entry);
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+	/* Leave this here, mostly for sanity and for the fact that the epid
+	 * table is currently not used in the critical path */
+	if (++psmi_epid_table.tabsize_used >
+	    (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) {
+		struct psmi_epid_tabentry *newtab;
+		newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK;
+		newtab = (struct psmi_epid_tabentry *)
+		    psmi_calloc(ep, PER_PEER_ENDPOINT,
+				newsz, sizeof(struct psmi_epid_tabentry));
+		if (newtab == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		if (psmi_epid_table.table) {	/* rehash the table */
+			for (i = 0; i < psmi_epid_table.tabsize; i++) {
+				e = &psmi_epid_table.table[i];
+				if (e->entry == NULL)
+					continue;
+				/* When rehashing, mark deleted as free again */
+				if (e->entry == EPADDR_DELETED) {
+					psmi_epid_table.tabsize_used--;
+					continue;
+				}
+				idx = (int)(e->key % newsz);
+				while (newtab[idx].entry != NULL)
+					if (++idx == newsz)
+						idx = 0;
+				newtab[idx].entry = e->entry;
+				newtab[idx].key = e->key;
+				newtab[idx].ep = e->ep;
+				newtab[idx].epid = e->epid;
+			}
+			psmi_free(psmi_epid_table.table);
+		}
+		psmi_epid_table.table = newtab;
+		psmi_epid_table.tabsize = newsz;
+	}
+	key = hash_this(ep, epid);
+	idx = (int)(key % psmi_epid_table.tabsize);
+	e = &psmi_epid_table.table[idx];
+	while (e->entry && e->entry != EPADDR_DELETED) {
+		if (++idx == psmi_epid_table.tabsize)
+			idx = 0;
+		e = &psmi_epid_table.table[idx];
+	}
+	e->entry = entry;
+	e->key = key;
+	e->epid = epid;
+	e->ep = ep;
+
+fail:
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	return err;
+}
+
+char *psmi_gethostname(void)
+{
+	/* XXX this will need a lock in a multi-threaded environment */
+	static char hostname[80] = { '\0' };
+	char *c;
+
+	if (hostname[0] == '\0') {
+		gethostname(hostname, sizeof(hostname));
+		hostname[sizeof(hostname) - 1] = '\0';	/* no guarantee of nul termination */
+		if ((c = strchr(hostname, '.')))
+			*c = '\0';
+	}
+
+	return hostname;
+}
+
+/*
+ * Hostname stuff.  We really only register the network portion of the epid
+ * since all epids from the same nid are assumed to have the same hostname.
+ */
+psm2_error_t
+psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite)
+{
+	size_t hlen;
+	char *h;
+	psm2_error_t err = PSM2_OK;
+
+	if (hostname == NULL)
+		return PSM2_OK;
+	/* First see if a hostname already exists */
+	if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) {
+		if (!overwrite)
+			return PSM2_OK;
+
+		h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid);
+		if (h != NULL)	/* free the previous hostname if so exists */
+			psmi_free(h);
+	}
+
+	hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1);
+	h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen);
+	if (h == NULL)
+		return PSM2_NO_MEMORY;
+	snprintf(h, hlen, "%s", hostname);
+	h[hlen - 1] = '\0';
+	err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h);
+	return err;
+}
+
+/* XXX These two functions are not thread safe, we'll use a rotating buffer
+ * trick whenever we need to make them thread safe */
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid)
+{
+	static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+	static int bufno;
+	uint64_t nid = psm2_epid_nid(epid);
+	char *h, *hostname;
+
+	hostname = hostnamebufs[bufno];
+	bufno = (bufno + 1) % 4;
+
+	/* First, if we have registered a host for this epid, just return that, or
+	 * else try to return something with lid and context */
+	h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid);
+	if (h != NULL)
+		return h;
+	else {
+		snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d:%d.%d",
+			 (int)PSMI_EPID_GET_LID(epid),
+			 (int)PSMI_EPID_GET_CONTEXT(epid),
+			 (int)PSMI_EPID_GET_SUBCONTEXT(epid));
+		hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+		return hostname;
+	}
+}
+
+/* This one gives the hostname with a lid */
+const char *psmi_epaddr_get_name(psm2_epid_t epid)
+{
+	static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+	static int bufno;
+	char *h, *hostname;
+	hostname = hostnamebufs[bufno];
+	bufno = (bufno + 1) % 4;
+
+	h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid));
+	if (h == NULL)
+		return psmi_epaddr_get_hostname(epid);
+	else {
+		snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1,
+			 "%s (LID=%d:%d.%d)", h,
+			 (int)PSMI_EPID_GET_LID(epid),
+			 (int)PSMI_EPID_GET_CONTEXT(epid),
+			 (int)PSMI_EPID_GET_SUBCONTEXT(epid));
+		hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+	}
+	return hostname;
+}
+
+/* Wrapper, in case we port to OS xyz that doesn't have sysconf */
+uintptr_t psmi_getpagesize(void)
+{
+	static uintptr_t pagesz = (uintptr_t) -1;
+	long sz;
+	if (pagesz != (uintptr_t) -1)
+		return pagesz;
+	sz = sysconf(_SC_PAGESIZE);
+	if (sz == -1) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Can't query system page size");
+	}
+
+	pagesz = (uintptr_t) sz;
+	return pagesz;
+}
+
+/* If PSM2_VERBOSE_ENV is set in the environment, we determine
+ * what its verbose level is and print the environment at "INFO"
+ * level if the environment's level matches the desired printlevel.
+ */
+static int psmi_getenv_verblevel = -1;
+static int psmi_getenv_is_verblevel(int printlevel)
+{
+	if (psmi_getenv_verblevel == -1) {
+		char *env = getenv("PSM2_VERBOSE_ENV");
+		if (env && *env) {
+			char *ep;
+			int val = (int)strtol(env, &ep, 0);
+			if (ep == env)
+				psmi_getenv_verblevel = 0;
+			else if (val == 2)
+				psmi_getenv_verblevel = 2;
+			else
+				psmi_getenv_verblevel = 1;
+		} else
+			psmi_getenv_verblevel = 0;
+	}
+	return (printlevel <= psmi_getenv_verblevel);
+}
+
+#define GETENV_PRINTF(_level, _fmt, ...)			\
+	do {							\
+		int nlevel = _level;				\
+		if (psmi_getenv_is_verblevel(nlevel))		\
+		nlevel = 0;					\
+		_HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__);	\
+	} while (0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+	    int type, union psmi_envvar_val defval,
+	    union psmi_envvar_val *newval)
+{
+	int used_default = 0;
+	union psmi_envvar_val tval;
+	char *env = getenv(name);
+#if _HFI_DEBUGGING
+	int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS ||
+		     type == PSMI_ENVVAR_TYPE_UINT_FLAGS);
+#endif
+
+	/* If we're not using the default, always reset the print
+	 * level to '1' so the changed value gets seen at low
+	 * verbosity */
+#define _GETENV_PRINT(used_default, fmt, val, defval) \
+	do {	\
+		if (used_default)					\
+			GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt	\
+				"\n", level > 1 ? "*" : " ", name,	\
+				descr, ishex ? "0x" : " ", val);	\
+		else							\
+			GETENV_PRINTF(1, "%s%-25s %-40s =>%s"		\
+				fmt " (default was%s" fmt ")\n",	\
+				level > 1 ? "*" : " ", name, descr,	\
+				ishex ? " 0x" : " ", val,		\
+				ishex ? " 0x" : " ", defval);		\
+	} while (0)
+
+	switch (type) {
+	case PSMI_ENVVAR_TYPE_YESNO:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else if (env[0] == 'Y' || env[0] == 'y')
+			tval.e_int = 1;
+		else if (env[0] == 'N' || env[0] == 'n')
+			tval.e_int = 0;
+		else {
+			char *ep;
+			tval.e_ulong = strtoul(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			} else if (tval.e_ulong != 0)
+				tval.e_ulong = 1;
+		}
+		_GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO",
+			      defval.e_int ? "YES" : "NO");
+		break;
+
+	case PSMI_ENVVAR_TYPE_STR:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else
+			tval.e_str = env;
+		_GETENV_PRINT(used_default, "%s", tval.e_str, defval.e_str);
+		break;
+
+	case PSMI_ENVVAR_TYPE_INT:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			char *ep;
+			tval.e_int = (int)strtol(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			}
+		}
+		_GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int);
+		break;
+
+	case PSMI_ENVVAR_TYPE_UINT:
+	case PSMI_ENVVAR_TYPE_UINT_FLAGS:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			char *ep;
+			tval.e_int = (unsigned int)strtoul(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			}
+		}
+		if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
+			_GETENV_PRINT(used_default, "%x", tval.e_uint,
+				      defval.e_uint);
+		else
+			_GETENV_PRINT(used_default, "%u", tval.e_uint,
+				      defval.e_uint);
+		break;
+
+	case PSMI_ENVVAR_TYPE_LONG:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			char *ep;
+			tval.e_long = strtol(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			}
+		}
+		_GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG_ULONG:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			char *ep;
+			tval.e_ulonglong =
+			    (unsigned long long)strtoull(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			}
+		}
+		_GETENV_PRINT(used_default, "%llu",
+			      tval.e_ulonglong, defval.e_ulonglong);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG:
+	case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
+	default:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			char *ep;
+			tval.e_ulong = (unsigned long)strtoul(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			}
+		}
+		if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
+			_GETENV_PRINT(used_default, "%lx", tval.e_ulong,
+				      defval.e_ulong);
+		else
+			_GETENV_PRINT(used_default, "%lu", tval.e_ulong,
+				      defval.e_ulong);
+		break;
+	}
+#undef _GETENV_PRINT
+	*newval = tval;
+
+	return used_default;
+}
+MOCK_DEF_EPILOGUE(psmi_getenv);
+
+/*
+ * Parsing int parameters set in string tuples.
+ * Output array int *vals should be able to store 'ntup' elements.
+ * Values are only overwritten if they are parsed.
+ * Tuples are always separated by colons ':'
+ */
+int psmi_parse_str_tuples(const char *string, int ntup, int *vals)
+{
+	char *b = (char *)string;
+	char *e = b;
+	int tup_i = 0;
+	int n_parsed = 0;
+	char *buf = psmi_strdup(NULL, string);
+	psmi_assert_always(buf != NULL);
+
+	while (*e && tup_i < ntup) {
+		b = e;
+		while (*e && *e != ':')
+			e++;
+		if (e > b) {	/* something to parse */
+			char *ep;
+			int len = e - b;
+			long int l;
+			strncpy(buf, b, len);
+			buf[len] = '\0';
+			l = strtol(buf, &ep, 0);
+			if (ep != buf) {	/* successful conversion */
+				vals[tup_i] = (int)l;
+				n_parsed++;
+			}
+		}
+		if (*e == ':')
+			e++;	/* skip delimiter */
+		tup_i++;
+	}
+	psmi_free(buf);
+	return n_parsed;
+}
+
+/*
+ * Memory footprint/usage mode.
+ *
+ * This can be used for debug or for separating large installations from
+ * small/medium ones.  The default is to assume a medium installation.  Large
+ * is not that much larger in memory footprint, but we make a conscious effort
+ * an consuming only the amount of memory we need.
+ */
+int psmi_parse_memmode(void)
+{
+	union psmi_envvar_val env_mmode;
+	int used_default =
+	    psmi_getenv("PSM2_MEMORY", "Memory usage mode (normal or large)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"normal", &env_mmode);
+	if (used_default || !strcasecmp(env_mmode.e_str, "normal"))
+		return PSMI_MEMMODE_NORMAL;
+	else if (!strcasecmp(env_mmode.e_str, "min"))
+		return PSMI_MEMMODE_MINIMAL;
+	else if (!strcasecmp(env_mmode.e_str, "large") ||
+		 !strcasecmp(env_mmode.e_str, "big"))
+		return PSMI_MEMMODE_LARGE;
+	else {
+		_HFI_PRDBG("PSM2_MEMORY env value %s unrecognized, "
+			   "using 'normal' memory mode instead\n",
+			   env_mmode.e_str);
+		return PSMI_MEMMODE_NORMAL;
+	}
+}
+
+static
+const char *psmi_memmode_string(int mode)
+{
+	psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM);
+	switch (mode) {
+	case PSMI_MEMMODE_NORMAL:
+		return "normal";
+	case PSMI_MEMMODE_MINIMAL:
+		return "minimal";
+	case PSMI_MEMMODE_LARGE:
+		return "large";
+	default:
+		return "unknown";
+	}
+}
+
+psm2_error_t
+psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+		     const struct psmi_rlimit_mpool *rlim,
+		     uint32_t *valo, uint32_t *chunkszo)
+{
+	uint32_t val;
+	const char *env = rlim->env;
+	int mode = mq->memmode;
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val env_val;
+
+	psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL
+			   && mode < PSMI_MEMMODE_NUM);
+
+	psmi_getenv(rlim->env, rlim->descr, rlim->env_level,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val);
+
+	val = env_val.e_uint;
+	if (val < rlim->minval || val > rlim->maxval) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Env. var %s=%u is invalid (valid settings in mode PSM2_MEMORY=%s"
+					" are inclusively between %u and %u)",
+					env, val, psmi_memmode_string(mode),
+					rlim->minval, rlim->maxval);
+		goto fail;
+	}
+
+	_HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n",
+		  env, val, rlim->mode[mode].obj_chunk,
+		  psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval);
+
+	*valo = val;
+	*chunkszo = rlim->mode[mode].obj_chunk;
+
+fail:
+	return err;
+}
+
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns)
+{
+	if (timeout_ns < 0)
+		return 0ULL;
+	else if (timeout_ns == 0ULL || timeout_ns == ~0ULL)
+		return ~0ULL;
+	else {
+		uint64_t t_end = nanosecs_to_cycles(timeout_ns);
+		uint64_t t_now = get_cycles() - start_cycles;
+
+		if (t_now >= t_end)
+			return 0ULL;
+		else
+			return (t_end - t_now);
+	}
+}
+
+uint32_t psmi_get_ipv4addr()
+{
+	struct hostent *he;
+	uint32_t addr = 0;
+
+	he = gethostbyname(psmi_gethostname());
+	if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) {
+		memcpy(&addr, he->h_addr, sizeof(uint32_t));
+		return addr;
+	} else
+		return 0;
+}
+
+#define PSMI_EP_IS_PTR(ptr)	    ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT)
+
+void
+psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...)
+{
+	va_list ap;
+
+	/* If we've never syslogged anything from this ep at the PSM level, make
+	 * sure we log context information */
+	if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) {
+		char uuid_str[64];
+		int hfi = ep->context.ctrl != NULL;
+		ep->did_syslog = 1;
+
+		memset(&uuid_str, 0, sizeof(uuid_str));
+		psmi_uuid_unparse(ep->uuid, uuid_str);
+		hfi_syslog("PSM", 0, LOG_WARNING,
+			   "uuid_key=%s,unit=%d,context=%d,subcontext=%d",
+			   uuid_str,
+			   hfi ? ep->context.ctrl->ctxt_info.unit : -1,
+			   hfi ? ep->context.ctrl->ctxt_info.ctxt : -1,
+			   hfi ? ep->context.ctrl->ctxt_info.subctxt : -1);
+	}
+
+	va_start(ap, format);
+	hfi_vsyslog("PSM", to_console, level, format, ap);
+	va_end(ap);
+}
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+	uint32_t c;
+	int n, k;
+
+	for (n = 0; n < 256; n++) {
+		c = (uint32_t) n;
+		for (k = 0; k < 8; k++) {
+			if (c & 1)
+				c = 0xedb88320 ^ (c >> 1);
+			else
+				c = c >> 1;
+		}
+		crc_table[n] = c;
+	}
+	crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len)
+{
+	uint32_t c = crc;
+	int n;
+
+	if_pf(!crc_table_computed)
+	    make_crc_table();
+	for (n = 0; n < len; n++) {
+		c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
+	}
+	return c;
+}
+
+/* Return the CRC of the bytes buf[0..len-1]. */
+uint32_t psmi_crc(unsigned char *buf, int len)
+{
+	return update_crc(0xffffffff, buf, len) ^ 0xffffffff;
+}
+
+/* Return the HFI type being used for a context */
+uint32_t psmi_get_hfi_type(const psmi_context_t *context)
+{
+	return PSMI_HFI_TYPE_OPA1;
+}
+
+#define PSMI_FAULTINJ_SPEC_NAMELEN  32
+struct psmi_faultinj_spec {
+	STAILQ_ENTRY(psmi_faultinj_spec) next;
+	char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN];
+
+	unsigned long long num_faults;
+	unsigned long long num_calls;
+
+	struct drand48_data drand48_data;
+	int num;
+	int denom;
+
+};
+
+int psmi_multi_ep_enabled = 0;
+void psmi_multi_ep_init()
+{
+	union psmi_envvar_val env_fi;
+
+	psmi_getenv("PSM2_MULTI_EP", "PSM2 Multiple Endpoints (yes/no)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+		    PSMI_ENVVAR_VAL_NO, &env_fi);
+
+	psmi_multi_ep_enabled = env_fi.e_uint;
+}
+
+int psmi_faultinj_enabled = 0;
+int psmi_faultinj_verbose = 0;
+char *psmi_faultinj_outfile = NULL;
+
+static struct psmi_faultinj_spec psmi_faultinj_dummy;
+static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head =
+STAILQ_HEAD_INITIALIZER(psmi_faultinj_head);
+
+void psmi_faultinj_init()
+{
+	union psmi_envvar_val env_fi;
+
+	psmi_getenv("PSM2_FI", "PSM Fault Injection (yes/no)",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO,
+		    PSMI_ENVVAR_VAL_NO, &env_fi);
+
+	psmi_faultinj_enabled = !!env_fi.e_uint;
+
+	if (psmi_faultinj_enabled) {
+		char *def = NULL;
+		if (!psmi_getenv
+		    ("PSM2_FI_TRACEFILE", "PSM Fault Injection output file",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+		     (union psmi_envvar_val)def, &env_fi)) {
+			psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str);
+		}
+	}
+
+	return;
+}
+
+void psmi_faultinj_fini()
+{
+	struct psmi_faultinj_spec *fi;
+	FILE *fp;
+	int do_fclose = 0;
+
+	if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL)
+		return;
+
+	if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0)
+		fp = stdout;
+	else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0)
+		fp = stderr;
+	else {
+		char *c = psmi_faultinj_outfile;
+		char buf[192];
+		int append = 0;
+		if (*c == '+') {
+			append = 1;
+			++c;
+		}
+		do_fclose = 1;
+		snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel());
+		buf[sizeof(buf) - 1] = '\0';
+		fp = fopen(buf, append ? "a" : "w");
+	}
+
+	if (fp != NULL) {
+		STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+			fprintf(fp, "%s:%s PSM2_FI_%-12s %2.3f%% => "
+				"%2.3f%% %10lld faults/%10lld events\n",
+				__progname, hfi_get_mylabel(), fi->spec_name,
+				(double)fi->num * 100.0 / fi->denom,
+				(double)fi->num_faults * 100.0 / fi->num_calls,
+				fi->num_faults, fi->num_calls);
+		}
+		fflush(fp);
+		if (do_fclose)
+			fclose(fp);
+	}
+
+	psmi_free(psmi_faultinj_outfile);
+	return;
+}
+
+/*
+ * Intended to be used only once, not in the critical path
+ */
+struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, int num,
+						 int denom)
+{
+	struct psmi_faultinj_spec *fi;
+
+	if (!psmi_faultinj_enabled)
+		return &psmi_faultinj_dummy;
+
+	STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+		if (strcmp(fi->spec_name, spec_name) == 0)
+			return fi;
+	}
+
+	/* We got here, so no spec -- allocate one */
+	fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+			 sizeof(struct psmi_faultinj_spec));
+	psmi_assert_always(fi != NULL);
+	strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1);
+	fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0';
+	fi->num = num;
+	fi->denom = denom;
+	fi->num_faults = 0;
+	fi->num_calls = 0;
+
+	/*
+	 * See if we get a hint from the environment.
+	 * Format is
+	 * <num:denom:initial_seed>
+	 *
+	 * By default, we chose the initial seed to be the 'pid'.  If users need
+	 * repeatability, they should set initial_seed to be the 'pid' when the
+	 * error was observed or force the initial_seed to be a constant number in
+	 * each running process.  Using 'pid' is useful because core dumps store
+	 * pids and our backtrace format does as well so if a crash is observed for
+	 * a specific seed, programs can reuse the 'pid' to regenerate the same
+	 * error condition.
+	 */
+	{
+		int fvals[3] = { num, denom, (int)getpid() };
+		union psmi_envvar_val env_fi;
+		char fvals_str[128];
+		char fname[128];
+		char fdesc[256];
+
+		snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num,
+			 denom);
+		fvals_str[sizeof(fvals_str) - 1] = '\0';
+		snprintf(fname, sizeof(fname) - 1, "PSM2_FI_%s", spec_name);
+		fname[sizeof(fname) - 1] = '\0';
+		snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection %s <%s>",
+			 fname, fvals_str);
+
+		if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN,
+				 PSMI_ENVVAR_TYPE_STR,
+				 (union psmi_envvar_val)fvals_str, &env_fi)) {
+			/* not using default values */
+			int n_parsed =
+			    psmi_parse_str_tuples(env_fi.e_str, 3, fvals);
+			if (n_parsed >= 1)
+				fi->num = fvals[0];
+			if (n_parsed >= 2)
+				fi->denom = fvals[1];
+			if (n_parsed >= 3)
+				srand48_r((long int) fvals[2], &fi->drand48_data);
+		}
+	}
+
+	STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next);
+	return fi;
+}
+
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi)
+{
+	if (!psmi_faultinj_enabled)	/* never fault if disabled */
+		return 0;
+	if (fi->num == 0)
+		return 0;
+
+	fi->num_calls++;
+	long int rnum;
+	lrand48_r(&fi->drand48_data, &rnum);
+	if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) {
+		fi->num_faults++;
+		return 1;
+	} else
+		return 0;
+}
+
+/* For memory allocation, we kind of break the PSM error handling rules.
+ * If the caller gets NULL, it has to assume that the error has been handled
+ * and should always return PSM2_NO_MEMORY */
+
+/*
+ * Log memory increments or decrements of type memstats_t.
+ */
+struct psmi_memtype_hdr {
+	struct {
+		uint64_t size:48;
+		uint64_t magic:8;
+		uint64_t type:8;
+	};
+	void *original_allocation;
+};
+
+struct psmi_stats_malloc psmi_stats_memory;
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes)
+{
+#define _add_max_total(type, nbytes)				\
+	psmi_stats_memory.m_ ## type ## _total += (nbytes);	\
+	psmi_stats_memory.m_ ## type ## _max = max(		\
+	    psmi_stats_memory.m_ ## type ## _total,		\
+	    psmi_stats_memory.m_ ## type ## _max);
+
+	switch (type) {
+	case PER_PEER_ENDPOINT:
+		_add_max_total(perpeer, nbytes);
+		break;
+	case NETWORK_BUFFERS:
+		_add_max_total(netbufs, nbytes);
+		break;
+	case DESCRIPTORS:
+		_add_max_total(descriptors, nbytes);
+		break;
+	case UNEXPECTED_BUFFERS:
+		_add_max_total(unexpbufs, nbytes);
+		break;
+	case STATS:
+		_add_max_total(stats, nbytes);
+		break;
+	case UNDEFINED:
+		_add_max_total(undefined, nbytes);
+		break;
+	default:
+		psmi_assert_always(type == TOTAL);
+		break;
+	}
+	_add_max_total(all, nbytes);
+	psmi_stats_memory.m_all_max++;
+#undef _add_max_total
+
+	return;
+}
+
+// Memory stats will only be collected under debug builds
+
+#ifdef PSM_DEBUG
+#define psmi_stats_mask PSMI_STATSTYPE_MEMORY
+#else
+#define psmi_stats_mask 0
+#endif
+
+#ifdef malloc
+#undef malloc
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* PSM HEAP DEBUG documentation:
+
+   In the following code, the acronym: 'HD' is short for "Heap Debug".
+
+   Each actual heap allocation will have a header and a trailer surrounding it,
+   and the header itself may have some vacant space preceding it due to alignment
+   needs:
+
+   0. This area is the actual return value of posix_memalign and is due to
+      alignment requirements.  (This area does not exist for heap allocations
+      from malloc()).
+   1. HD HEADER
+   2. Actual allocation
+   3. HD TRAILER
+
+   malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code,
+   then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to
+   the caller.  Thereafter, the HD code will inspect areas 1 and 3 of all heap
+   allocations to make sure they have retained their integrity.
+
+   Surrounding the actual allocation like this enables:
+
+   1. Checking for heap overrun / underrun of all allocations.
+   2. Checking for double frees.
+   3. Use of an area that has been freed.
+   4. Identifying orphaned heap allocations.
+
+Constant no-mans-land written to areas that no-one should be writing to:
+
+ */
+
+#define HD_NO_MANS_LAND -15
+
+/*   The following is the declaration of the HD header. */
+
+/* Heap debug header magic number type: */
+typedef char HD_Hdr_Magic_Type[8];
+
+typedef struct HD_Header_Struct
+{
+	HD_Hdr_Magic_Type        magic1;         /* Magic number to ensure this
+						    allocation has integrity.
+						    (guards against heap
+						    overrun from above). */
+	const char              *allocLoc;       /* Source file name/line
+						    number where this heap
+						    allocation was made. */
+	const char              *freeLoc;        /* Source filename/line number
+						    where this heap allocation
+						    was freed. */
+	struct HD_Header_Struct *nextHD_header;  /* Creates a singly-linked
+						    list of all heap
+						    allocations. */
+	uint64_t                 sizeOfAlloc;    /* size of this heap
+						    allocation. */
+	void                    *systemAlloc;    /* The actual return value
+						    from malloc()/posix_memaligh(). */
+	uint64_t                 systemAllocSize;/* The size that is actually allocated
+						    by malloc()/posix_memalign(). */
+	HD_Hdr_Magic_Type        magic2;         /* Second magic number to
+						    ensure this allocation
+						    has integrity.
+						    (guards against heap
+						    underrun from the actual
+						    allocation that follows). */
+} __attribute__ ((packed)) HD_Header_Type;
+
+typedef struct HD_free_list_struct
+{
+	HD_Header_Type *freedStuct;
+	struct HD_free_list_struct *next_free_struct;
+} HD_Free_Struct_Type;
+
+static HD_Free_Struct_Type  *HD_free_list_root   = NULL;
+static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root;
+
+typedef char HD_Trlr_Magic_Type[16];
+
+static const HD_Hdr_Magic_Type  HD_HDR_MGC_1 = "Eric";
+static const HD_Hdr_Magic_Type  HD_HDR_MGC_2 = "Emily";
+static const HD_Trlr_Magic_Type HD_TRLR_MGC  = "Erin&Elaine";
+
+/* Convert a pointer of an actual allocation to a pointer to its HD header: */
+static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa)
+{
+	char *p = (char*)aa;
+	return (HD_Header_Type*)(p - sizeof(HD_Header_Type));
+}
+
+/* Convert a pointer to an HD header to the actual allocation: */
+static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr)
+{
+	char *p = (char*)phdHdr;
+	return p + sizeof(HD_Header_Type);
+}
+
+/* Get the address of the trailer that follows the actual allocation: */
+static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr)
+{
+	char *p = (char*)HD_HDR_TO_AA(phdr);
+	return p + phdr->sizeOfAlloc;
+}
+
+static HD_Header_Type * HD_root_of_list = NULL;   /* Root of singly linked list
+						     of all heap allocations */
+static HD_Header_Type **HD_end_of_list = &HD_root_of_list;  /* Pointer to the
+	       last pointer of the singly linked list of all heap allocations. */
+
+/* Number of allocations in the list.  Maintained to assert the integrity
+   of the singly linked list of heap allocations. */
+static int n_allocations = 0;
+
+/* HD_check_one_struct() checks one heap allocation for integrity. */
+static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc)
+{
+	/* First check the magic values in the header and trailer: */
+	psmi_assert_always(0 == memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)));
+	psmi_assert_always(0 == memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)));
+	psmi_assert_always(0 == memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)));
+
+	/* Next, check the area between systemAlloc and the start of the header */
+	signed char *pchr = (signed char *)p->systemAlloc;
+	while (pchr < (signed char*)p)
+	{
+		psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND);
+		pchr++;
+	}
+
+	/* Lastly, check the actual allocation area if directed to do so: */
+	if (checkAA)
+	{
+		uint64_t i;
+		signed char *pchr = HD_HDR_TO_AA(p);
+		for (i=0;i < p->sizeOfAlloc;i++)
+			if (pchr[i] != (signed char) HD_NO_MANS_LAND)
+			{
+				fprintf(stderr,
+					"use after free; ptr: %p,\n"
+					" allocated from: %s,\n"
+					" validated from: %s\n"
+					" freed from: %s\n",
+					pchr+i,p->allocLoc,curloc,p->freeLoc);
+				fflush(0);
+				psmi_assert_always(0);
+			}
+	}
+}
+
+/* _HD_validate_heap_allocations() walks the singly linked list and inspects all
+ *  heap allocations to ensure all of them have integrity still. */
+void _HD_validate_heap_allocations(const char *curloc)
+{
+	/* first check current allocation list: */
+	HD_Header_Type *p = HD_root_of_list;
+	int cnt = 0;
+
+	while (p)
+	{
+		HD_check_one_struct(p,0,curloc);
+		p = p->nextHD_header;
+		cnt++;
+	}
+	psmi_assert_always(cnt == n_allocations);
+	/* Next check free list */
+	HD_Free_Struct_Type *pfreestruct = HD_free_list_root;
+	while (pfreestruct)
+	{
+		HD_check_one_struct(pfreestruct->freedStuct,1,curloc);
+		pfreestruct = pfreestruct->next_free_struct;
+	}
+}
+
+/* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds
+ * the header and trailer to the allocation.  Lastly, it validates the existing singly-linked
+ * list for integrity. */
+static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc,
+			    void *systemAlloc,
+			    uint64_t systemSize,
+			    uint64_t actualSize,
+			    const char *curloc)
+{
+#if 0
+	/* if we use this block of code, psm hangs running mpistress.  See JIRA STL-5244.  */
+	memset(systemAlloc,HD_NO_MANS_LAND,systemSize);
+#else
+	/* write HD_NO_MANS_LAND to the area between the system allocation and the start of the hd header. */
+	signed char *pchr = systemAlloc;
+	for (;pchr < (signed char*) hd_alloc;pchr++)
+		*pchr = (signed char) HD_NO_MANS_LAND;
+#endif
+	/* Write the HD header info: */
+	memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1));
+	hd_alloc->allocLoc = curloc;
+	hd_alloc->freeLoc = NULL;
+	hd_alloc->nextHD_header = NULL;
+	hd_alloc->sizeOfAlloc = actualSize;
+	hd_alloc->systemAlloc = systemAlloc;
+	hd_alloc->systemAllocSize = systemSize;
+	memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2));
+	memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC));
+	*HD_end_of_list = hd_alloc;
+	HD_end_of_list = &hd_alloc->nextHD_header;
+	n_allocations++;
+	HD_validate_heap_allocations();
+}
+
+/* hd_malloc() is the heap debug version of malloc that will create the header and trailer
+ * and link the allocation into the singly linked list. */
+static inline void *hd_malloc(size_t sz, const char *curloc)
+{
+	const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC);
+	HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize);
+
+	hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc);
+	return HD_HDR_TO_AA(hd_alloc);
+}
+
+/* hd_memalign() is the heap debug version of posix_memalign(). */
+static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc)
+{
+	void *systemAlloc = NULL;
+	const uint64_t alignMask = alignment - 1;
+	uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC);
+	int rv = posix_memalign(&systemAlloc,alignment,systemSize);
+	char *actualAlloc = NULL;
+	const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize;
+
+	if (rv)
+		return rv;
+
+	uint64_t actualAllocu64 = (uint64_t) systemAlloc;
+	actualAllocu64 += sizeof(HD_Header_Type) + alignMask;
+	actualAllocu64 &= ~ alignMask;
+	actualAlloc = (char*)actualAllocu64;
+	psmi_assert_always((actualAllocu64 & alignMask) == 0);
+	psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc);
+	psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type));
+
+	hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc);
+	*ptr = actualAlloc;
+	return rv;
+}
+
+/* hd_free() is the heap debug version of free().  First, hd_free() ensures that the ptr to be
+ * freed in fact is known by the HD code.  Next, hd_free() removes the ptr from the list. Then,
+ * hd_free scribbles to the ptr's area and actually frees the heap space. */
+static inline void hd_free(void *ptr,const char *curloc)
+{
+	HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+	HD_Header_Type *p = HD_root_of_list, *q = NULL;
+
+	HD_validate_heap_allocations();
+	while (p)
+	{
+		if (p == hd_alloc)
+		{
+			/* first, fix the next pointers: */
+			if (q)
+			{
+				q->nextHD_header = p->nextHD_header;
+			}
+			else
+			{
+				psmi_assert_always(p == HD_root_of_list);
+				HD_root_of_list = p->nextHD_header;
+			}
+			/* Now, handle the case of removing the last entry in the list. */
+			if (&p->nextHD_header == HD_end_of_list)
+			{
+				if (q)
+				{
+					q->nextHD_header = NULL;
+					HD_end_of_list = &q->nextHD_header;
+				}
+				else
+				{
+					HD_root_of_list = NULL;
+					HD_end_of_list = &HD_root_of_list;
+				}
+			}
+			/* Scribble to the actual allocation to make further access to the heap
+			   area unusable. */
+			n_allocations--;
+			memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc);
+			hd_alloc->freeLoc = curloc;
+			/* Add this allocation to the free list. */
+			HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type));
+			*HD_free_list_bottom = pfreestruct;
+			HD_free_list_bottom = &pfreestruct->next_free_struct;
+			pfreestruct->freedStuct = hd_alloc;
+			pfreestruct->next_free_struct = NULL;
+			HD_validate_heap_allocations();
+			return;
+		}
+		q = p;
+		p = p->nextHD_header;
+	}
+	/* trying to free a heap allocation that we did not allocate. */
+	psmi_assert_always(0);
+}
+
+size_t hd_malloc_usable_size(void *ptr,const char *curloc)
+{
+	HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+	return hd_alloc->systemAllocSize;
+}
+
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* For HD code, we retarget the malloc, memaligh and free calls to the hd versions
+ * of the code. */
+
+#define my_malloc(SZ,CURLOC)              hd_malloc(SZ,CURLOC)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC)  hd_memalign(PTR,ALIGN,SZ,CURLOC)
+#define my_free(PTR,CURLOC)               hd_free(PTR,CURLOC)
+#define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC)
+
+#else
+
+/* For non-HD code, we target the code to the usual functions: */
+#define my_malloc(SZ,CURLOC)              malloc(SZ)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC)  posix_memalign(PTR,ALIGN,SZ)
+#define my_free(PTR,CURLOC)               free(PTR)
+#define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR)
+
+#endif
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+			   size_t sz, const char *curloc)
+{
+	size_t newsz = sz;
+	void *newa;
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+	    newsz += sizeof(struct psmi_memtype_hdr);
+
+	newa = my_malloc(newsz,curloc);
+	if (newa == NULL) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				  "Out of memory for malloc at %s", curloc);
+		return NULL;
+	}
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa;
+		hdr->size = newsz;
+		hdr->type = type;
+		hdr->magic = 0x8c;
+		hdr->original_allocation = newa;
+		psmi_log_memstats(type, newsz);
+		newa = (void *)(hdr + 1);
+		/* _HFI_INFO("alloc is %p\n", newa); */
+	}
+	return newa;
+}
+
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+			    void *ptr, size_t nsz, const char *curloc)
+{
+	if (ptr)
+	{
+		size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc);
+		if (nsz > existingSize)
+		{
+			void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc);
+
+			memcpy(newPtr,ptr,existingSize);
+			psmi_free_internal(ptr,curloc);
+			return newPtr;
+		}
+		else
+			/* We will not support shrinking virtual space
+			   for performance reasons. */
+			return ptr;
+	}
+	else
+		return psmi_malloc_internal(ep,type,nsz,curloc);
+}
+
+#ifdef memalign
+#undef memalign
+#endif
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type,
+			     size_t alignment, size_t sz, const char *curloc)
+{
+	size_t newsz = sz;
+	void *newa;
+	int ret, preambleSize = 0;
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+	{
+		if (sizeof(struct psmi_memtype_hdr) > alignment)
+		{
+			int n = sizeof(struct psmi_memtype_hdr) / alignment;
+			int r = sizeof(struct psmi_memtype_hdr) % alignment;
+			if (r)
+				n++;
+			preambleSize = n * alignment;
+		}
+		else
+			preambleSize = alignment;
+		newsz += preambleSize;
+	}
+
+	ret = my_memalign(&newa, alignment, newsz, curloc);
+	if (ret) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				  "Out of memory for malloc at %s", curloc);
+		return NULL;
+	}
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		void *rv = newa + preambleSize;
+		struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)(rv-sizeof(struct psmi_memtype_hdr));
+		hdr->size = newsz;
+		hdr->type = type;
+		hdr->magic = 0x8c;
+		hdr->original_allocation = newa;
+		psmi_log_memstats(type, newsz);
+		newa = rv;
+		/* _HFI_INFO("alloc is %p\n", newa); */
+	}
+	return newa;
+}
+
+#ifdef calloc
+#undef calloc
+#endif
+
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem,
+			   size_t elemsz, const char *curloc)
+{
+	void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc);
+	if (newa == NULL)	/* error handled above */
+		return NULL;
+	memset(newa, 0, nelem * elemsz);
+	return newa;
+}
+
+#ifdef strdup
+#undef strdup
+#endif
+
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc)
+{
+	size_t len = strlen(string) + 1;
+	void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc);
+	if (newa == NULL)
+		return NULL;
+	memcpy(newa, string, len);	/* copy with \0 */
+	return newa;
+}
+
+#ifdef free
+#undef free
+#endif
+
+void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc)
+{
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		struct psmi_memtype_hdr *hdr =
+		    (struct psmi_memtype_hdr *)ptr - 1;
+		/* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */
+		psmi_memtype_t type = hdr->type;
+		int64_t size = hdr->size;
+		int magic = (int)hdr->magic;
+		psmi_log_memstats(type, -size);
+		psmi_assert_always(magic == 0x8c);
+		ptr = hdr->original_allocation;
+	}
+	my_free(ptr,curloc);
+}
+MOCK_DEF_EPILOGUE(psmi_free_internal);
+
+#ifdef malloc_usable_size
+#undef malloc_usable_size
+#endif
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc)
+{
+	return my_malloc_usable_size(ptr,curLoc);
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_coreopt_ctl(const void *core_obj, int optname,
+		 void *optval, uint64_t *optlen, int get))
+{
+	psm2_error_t err = PSM2_OK;
+
+	switch (optname) {
+	case PSM2_CORE_OPT_DEBUG:
+		/* Sanity check length */
+		if (*optlen < sizeof(unsigned)) {
+			err =  psmi_handle_error(NULL,
+					PSM2_PARAM_ERR,
+					"Option value length error");
+			*optlen = sizeof(unsigned);
+			return err;
+		}
+
+		if (get) {
+			*((unsigned *)optval) = hfi_debug;
+		} else
+			hfi_debug = *(unsigned *)optval;
+		break;
+	case PSM2_CORE_OPT_EP_CTXT:
+		{
+			/* core object is epaddr */
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+
+			/* Sanity check epaddr */
+			if (!epaddr) {
+				return psmi_handle_error(NULL,
+						PSM2_PARAM_ERR,
+						"Invalid endpoint address");
+			}
+
+			/* Sanity check length */
+			if (*optlen < sizeof(unsigned long)) {
+				err =  psmi_handle_error(NULL,
+						PSM2_PARAM_ERR,
+						"Option value length error");
+				*optlen = sizeof(void *);
+				return err;
+			}
+
+			if (get) {
+				*((unsigned long *)optval) =
+				    (unsigned long)epaddr->usr_ep_ctxt;
+			} else
+				epaddr->usr_ep_ctxt = optval;
+		}
+		break;
+	default:
+		/* Unknown/unrecognized option */
+		err = psmi_handle_error(NULL,
+				PSM2_PARAM_ERR,
+				"Unknown PSM2_CORE option %u.",
+				optname);
+		break;
+	}
+	return err;
+}
+
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+			     const void *optval, uint64_t optlen)
+{
+	return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+			     void *optval, uint64_t *optlen)
+{
+	return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1);
+}
+
+/* PSM AM component option handling */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_amopt_ctl(const void *am_obj, int optname,
+	       void *optval, uint64_t *optlen, int get))
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* AM object is a psm2_epaddr (or NULL for global minimum sz) */
+	/* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */
+
+	/* All AM options are read-only. */
+	if (!get) {
+		return err =
+		    psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY,
+				      "Attempted to set read-only option value");
+	}
+
+	/* Sanity check length -- all AM options are uint32_t. */
+	if (*optlen < sizeof(uint32_t)) {
+		*optlen = sizeof(uint32_t);
+		return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR,
+					       "Option value length error");
+	}
+
+	switch (optname) {
+	case PSM2_AM_OPT_FRAG_SZ:
+		*((uint32_t *) optval) = psmi_am_parameters.max_request_short;
+		break;
+	case PSM2_AM_OPT_NARGS:
+		*((uint32_t *) optval) = psmi_am_parameters.max_nargs;
+		break;
+	case PSM2_AM_OPT_HANDLERS:
+		*((uint32_t *) optval) = psmi_am_parameters.max_handlers;
+		break;
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown PSM2_AM option %u.", optname);
+	}
+
+	return err;
+}
+
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+			   const void *optval, uint64_t optlen)
+{
+	return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+			   void *optval, uint64_t *optlen)
+{
+	return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1);
+}
+
+#ifdef PSM_LOG
+
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include "ptl_ips/ips_proto_header.h"
+
+/* A treeNode is used to store the list of Function Name Lists that
+   are passed to the PSM_LOG facility via environment variables.
+   See psm_log.h for more information.
+
+   Note that treeNode is a node in a binary tree data structure. */
+typedef struct _treeNode
+{
+	const char *name;
+	int line1,line2;
+	struct _treeNode *left,*right;
+} treeNode;
+
+/* An epmTreeNode is used to track the number of protocol packets
+   that are send/recevied, for a given opcode, and source epid
+   to another epid. */
+typedef struct _epmTreeNode
+{
+	int opcode,count,txrx;
+	uint64_t fromepid,toepid;
+	struct _epmTreeNode *left,*right;
+} epmTreeNode;
+
+
+/* given a line range: [*line1 .. *line2], and another line, line
+   'join' the line range to the new line if the line immediately abuts
+   the line range.  The new line does not abut the existing range,
+   return 0.  Else, return 1.
+
+   For example, take the line range [ 20 .. 30 ] and the line: 19.
+   Since 19 comes immediately before 20, the line range can be joined
+   resulting in the line rage: [ 19 .. 30 ].  The function returns 1 for this
+   case.
+
+   The following other examples gives the new line range given the new line and
+   range [ 20 .. 30 ], and gives the return value:
+
+   31 [ 20 .. 31 ] 1
+   18 [ 20 .. 30 ] 0
+   32 [ 20 .. 30 ] 0
+   25 [ 20 .. 30 ] 1 */
+static int joinOverlap(int *line1,int *line2,int line)
+{
+	long long ll_line = line;
+
+	if (ll_line+1 >= *line1 && ll_line-1 <= *line2)
+	{
+		*line1 = min(*line1,line);
+		*line2 = max(*line2,line);
+		return 1;
+	}
+	return 0;
+}
+
+/* given two line ranges, determine the range that encompasses both line ranges
+   if an overlap has occurred.  Returns 0 if the two ranges do not overlap and
+   do not abutt.
+
+   Some examples, if line1=20 and line2=30
+
+   [20 30] [20 30] 2
+   [19 30] [19 30] 2
+   [19 20] [19 30] 2
+   [10 15] [20 30] 0
+   [40 50] [20 30] 0 */
+static int joinOverlapRange(int *line1,int *line2,int l1,int l2)
+{
+	return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2);
+}
+
+/* inserts a new treeNode into the FNL tree, or, merges the lines that are already
+   present in the tree. */
+static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2)
+{
+	if (*root)
+	{
+		int c = strcmp(name,(*root)->name);
+		if (c < 0)
+			insertNodeInTree(&((*root)->left),name,line1,line2);
+		else if (c > 0)
+			insertNodeInTree(&((*root)->right),name,line1,line2);
+		else
+		{
+			if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2))
+				return;
+			else if (line1 < (*root)->line1)
+				insertNodeInTree(&((*root)->left),name,line1,line2);
+			else if (line2 > (*root)->line2)
+				insertNodeInTree(&((*root)->right),name,line1,line2);
+			else psmi_assert_always(0); /* should never happen. */
+		}
+	}
+	else
+	{
+		*root = malloc(sizeof(treeNode));
+		(*root)->name  = strdup(name);
+		(*root)->line1 = line1;
+		(*root)->line2 = line2;
+		(*root)->left  = (*root)->right  = NULL;
+	}
+}
+
+/* Returns -1 if the data in the node is less    than the data supplied as parameter, else
+   Returns  1 if the data in the node is greater than the data supplied as parameter, else
+   Returns  0.
+   */
+static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+#define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1
+	COMPARE_ONE(opcode);
+	COMPARE_ONE(txrx);
+	COMPARE_ONE(fromepid);
+	COMPARE_ONE(toepid);
+	return 0;
+}
+
+/* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree.
+   In either case, this code returns a pointer to the count in the node. */
+static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+	if (*root)
+	{
+		int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid);
+		if (a < 0)
+			return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid);
+		else if (a > 0)
+			return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid);
+		else
+			return &((*root)->count);
+	}
+	else
+	{
+		*root = malloc(sizeof(epmTreeNode));
+		(*root)->opcode   = opcode;
+		(*root)->txrx     = txrx;
+		(*root)->count    = 0;
+		(*root)->fromepid = fromepid;
+		(*root)->toepid   = toepid;
+		(*root)->left     = (*root)->right  = NULL;
+		return &((*root)->count);
+	}
+}
+
+/* returns 0, if the node is present, non-zero if it is absent. */
+static int lookupNodeInTree(const treeNode *root,const char *name,int line)
+{
+	if (root)
+	{
+		int c = strcmp(name,root->name);
+		if (c < 0)
+			return lookupNodeInTree(root->left,name,line);
+		else if (c > 0)
+			return lookupNodeInTree(root->right,name,line);
+		else
+		{
+			if (line < root->line1)
+				return lookupNodeInTree(root->left,name,line);
+			else if (line > root->line2)
+				return lookupNodeInTree(root->right,name,line);
+			else /* line must be >= root->line1 and line must be <= root->line2. */
+				return 0;
+		}
+	}
+	else
+	{
+		return 1;
+	}
+}
+
+/* Declare a prototype for a parserFunc - referenced in the following code: */
+typedef void parserFunc(char *,int,int,void *);
+
+/* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */
+static void parseString(char *ps,char c,parserFunc pf,void *ctx)
+{
+	int idx,n=0;
+	char *p;
+
+	/* first, count the number of instances of c in ps, for use by the parser function: */
+	for (idx=0;ps[idx];idx++)
+		if (ps[idx] == c)
+			n++;
+	/* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */
+	for (idx=0,p=ps;p && *p;idx++)
+	{
+		char *t = strchr(p,c);
+		if (!t)
+		{
+			break;
+		}
+		else
+		{
+			*t = 0;
+			pf(p,idx,n,ctx);
+			p = t+1;
+		}
+	}
+	/* finally, call pf on the final substring. */
+	pf(p,idx,n,ctx);
+}
+
+/* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */
+typedef struct
+{
+	const char *currentFuncName;
+	int firstLineNumber;
+	treeNode **root;
+} funcNameCtx;
+
+/* This is the start of the parser code for parsing FNL's.  Here is the grammar:
+
+  An FNL is a 'Function Name List' that is defined by the following grammar:
+
+  # A LINE1 is either a single line number of a range of line numbers:
+(1)  LINE1 :: lineNumber |
+(2)           lineNumber1 '-' lineNumber2
+
+  # LINES is a list of LINE1's separated by commas:
+(3)  LINES :: LINE1 |
+(4)           LINE1 ',' LINES
+
+  # An FN is either a function name, or a function name with a list of lines:
+(5)  FN :: functionName |
+(6)        functionName ';' LINES
+
+  # A FNL is a list of FN's separated by colons:
+(7)  FNL ::  FN |
+(8)          FN ':' FNL
+
+  # Examples:
+  foo:bar    the two functions foo and bar
+  foo;1-10   lines 1 to 10 of function foo.
+  bar;1,3,5  lines 1, 3 and 5 of function bar
+
+*/
+
+/* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree.
+*/
+static void p4(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n == 0) /* production (1) */
+	{
+		pfnc->firstLineNumber = atoi(s);
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber);
+	}
+	else if (n == 1) /* production (2) */
+	{
+		if (idx == 0) /* lhs of production (2) */
+			pfnc->firstLineNumber = atoi(s);
+		else /* rhs of production (2). */
+			insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s));
+	}
+}
+
+/* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and
+   uses p4 to spill each individual range (or just one line number) into the tree */
+static void p3(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n == 0 && *s == 0) /* production (5)/(7) */
+	{
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX);
+	}
+	else if (*s) /* production (2) */
+	{
+		/* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */
+		parseString(s,'-',p4,ctx);
+	}
+}
+
+/* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */
+static void p2(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n)
+	{
+		if (idx == 0)
+			pfnc->currentFuncName = s;
+		else
+		{
+			/* production (4) */
+			/* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */
+			parseString(s,',',p3,ctx);
+		}
+	}
+	else
+	{
+		/* production (7)/(5). */
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX);
+	}
+}
+
+/* p1 parses each function name and line range list. */
+static void p1(char *s,int idx,int n,void *ctx)
+{
+	/* production (5)/(6)) */
+	/* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */
+	parseString(s,';',p2,ctx);
+}
+
+static void parseAndInsertInTree(const char *buf,treeNode **root)
+{
+	funcNameCtx t;
+	t.root = root;
+	char *p = alloca(strlen(buf)+1);
+	strcpy(p,buf);
+	/* productions (7)/(8) */
+	/* separates string into colon-separated strings, and then parses each substring in p1: */
+	parseString(p,':',p1,(void*)&t);
+}
+
+/* initialization code for the psmi log mechanism. */
+static inline void psmi_initialize(const char **plmf_fileName_kernel,
+				   const char **plmf_search_format_string,
+				   treeNode   **includeFunctionNamesTreeRoot,
+				   treeNode   **excludeFunctionNamesTreeRoot)
+{
+	static volatile int  plmf_initialized = 0;
+
+	if (!plmf_initialized)
+	{
+		static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+		if (pthread_mutex_lock(&plmf_init_mutex))
+		{
+			perror("cannot lock mutex for psmi_log_message facility");
+			return;
+		}
+                /* CRITICAL SECTION BEGIN */
+		if (!plmf_initialized)
+		{
+			/* initializing psmi log message facility here. */
+			const char *env = getenv("PSM2_LOG_FILENAME");
+			if (env)
+				*plmf_fileName_kernel = env;
+			env = getenv("PSM2_LOG_SRCH_FORMAT_STRING");
+			if (env)
+			{
+				*plmf_search_format_string = env;
+			}
+			else
+			{
+				env = getenv("PSM2_LOG_INC_FUNCTION_NAMES");
+				if (env)
+				{
+					parseAndInsertInTree(env,includeFunctionNamesTreeRoot);
+				}
+				env = getenv("PSM2_LOG_EXC_FUNCTION_NAMES");
+				if (env)
+				{
+					parseAndInsertInTree(env,excludeFunctionNamesTreeRoot);
+				}
+			}
+			/* initialization of psmi log message facility is completed. */
+			plmf_initialized = 1;
+		}
+		/* CRITICAL SECTION END */
+		if (pthread_mutex_unlock(&plmf_init_mutex))
+		{
+			perror("cannot unlock mutex for psmi_log_message facility");
+			return;
+		}
+	}
+}
+
+/* Utility function to map the integer txrx value to the given strings for emitting to the log file. */
+static const char * const TxRxString(int txrx)
+{
+	switch(txrx)
+	{
+	case PSM_LOG_EPM_TX: return "Sent";
+	case PSM_LOG_EPM_RX: return "Received";
+	default:             return "Unknown";
+	}
+}
+
+/* Utility function to map an integer opcode value to the given strings for emitting to the log file. */
+static const char * const OpcodeString(int opcode)
+{
+	switch(opcode)
+	{
+	case OPCODE_LONG_RTS:          return "RTS";
+	case OPCODE_LONG_CTS:          return "CTS";
+	case OPCODE_LONG_DATA:         return "DATA";
+	case OPCODE_EXPTID:            return "EXPTID";
+	case OPCODE_EXPTID_COMPLETION: return "EXPTID_COMPLETION";
+	default:                       return "UNKNOWN";
+	}
+}
+
+static const char     *plmf_fileName_kernel         = "/tmp/psm2_log";
+static const char     *plmf_search_format_string    = NULL;
+static       treeNode *includeFunctionNamesTreeRoot = NULL;
+static       treeNode *excludeFunctionNamesTreeRoot = NULL;
+
+void psmi_log_initialize(void)
+{
+	/* If not initialized, then, initialize in a single thread of execution. */
+	psmi_initialize(&plmf_fileName_kernel,
+			&plmf_search_format_string,
+			&includeFunctionNamesTreeRoot,
+			&excludeFunctionNamesTreeRoot);
+}
+
+#ifdef PSM_LOG_FAST_IO
+
+struct psmi_log_io_thread_info
+{
+	pthread_t thread_id;
+	char *buff;
+	unsigned long max_buff_length, curr_buff_length;
+	pthread_mutex_t flags_mutex;
+	volatile int flags;
+#define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1  /* io is currently in progress */
+#define PSMI_LOG_IO_FLAG_IO_SHUTDOWN    2  /* we are shutting down logging. */
+};
+
+/* Please note that psmi_log_io_info is in thread local storage. */
+static __thread struct psmi_log_io_thread_info psmi_log_io_info =
+{
+	.thread_id        = 0,
+	.buff             = NULL,
+	.max_buff_length  = 0,
+	.curr_buff_length = 0,
+	.flags_mutex      = PTHREAD_MUTEX_INITIALIZER,
+	.flags            = 0
+};
+
+static struct
+{
+	unsigned int nTableEntries,maxTableEntries;
+	pthread_mutex_t table_mutex;
+	struct psmi_log_io_thread_info **table;
+} psmi_log_io_table =
+{
+	.nTableEntries   = 0,
+	.maxTableEntries = 0,
+	.table_mutex     = PTHREAD_MUTEX_INITIALIZER,
+	.table           = NULL
+};
+
+void psmi_log_fini()
+{
+	if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+	{
+		perror("Cannot lock mutex for psmi_log_io_table");
+		return;
+	}
+	/* Start critical section. */
+
+	unsigned int i;
+	for (i=0;i < psmi_log_io_table.nTableEntries;i++)
+	{
+		if (psmi_log_io_table.table[i])
+		{
+			struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i];
+			int flags;
+
+			if (pthread_mutex_lock(&pti->flags_mutex))
+			{
+				perror("can't lock the flags mutex.");
+				continue;
+			}
+			/* critical section */
+			flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN);
+			/* end critical section */
+			pthread_mutex_unlock(&pti->flags_mutex);
+			/* if io is currenctly in progress, allow it to complete. */
+			while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+			{
+				sleep(1);
+				if (pthread_mutex_lock(&pti->flags_mutex))
+				{
+					perror("can't lock the flags mutex.");
+					continue;
+				}
+				flags = pti->flags;
+				pthread_mutex_unlock(&pti->flags_mutex);
+			}
+			if (pti->buff)
+			{
+				char logFileName[256];
+				FILE *fout;
+
+				snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+					 plmf_fileName_kernel,getpid(),pti->thread_id);
+				fout = fopen(logFileName,"w");
+				if (!fout)
+				{
+					perror(logFileName);
+					continue;
+				}
+				fwrite(pti->buff,pti->curr_buff_length,1,fout);
+				fclose(fout);
+			}
+		}
+		psmi_log_io_table.table[i] = NULL;
+	}
+	psmi_log_io_table.nTableEntries = 0;
+	psmi_free(psmi_log_io_table.table);
+	psmi_log_io_table.table = NULL;
+	psmi_log_io_table.maxTableEntries = 0;
+	/* End critical section. */
+	pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+}
+
+static int psmi_log_register_tls(void)
+{
+	if (psmi_log_io_info.thread_id != pthread_self())
+	{
+		psmi_log_io_info.thread_id = pthread_self();
+		if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+		{
+			perror("cannot lock table mutex");
+			return -1;
+		}
+		/* critical section start. */
+		if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1)
+		{
+			if (psmi_log_io_table.maxTableEntries == 0)
+			{
+				psmi_log_io_table.maxTableEntries = 2;
+				psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE,
+								      PER_PEER_ENDPOINT,
+								      psmi_log_io_table.maxTableEntries *
+								      sizeof(struct psmi_log_io_thread_info *));
+			}
+			else
+			{
+				psmi_log_io_table.maxTableEntries *= 2;
+				psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE,
+								       PER_PEER_ENDPOINT,
+								       psmi_log_io_table.table,
+								       psmi_log_io_table.maxTableEntries *
+								       sizeof(struct psmi_log_io_thread_info *));
+			}
+		}
+		psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info;
+		psmi_log_io_table.nTableEntries++;
+		/* critical section end. */
+		pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+	}
+	if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+	{
+		perror("cannot lock table mutex");
+		return -1;
+	}
+	/* critical section start. */
+	int old_flags = psmi_log_io_info.flags;
+	int new_flags = old_flags;
+	if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN))
+		new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+	psmi_log_io_info.flags = new_flags;
+	/* critical section end. */
+	pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+	if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+		return 0;
+	return -1;
+}
+
+static void psmi_buff_fclose(int port)
+{
+	if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+	{
+		perror("cannot lock table mutex");
+		return;
+	}
+	/* critical section start. */
+	psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+	/* critical section end. */
+	pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+}
+
+static void growBuff(size_t minExcess)
+{
+       while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length)
+	{
+		if (!psmi_log_io_info.buff)
+			psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+								    psmi_log_io_info.max_buff_length = 1 << 20);
+		else
+		{
+			psmi_log_io_info.max_buff_length *= 2;
+			psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+								     psmi_log_io_info.buff,
+								     psmi_log_io_info.max_buff_length);
+		}
+	}
+}
+
+static int psmi_buff_vfprintf(int port, const char *format, va_list ap)
+{
+	int done = 0;
+	size_t excess = 1024;
+	int length;
+
+	while (!done)
+	{
+		growBuff(excess);
+
+		length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length,
+				   excess, format, ap);
+		if (length >= excess)
+			excess *= 2;
+		else
+			done = 1;
+	}
+	psmi_log_io_info.curr_buff_length += length;
+	return length;
+}
+
+static int psmi_buff_fprintf(int port,const char *format, ...)
+{
+	int length;
+	va_list ap;
+
+	va_start(ap, format);
+
+	length = psmi_buff_vfprintf(port,format,ap);
+
+	va_end(ap);
+	return length;
+}
+
+static int psmi_buff_fputc(int c, int port)
+{
+	growBuff(1024);
+	psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c;
+	psmi_log_io_info.curr_buff_length++;
+	return 1;
+}
+#endif
+
+
+/* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h
+   are serviced from this back end. */
+void psmi_log_message(const char *fileName,
+		      const char *functionName,
+		      int         lineNumber,
+		      const char *format, ...)
+{
+	va_list ap;
+
+	va_start(ap, format);
+
+	/* Next, determine if this log message is signal or noise. */
+	if (plmf_search_format_string)
+	{
+		if((format != PSM_LOG_BT_MAGIC) && (format != PSM_LOG_EPM_MAGIC))
+		{
+			if (fnmatch(plmf_search_format_string, format, 0))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+	}
+	else
+	{
+		if (includeFunctionNamesTreeRoot)
+		{
+			if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+
+		if (excludeFunctionNamesTreeRoot)
+		{
+			if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+	}
+
+	/* At this point, we think that this may be a message that we want to emit to the log.
+	   But, there is one more test, to apply to the two cases where (format == PSM_LOG_BT_MAGIC
+	   and format == PSM_LOG_EPM_MAGIC. */
+	{
+		void      **voidarray      = NULL;   /*va_arg(ap,void **);*/
+		int         nframes        = 0;      /*va_arg(ap,int);*/
+		const char *newFormat      = format; /*va_arg(ap,const char *);*/
+		int         opcode         = 0;
+		int         txrx           = 0;
+		uint64_t    fromepid       = 0;
+		uint64_t    toepid         = 0;
+
+#ifdef PSM_LOG_FAST_IO
+#define IO_PORT         0
+#define MY_FPRINTF      psmi_buff_fprintf
+#define MY_VFPRINTF     psmi_buff_vfprintf
+#define MY_FPUTC        psmi_buff_fputc
+#define MY_FCLOSE       psmi_buff_fclose
+#else
+		char logFileName[256];
+		FILE *fout;
+#define IO_PORT         fout
+#define MY_FPRINTF      fprintf
+#define MY_VFPRINTF     vfprintf
+#define MY_FPUTC        fputc
+#define MY_FCLOSE       fclose
+#endif
+		struct timespec tp;
+
+		if (format == PSM_LOG_BT_MAGIC)
+		{
+			voidarray = va_arg(ap,void **);
+			nframes   = va_arg(ap,int);
+			newFormat = va_arg(ap,const char *);
+			/* One last test to make sure that this message is signal: */
+			if (plmf_search_format_string)
+			{
+				{
+					if (fnmatch(plmf_search_format_string, newFormat, 0))
+					{
+						va_end(ap);
+						/* tis noise, return. */
+						return;
+					}
+				}
+			}
+		}
+		else if (format == PSM_LOG_EPM_MAGIC)
+		{
+			opcode    = va_arg(ap,int);
+			txrx      = va_arg(ap,int);
+			fromepid  = va_arg(ap,uint64_t);
+			toepid    = va_arg(ap,uint64_t);
+			newFormat = va_arg(ap,const char *);
+			/* One last test to make sure that this message is signal: */
+			if (plmf_search_format_string)
+			{
+				{
+					if (fnmatch(plmf_search_format_string, newFormat, 0))
+					{
+						va_end(ap);
+						/* tis noise, return. */
+						return;
+					}
+				}
+			}
+		}
+
+#ifdef PSM_LOG_FAST_IO
+		if (psmi_log_register_tls() != 0)
+		{
+			va_end(ap);
+			return;
+		}
+#else
+		/* At this point we know that the message is not noise, and it is going to be emitted to the log. */
+		snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+			 plmf_fileName_kernel,getpid(),
+			 pthread_self());
+		fout = fopen(logFileName,"a");
+		if (!fout)
+		{
+			va_end(ap);
+			return;
+		}
+#endif
+
+#define M1()	clock_gettime(CLOCK_REALTIME, &tp);				 	\
+			MY_FPRINTF(IO_PORT,"%f %s %s:%d: ",				\
+			   (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0),	\
+			   functionName,fileName,lineNumber)
+
+		M1();
+
+		if ((format != PSM_LOG_BT_MAGIC) && (format != PSM_LOG_EPM_MAGIC))
+		{
+			MY_VFPRINTF(IO_PORT,format,ap);
+			MY_FPUTC('\n',IO_PORT);
+		}
+		else if (format == PSM_LOG_BT_MAGIC)
+		{
+			void *newframes[PSM_LOG_BT_BUFFER_SIZE];
+			int  newframecnt      = backtrace(newframes,
+							  PSM_LOG_BT_BUFFER_SIZE);
+			int  pframes          = min(newframecnt,nframes);
+
+			MY_VFPRINTF(IO_PORT,newFormat,ap);
+			MY_FPUTC('\n',IO_PORT);
+
+			if (memcmp(voidarray,newframes,pframes * sizeof(void*)))
+			{
+				int i;
+				char **strings;
+
+				memcpy(voidarray,newframes,sizeof(newframes));
+				M1();
+				MY_FPRINTF(IO_PORT,
+					   "backtrace() returned %d addresses\n",
+					   newframecnt);
+
+				strings = backtrace_symbols(voidarray, pframes);
+				if (strings == NULL)
+				{
+					perror("backtrace_symbols");
+					exit(EXIT_FAILURE);
+				}
+
+				for (i = 0; i < pframes; i++)
+				{
+					M1();
+					MY_FPRINTF(IO_PORT,"%s\n", strings[i]);
+				}
+
+#undef free
+				free(strings);
+			}
+
+		}
+		else /* (format == PSM_LOG_EPM_MAGIC) */
+		{
+			static epmTreeNode *root = 0;
+			static pthread_mutex_t plmf_epm_mutex =
+				PTHREAD_MUTEX_INITIALIZER;
+			int *pcount = 0;
+			if (pthread_mutex_lock(&plmf_epm_mutex))
+			{
+				perror("cannot lock mutex for "
+				       "psmi_log_message facility");
+				va_end(ap);
+				return;
+			}
+			/* START OF CRITICAL SECTION */
+			pcount = insertNodeInEpmTree(&root,opcode,txrx,
+						     fromepid,toepid);
+			/* END OF CRITICAL SECTION */
+			if (pthread_mutex_unlock(&plmf_epm_mutex))
+			{
+				perror("cannot unlock mutex for "
+				       "psmi_log_message facility");
+				va_end(ap);
+				return;
+			}
+			(*pcount)++;
+			MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64
+				   ", to: %" PRIx64 ", count: %d, ",
+				   TxRxString(txrx),OpcodeString(opcode),
+				   fromepid,toepid,*pcount);
+			MY_VFPRINTF(IO_PORT,newFormat,ap);
+			MY_FPUTC('\n',IO_PORT);
+		}
+		MY_FCLOSE(IO_PORT);
+	}
+
+	va_end(ap);
+}
+#endif /* #ifdef PSM_LOG */
diff --git a/psm_utils.h b/psm_utils.h
new file mode 100644
index 0000000..07d198b
--- /dev/null
+++ b/psm_utils.h
@@ -0,0 +1,379 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_IN_USER_H
+#error psm_utils.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_UTILS_H
+#define _PSMI_UTILS_H
+
+#include <arpa/inet.h>		/* ipv4addr */
+#include <stdlib.h>		/* malloc/free */
+
+/*
+ * Endpoint 'id' hash table, with iterator interface
+ */
+struct psmi_epid_table {
+	struct psmi_epid_tabentry *table;
+	int tabsize;
+	int tabsize_used;
+	pthread_mutex_t tablock;
+};
+/*
+ * Endpoint address hash table
+ */
+struct psmi_epid_tabentry {
+	void *entry;
+	uint64_t key;
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+};
+
+extern struct psmi_epid_table psmi_epid_table;
+#define EPADDR_DELETED	((void *)-1)	/* tag used to mark deleted entries */
+#define PSMI_EPID_TABSIZE_CHUNK	 128
+#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7)
+
+psm2_error_t psmi_epid_init();
+psm2_error_t psmi_epid_fini();
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid);
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid);
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry);
+#define PSMI_EP_HOSTNAME    ((psm2_ep_t) -1)	/* Special endpoint handle we use
+						 * to register hostnames */
+#define PSMI_EP_CROSSTALK   ((psm2_ep_t) -2)	/* Second special endpoint handle
+						 * to log which nodes we've seen
+						 * crosstalk from */
+struct psmi_eptab_iterator {
+	int i;			/* last index looked up */
+	psm2_ep_t ep;
+};
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep);
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor);
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor);
+
+uint64_t psmi_epid_version(psm2_epid_t epid);
+
+/*
+ * Hostname manipulation
+ */
+#define	     PSMI_EP_HOSTNAME_LEN   64	/* hostname only */
+#define	     PSMI_EP_NAME_LEN       96	/* hostname:LID:context:subcontext */
+char *psmi_gethostname(void);
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid);
+const char *psmi_epaddr_get_name(psm2_epid_t epid);
+psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname,
+				   int overwrite);
+
+/*
+ * Memory allocation, use macros only.
+ *
+ * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE
+ * if no endpoint is available.
+ *
+ *   psmi_malloc_usable_size(void *ptr)
+ *   psmi_malloc(ep, memtype, size)
+ *   psmi_realloc(ep, memtype, ptr, newsize)
+ *   psmi_memalign(ep, memtype, alignment, size)
+ *   psmi_calloc(ep, memtype, elemsz, numelems)
+ *   psmi_strdup(ep, memtype, ptr)
+ *   psmi_free(ptr)
+ *
+ */
+typedef enum psmi_memtype {
+	TOTAL = 0,		/* Logged automatically by malloc/calloc */
+	UNDEFINED,		/* For tracking "other types" of allocations */
+	PER_PEER_ENDPOINT,	/* For tracking "per peer" allocations */
+	NETWORK_BUFFERS,	/* For tracking network buffers */
+	DESCRIPTORS,		/* For tracking send/recv descriptors */
+	UNEXPECTED_BUFFERS,	/* For tracking unexpected recv buffers */
+	STATS,			/* For tracking stats-related allocs */
+} psmi_memtype_t;
+
+/*
+ * We track allocation stats.
+ */
+struct psmi_stats_malloc {
+	int64_t m_all_total;
+	int64_t m_all_max;
+	int64_t m_perpeer_total;
+	int64_t m_perpeer_max;
+	int64_t m_netbufs_total;
+	int64_t m_netbufs_max;
+	int64_t m_descriptors_total;
+	int64_t m_descriptors_max;
+	int64_t m_unexpbufs_total;
+	int64_t m_unexpbufs_max;
+	int64_t m_undefined_total;
+	int64_t m_undefined_max;
+	int64_t m_stats_total;
+	int64_t m_stats_max;
+};
+
+extern struct psmi_stats_malloc psmi_stats_memory;
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz,
+			   const char *curloc);
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr,
+			    size_t newSz, const char *curloc);
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment,
+			     size_t sz, const char *curloc);
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num,
+			   size_t sz, const char *curloc);
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc);
+
+void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc);
+MOCK_DCL_EPILOGUE(psmi_free_internal);
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc);
+
+#ifdef PSM_HEAP_DEBUG
+/* During heap debug code, we can sprinkle function calls:
+   HD_validate_heap_allocations(), that will examine all of the heap allocations
+   to ensure integrity. */
+void _HD_validate_heap_allocations(const char *curloc);
+
+#define HD_validate_heap_allocations() _HD_validate_heap_allocations(PSMI_CURLOC)
+
+#else
+
+#define HD_validate_heap_allocations() /* nothing */
+
+#endif
+
+#define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC)
+#define psmi_calloc(ep, mt, nelem, elemsz) \
+	psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC)
+#define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC)
+#define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC)
+#define psmi_memalign(ep, mt, al, sz) \
+	psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC)
+#define psmi_free(ptr)	psmi_free_internal(ptr, PSMI_CURLOC)
+#define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC)
+#ifndef PSM_IS_TEST
+#define malloc(sz)        _use_psmi_malloc_instead_of_plain_malloc
+#define realloc(ptr,nsz)  _use_psmi_realloc_instead_of_plain_realloc
+#define memalign(sz)      _use_psmi_memalign_instead_of_plain_memalign
+#define calloc(sz, nelm)  _use_psmi_calloc_instead_of_plain_calloc
+#ifdef strdup
+#undef strdup
+#endif
+#define strdup(ptr)             _use_psmi_strdup_instead_of_plain_strdup
+#define free(ptr)               _use_psmi_free_instead_of_plain_free
+#define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size
+#endif /* PSM_IS_TEST */
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes);
+
+/*
+ * Parsing int parameters set in string tuples.
+ */
+int psmi_parse_str_tuples(const char *str, int ntup, int *vals);
+
+/*
+ * Resource Limiting based on PSM memory mode.
+ */
+#define PSMI_MEMMODE_NORMAL  0
+#define PSMI_MEMMODE_MINIMAL 1
+#define PSMI_MEMMODE_LARGE   2
+#define PSMI_MEMMODE_NUM     3
+
+struct psmi_rlimit_mpool {
+	const char *env;
+	const char *descr;
+	int env_level;
+	uint32_t minval;
+	uint32_t maxval;
+	struct {
+		uint32_t obj_chunk;
+		uint32_t obj_max;
+	} mode[PSMI_MEMMODE_NUM];
+};
+psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+				 const struct psmi_rlimit_mpool *rlim,
+				 uint32_t *valo, uint32_t *chunkszo);
+int psmi_parse_memmode(void);
+
+/*
+ * Parsing environment variables
+ */
+
+union psmi_envvar_val {
+	void *e_void;
+	char *e_str;
+	int e_int;
+	unsigned int e_uint;
+	long e_long;
+	unsigned long e_ulong;
+	unsigned long long e_ulonglong;
+};
+
+#define PSMI_ENVVAR_LEVEL_USER	    1
+#define PSMI_ENVVAR_LEVEL_HIDDEN    2
+
+#define PSMI_ENVVAR_TYPE_YESNO		0
+#define PSMI_ENVVAR_TYPE_STR		1
+#define PSMI_ENVVAR_TYPE_INT		2
+#define PSMI_ENVVAR_TYPE_UINT		3
+#define PSMI_ENVVAR_TYPE_UINT_FLAGS	4
+#define PSMI_ENVVAR_TYPE_LONG		5
+#define PSMI_ENVVAR_TYPE_ULONG		6
+#define PSMI_ENVVAR_TYPE_ULONG_FLAGS	7
+#define PSMI_ENVVAR_TYPE_ULONG_ULONG    8
+
+#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1)
+#define PSMI_ENVVAR_VAL_NO  ((union psmi_envvar_val) 0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+		int type, union psmi_envvar_val defval,
+		union psmi_envvar_val *newval);
+MOCK_DCL_EPILOGUE(psmi_getenv);
+/*
+ * Misc functionality
+ */
+uintptr_t psmi_getpagesize(void);
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns);
+uint32_t psmi_get_ipv4addr();
+void psmi_syslog(psm2_ep_t ep, int to_console, int level,
+		 const char *format, ...);
+void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out);
+int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB);
+void *psmi_memcpyo(void *dst, const void *src, size_t n);
+uint32_t psmi_crc(unsigned char *buf, int len);
+uint32_t psmi_get_hfi_type(const psmi_context_t *context);
+
+/*
+ * Internal CPUID detection
+ */
+#define CPUID_FAMILY_MASK       0x00000f00
+#define CPUID_MODEL_MASK        0x000000f0
+#define CPUID_EXMODEL_MASK      0x000f0000
+
+/*
+ * CPUID return values
+ */
+#define CPUID_FAMILY_XEON       0x00000600
+#define CPUID_MODEL_PHI_GEN2    87
+#define CPUID_MODEL_PHI_GEN2M   133
+/*
+ * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX
+ * due to Little Endian and Hex it is not so obvious
+ */
+#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */
+#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */
+#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */
+
+/*
+ * These values are internal only, not real register values
+ */
+#define CPUID_GENUINE_INTEL     0xf0000000
+#define CPUID_MODEL_UNDEFINED   -1
+
+/*
+ * Global model so we can tune defaults better for specific cpu's
+ */
+uint32_t psmi_cpu_model;
+
+/*
+ * Diagnostics, all in psm_diags.c
+ */
+int psmi_diags(void);
+
+/*
+ * Multiple Endpoints
+ */
+extern int psmi_multi_ep_enabled;
+void psmi_multi_ep_init();
+
+/*
+ * Fault injection
+ */
+struct psmi_faultinj_spec;
+int psmi_faultinj_enabled;	/* use macro to test */
+#if 1				/* possible to disable at compile time */
+#define PSMI_FAULTINJ_ENABLED()	(!!psmi_faultinj_enabled)
+#else
+#define PSMI_FAULTINJ_ENABLED()	0
+#endif
+
+void psmi_faultinj_init();
+void psmi_faultinj_fini();
+struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name,
+						 int num, int denom);
+#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom)		\
+	static struct psmi_faultinj_spec *var;			\
+	if (PSMI_FAULTINJ_ENABLED() && (var) == NULL)			\
+	    (var) = psmi_faultinj_getspec((spec_name), (num), (denom));
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec);
+
+/*
+ * PSM core component set/get options
+ */
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+			     const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+			     void *optval, uint64_t *optlen);
+
+/*
+ * PSM AM component set/get options
+ */
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+			   const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+			   void *optval, uint64_t *optlen);
+
+#endif /* _PSMI_UTILS_H */
diff --git a/psmi_wrappers.c b/psmi_wrappers.c
new file mode 100644
index 0000000..ea857fc
--- /dev/null
+++ b/psmi_wrappers.c
@@ -0,0 +1,94 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdlib.h>
+#include "opa_common.h"
+#include <unistd.h>
+#include "psmi_wrappers.h"
+
+/* The following indirection wrappers for external functions
+ * are only created if this is a mocking tests build
+ */
+#ifdef PSM2_MOCK_TESTING
+
+void MOCKABLE(psmi_exit)(int status)
+{
+	exit(status);
+}
+MOCK_DEF_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count)
+{
+	return write(fd, buf, count);
+}
+MOCK_DEF_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg)
+{
+	return ioctl(fd, cmd, arg);
+}
+MOCK_DEF_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact)
+{
+	return sigaction(signum, act, oldact);
+}
+MOCK_DEF_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void)
+{
+	return ips_rmb();
+}
+MOCK_DEF_EPILOGUE(psmi_rmb);
+
+#endif /* def PSM2_MOCK_TESTING */
diff --git a/psmi_wrappers.h b/psmi_wrappers.h
new file mode 100644
index 0000000..68f11c8
--- /dev/null
+++ b/psmi_wrappers.h
@@ -0,0 +1,98 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_WRAPPERS_H
+#define _PSMI_WRAPPERS_H
+
+#include <signal.h>
+#include "psm2_mock_testing.h"
+#include "opa_intf.h"
+
+#if defined( IB_IOCTL_MAGIC )
+#include <sys/ioctl.h>
+#endif
+
+/* If this is a mocking tests build, we introduce "incision points"
+ * through which we can easily mock external dependencies.
+ * For non-mocking-tests build, we bypass those indirections
+ * for performance reasons.
+ */
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_exit)(int status);
+MOCK_DCL_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count);
+MOCK_DCL_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg);
+MOCK_DCL_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact);
+MOCK_DCL_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void);
+MOCK_DCL_EPILOGUE(psmi_rmb);
+
+#else /* def PSM2_MOCK_TESTING */
+
+#define psmi_exit	exit
+#define psmi_write	write
+#define psmi_ioctl	ioctl
+#define psmi_sigaction	sigaction
+#define psmi_rmb 	ips_rmb
+
+#endif /* def PSM2_MOCK_TESTING */
+
+#endif // _PSMI_WRAPPERS_H
+
diff --git a/ptl.h b/ptl.h
new file mode 100644
index 0000000..88d6fc1
--- /dev/null
+++ b/ptl.h
@@ -0,0 +1,211 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* Interface implemented by Packet Transport layers such as
+ * ips and active messages.
+ *
+ * This interface can be volatile, it is never seen by PSM clients, and it will
+ * probably change as the AM ptl is developed.
+ */
+
+#ifndef PSM_PTL_H
+#define PSM_PTL_H
+#include <inttypes.h>
+#include <psm2.h>
+#include <psm2_mq.h>
+#include <psm2_am.h>
+
+/* We currently have 3 PTLs, 0 is reserved. */
+#define PTL_DEVID_IPS  1
+#define PTL_DEVID_AMSH 2
+#define PTL_DEVID_SELF 3
+
+/* We can currently initialize up to 3 PTLs */
+#define PTL_MAX_INIT	3
+
+struct ptl;
+typedef struct ptl ptl_t;
+
+struct ptl_ctl;
+typedef struct ptl_ctl ptl_ctl_t;
+
+struct ptl_mq_req;
+typedef struct ptl_mq_req ptl_mq_req_t;
+
+struct ips_proto;
+typedef struct ips_proto ips_proto_t;
+
+/* To be filled in statically by all PTLs */
+struct ptl_ctl_init {
+	size_t(*sizeof_ptl) (void);
+
+	psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl);
+
+	psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns);
+
+	psm2_error_t
+	    (*setopt) (const void *component_obj, int optname,
+		       const void *optval, uint64_t optlen);
+
+	psm2_error_t
+	    (*getopt) (const void *component_obj, int optname,
+		       void *optval, uint64_t *optlen);
+};
+
+typedef
+struct ptl_arg {
+	union {
+		struct {
+			uint16_t u16w3;
+			uint16_t u16w2;
+			uint16_t u16w1;
+			uint16_t u16w0;
+		};
+		struct {
+			uint32_t u32w1;
+			uint32_t u32w0;
+		};
+		uint64_t u64w0;
+		uint64_t u64;
+		void *uptr;
+	};
+} ptl_arg_t;
+
+#include "ptl_self/ptl_fwd.h"
+#include "ptl_ips/ptl_fwd.h"
+#include "ptl_am/ptl_fwd.h"
+
+/* To be filled in as part of ptl_init */
+struct ptl_ctl {
+	ptl_t *ptl;		/* pointer to ptl */
+	psm2_ep_t ep;		/* pointer to ep */
+
+	/* EP-specific stuff */
+	 psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly);
+
+	/* PTL-level connect
+	 *
+	 * This PTL-level is slightly different from the top-level PSM connect.
+	 *
+	 * pre 1: Caller has masked off epids in epid array that are already
+	 *        connected at the PSM level.
+	 *
+	 * post 0: PTL has allocate all epaddrs and whatever internal ptladdr
+	 *         that ptl needs.
+	 * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i]
+	 * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't
+	 *         be connected before a timeout occurred.
+	 * post 3: PTL returns OK if all epids are either OK or UNREACHABLE
+	 * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK.
+	 */
+	 psm2_error_t(*ep_connect) (ptl_t *ptl,
+				   int num_ep,
+				   const psm2_epid_t input_array_of_epid[],
+				   const int array_of_epid_mask[],
+				   psm2_error_t output_array_of_errors[],
+				   psm2_epaddr_t output_array_of_epddr[],
+				   uint64_t timeout_ns);
+
+	 psm2_error_t (*ep_disconnect)(ptl_t *ptl,
+				       int force,
+				       int num_ep,
+				       psm2_epaddr_t input_array_of_epaddr[],
+				       const int array_of_epaddr_mask[],
+				       psm2_error_t output_array_of_errors[],
+				       uint64_t timeout_ns);
+
+	/* MQ stuff */
+	 psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest,
+				uint32_t flags, psm2_mq_tag_t *stag,
+				const void *buf, uint32_t len);
+	 psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest,
+				 uint32_t flags, psm2_mq_tag_t *stag,
+				 const void *buf, uint32_t len,
+				 void *ctxt, psm2_mq_req_t *req);
+
+	int (*epaddr_stats_num) (void);
+	int (*epaddr_stats_init) (char *desc[], uint16_t *flags);
+	int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats);
+
+	/* AM stuff */
+	 psm2_error_t(*am_get_parameters) (psm2_ep_t ep,
+					  struct psm2_am_parameters *
+					  parameters);
+	 psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr,
+					 psm2_handler_t handler,
+					 psm2_amarg_t *args, int nargs,
+					 void *src, size_t len, int flags,
+					 psm2_am_completion_fn_t completion_fn,
+					 void *completion_ctxt);
+	 psm2_error_t(*am_short_reply) (psm2_am_token_t token,
+				       psm2_handler_t handler,
+				       psm2_amarg_t *args, int nargs, void *src,
+				       size_t len, int flags,
+				       psm2_am_completion_fn_t completion_fn,
+				       void *completion_ctxt);
+	/* Long messages currently unsupported */
+#if 0
+	 psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr,
+					psm2_handler_t handler,
+					psm2_amarg_t *args, int nargs,
+					void *src, size_t len, void *dest,
+					int flags);
+	 psm2_error_t(*am_long_reply) (psm2_am_token_t token,
+				      psm2_handler_t handler, psm2_amarg_t *args,
+				      int nargs, void *src, size_t len,
+				      void *dest, int flags);
+#endif
+};
+#endif
diff --git a/ptl_am/Makefile b/ptl_am/Makefile
new file mode 100644
index 0000000..5aa5a46
--- /dev/null
+++ b/ptl_am/Makefile
@@ -0,0 +1,91 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o cmarwu.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno; \
+		cd -; \
+	fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+	@echo "Nothing to do for install."
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
new file mode 100644
index 0000000..959dbc3
--- /dev/null
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -0,0 +1,316 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#include "psm_user.h"
+#include "am_cuda_memhandle_cache.h"
+#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR)  ((PAYLOAD_PTR)->start)
+#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length))
+#define RBTREE_ASSERT                     psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+
+#include "rbtree.c"
+
+#ifdef PSM_DEBUG
+static int cache_hit_counter;
+static int cache_miss_counter;
+#endif
+
+/*
+ * Creating mempool for cuda memhandle cache nodes.
+ */
+psm2_error_t
+am_cuda_memhandle_mpool_init(uint32_t memcache_size)
+{
+	psm2_error_t err;
+	cuda_memhandle_cache_size = memcache_size;
+	/* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE
+	 * which includes the Root and NIL items
+	 */
+	cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t),
+					cuda_memhandle_cache_size,
+					cuda_memhandle_cache_size, 0,
+					UNDEFINED, NULL, NULL,
+					psmi_cuda_memhandle_cache_alloc_func,
+					NULL);
+	if (cuda_memhandle_mpool == NULL) {
+		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				"Couldn't allocate CUDA host receive buffer pool");
+		return err;
+	}
+	return PSM2_OK;
+}
+
+/*
+ * Initialize rbtree.
+ */
+psm2_error_t am_cuda_memhandle_cache_map_init()
+{
+	cl_map_item_t *root, *nil_item;
+	root = (cl_map_item_t *)psmi_mpool_get(cuda_memhandle_mpool);
+	if (root == NULL)
+		return PSM2_NO_MEMORY;
+	nil_item = (cl_map_item_t *)psmi_mpool_get(cuda_memhandle_mpool);
+	if (nil_item == NULL)
+		return PSM2_NO_MEMORY;
+	nil_item->payload.start = 0;
+	nil_item->payload.epid = 0;
+	nil_item->payload.length = 0;
+	cuda_memhandle_cache_enabled = 1;
+	ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item);
+	NELEMS = 0;
+	return PSM2_OK;
+}
+
+void am_cuda_memhandle_cache_map_fini()
+{
+#ifdef PSM_DEBUG
+	_HFI_DBG("cache hit counter: %d\n", cache_hit_counter);
+	_HFI_DBG("cache miss counter: %d\n", cache_miss_counter);
+#endif
+	if(cuda_memhandle_cache_enabled)
+		psmi_mpool_destroy(cuda_memhandle_mpool);
+	return;
+}
+
+/*
+ * Insert at the head of Idleq.
+ */
+static void
+am_cuda_idleq_insert(cl_map_item_t* memcache_item)
+{
+	if (FIRST == NULL) {
+		FIRST = memcache_item;
+		LAST = memcache_item;
+		return;
+	}
+	INEXT(FIRST) = memcache_item;
+	IPREV(memcache_item) = FIRST;
+	FIRST = memcache_item;
+	return;
+}
+
+/*
+ * Remove least recent used element.
+ */
+static void
+am_cuda_idleq_remove_last(cl_map_item_t* memcache_item)
+{
+	if (!INEXT(memcache_item)) {
+		LAST = NULL;
+		FIRST = NULL;
+		return;
+	}
+	LAST = INEXT(memcache_item);
+	IPREV(LAST) = NULL;
+	return;
+}
+
+static void
+am_cuda_idleq_remove(cl_map_item_t* memcache_item)
+{
+	if (LAST == memcache_item) {
+		am_cuda_idleq_remove_last(memcache_item);
+		return;
+	}
+	if (INEXT(memcache_item) == NULL) {
+		INEXT(IPREV(memcache_item)) = NULL;
+		return;
+	}
+	INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+	IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
+	return;
+}
+
+static void
+am_cuda_idleq_reorder(cl_map_item_t* memcache_item)
+{
+	if (FIRST == memcache_item && LAST == memcache_item ) {
+		return;
+	}
+	am_cuda_idleq_remove(memcache_item);
+	am_cuda_idleq_insert(memcache_item);
+	return;
+}
+
+/*
+ * After a successful cache hit, item is validated by doing a
+ * memcmp on the handle stored and the handle we recieve from the
+ * sender. If the validation fails the item is removed from the idleq,
+ * the rbtree, is put back into the mpool and IpcCloseMemHandle function
+ * is called.
+ */
+static psm2_error_t
+am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
+				 uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+				 uint32_t length, psm2_epid_t epid)
+{
+	if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle,
+			 sizeof(cudaIpcMemHandle_t )))
+			 && sbuf == memcache_item->payload.start
+			 && epid == memcache_item->payload.epid) {
+		return PSM2_OK;
+	}
+	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item);
+	PSMI_CUDA_CALL(cudaIpcCloseMemHandle,
+		       memcache_item->payload.cuda_ipc_dev_ptr);
+	am_cuda_idleq_remove(memcache_item);
+	psmi_mpool_put(memcache_item);
+	return PSM2_OK_NO_PROGRESS;
+}
+
+/*
+ * Current eviction policy: Least Recently Used.
+ */
+static void
+am_cuda_memhandle_cache_evict()
+{
+	cl_map_item_t *p_item = LAST;
+	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item);
+	PSMI_CUDA_CALL(cudaIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
+	am_cuda_idleq_remove_last(p_item);
+	psmi_mpool_put(p_item);
+	return;
+}
+
+static psm2_error_t
+am_cuda_memhandle_cache_register(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+				 uint32_t length, psm2_epid_t epid,
+				 void* cuda_ipc_dev_ptr)
+{
+	if (NELEMS == cuda_memhandle_cache_size)
+		am_cuda_memhandle_cache_evict();
+	cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool);
+	/* memcache_item cannot be NULL as we evict
+	 * before the call to mpool_get. Check has
+	 * been fixed to help with klockwork analysis.
+	 */
+	if (memcache_item == NULL)
+		return PSM2_NO_MEMORY;
+	memcache_item->payload.start = sbuf;
+	memcache_item->payload.cuda_ipc_handle = *handle;
+	memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr;
+	memcache_item->payload.length = length;
+	memcache_item->payload.epid = epid;
+	ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item);
+	am_cuda_idleq_insert(memcache_item);
+	return PSM2_OK;
+}
+
+/*
+ * The key used to search the cache is the senders buf address pointer.
+ * Upon a succesful hit in the cache, additional validation is required
+ * as multiple senders could potentially send the same buf address value.
+ */
+void*
+am_cuda_memhandle_acquire(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+				uint32_t length, psm2_epid_t epid)
+{
+	void* cuda_ipc_dev_ptr;
+	if(cuda_memhandle_cache_enabled) {
+		cl_qmap_t *p_map = &cuda_memhandle_cachemap;
+		cl_map_item_t *p_item;
+		unsigned long start = (unsigned long)sbuf;
+		unsigned long end = start + length;
+		p_item = ips_cl_qmap_search(p_map, start, end);
+		if (p_item->payload.start) {
+			if (am_cuda_memhandle_cache_validate(p_item, sbuf,
+					       handle, length, epid) == PSM2_OK) {
+#ifdef PSM_DEBUG
+				cache_hit_counter++;
+#endif
+				am_cuda_idleq_reorder(p_item);
+				return p_item->payload.cuda_ipc_dev_ptr;
+			}
+		}
+#ifdef PSM_DEBUG
+		cache_miss_counter++;
+#endif
+		PSMI_CUDA_CALL(cudaIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+				 *handle, cudaIpcMemLazyEnablePeerAccess);
+		am_cuda_memhandle_cache_register(sbuf, handle,
+					       length, epid, cuda_ipc_dev_ptr);
+		return cuda_ipc_dev_ptr;
+	} else {
+		PSMI_CUDA_CALL(cudaIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+				 *handle, cudaIpcMemLazyEnablePeerAccess);
+		return cuda_ipc_dev_ptr;
+	}
+}
+
+void
+am_cuda_memhandle_release(void* cuda_ipc_dev_ptr)
+{
+	if(!cuda_memhandle_cache_enabled)
+		PSMI_CUDA_CALL(cudaIpcCloseMemHandle, cuda_ipc_dev_ptr);
+	return;
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+void
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			PSMI_CUDA_CALL(cudaIpcCloseMemHandle,
+				       memcache_item->payload.cuda_ipc_dev_ptr);
+	}
+}
+
+#endif
diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h
new file mode 100644
index 0000000..13c972b
--- /dev/null
+++ b/ptl_am/am_cuda_memhandle_cache.h
@@ -0,0 +1,124 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#ifndef _AM_CUDA_MEMHANDLE_CACHE_H
+#define _AM_CUDA_MEMHANDLE_CACHE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+struct _cl_map_item;
+
+typedef struct
+{
+	unsigned long		start;		 /* start virtual address */
+	cudaIpcMemHandle_t      cuda_ipc_handle; /* cuda ipc mem handle */
+	void*			cuda_ipc_dev_ptr;/* Cuda device pointer */
+	uint16_t		length;	 /* length*/
+	psm2_epid_t             epid;
+	struct _cl_map_item*	i_prev;	 /* idle queue previous */
+	struct _cl_map_item*	i_next;	 /* idle queue next */
+}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		nelems;	/* number of elements in the cache */
+} rbtree_cuda_memhandle_cache_map_pl_t;
+
+#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
+
+#include "rbtree.h"
+
+cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
+uint8_t cuda_memhandle_cache_enabled;
+mpool_t cuda_memhandle_mpool;
+uint32_t cuda_memhandle_cache_size;
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
+
+/*
+ * Macro definition for easy programming.
+ */
+
+#define NELEMS			cuda_memhandle_cachemap.payload.nelems
+
+/*
+ * Macro for idle queue management.
+ */
+#define IHEAD			cuda_memhandle_cachemap.root
+#define LAST			IHEAD->payload.i_prev
+#define FIRST			IHEAD->payload.i_next
+#define INEXT(x)		x->payload.i_next
+#define IPREV(x)		x->payload.i_prev
+
+
+psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+
+psm2_error_t am_cuda_memhandle_cache_map_init();
+
+void*
+am_cuda_memhandle_acquire(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+				uint32_t length, psm2_epid_t epid);
+void
+am_cuda_memhandle_release(void* cuda_ipc_dev_ptr);
+
+void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj);
+
+void am_cuda_memhandle_cache_map_fini();
+
+#endif
+
+#endif
diff --git a/ptl_am/am_reqrep.c b/ptl_am/am_reqrep.c
new file mode 100644
index 0000000..5f90ec7
--- /dev/null
+++ b/ptl_am/am_reqrep.c
@@ -0,0 +1,118 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+			   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm2_am_completion_fn_t completion_fn,
+			   void *completion_ctxt)
+{
+	psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS];
+
+	/* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC.
+	 * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly
+	 * used to optimize the IPS path though we could put a stricter interpretation
+	 * on it to disallow any replies.
+	 */
+
+	/* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+	 * the handler index.
+	 */
+	psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+	psmi_assert(epaddr->ptlctl->ptl != NULL);
+
+	req_args[0].u32w0 = (uint32_t) handler;
+	psmi_mq_mtucpy((void *)&req_args[1], (const void *)args,
+		       (nargs * sizeof(psm2_amarg_t)));
+	psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx,
+				req_args, nargs + 1, src, len, 0);
+
+	if (completion_fn)
+		completion_fn(completion_ctxt);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+			 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm2_am_completion_fn_t completion_fn,
+			 void *completion_ctxt)
+{
+	psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS];
+
+	/* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+	 * the handler index.
+	 */
+	psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+	rep_args[0].u32w0 = (uint32_t) handler;
+	psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args,
+		       (nargs * sizeof(psm2_amarg_t)));
+
+	psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx,
+			      rep_args, nargs + 1, src, len, 0);
+
+	if (completion_fn)
+		completion_fn(completion_ctxt);
+
+	return PSM2_OK;
+}
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
new file mode 100755
index 0000000..52d3ab2
--- /dev/null
+++ b/ptl_am/am_reqrep_shmem.c
@@ -0,0 +1,2590 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>		/* shm_open and signal handling */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+#include "psmi_wrappers.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+
+/* AMLONG_SZ is the total size in memory of a bulk packet, including an
+ * am_pkt_bulk_t header struct.
+ * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */
+#define AMLONG_SZ   8192
+#define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t))
+
+static const amsh_qinfo_t amsh_qcounts = {
+	.qreqFifoShort = 1024,
+	.qreqFifoLong = 256,
+	.qrepFifoShort = 1024,
+	.qrepFifoLong = 256
+};
+
+static const amsh_qinfo_t amsh_qelemsz = {
+	.qreqFifoShort = sizeof(am_pkt_short_t),
+	.qreqFifoLong = AMLONG_SZ,
+	.qrepFifoShort = sizeof(am_pkt_short_t),
+	.qrepFifoLong = AMLONG_SZ
+};
+
+ustatic struct {
+	void *addr;
+	size_t len;
+	struct sigaction SIGSEGV_old_act;
+	struct sigaction SIGBUS_old_act;
+} action_stash;
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly);
+static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq);
+static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg,
+			      void *buf, size_t len);
+
+/* Kassist helper functions */
+#if _HFI_DEBUGGING
+static const char *psmi_kassist_getmode(int mode);
+#endif
+static int psmi_get_kassist_mode();
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+static inline void
+am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz)
+{
+	pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED);
+	q->head = 0;
+	q->tail = 0;
+	q->elem_cnt = elem_cnt;
+	q->elem_sz = elem_sz;
+}
+
+static void
+am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems)
+{
+	int i;
+	am_pkt_bulk_t *bulkpkt;
+	uintptr_t bulkptr = (uintptr_t) base_ptr;
+
+	for (i = 0; i < nelems; i++, bulkptr += elemsz) {
+		bulkpkt = (am_pkt_bulk_t *) bulkptr;
+		bulkpkt->idx = i;
+	}
+}
+
+#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \
+			       PSMI_PAGESIZE)
+static inline uintptr_t am_ctl_sizeof_block()
+{
+	return PSMI_ALIGNUP(
+			PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) +
+			/* reqctrl block */
+			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+			_PA(reqFifoShort) + _PA(reqFifoLong) +
+			/*reqctrl block */
+			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+			/* align to page size */
+			_PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE);
+}
+
+#undef _PA
+
+static void am_update_directory(struct am_ctl_nodeinfo *);
+
+static
+void amsh_atexit()
+{
+	static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER;
+	static int atexit_once;
+	psm2_ep_t ep;
+	ptl_t *ptl;
+
+	pthread_mutex_lock(&mutex_once);
+	if (atexit_once) {
+		pthread_mutex_unlock(&mutex_once);
+		return;
+	} else
+		atexit_once = 1;
+	pthread_mutex_unlock(&mutex_once);
+
+	ep = psmi_opened_endpoint;
+	while (ep) {
+		ptl = ep->ptl_amsh.ptl;
+		if (ptl->self_nodeinfo &&
+		    ptl->amsh_keyname != NULL) {
+			_HFI_VDBG("unlinking shm file %s\n",
+				  ptl->amsh_keyname);
+			shm_unlink(ptl->amsh_keyname);
+		}
+		ep = ep->user_ep_next;
+	}
+
+	return;
+}
+
+ustatic
+void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context)
+{
+	if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr &&
+	    (unsigned long int) siginfo->si_addr <  (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) {
+
+		static char shm_errmsg[256];
+
+		snprintf(shm_errmsg, sizeof(shm_errmsg),
+			 "%s: Unable to allocate shared memory for intra-node messaging.\n"
+			 "%s: Delete stale shared memory files in /dev/shm.\n",
+			 psmi_gethostname(), psmi_gethostname());
+		amsh_atexit();
+		if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1)
+			psmi_exit(2);
+		else
+			psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */
+	} else {
+		if (signo == SIGSEGV) {
+			if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) {
+				psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+				raise(SIGSEGV);
+				struct sigaction act;
+				act.sa_sigaction = amsh_mmap_fault;
+				act.sa_flags = SA_SIGINFO;
+				psmi_sigaction(SIGSEGV, &act, NULL);
+			} else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) {
+				return;
+			} else {
+				action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context);
+			}
+		} else if (signo == SIGBUS) {
+			if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) {
+				psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+				raise(SIGBUS);
+				struct sigaction act;
+				act.sa_sigaction = amsh_mmap_fault;
+				act.sa_flags = SA_SIGINFO;
+				psmi_sigaction(SIGBUS, &act, NULL);
+			} else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) {
+				return;
+			} else {
+				action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context);
+			}
+		} else {
+			psmi_exit(signo);
+		}
+	}
+}
+
+/**
+ * Create endpoint shared-memory object, containing ep's info
+ * and message queues.
+ */
+psm2_error_t psmi_shm_create(ptl_t *ptl)
+{
+	psm2_ep_t ep = ptl->ep;
+	char shmbuf[256];
+	void *mapptr;
+	size_t segsz;
+	psm2_error_t err = PSM2_OK;
+	int shmfd;
+	char *amsh_keyname;
+	int iterator;
+	/* Get which kassist mode to use. */
+	ptl->psmi_kassist_mode = psmi_get_kassist_mode();
+
+	if (_HFI_PRDBG_ON) {
+		_HFI_PRDBG_ALWAYS
+			("kassist_mode %d %s use_kassist %d\n",
+			ptl->psmi_kassist_mode,
+			psmi_kassist_getmode(ptl->psmi_kassist_mode),
+			(ptl->psmi_kassist_mode != PSMI_KASSIST_OFF));
+	}
+
+	segsz = am_ctl_sizeof_block();
+	for (iterator = 0; iterator <= INT_MAX; iterator++) {
+		snprintf(shmbuf,
+			 sizeof(shmbuf),
+			 "/psm2_shm.%ld%016lx%d",
+			 (long int) getuid(),
+			 ep->epid,
+			 iterator);
+		amsh_keyname = psmi_strdup(NULL, shmbuf);
+		if (amsh_keyname == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		shmfd =
+		    shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+		if (shmfd < 0) {
+			if (errno == EACCES && iterator < INT_MAX)
+				continue;
+			else {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error creating shared "
+							"memory object in "
+							"shm_open: %s",
+							strerror(errno));
+				goto fail;
+			}
+		} else {
+			struct stat st;
+			if (fstat(shmfd, &st) == -1) {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error validating "
+							"shared memory object "
+							"with fstat: %s",
+							strerror(errno));
+				goto fail;
+			}
+			if (getuid() == st.st_uid) {
+				err = PSM2_OK;
+				break;
+			} else {
+				err = PSM2_SHMEM_SEGMENT_ERR;
+				close(shmfd);
+			}
+		}
+	}
+	if (err) {
+		err = psmi_handle_error(NULL,
+					PSM2_SHMEM_SEGMENT_ERR,
+					"Error creating shared memory object "
+					"in shm_open: namespace exhausted.");
+		goto fail;
+	}
+
+	/* Now register the atexit handler for cleanup, whether master or slave */
+	atexit(amsh_atexit);
+
+	_HFI_PRDBG("Opened shmfile %s\n", amsh_keyname);
+
+	if (ftruncate(shmfd, segsz) != 0) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error setting size of shared memory object to %u bytes in "
+					"ftruncate: %s\n",
+					(uint32_t) segsz,
+					strerror(errno));
+		goto fail;
+	}
+
+	mapptr = mmap(NULL, segsz,
+		      PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0);
+	if (mapptr == MAP_FAILED) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error mmapping shared memory: %s",
+					strerror(errno));
+		goto fail;
+	}
+	close(shmfd);
+	memset((void *) mapptr, 0, segsz); /* touch all of my pages */
+
+	/* Our own ep's info for ptl_am resides at the start of the
+	   shm object.  Other processes need some of this info to
+	   understand the rest of the queue structure and other details. */
+	ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr;
+	ptl->amsh_keyname = amsh_keyname;
+	ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr;
+
+fail:
+	return err;
+}
+
+psm2_error_t psmi_epdir_extend(ptl_t *ptl)
+{
+	struct am_ctl_nodeinfo *new = NULL;
+
+	new = (struct am_ctl_nodeinfo *)
+		psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+			      (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) *
+			      sizeof(struct am_ctl_nodeinfo));
+	if (new == NULL)
+		return PSM2_NO_MEMORY;
+
+	memcpy(new, ptl->am_ep,
+	       ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+	memset(new + ptl->am_ep_size, 0,
+	       AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo));
+
+	psmi_free(ptl->am_ep);
+	ptl->am_ep = new;
+	ptl->am_ep_size += AMSH_DIRBLOCK_SIZE;
+
+	return PSM2_OK;
+}
+
+/**
+ * Unmap shm regions upon proper disconnect with other processes
+ */
+psm2_error_t psmi_do_unmap(uintptr_t shmbase)
+{
+	psm2_error_t err = PSM2_OK;
+	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+		err =
+		    psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+				      "Error with munmap of shared segment: %s",
+				      strerror(errno));
+	}
+	return err;
+}
+
+/**
+ * Map a remote process' shared memory object.
+ *
+ * If the remote process has a shared memory object available, add it to our own
+ * directory and return the shmidx.  If the shared memory object does not exist,
+ * return -1, and the connect poll function will try to map again later.
+ */
+psm2_error_t psmi_shm_map_remote(ptl_t *ptl, psm2_epid_t epid, uint16_t *shmidx_o)
+{
+	int i;
+	int use_kassist;
+	uint16_t shmidx;
+	char shmbuf[256];
+	void *dest_mapptr;
+	size_t segsz;
+	psm2_error_t err = PSM2_OK;
+	int dest_shmfd;
+	struct am_ctl_nodeinfo *dest_nodeinfo;
+	int iterator;
+
+	shmidx = *shmidx_o = -1;
+
+	for (i = 0; i <= ptl->max_ep_idx; i++) {
+		if (ptl->am_ep[i].epid == epid) {
+			*shmidx_o = shmidx = i;
+			return err;
+		}
+	}
+
+
+	use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF);
+
+	segsz = am_ctl_sizeof_block();
+	for (iterator = 0; iterator <= INT_MAX; iterator++) {
+		snprintf(shmbuf,
+			 sizeof(shmbuf),
+			 "/psm2_shm.%ld%016lx%d",
+			 (long int) getuid(),
+			 epid,
+			 iterator);
+		dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU);
+		if (dest_shmfd < 0) {
+			if (errno == EACCES && iterator < INT_MAX)
+				continue;
+			else {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error opening remote "
+							"shared memory object "
+							"in shm_open: %s",
+							strerror(errno));
+				goto fail;
+			}
+		} else {
+			struct stat st;
+			if (fstat(dest_shmfd, &st) == -1) {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error validating "
+							"shared memory object "
+							"with fstat: %s",
+							strerror(errno));
+				goto fail;
+			}
+			if (getuid() == st.st_uid) {
+				err = PSM2_OK;
+				break;
+			} else {
+				err = PSM2_SHMEM_SEGMENT_ERR;
+				close(dest_shmfd);
+			}
+		}
+	}
+	if (err) {
+		err = psmi_handle_error(NULL,
+					PSM2_SHMEM_SEGMENT_ERR,
+					"Error opening remote shared "
+					"memory object in shm_open: "
+					"namespace exhausted.");
+		goto fail;
+	}
+
+	dest_mapptr = mmap(NULL, segsz,
+		      PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0);
+	if (dest_mapptr == MAP_FAILED) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error mmapping remote shared memory: %s",
+					strerror(errno));
+		goto fail;
+	}
+	close(dest_shmfd);
+	dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr;
+
+	/* We core dump right after here if we don't check the mmap */
+	action_stash.addr = dest_mapptr;
+	action_stash.len = segsz;
+
+	struct sigaction act;
+	act.sa_sigaction = amsh_mmap_fault;
+	act.sa_flags = SA_SIGINFO;
+
+	sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+	sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+	{
+		volatile uint16_t *is_init = &dest_nodeinfo->is_init;
+		while (*is_init == 0)
+			usleep(1);
+		ips_sync_reads();
+		_HFI_PRDBG("Got a published remote dirpage page at "
+			   "%p, size=%dn", dest_mapptr, (int)segsz);
+	}
+
+	shmidx = -1;
+	if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) {
+		err = psmi_epdir_extend(ptl);
+		if (err)
+			goto fail;
+
+		for (i = 0; i <= ptl->max_ep_idx; i++) {
+			if (ptl->am_ep[i].epid != 0)
+				am_update_directory(&ptl->am_ep[i]);
+		}
+	}
+	for (i = 0; i < ptl->am_ep_size; i++) {
+		psmi_assert(ptl->am_ep[i].epid != epid);
+		if (ptl->am_ep[i].epid == 0) {
+			ptl->am_ep[i].epid = epid;
+			ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno;
+			ptl->am_ep[i].pid = dest_nodeinfo->pid;
+			if (use_kassist) {
+				/* If we are able to use CMA assume everyone
+				 * else on the node can also use it.
+				 * Advertise that CMA is active via the
+				 * feature flag.
+				 */
+
+				if (cma_available()) {
+					ptl->am_ep[i].amsh_features |=
+					    AMSH_HAVE_CMA;
+					psmi_shm_mq_rv_thresh =
+					    PSMI_MQ_RV_THRESH_CMA;
+				} else {
+					ptl->psmi_kassist_mode =
+					    PSMI_KASSIST_OFF;
+					use_kassist = 0;
+					psmi_shm_mq_rv_thresh =
+					    PSMI_MQ_RV_THRESH_NO_KASSIST;
+				}
+			} else
+				psmi_shm_mq_rv_thresh =
+				    PSMI_MQ_RV_THRESH_NO_KASSIST;
+			_HFI_PRDBG("KASSIST MODE: %s\n",
+				   psmi_kassist_getmode(ptl->psmi_kassist_mode));
+			shmidx = *shmidx_o = i;
+			_HFI_PRDBG("Mapped epid %lx into shmidx %d\n", epid, shmidx);
+			ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr;
+			ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes;
+			if (i > ptl->max_ep_idx)
+				ptl->max_ep_idx = i;
+			break;
+		}
+	}
+
+	/* install the old sighandler back */
+	sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+	sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+	if (shmidx == (uint16_t)-1)
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Could not connect to local endpoint");	fail:
+	return err;
+}
+
+/**
+ * Initialize pointer structure and locks for endpoint shared-memory AM.
+ */
+
+#define AMSH_QSIZE(type)                                                \
+	PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type,   \
+		     PSMI_PAGESIZE)
+
+static psm2_error_t amsh_init_segment(ptl_t *ptl)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Preconditions */
+	psmi_assert_always(ptl != NULL);
+	psmi_assert_always(ptl->ep != NULL);
+	psmi_assert_always(ptl->epaddr != NULL);
+	psmi_assert_always(ptl->ep->epid != 0);
+
+	if ((err = psmi_shm_create(ptl)))
+		goto fail;
+
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong);
+
+	/* We core dump right after here if we don't check the mmap */
+
+	struct sigaction act;
+	act.sa_sigaction = amsh_mmap_fault;
+	act.sa_flags = SA_SIGINFO;
+
+	sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+	sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+	/*
+	 * Now that we know our epid, update it in the shmidx array
+	 */
+	ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL;
+	ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL;
+
+	am_update_directory(ptl->self_nodeinfo);
+
+	ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort));
+	ptl->reqH.end = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) +
+		 amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort);
+
+	ptl->repH.head = ptl->repH.base = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort));
+	ptl->repH.end = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) +
+		 amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort);
+
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq,
+			 amsh_qcounts.qreqFifoShort,
+			 amsh_qelemsz.qreqFifoShort);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq,
+			 amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq,
+			 amsh_qcounts.qrepFifoShort,
+			 amsh_qelemsz.qrepFifoShort);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq,
+			 amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong);
+
+	/* Set bulkidx in every bulk packet */
+	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong,
+			    amsh_qelemsz.qreqFifoLong,
+			    amsh_qcounts.qreqFifoLong);
+	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong,
+			    amsh_qelemsz.qrepFifoLong,
+			    amsh_qcounts.qrepFifoLong);
+
+	/* install the old sighandler back */
+	sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+	sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+fail:
+	return err;
+}
+
+psm2_error_t psmi_shm_detach(ptl_t *ptl)
+{
+	psm2_error_t err = PSM2_OK;
+	uintptr_t shmbase;
+
+	if (ptl->self_nodeinfo == NULL)
+		return err;
+
+	_HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1);
+	shmbase = ptl->self_nodeinfo->amsh_shmbase;
+	shm_unlink(ptl->amsh_keyname);
+	psmi_free(ptl->amsh_keyname);
+
+	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+		err =
+		    psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+				      "Error with munmap of shared segment: %s",
+				      strerror(errno));
+		goto fail;
+	}
+	ptl->self_nodeinfo = NULL;
+	return PSM2_OK;
+
+fail:
+	return err;
+}
+
+/**
+ * Update locally shared-pointer directory.  The directory must be
+ * updated when a new epaddr is connected to or on every epaddr already
+ * connected to whenever the shared memory segment is relocated via mremap.
+ *
+ * @param epaddr Endpoint address for which to update local directory.
+ */
+
+static
+void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
+{
+	uintptr_t base_this;
+
+	base_this = nodeinfo->amsh_shmbase +
+		AMSH_BLOCK_HEADER_SIZE;
+
+	/* Request queues */
+	nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this;
+	nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqH +
+	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+
+	nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqFifoShort +
+	     nodeinfo->amsh_qsizes.qreqFifoShort);
+
+	/* Reply queues */
+	nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqFifoLong +
+	     nodeinfo->amsh_qsizes.qreqFifoLong);
+
+	nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *)
+	    ((uintptr_t) nodeinfo->qdir.qrepH +
+	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+	nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *)
+	    ((uintptr_t) nodeinfo->qdir.qrepFifoShort +
+	     nodeinfo->amsh_qsizes.qrepFifoShort);
+
+	_HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n",
+		  nodeinfo->epaddr,
+		  nodeinfo->qdir.qreqH,
+		  nodeinfo->qdir.qreqFifoShort,
+		  nodeinfo->qdir.qreqFifoLong);
+	_HFI_VDBG("epaddr=%p Reply   Hdr=%p,Pkt=%p,Long=%p\n",
+		  nodeinfo->epaddr,
+		  nodeinfo->qdir.qrepH,
+		  nodeinfo->qdir.qrepFifoShort,
+		  nodeinfo->qdir.qrepFifoLong);
+
+	/* Sanity check */
+	uintptr_t base_next =
+	    (uintptr_t) nodeinfo->qdir.qrepFifoLong +
+	    nodeinfo->amsh_qsizes.qrepFifoLong;
+
+	psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block());
+}
+
+
+/* ep_epid_share_memory wrapper */
+static
+int amsh_epid_reachable(ptl_t *ptl, psm2_epid_t epid)
+{
+	int result;
+	psm2_error_t err;
+	err = psm2_ep_epid_share_memory(ptl->ep, epid, &result);
+	psmi_assert_always(err == PSM2_OK);
+	return result;
+}
+
+static
+psm2_error_t
+amsh_epaddr_add(ptl_t *ptl, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o)
+{
+	psm2_epaddr_t epaddr;
+	am_epaddr_t *amaddr;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL);
+
+	/* The self PTL handles loopback communication. */
+	psmi_assert(epid != ptl->epid);
+
+	/* note the size of the memory is am_epaddr_t */
+	epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep,
+					    PER_PEER_ENDPOINT, 1,
+					    sizeof(am_epaddr_t));
+	if (epaddr == NULL) {
+		return PSM2_NO_MEMORY;
+	}
+	psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL);
+
+	if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid),
+					  psmi_gethostname(), 0)))
+		goto fail;
+
+	epaddr->ptlctl = ptl->ctl;
+	epaddr->epid = epid;
+
+	/* convert to am_epaddr_t */
+	amaddr = (am_epaddr_t *) epaddr;
+	/* tell the other endpoint their location in our directory */
+	amaddr->_shmidx = shmidx;
+	/* we haven't connected yet, so we can't give them the same hint */
+	amaddr->_return_shmidx = -1;
+	AMSH_CSTATE_OUTGOING_SET(amaddr, NONE);
+	AMSH_CSTATE_INCOMING_SET(amaddr, NONE);
+
+	/* other setup */
+	ptl->am_ep[shmidx].epaddr = epaddr;
+	am_update_directory(&ptl->am_ep[shmidx]);
+	/* Finally, add to table */
+	if ((err = psmi_epid_add(ptl->ep, epid, epaddr)))
+		goto fail;
+	_HFI_VDBG("epaddr=%s added to ptl=%p\n",
+		  psmi_epaddr_get_name(epid), ptl);
+	*epaddr_o = epaddr;
+	return PSM2_OK;
+fail:
+	if (epaddr != ptl->epaddr)
+		psmi_free(epaddr);
+	return err;
+}
+
+static
+void
+amsh_epaddr_update(ptl_t *ptl, psm2_epaddr_t epaddr)
+{
+	am_epaddr_t *amaddr;
+	uint16_t shmidx;
+	struct am_ctl_nodeinfo *nodeinfo;
+
+	amaddr = (am_epaddr_t *) epaddr;
+	shmidx = amaddr->_shmidx;
+	nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase;
+
+	/* restart the connection process */
+	amaddr->_return_shmidx = -1;
+	AMSH_CSTATE_OUTGOING_SET(amaddr, NONE);
+
+	/* wait for the other process to init again */
+	{
+		volatile uint16_t *is_init = &nodeinfo->is_init;
+		while (*is_init == 0)
+			usleep(1);
+		ips_sync_reads();
+	}
+
+	/* get the updated values from the new nodeinfo page */
+	ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno;
+	ptl->am_ep[shmidx].pid = nodeinfo->pid;
+	ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes;
+	am_update_directory(&ptl->am_ep[shmidx]);
+	return;
+}
+
+struct ptl_connection_req {
+	int isdone;
+	int op;			/* connect or disconnect */
+	int numep;
+	int numep_left;
+	int phase;
+
+	int *epid_mask;
+	const psm2_epid_t *epids;	/* input epid list */
+	psm2_epaddr_t *epaddr;
+	psm2_error_t *errors;	/* inout errors */
+
+	/* Used for connect/disconnect */
+	psm2_amarg_t args[4];
+};
+
+static
+void amsh_free_epaddr(psm2_epaddr_t epaddr)
+{
+	psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+	psmi_free(epaddr);
+	return;
+}
+
+#define PTL_OP_CONNECT      0
+#define PTL_OP_DISCONNECT   1
+#define PTL_OP_ABORT        2
+
+static
+psm2_error_t
+amsh_ep_connreq_init(ptl_t *ptl, int op, /* connect, disconnect or abort */
+		     int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */
+		     const int array_of_epid_mask[],
+		     psm2_error_t *array_of_errors,
+		     psm2_epaddr_t *array_of_epaddr,
+		     struct ptl_connection_req **req_o)
+{
+	int i, cstate;
+	psm2_epaddr_t epaddr;
+	psm2_epid_t epid;
+	struct ptl_connection_req *req = NULL;
+
+	req = (struct ptl_connection_req *)
+	    psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1,
+			sizeof(struct ptl_connection_req));
+	if (req == NULL)
+		return PSM2_NO_MEMORY;
+	req->isdone = 0;
+	req->op = op;
+	req->numep = numep;
+	req->numep_left = 0;
+	req->phase = ptl->connect_phase;
+	req->epid_mask = (int *)
+	    psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int));
+	if (req->epid_mask == NULL) {
+		psmi_free(req);
+		return PSM2_NO_MEMORY;
+	}
+	req->epaddr = array_of_epaddr;
+	req->epids = array_of_epid;
+	req->errors = array_of_errors;
+
+	/* First check if there's really something to connect/disconnect
+	 * for this PTL */
+	for (i = 0; i < numep; i++) {
+		req->epid_mask[i] = AMSH_CMASK_NONE;	/* no connect by default */
+		if (!array_of_epid_mask[i])
+			continue;
+		if (op == PTL_OP_CONNECT) {
+			epid = array_of_epid[i];
+
+			/* Connect only to other processes reachable by shared memory.
+			   The self PTL handles loopback communication, so explicitly
+			   refuse to connect to self. */
+			if (!amsh_epid_reachable(ptl, epid)
+			    || epid == ptl->epid) {
+				array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+				array_of_epaddr[i] = NULL;
+				continue;
+			}
+
+			_HFI_VDBG("looking at epid %llx\n",
+				  (unsigned long long)epid);
+			epaddr = psmi_epid_lookup(ptl->ep, epid);
+			if (epaddr != NULL) {
+				if (epaddr->ptlctl->ptl != ptl) {
+					array_of_errors[i] =
+					    PSM2_EPID_UNREACHABLE;
+					array_of_epaddr[i] = NULL;
+					continue;
+				}
+				cstate =
+				    AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+				if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+					array_of_epaddr[i] = epaddr;
+					array_of_errors[i] = PSM2_OK;
+				} else {
+					psmi_assert(cstate ==
+						    AMSH_CSTATE_OUTGOING_NONE);
+					array_of_errors[i] = PSM2_TIMEOUT;
+					array_of_epaddr[i] = epaddr;
+					req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				}
+			} else {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				array_of_epaddr[i] = NULL;
+			}
+		} else {	/* disc or abort */
+			epaddr = array_of_epaddr[i];
+			psmi_assert(epaddr != NULL);
+			cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+			if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				_HFI_VDBG
+				    ("Just set index %d to AMSH_CMASK_PREREQ\n",
+				     i);
+			}
+			/* XXX undef ? */
+		}
+		if (req->epid_mask[i] != AMSH_CMASK_NONE)
+			req->numep_left++;
+	}
+
+	if (req->numep_left == 0) {	/* nothing to do */
+		psmi_free(req->epid_mask);
+		psmi_free(req);
+		_HFI_VDBG("Nothing to connect, bump up phase\n");
+		ptl->connect_phase++;
+		*req_o = NULL;
+		return PSM2_OK;
+	} else {
+		*req_o = req;
+		return PSM2_OK_NO_PROGRESS;
+	}
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_poll(ptl_t *ptl, struct ptl_connection_req *req)
+{
+	int i, j, cstate;
+	uint16_t shmidx = (uint16_t)-1;
+	psm2_error_t err = PSM2_OK;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+
+	if (req == NULL || req->isdone)
+		return PSM2_OK;
+
+	psmi_assert_always(ptl->connect_phase == req->phase);
+
+	if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+		for (i = 0; i < req->numep; i++) {
+			if (req->epid_mask[i] == AMSH_CMASK_NONE ||
+			    req->epid_mask[i] == AMSH_CMASK_DONE)
+				continue;
+
+			epaddr = req->epaddr[i];
+			psmi_assert(epaddr != NULL);
+			if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+				shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+				/* Make sure the target of the disconnect is still there */
+				if (ptl->am_ep[shmidx].
+				    epid != epaddr->epid) {
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *)
+							   epaddr, NONE);
+				}
+			}
+
+			if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+				req->args[0].u32w0 = PSMI_AM_DISC_REQ;
+				req->args[0].u32w1 = ptl->connect_phase;
+				req->args[1].u64w0 = (uint64_t) ptl->epid;
+				psmi_assert(shmidx != (uint16_t)-1);
+				req->args[2].u16w0 = shmidx;
+				req->args[2].u32w1 = PSM2_OK;
+				req->args[3].u64w0 =
+				    (uint64_t) (uintptr_t) &req->errors[i];
+				psmi_amsh_short_request(ptl, epaddr,
+							amsh_conn_handler_hidx,
+							req->args, 4, NULL, 0,
+							0);
+				AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+				           DISC_REQUESTED);
+				/**
+				* Only munmap if we have nothing more to
+				* communicate with the other node, i.e. we
+				* already recieved a disconnect req from the
+				* other node.
+				*/
+				if (AMSH_CSTATE_INCOMING_GET((am_epaddr_t *) epaddr) ==
+					AMSH_CSTATE_INCOMING_DISC_REQUESTED)
+					err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+				req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+			} else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+				cstate =
+				    AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+				if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) {
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *)
+							   epaddr, NONE);
+				}
+			}
+		}
+	} else {
+		/* First see if we've made progress on any postreqs */
+		int n_prereq = 0;
+		for (i = 0; i < req->numep; i++) {
+			int cstate;
+			if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) {
+				if (req->epid_mask[i] == AMSH_CMASK_PREREQ)
+					n_prereq++;
+				continue;
+			}
+			epaddr = req->epaddr[i];
+			psmi_assert(epaddr != NULL);
+
+			/* detect if a race has occurred on due to re-using an
+			 * old shm file - if so, restart the connection */
+			shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+			if (ptl->am_ep[shmidx].pid !=
+			    ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+						   NONE);
+				n_prereq++;
+				amsh_epaddr_update(ptl, epaddr);
+				continue;
+			}
+
+			cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+			if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+				req->numep_left--;
+				AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+						   ESTABLISHED);
+				req->epid_mask[i] = AMSH_CMASK_DONE;
+				continue;
+			}
+		}
+		if (n_prereq > 0) {
+			psmi_assert(req->numep_left > 0);
+			/* Go through the list of peers we need to connect to and find out
+			 * if they each shared ep is mapped into shm */
+			for (i = 0; i < req->numep; i++) {
+				if (req->epid_mask[i] != AMSH_CMASK_PREREQ)
+					continue;
+				epid = req->epids[i];
+				epaddr = req->epaddr[i];
+				/* Go through mapped epids and find the epid we're looking for */
+				for (shmidx = -1, j = 0;
+				     j <= ptl->max_ep_idx; j++) {
+					/* epid is connected and ready to go */
+					if (ptl->am_ep[j].
+					    epid == epid) {
+						shmidx = j;
+						break;
+					}
+				}
+				if (shmidx == (uint16_t)-1) {
+					/* Couldn't find peer's epid in dirpage.
+					   Check shmdir to see if epid is up now. */
+					if ((err = psmi_shm_map_remote(ptl, epid, &shmidx))) {
+						return err;
+					}
+					continue;
+				}
+				/* Before we even send the request out, check to see if
+				 * versions are interoperable */
+				if (!psmi_verno_isinteroperable
+				    (ptl->am_ep[shmidx].
+				     psm_verno)) {
+					char buf[32];
+					uint16_t their_verno =
+					    ptl->am_ep[shmidx].
+					    psm_verno;
+					snprintf(buf, sizeof(buf), "%d.%d",
+						 PSMI_VERNO_GET_MAJOR
+						 (their_verno),
+						 PSMI_VERNO_GET_MINOR
+						 (their_verno));
+
+					_HFI_INFO("Local endpoint id %" PRIx64
+						  " has version %s "
+						  "which is not supported by library version %d.%d",
+						  epid, buf, PSM2_VERNO_MAJOR,
+						  PSM2_VERNO_MINOR);
+					req->errors[i] =
+					    PSM2_EPID_INVALID_VERSION;
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					continue;
+				}
+				if (epaddr != NULL) {
+					psmi_assert(((am_epaddr_t *) epaddr)->
+						    _shmidx == shmidx);
+				} else
+				    if ((epaddr =
+					 psmi_epid_lookup(ptl->ep,
+							  epid)) == NULL) {
+					if ((err =
+					     amsh_epaddr_add(ptl, epid, shmidx,
+							     &epaddr))) {
+						return err;
+					}
+				}
+				req->epaddr[i] = epaddr;
+				req->args[0].u32w0 = PSMI_AM_CONN_REQ;
+				req->args[0].u32w1 = ptl->connect_phase;
+				req->args[1].u64w0 = (uint64_t) ptl->epid;
+				/* tell the other process its shmidx here */
+				req->args[2].u16w0 = shmidx;
+				req->args[2].u32w1 = PSM2_OK;
+				req->args[3].u64w0 =
+				    (uint64_t) (uintptr_t) &req->errors[i];
+				req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+				psmi_amsh_short_request(ptl, epaddr,
+							amsh_conn_handler_hidx,
+							req->args, 4, NULL, 0,
+							0);
+				_HFI_PRDBG("epaddr=%p, epid=%" PRIx64
+					   " at shmidx=%d\n", epaddr, epid,
+					   shmidx);
+			}
+		}
+	}
+
+	if (req->numep_left == 0) {	/* we're all done */
+		req->isdone = 1;
+		return PSM2_OK;
+	} else {
+		sched_yield();
+		return PSM2_OK_NO_PROGRESS;
+	}
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_fini(ptl_t *ptl, struct ptl_connection_req *req)
+{
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	/* Wherever we are at in our connect process, we've been instructed to
+	 * finish the connection process */
+	if (req == NULL)
+		return PSM2_OK;
+
+	/* This prevents future connect replies from referencing data structures
+	 * that disappeared */
+	ptl->connect_phase++;
+
+	/* First process any leftovers in postreq or prereq */
+	for (i = 0; i < req->numep; i++) {
+		if (req->epid_mask[i] == AMSH_CMASK_NONE)
+			continue;
+		else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+			int cstate;
+			req->epid_mask[i] = AMSH_CMASK_DONE;
+			cstate =
+			    AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) (req->
+								epaddr[i]));
+			if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+				req->numep_left--;
+				AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) (req->
+								    epaddr[i]),
+						   ESTABLISHED);
+			} else {	/* never actually got reply */
+				req->errors[i] = PSM2_TIMEOUT;
+			}
+		}
+		/* If we couldn't go from prereq to postreq, that means we couldn't
+		 * find the shmidx for an epid in time.  This can only be a case of
+		 * time out */
+		else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+			req->errors[i] = PSM2_TIMEOUT;
+			req->numep_left--;
+			req->epid_mask[i] = AMSH_CMASK_DONE;
+		}
+	}
+
+	/* Whatever is left can only be in DONE or NONE state */
+	for (i = 0; i < req->numep; i++) {
+		if (req->epid_mask[i] == AMSH_CMASK_NONE)
+			continue;
+		psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE);
+
+		err = psmi_error_cmp(err, req->errors[i]);
+		/* XXX TODO: Report errors in connection. */
+		/* Only free epaddr if they have disconnected from us */
+		int cstate = AMSH_CSTATE_INCOMING_GET((am_epaddr_t *) req->epaddr[i]);
+		if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) {
+			if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+				psmi_assert(req->epaddr[i] != NULL);
+				amsh_free_epaddr(req->epaddr[i]);
+				req->epaddr[i] = NULL;
+			}
+		}
+	}
+
+	psmi_free(req->epid_mask);
+	psmi_free(req);
+
+	return err;
+}
+
+/* Wrapper for 2.0's use of connect/disconnect.  The plan is to move the
+ * init/poll/fini interface up to the PTL level for 2.2 */
+#define CONNREQ_ZERO_POLLS_BEFORE_YIELD  20
+static
+psm2_error_t
+amsh_ep_connreq_wrap(ptl_t *ptl, int op,
+		     int numep,
+		     const psm2_epid_t *array_of_epid,
+		     const int array_of_epid_mask[],
+		     psm2_error_t *array_of_errors,
+		     psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+	psm2_error_t err;
+	uint64_t t_start;
+	struct ptl_connection_req *req;
+	int num_polls_noprogress = 0;
+	static int shm_polite_attach = -1;
+
+	if (shm_polite_attach == -1) {
+		char *p = getenv("PSM2_SHM_POLITE_ATTACH");
+		if (p && *p && atoi(p) != 0) {
+			fprintf(stderr, "%s: Using Polite SHM segment attach\n",
+				psmi_gethostname());
+			shm_polite_attach = 1;
+		}
+		shm_polite_attach = 0;
+	}
+
+	/* Initialize */
+	err = amsh_ep_connreq_init(ptl, op, numep,
+				   array_of_epid, array_of_epid_mask,
+				   array_of_errors, array_of_epaddr, &req);
+	if (err != PSM2_OK_NO_PROGRESS)	/* Either we're all done with connect or
+					 * there was an error */
+		return err;
+
+	/* Poll until either
+	 * 1. We time out
+	 * 2. We are done with connecting
+	 */
+	t_start = get_cycles();
+	do {
+		psmi_poll_internal(ptl->ep, 1);
+		err = amsh_ep_connreq_poll(ptl, req);
+		if (err == PSM2_OK)
+			break;	/* Finished before timeout */
+		else if (err != PSM2_OK_NO_PROGRESS) {
+			psmi_free(req->epid_mask);
+			psmi_free(req);
+			goto fail;
+		} else if (shm_polite_attach &&
+			   ++num_polls_noprogress ==
+			   CONNREQ_ZERO_POLLS_BEFORE_YIELD) {
+			num_polls_noprogress = 0;
+			PSMI_YIELD(ptl->ep->mq->progress_lock);
+		}
+	}
+	while (psmi_cycles_left(t_start, timeout_ns));
+
+	err = amsh_ep_connreq_fini(ptl, req);
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+amsh_ep_connect(ptl_t *ptl,
+		int numep,
+		const psm2_epid_t *array_of_epid,
+		const int array_of_epid_mask[],
+		psm2_error_t *array_of_errors,
+		psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+	return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid,
+				    array_of_epid_mask, array_of_errors,
+				    array_of_epaddr, timeout_ns);
+}
+
+static
+psm2_error_t
+amsh_ep_disconnect(ptl_t *ptl, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_ns)
+{
+	return amsh_ep_connreq_wrap(ptl,
+				    force ? PTL_OP_ABORT : PTL_OP_DISCONNECT,
+				    numep, NULL, array_of_epaddr_mask,
+				    array_of_errors,
+				    array_of_epaddr,
+				    timeout_ns);
+}
+
+#undef CSWAP
+PSMI_ALWAYS_INLINE(
+int32_t
+cswap(volatile int32_t *p, int32_t old_value, int32_t new_value))
+{
+	asm volatile ("lock cmpxchg %2, %0" :
+		      "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+	return old_value;
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0))
+{
+	am_pkt_short_t *pkt;
+	uint32_t idx;
+#ifndef CSWAP
+	pthread_spin_lock(&shq->lock);
+	idx = shq->tail;
+	pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+	if (pkt->flag == QFREE) {
+		ips_sync_reads();
+		pkt->flag = QUSED;
+		shq->tail += 1;
+		if (shq->tail == shq->elem_cnt)
+			shq->tail = 0;
+	} else {
+		pkt = 0;
+	}
+	pthread_spin_unlock(&shq->lock);
+#else
+	uint32_t idx_next;
+	do {
+		idx = shq->tail;
+		idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1;
+	} while (cswap(&shq->tail, idx, idx_next) != idx);
+
+	pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+	while (cswap(&pkt->flag, QFREE, QUSED) != QFREE);
+#endif
+	return pkt;
+}
+
+/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */
+#define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \
+	am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0)))
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt(ptl_t *ptl, uint16_t shmidx, int is_reply))
+{
+	volatile am_ctl_qhdr_t *shq;
+	am_pkt_short_t *pkt0;
+	if (!is_reply) {
+		shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort;
+	} else {
+		shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort;
+	}
+	return am_ctl_getslot_pkt_inner(shq, pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_long(ptl_t *ptl, uint16_t shmidx, int is_reply))
+{
+	volatile am_ctl_qhdr_t *shq;
+	am_pkt_bulk_t *pkt0;
+	if (!is_reply) {
+		shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong;
+	} else {
+		shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong;
+	}
+	return am_ctl_getslot_bulkpkt_inner(shq, pkt0);
+}
+
+psmi_handlertab_t psmi_allhandlers[] = {
+	{0}
+	,
+	{amsh_conn_handler}
+	,
+	{psmi_am_mq_handler}
+	,
+	{psmi_am_mq_handler_data}
+	,
+	{psmi_am_mq_handler_rtsmatch}
+	,
+	{psmi_am_mq_handler_rtsdone}
+	,
+	{psmi_am_handler}
+};
+
+PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr))
+{
+	QMARKFREE(hdr->head);
+	hdr->head++;
+	if (hdr->head == hdr->end)
+		hdr->head = hdr->base;
+}
+
+#define AMSH_ZERO_POLLS_BEFORE_YIELD    64
+#define AMSH_POLLS_BEFORE_PSM_POLL      16
+
+/* XXX this can be made faster.  Instead of checking the flag of the head, keep
+ * a cached copy of the integer value of the tail and compare it against the
+ * previous one we saw.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_poll_internal_inner(ptl_t *ptl, int replyonly,
+			 int is_internal))
+{
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	/* poll replies */
+	if (!QISEMPTY(ptl->repH.head->flag)) {
+		do {
+			ips_sync_reads();
+			process_packet(ptl, (am_pkt_short_t *) ptl->repH.head,
+				       0);
+			advance_head(&ptl->repH);
+			err = PSM2_OK;
+		} while (!QISEMPTY(ptl->repH.head->flag));
+	}
+
+	if (!replyonly) {
+		/* Request queue not enable for 2.0, will be re-enabled to support long
+		 * replies */
+		if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) {
+			psmi_am_reqq_drain(ptl);
+			err = PSM2_OK;
+		}
+		if (!QISEMPTY(ptl->reqH.head->flag)) {
+			do {
+				ips_sync_reads();
+				process_packet(ptl,
+					       (am_pkt_short_t *) ptl->reqH.
+					       head, 1);
+				advance_head(&ptl->reqH);
+				err = PSM2_OK;
+			} while (!QISEMPTY(ptl->reqH.head->flag));
+		}
+	}
+
+	if (is_internal) {
+		if (err == PSM2_OK)	/* some progress, no yields */
+			ptl->zero_polls = 0;
+		else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) {
+			/* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */
+			sched_yield();
+			ptl->zero_polls = 0;
+		}
+
+		if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) {
+			psmi_poll_internal(ptl->ep, 0);
+			ptl->amsh_only_polls = 0;
+		}
+	}
+	return err;		/* if we actually did something */
+}
+
+/* non-inlined version */
+static
+psm2_error_t
+amsh_poll_internal(ptl_t *ptl, int replyonly)
+{
+	return amsh_poll_internal_inner(ptl, replyonly, 1);
+}
+
+#ifdef PSM_PROFILE
+#define AMSH_POLL_UNTIL(ptl, isreply, cond) \
+	do {								\
+		PSMI_PROFILE_BLOCK();					\
+		while (!(cond)) {					\
+			PSMI_PROFILE_REBLOCK(				\
+				amsh_poll_internal(ptl, isreply) ==	\
+					PSM2_OK_NO_PROGRESS);		\
+		}							\
+		PSMI_PROFILE_UNBLOCK();					\
+	} while (0)
+#else
+#define AMSH_POLL_UNTIL(ptl, isreply, cond)			\
+	do {							\
+		while (!(cond)) {				\
+			amsh_poll_internal(ptl, isreply);	\
+		}						\
+	} while (0)
+#endif
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly)
+{
+	return amsh_poll_internal_inner(ptl, replyonly, 0);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx,
+		  uint32_t bulkidx, uint16_t fmt, uint16_t nargs,
+		  uint16_t handleridx, psm2_amarg_t *args,
+		  const void *src, uint32_t len, int isreply))
+{
+	int i;
+	volatile am_pkt_short_t *pkt;
+	int copy_nargs;
+
+	AMSH_POLL_UNTIL(ptl, isreply,
+			(pkt =
+			 am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL);
+
+	/* got a free pkt... fill it in */
+	pkt->bulkidx = bulkidx;
+	pkt->shmidx = returnidx;
+	pkt->type = fmt;
+	pkt->nargs = nargs;
+	pkt->handleridx = handleridx;
+
+	/* Limit the number of args copied here to NSHORT_ARGS.  Additional args
+	   are carried in the bulkpkt. */
+	copy_nargs = nargs;
+	if (copy_nargs > NSHORT_ARGS) {
+		copy_nargs = NSHORT_ARGS;
+	}
+
+	for (i = 0; i < copy_nargs; i++)
+		pkt->args[i] = args[i];
+
+	if (fmt == AMFMT_SHORT_INLINE)
+		mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src,
+			     len);
+
+	_HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d,"
+		  "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx,
+		  pkt->flag, pkt->nargs, src, (int)len, (int)handleridx,
+		  src != NULL ? *((uint32_t *) src) : 0);
+	QMARKREADY(pkt);
+}
+
+#define amsh_shm_copy_short psmi_mq_mtucpy
+#define amsh_shm_copy_long  psmi_mq_mtucpy
+
+PSMI_ALWAYS_INLINE(
+int
+psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, void *dst, int flags))
+{
+	uint16_t type;
+	uint32_t bulkidx;
+	uint16_t hidx = (uint16_t) handler;
+	int destidx = ((am_epaddr_t *) epaddr)->_shmidx;
+	int returnidx = ((am_epaddr_t *) epaddr)->_return_shmidx;
+	int is_reply = AM_IS_REPLY(amtype);
+	volatile am_pkt_bulk_t *bulkpkt;
+
+	_HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n",
+		  is_reply ? "reply" : "request",
+		  psmi_epaddr_get_name(epaddr->epid),
+		  ((am_epaddr_t *) epaddr)->_shmidx, amtype);
+	psmi_assert(epaddr != ptl->epaddr);
+
+	switch (amtype) {
+	case AMREQUEST_SHORT:
+	case AMREPLY_SHORT:
+		if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) {
+			/* Payload fits in args packet */
+			type = AMFMT_SHORT_INLINE;
+			bulkidx = len;
+		} else {
+			int i;
+
+			psmi_assert(len < amsh_qelemsz.qreqFifoLong);
+			psmi_assert(src != NULL || nargs > NSHORT_ARGS);
+			type = AMFMT_SHORT;
+
+			AMSH_POLL_UNTIL(ptl, is_reply,
+					(bulkpkt =
+					 am_ctl_getslot_long(ptl, destidx,
+							     is_reply)) !=
+					NULL);
+
+			bulkidx = bulkpkt->idx;
+			bulkpkt->len = len;
+			_HFI_VDBG("bulkpkt %p flag is %d from idx %d\n",
+				  bulkpkt, bulkpkt->flag, destidx);
+
+			for (i = 0; i < nargs - NSHORT_ARGS; i++) {
+				bulkpkt->args[i] = args[i + NSHORT_ARGS];
+			}
+
+			amsh_shm_copy_short((void *)bulkpkt->payload, src,
+					    (uint32_t) len);
+			QMARKREADY(bulkpkt);
+		}
+		am_send_pkt_short(ptl, destidx, returnidx, bulkidx, type,
+				  nargs, hidx, args, src, len, is_reply);
+		break;
+
+	case AMREQUEST_LONG:
+	case AMREPLY_LONG:
+		{
+			uint32_t bytes_left = len;
+			uint8_t *src_this = (uint8_t *) src;
+			uint8_t *dst_this = (uint8_t *) dst;
+			uint32_t bytes_this;
+
+			type = AMFMT_LONG;
+
+			_HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n",
+				  is_reply ? "rep" : "req", src, dst,
+				  (uint32_t) len, hidx);
+			while (bytes_left) {
+				bytes_this = min(bytes_left, AMLONG_MTU);
+				AMSH_POLL_UNTIL(ptl, is_reply,
+						(bulkpkt =
+						 am_ctl_getslot_long(ptl,
+								     destidx,
+								     is_reply))
+						!= NULL);
+				bytes_left -= bytes_this;
+				if (bytes_left == 0)
+					type = AMFMT_LONG_END;
+				bulkidx = bulkpkt->idx;
+				amsh_shm_copy_long((void *)bulkpkt->payload,
+						   src_this, bytes_this);
+
+				bulkpkt->dest = (uintptr_t) dst;
+				bulkpkt->dest_off =
+				    (uint32_t) ((uintptr_t) dst_this -
+						(uintptr_t) dst);
+				bulkpkt->len = bytes_this;
+				QMARKREADY(bulkpkt);
+				am_send_pkt_short(ptl, destidx, returnidx,
+						  bulkidx, type, nargs, hidx,
+						  args, NULL, 0, is_reply);
+				src_this += bytes_this;
+				dst_this += bytes_this;
+			}
+			break;
+		}
+	default:
+		break;
+	}
+	return 1;
+}
+
+/* A generic version that's not inlined */
+int
+psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+		  psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		  const void *src, size_t len, void *dst, int flags)
+{
+	return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args,
+				       nargs, src, len, dst, flags);
+}
+
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, int flags)
+{
+	return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler,
+				       args, nargs, src, len, NULL, flags);
+}
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+		       psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		       const void *src, size_t len, void *dest, int flags)
+{
+	return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler,
+				       args, nargs, src, len, dest, flags);
+}
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags)
+{
+	psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming,
+				handler, args, nargs, src, len, NULL, flags);
+	return;
+}
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags)
+{
+	psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming,
+				handler, args, nargs, src, len, dest, flags);
+	return;
+}
+
+void psmi_am_reqq_init(ptl_t *ptl)
+{
+	ptl->psmi_am_reqq_fifo.first = NULL;
+	ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+}
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl)
+{
+	am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first;
+	am_reqq_t *req;
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+
+	/* We're going to process the entire list, and running the generic handler
+	 * below can cause other requests to be enqueued in the queue that we're
+	 * processing. */
+	ptl->psmi_am_reqq_fifo.first = NULL;
+	ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+
+	while ((req = reqn) != NULL) {
+		err = PSM2_OK;
+		reqn = req->next;
+		_HFI_VDBG
+		    ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n",
+		     req, psmi_epaddr_get_hostname(req->epaddr->epid),
+		     (void *)(uintptr_t) req->args[1].u64w0,
+		     (void *)(uintptr_t) req->args[0].u64w0);
+		psmi_amsh_generic(req->amtype, req->ptl, req->epaddr,
+				  req->handler, req->args, req->nargs, req->src,
+				  req->len, req->dest, req->amflags);
+		if (req->flags & AM_FLAG_SRC_TEMP)
+			psmi_free(req->src);
+		psmi_free(req);
+	}
+	return err;
+}
+
+void
+psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+		 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		 void *src, size_t len, void *dest, int amflags)
+{
+	int i;
+	int flags = 0;
+	am_reqq_t *nreq =
+	    (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t));
+	psmi_assert_always(nreq != NULL);
+	_HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, "
+		  "localreq=%p, remotereq=%p\n", nreq,
+		  psmi_epaddr_get_hostname(epaddr->epid), dest,
+		  (int)len, (void *)(uintptr_t) args[1].u64w0,
+		  (void *)(uintptr_t) args[0].u64w0);
+
+	psmi_assert(nargs <= 8);
+	nreq->next = NULL;
+	nreq->amtype = amtype;
+	nreq->ptl = ptl;
+	nreq->epaddr = epaddr;
+	nreq->handler = handler;
+	for (i = 0; i < nargs; i++)
+		nreq->args[i] = args[i];
+	nreq->nargs = nargs;
+	if (AM_IS_LONG(amtype) && src != NULL &&
+	    len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) {
+		abort();
+		flags |= AM_FLAG_SRC_TEMP;
+		nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len);
+		psmi_assert_always(nreq->src != NULL);	/* XXX mem */
+		amsh_shm_copy_short(nreq->src, src, len);
+	} else
+		nreq->src = src;
+	nreq->len = len;
+	nreq->dest = dest;
+	nreq->amflags = amflags;
+	nreq->flags = flags;
+
+	nreq->next = NULL;
+	*(ptl->psmi_am_reqq_fifo.lastp) = nreq;
+	ptl->psmi_am_reqq_fifo.lastp = &nreq->next;
+}
+
+static
+void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq)
+{
+	amsh_am_token_t tok;
+	psmi_handler_fn_t fn;
+	psm2_amarg_t *args = pkt->args;
+	uint16_t shmidx = pkt->shmidx;
+	int nargs = pkt->nargs;
+
+	tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0);
+	tok.ptl = ptl;
+	tok.mq = ptl->ep->mq;
+	tok.shmidx = shmidx;
+
+	uint16_t hidx = (uint16_t) pkt->handleridx;
+	uint32_t bulkidx = pkt->bulkidx;
+	uintptr_t bulkptr;
+	am_pkt_bulk_t *bulkpkt;
+
+	fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn;
+	psmi_assert(fn != NULL);
+	psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase);
+
+	if (pkt->type == AMFMT_SHORT_INLINE) {
+		_HFI_VDBG
+		    ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n",
+		     isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt,
+		     hidx);
+
+		fn(&tok, args, nargs, pkt->length > 0 ?
+		   (void *)&args[nargs] : NULL, pkt->length);
+	} else {
+		int isend = 0;
+		switch (pkt->type) {
+		case AMFMT_LONG_END:
+			isend = 1;
+		case AMFMT_LONG:
+		case AMFMT_SHORT:
+			if (isreq) {
+				bulkptr =
+				    (uintptr_t) ptl->self_nodeinfo->qdir.
+				    qreqFifoLong;
+				bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong;
+			} else {
+				bulkptr =
+				    (uintptr_t) ptl->self_nodeinfo->qdir.
+				    qrepFifoLong;
+				bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong;
+			}
+			break;
+		default:
+			bulkptr = 0;
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled packet type 0x%x",
+					  pkt->type);
+			return;
+		}
+
+		bulkpkt = (am_pkt_bulk_t *) bulkptr;
+		_HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d "
+			  "from_idx=%d pkt=%p/%p hidx=%d\n",
+			  ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag,
+			  bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx);
+		psmi_assert(bulkpkt->flag == QREADY);
+
+		if (nargs > NSHORT_ARGS || isend == 1) {
+			/* Either there are more args in the bulkpkt, or this is the last
+			   packet of a long payload.  In either case, copy the args. */
+			int i;
+			args =
+			    alloca((NSHORT_ARGS +
+				    NBULK_ARGS) * sizeof(psm2_amarg_t));
+
+			for (i = 0; i < NSHORT_ARGS; i++) {
+				args[i] = pkt->args[i];
+			}
+
+			for (; i < nargs; i++) {
+				args[i] = bulkpkt->args[i - NSHORT_ARGS];
+			}
+		}
+
+		if (pkt->type == AMFMT_SHORT) {
+			fn(&tok, args, nargs,
+			   (void *)bulkpkt->payload, bulkpkt->len);
+			QMARKFREE(bulkpkt);
+		} else {
+			amsh_shm_copy_long((void *)(bulkpkt->dest +
+						    bulkpkt->dest_off),
+					   bulkpkt->payload, bulkpkt->len);
+
+			/* If this is the last packet, copy args before running the
+			 * handler */
+			if (isend) {
+				void *dest = (void *)bulkpkt->dest;
+				size_t len =
+				    (size_t) (bulkpkt->dest_off + bulkpkt->len);
+				QMARKFREE(bulkpkt);
+				fn(&tok, args, nargs, dest, len);
+			} else
+				QMARKFREE(bulkpkt);
+		}
+	}
+	return;
+}
+
+static
+psm2_error_t
+amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
+	     psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf,
+	     uint32_t len)
+{
+	psm2_amarg_t args[5];
+	psm2_error_t err = PSM2_OK;
+
+	args[0].u32w0 = MQ_MSG_LONGRTS;
+	args[0].u32w1 = len;
+	args[1].u32w1 = tag->tag[0];
+	args[1].u32w0 = tag->tag[1];
+	args[2].u32w1 = tag->tag[2];
+	args[3].u64w0 = (uint64_t) (uintptr_t) req;
+	args[4].u64w0 = (uint64_t) (uintptr_t) buf;
+
+	psmi_assert(req != NULL);
+	req->type = MQE_TYPE_SEND;
+	req->buf = (void *)buf;
+	req->buf_len = len;
+	req->send_msglen = len;
+	req->send_msgoff = 0;
+
+#ifdef PSM_CUDA
+		/* If the send buffer is on gpu, we create a cuda IPC
+		 * handle and send it as payload in the RTS
+		 */
+		if (req->is_buf_gpu_mem) {
+			PSMI_CUDA_CALL(cudaIpcGetMemHandle,
+				      (cudaIpcMemHandle_t *) &req->cuda_ipc_handle,
+				      (void*) buf);
+			psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+						args, 5, (void*)&req->cuda_ipc_handle,
+						sizeof(cudaIpcMemHandle_t), 0);
+			req->cuda_ipc_handle_attached = 1;
+		} else
+			psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+						args, 5, NULL, 0, 0);
+#else
+		psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+					args, 5, NULL, 0, 0);
+#endif
+
+	return err;
+}
+
+/*
+ * All shared am mq sends, req can be NULL
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+		   uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf,
+		   uint32_t len))
+{
+	psm2_amarg_t args[3];
+	psm2_error_t err = PSM2_OK;
+	int is_blocking = (req == NULL);
+
+#ifdef PSM_CUDA
+	int gpu_mem;
+	/* All sends from  a gpu buffer use the rendezvous protocol */
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+		if (!PSMI_IS_CUDA_ENABLED)
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				 " Please enable PSM CUDA support when using GPU buffer \n");
+		gpu_mem = 1;
+		goto do_rendezvous;
+	} else
+		gpu_mem = 0;
+#endif
+
+	if (!flags && len <= AMLONG_MTU) {
+		if (len <= 32)
+			args[0].u32w0 = MQ_MSG_TINY;
+		else
+			args[0].u32w0 = MQ_MSG_SHORT;
+		args[1].u32w1 = tag->tag[0];
+		args[1].u32w0 = tag->tag[1];
+		args[2].u32w1 = tag->tag[2];
+
+		psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+					mq_handler_hidx, args, 3, ubuf, len, 0);
+	} else if (flags & PSM2_MQ_FLAG_SENDSYNC)
+		goto do_rendezvous;
+	else if (len <= mq->shm_thresh_rv) {
+		uint32_t bytes_left = len;
+		uint32_t bytes_this = min(bytes_left, AMLONG_MTU);
+		uint8_t *buf = (uint8_t *) ubuf;
+		args[0].u32w0 = MQ_MSG_EAGER;
+		args[0].u32w1 = len;
+		args[1].u32w1 = tag->tag[0];
+		args[1].u32w0 = tag->tag[1];
+		args[2].u32w1 = tag->tag[2];
+		psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+					mq_handler_hidx, args, 3, buf,
+					bytes_this, 0);
+		bytes_left -= bytes_this;
+		buf += bytes_this;
+		args[2].u32w0 = 0;
+		while (bytes_left) {
+			args[2].u32w0 += bytes_this;
+			bytes_this = min(bytes_left, AMLONG_MTU);
+			/* Here we kind of bend the rules, and assume that shared-memory
+			 * active messages are delivered in order */
+			psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+						mq_handler_data_hidx, args,
+						3, buf, bytes_this, 0);
+			buf += bytes_this;
+			bytes_left -= bytes_this;
+		}
+	} else {
+do_rendezvous:
+		if (is_blocking) {
+			req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+			if_pf(req == NULL)
+			    return PSM2_NO_MEMORY;
+			req->send_msglen = len;
+			req->tag = *tag;
+
+			/* Since SEND command is blocking, this request is
+			 * entirely internal and we will not be exposed to user.
+			 * Setting as internal so it will not be added to
+			 * mq->completed_q */
+			req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+		}
+#ifdef PSM_CUDA
+		/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+		 * when the buffer pointer received into PSM has been allocated
+		 * by the application. This guarantees the all memory operations
+		 * to this region of memory (used by multiple layers of the stack)
+		 * always synchronize
+		 */
+		if (gpu_mem) {
+			int trueflag = 1;
+			PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+				       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+				      (CUdeviceptr)ubuf);
+			req->is_buf_gpu_mem = 1;
+		} else
+			req->is_buf_gpu_mem = 0;
+#endif
+
+		err =
+		    amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag,
+				 ubuf, len);
+
+		if (err == PSM2_OK && is_blocking) {	/* wait... */
+			err = psmi_mq_wait_internal(&req);
+		}
+		return err;	/* skip eager accounting below */
+	}
+
+	/* All eager async sends are always "all done" */
+	if (req != NULL) {
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_shm_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+
+	return err;
+}
+
+static
+psm2_error_t
+amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	      psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context,
+	      psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(req == NULL)
+	    return PSM2_NO_MEMORY;
+
+	req->send_msglen = len;
+	req->tag = *tag;
+	req->context = context;
+
+	_HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+		  psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+		  psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+		  tag->tag[0], tag->tag[1], tag->tag[2]);
+
+	amsh_mq_send_inner(mq, req, epaddr, flags, tag, ubuf, len);
+
+	*req_o = req;
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	     psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	_HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+		  psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+		  psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+		  tag->tag[0], tag->tag[1], tag->tag[2]);
+
+	amsh_mq_send_inner(mq, NULL, epaddr, flags, tag, ubuf, len);
+
+	return PSM2_OK;
+}
+
+/* kassist-related handling */
+int psmi_epaddr_pid(psm2_epaddr_t epaddr)
+{
+	uint16_t shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+	return epaddr->ptlctl->ptl->am_ep[shmidx].pid;
+}
+#if _HFI_DEBUGGING
+static
+const char *psmi_kassist_getmode(int mode)
+{
+	switch (mode) {
+	case PSMI_KASSIST_OFF:
+		return "kassist off";
+	case PSMI_KASSIST_CMA_GET:
+		return "cma get";
+	case PSMI_KASSIST_CMA_PUT:
+		return "cma put";
+	default:
+		return "unknown";
+	}
+}
+#endif
+
+static
+int psmi_get_kassist_mode()
+{
+	int mode = PSMI_KASSIST_MODE_DEFAULT;
+	/* Cuda PSM only supports KASSIST_CMA_GET */
+#ifdef PSM_CUDA
+	mode = PSMI_KASSIST_CMA_GET;
+#else
+	union psmi_envvar_val env_kassist;
+
+	if (!psmi_getenv("PSM2_KASSIST_MODE",
+			 "PSM Shared memory kernel assist mode "
+			 "(cma-put, cma-get, none)",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)
+			 PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) {
+		char *s = env_kassist.e_str;
+		if (strcasecmp(s, "cma-put") == 0)
+			mode = PSMI_KASSIST_CMA_PUT;
+		else if (strcasecmp(s, "cma-get") == 0)
+			mode = PSMI_KASSIST_CMA_GET;
+		else
+			mode = PSMI_KASSIST_OFF;
+	} else {
+		/* cma-get is the fastest, so it's the default.
+		   Availability of CMA is checked in psmi_shm_create();
+		   if CMA is not available it falls back to 'none' there. */
+		mode = PSMI_KASSIST_CMA_GET;
+	}
+#endif
+	return mode;
+}
+
+/* Connection handling for shared memory AM.
+ *
+ * arg0 => conn_op, result (PSM error type)
+ * arg1 => epid (always)
+ * arg2 => version.
+ * arg3 => pointer to error for replies.
+ */
+static
+void
+amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		  size_t len)
+{
+	int op = args[0].u32w0;
+	int phase = args[0].u32w1;
+	psm2_epid_t epid = args[1].u64w0;
+	int16_t return_shmidx = args[2].u16w0;
+	psm2_error_t err = (psm2_error_t) args[2].u32w1;
+	psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0;
+
+	psm2_epaddr_t epaddr;
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	uint16_t shmidx = tok->shmidx;
+	int is_valid;
+	ptl_t *ptl = tok->ptl;
+	int cstate;
+
+	/* We do this because it's an assumption below */
+	psmi_assert_always(buf == NULL && len == 0);
+
+	_HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n",
+		  op, phase, (unsigned long long)epid, err);
+	switch (op) {
+	case PSMI_AM_CONN_REQ:
+		_HFI_VDBG("Connect from %d:%d\n",
+			  (int)psm2_epid_nid(epid), (int)psm2_epid_context(epid));
+		epaddr = psmi_epid_lookup(ptl->ep, epid);
+		if (shmidx == (uint16_t)-1) {
+			/* incoming packet will never be from our shmidx slot 0
+			   thus the other process doesn't know our return info.
+			   attach_to will lookup or create the proper shmidx */
+			if ((err = psmi_shm_map_remote(ptl, epid, &shmidx))) {
+				psmi_handle_error(PSMI_EP_NORETURN, err,
+						  "Fatal error in "
+						  "connecting to shm segment");
+			}
+			am_update_directory(&ptl->am_ep[shmidx]);
+			tok->shmidx = shmidx;
+		}
+
+		if (epaddr == NULL) {
+			uintptr_t args_segoff =
+				(uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase;
+			if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr)))
+				/* Unfortunately, no way out of here yet */
+				psmi_handle_error(PSMI_EP_NORETURN, err,
+						  "Fatal error "
+						  "in connecting to shm segment");
+			args =
+			    (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase +
+					     args_segoff);
+		}
+
+		/* Rewrite args */
+		ptl->connect_incoming++;
+		args[0].u32w0 = PSMI_AM_CONN_REP;
+		args[1].u64w0 = (psm2_epid_t) ptl->epid;
+		/* and return our shmidx for the connecting process */
+		args[2].u16w0 = shmidx;
+		args[2].u32w1 = PSM2_OK;
+		AMSH_CSTATE_INCOMING_SET((am_epaddr_t *) epaddr, ESTABLISHED);
+		((am_epaddr_t *)epaddr)->_return_shmidx = return_shmidx;
+		tok->tok.epaddr_incoming = epaddr;	/* adjust token */
+		psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+				      args, narg, NULL, 0, 0);
+		break;
+
+	case PSMI_AM_CONN_REP:
+		if (ptl->connect_phase != phase) {
+			_HFI_VDBG("Out of phase connect reply\n");
+			return;
+		}
+		epaddr = ptl->am_ep[shmidx].epaddr;
+		/* check if a race has occurred on shm-file reuse.
+		 * if so, don't transition to the next state.
+		 * the next call to connreq_poll() will restart the
+		 * connection.
+		*/
+		if (ptl->am_ep[shmidx].pid !=
+		    ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid)
+			break;
+
+		*perr = err;
+		AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr, REPLIED);
+		((am_epaddr_t *)epaddr)->_return_shmidx = return_shmidx;
+		ptl->connect_outgoing++;
+		_HFI_VDBG("CCC epaddr=%s connected to ptl=%p\n",
+			  psmi_epaddr_get_name(epaddr->epid), ptl);
+		break;
+
+	case PSMI_AM_DISC_REQ:
+		epaddr = psmi_epid_lookup(ptl->ep, epid);
+		if (!epaddr) {
+			_HFI_VDBG("Dropping disconnect request from an epid that we are not connected to\n");
+			return;
+		}
+		args[0].u32w0 = PSMI_AM_DISC_REP;
+		args[2].u32w1 = PSM2_OK;
+		AMSH_CSTATE_INCOMING_SET((am_epaddr_t *) epaddr, DISC_REQUESTED);
+		ptl->connect_incoming--;
+		/* Before sending the reply, make sure the process
+		 * is still connected */
+
+		if (ptl->am_ep[shmidx].epid != epaddr->epid)
+			is_valid = 0;
+		else
+			is_valid = 1;
+
+		if (is_valid) {
+			psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+					      args, narg, NULL, 0, 0);
+			/**
+			* Only munmap if we have nothing more to
+			* communicate with the other node, i.e. we are
+			* already disconnected with the other node
+			* or have sent a disconnect request.
+			*/
+			cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+			if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) {
+				err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+				psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+			}
+		}
+		break;
+
+	case PSMI_AM_DISC_REP:
+		if (ptl->connect_phase != phase) {
+			_HFI_VDBG("Out of phase disconnect reply\n");
+			return;
+		}
+		*perr = err;
+		epaddr = tok->tok.epaddr_incoming;
+		AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr, DISC_REPLIED);
+		ptl->connect_outgoing--;
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unknown/unhandled connect handler op=%d",
+				  op);
+		break;
+	}
+	return;
+}
+
+static
+size_t amsh_sizeof(void)
+{
+	return sizeof(ptl_t);
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	parameters->max_handlers = PSMI_AM_NUM_HANDLERS;
+	parameters->max_nargs = PSMI_AM_MAX_ARGS;
+	parameters->max_request_short = AMLONG_MTU;
+	parameters->max_reply_short = AMLONG_MTU;
+
+	return PSM2_OK;
+}
+
+/**
+ * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid.
+ * @param ptl Pointer to caller-allocated space for PTL (fill in)
+ * @param ctl Pointer to caller-allocated space for PTL-control
+ *            structure (fill in)
+ */
+static
+psm2_error_t
+amsh_init(psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Preconditions */
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+
+	ptl->ep = ep;		/* back pointer */
+	ptl->epid = ep->epid;	/* cache epid */
+	ptl->epaddr = ep->epaddr;	/* cache a copy */
+	ptl->ctl = ctl;
+	ptl->zero_polls = 0;
+
+	ptl->connect_phase = 0;
+	ptl->connect_incoming = 0;
+	ptl->connect_outgoing = 0;
+
+	memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt));
+	memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo));
+
+	ptl->max_ep_idx = -1;
+	ptl->am_ep_size = AMSH_DIRBLOCK_SIZE;
+
+	ptl->am_ep = (struct am_ctl_nodeinfo *)
+		psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+			      ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+	if (ptl->am_ep == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+	if ((err = amsh_init_segment(ptl)))
+		goto fail;
+
+	ptl->self_nodeinfo->psm_verno = PSMI_VERNO;
+	if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) {
+		if (cma_available()) {
+			ptl->self_nodeinfo->amsh_features |=
+				AMSH_HAVE_CMA;
+			psmi_shm_mq_rv_thresh =
+				PSMI_MQ_RV_THRESH_CMA;
+		} else {
+			ptl->psmi_kassist_mode =
+				PSMI_KASSIST_OFF;
+			psmi_shm_mq_rv_thresh =
+				PSMI_MQ_RV_THRESH_NO_KASSIST;
+		}
+	} else {
+		psmi_shm_mq_rv_thresh =
+			PSMI_MQ_RV_THRESH_NO_KASSIST;
+	}
+	ptl->self_nodeinfo->pid = getpid();
+	ptl->self_nodeinfo->epid = ep->epid;
+	ptl->self_nodeinfo->epaddr = ep->epaddr;
+
+	ips_mb();
+	ptl->self_nodeinfo->is_init = 1;
+
+	psmi_am_reqq_init(ptl);
+	memset(ctl, 0, sizeof(*ctl));
+
+	/* Fill in the control structure */
+	ctl->ep = ep;
+	ctl->ptl = ptl;
+	ctl->ep_poll = amsh_poll;
+	ctl->ep_connect = amsh_ep_connect;
+	ctl->ep_disconnect = amsh_ep_disconnect;
+
+	ctl->mq_send = amsh_mq_send;
+	ctl->mq_isend = amsh_mq_isend;
+
+	ctl->am_get_parameters = psmi_amsh_am_get_parameters;
+	ctl->am_short_request = psmi_amsh_am_short_request;
+	ctl->am_short_reply = psmi_amsh_am_short_reply;
+
+	/* No stats in shm (for now...) */
+	ctl->epaddr_stats_num = NULL;
+	ctl->epaddr_stats_init = NULL;
+	ctl->epaddr_stats_get = NULL;
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_memcache_enabled;
+	psmi_getenv("PSM2_CUDA_MEMCACHE_ENABLED",
+		    "PSM cuda ipc memhandle cache enabled (default is enabled)",
+		     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		     (union psmi_envvar_val)
+		      1, &env_memcache_enabled);
+	if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) {
+		union psmi_envvar_val env_memcache_size;
+		psmi_getenv("PSM2_CUDA_MEMCACHE_SIZE",
+			    "Size of the cuda ipc memhandle cache ",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)
+			    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
+		if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint)
+		     != PSM2_OK))
+			goto fail;
+		if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK))
+			goto fail;
+	}
+#endif
+fail:
+	return err;
+}
+
+static psm2_error_t amsh_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+	struct psmi_eptab_iterator itor;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	psm2_error_t err_seg;
+	uint64_t t_start = get_cycles();
+	int i = 0;
+
+	/* Close whatever has been left open -- this will be factored out for 2.1 */
+	if (ptl->connect_outgoing > 0) {
+		int num_disc = 0;
+		int *mask;
+		psm2_error_t *errs;
+		psm2_epaddr_t *epaddr_array;
+
+		psmi_epid_itor_init(&itor, ptl->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl != ptl)
+				continue;
+			if (AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr) ==
+			    AMSH_CSTATE_OUTGOING_ESTABLISHED)
+				num_disc++;
+		}
+		psmi_epid_itor_fini(&itor);
+
+		mask =
+		    (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				       sizeof(int));
+		errs = (psm2_error_t *)
+		    psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				sizeof(psm2_error_t));
+		epaddr_array = (psm2_epaddr_t *)
+		    psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				sizeof(psm2_epaddr_t));
+
+		if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+			if (epaddr_array)
+				psmi_free(epaddr_array);
+			if (errs)
+				psmi_free(errs);
+			if (mask)
+				psmi_free(mask);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		psmi_epid_itor_init(&itor, ptl->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl == ptl) {
+				if (AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr)
+				    == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+					mask[i] = 1;
+					epaddr_array[i] = epaddr;
+					i++;
+				}
+			}
+		}
+		psmi_epid_itor_fini(&itor);
+		psmi_assert(i == num_disc && num_disc > 0);
+		err = amsh_ep_disconnect(ptl, force, num_disc, epaddr_array,
+					 mask, errs, timeout_ns);
+		psmi_free(mask);
+		psmi_free(errs);
+		psmi_free(epaddr_array);
+	}
+
+	if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+		while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+			if (!psmi_cycles_left(t_start, timeout_ns)) {
+				err = PSM2_TIMEOUT;
+				_HFI_VDBG("CCC timed out with from=%d,to=%d\n",
+					  ptl->connect_incoming, ptl->connect_outgoing);
+				break;
+			}
+			psmi_poll_internal(ptl->ep, 1);
+		}
+	} else
+		_HFI_VDBG("CCC complete disconnect from=%d,to=%d\n",
+			  ptl->connect_incoming, ptl->connect_outgoing);
+
+	if ((err_seg = psmi_shm_detach(ptl))) {
+		err = err_seg;
+		goto fail;
+	}
+
+	/* This prevents poll calls between now and the point where the endpoint is
+	 * deallocated to reference memory that disappeared */
+	ptl->repH.head = &ptl->amsh_empty_shortpkt;
+	ptl->reqH.head = &ptl->amsh_empty_shortpkt;
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		am_cuda_memhandle_cache_map_fini();
+#endif
+	return PSM2_OK;
+fail:
+	return err;
+
+}
+
+static
+psm2_error_t
+amsh_setopt(const void *component_obj, int optname,
+	    const void *optval, uint64_t optlen)
+{
+	/* No options for AM PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown AM ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+amsh_getopt(const void *component_obj, int optname,
+	    void *optval, uint64_t *optlen)
+{
+	/* No options for AM PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown AM ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_amsh = {
+	amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt
+};
diff --git a/ptl_am/cmarw.h b/ptl_am/cmarw.h
new file mode 100644
index 0000000..0317ed4
--- /dev/null
+++ b/ptl_am/cmarw.h
@@ -0,0 +1,73 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+/*
+ * read from remote process pid
+ */
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n);
+
+/*
+ * write to remote process pid
+ */
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n);
+
+/*
+ * Test if CMA is available by trying a no-op call.
+ * Returns 1 if CMA is present, 0 if not.
+ */
+int cma_available(void);
diff --git a/ptl_am/cmarwu.c b/ptl_am/cmarwu.c
new file mode 100644
index 0000000..a9a1d83
--- /dev/null
+++ b/ptl_am/cmarwu.c
@@ -0,0 +1,207 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "cmarw.h"
+
+/* An iovec looks like this:
+ * struct iovec {
+ *       void  *iov_base;    // Starting address
+ *       size_t iov_len;     // Number of bytes to transfer
+ * };
+ */
+
+#if 0
+#define __NR_process_vm_readv			310
+#define __NR_process_vm_writev			311
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	syscall(__NR_process_vm_readv, \
+		pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	syscall(__NR_process_vm_writev, \
+		pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+#endif
+
+/*CMA syscall wrappers were added in glibc 2.15.  For anything older than that,
+   we need to define our own wrappers.  Apparently older (and maybe newer?)
+   (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5
+   arguments via the generic syscall() function.  These CMA functions, however,
+   have 6 arguments.  So for now, we hack our way around it by generating ASM
+   code for doing a syscall directly.
+*/
+
+#if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15))
+
+#ifdef __x86_64__
+
+#define __NR_process_vm_readv			310
+#define __NR_process_vm_writev			311
+
+static inline ssize_t __x86_64_syscall6(int syscall,
+					pid_t pid,
+					const struct iovec *local_iov,
+					unsigned long liovcnt,
+					const struct iovec *remote_iov,
+					unsigned long riovcnt,
+					unsigned long flags)
+{
+	/*GCC inline ASM is annoying -- can't specify all the x86_64 registers
+	   directly, so declare register-specific variables and use them. */
+	register int64_t rax asm("rax") = syscall;
+	register int64_t rdi asm("rdi") = pid;
+	register int64_t rsi asm("rsi") = (intptr_t) local_iov;
+	register int64_t rdx asm("rdx") = liovcnt;
+	register int64_t r10 asm("r10") = (intptr_t) remote_iov;
+	register int64_t r8 asm("r8") = riovcnt;
+	register int64_t r9 asm("r9") = flags;
+
+	asm volatile ("syscall\n" : "=a" (rax)
+		      : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10),
+		      "r"(r8), "r"(r9)
+		      : "%rcx", "%r11", "cc", "memory");
+	return rax;
+}
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	__x86_64_syscall6(__NR_process_vm_readv, \
+			  pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	__x86_64_syscall6(__NR_process_vm_writev, \
+			  pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#else /* ndef __x86_64__ */
+#error "Can't compile CMA support for this architecture."
+#endif /* __x86_64__ */
+#endif /* __GLIBC__ < 2.15 */
+
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n)
+{
+	int64_t nr, sum;
+	struct iovec local = {
+		.iov_base = dst,
+		.iov_len = n
+	};
+	struct iovec remote = {
+		.iov_base = (void *)src,
+		.iov_len = n
+	};
+	nr = sum = 0;
+	while (sum != n) {
+		nr = process_vm_readv(pid, &local, 1, &remote, 1, 0);
+		if (nr == -1) {
+			return -1;
+		}
+		sum += nr;
+		local.iov_base += nr;
+		local.iov_len -= nr;
+		remote.iov_base += nr;
+		remote.iov_len -= nr;
+	}
+	return sum;
+}
+
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n)
+{
+	int64_t nr, sum;
+	struct iovec local = {
+		.iov_base = (void *)src,
+		.iov_len = n
+	};
+	struct iovec remote = {
+		.iov_base = dst,
+		.iov_len = n
+	};
+
+	nr = sum = 0;
+	while (sum != n) {
+		nr = process_vm_writev(pid, &local, 1, &remote, 1, 0);
+		if (nr == -1) {
+			return -1;
+		}
+		sum += nr;
+		local.iov_base += nr;
+		local.iov_len -= nr;
+		remote.iov_base += nr;
+		remote.iov_len -= nr;
+	}
+	return sum;
+}
+
+/* Test if CMA is available by trying a no-op call. */
+int cma_available(void)
+{
+
+	/* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred)
+	 * should be returned.  If not present, expect -ENOSYS. */
+
+	int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0);
+
+	if (ret == 0) {
+		/* CMA is available! */
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h
new file mode 100644
index 0000000..a6ba9db
--- /dev/null
+++ b/ptl_am/psm_am_internal.h
@@ -0,0 +1,466 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef PSMI_AM_H
+#define PSMI_AM_H
+
+#include "../psm_am_internal.h"
+
+#define AMSH_DIRBLOCK_SIZE 128
+
+typedef
+struct am_epaddr {
+	/* must be the first field to be the same address */
+	struct psm2_epaddr epaddr;
+	union {
+		uint16_t _ptladdr_u16[4];
+		uint32_t _ptladdr_u32[2];
+		uint64_t _ptladdr_u64;
+		uint8_t _ptladdr_data[0];
+	};
+} am_epaddr_t;
+
+/* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining
+   arguments are passed using space in am_pkt_bulk_t.  One additional argument
+   is added for passing the internal ptl_am handler index. */
+#define NSHORT_ARGS 6
+#define NBULK_ARGS  (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1)
+
+typedef
+struct amsh_am_token {
+	struct psmi_am_token tok;
+
+	ptl_t *ptl;	  /**> What PTL was it received on */
+	psm2_mq_t mq;	  /**> What matched queue is this for ? */
+	uint16_t shmidx;  /**> what shmidx sent this */
+} amsh_am_token_t;
+
+typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs,
+				   void *src, size_t len);
+
+typedef struct psmi_handlertab {
+	psmi_handler_fn_t fn;
+} psmi_handlertab_t;
+
+/*
+ * Can change the rendezvous threshold based on usage of cma (or not)
+ */
+#define PSMI_MQ_RV_THRESH_CMA      16000
+
+/* If no kernel assisted copy is available this is the rendezvous threshold */
+#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
+
+#define PSMI_AM_CONN_REQ    1
+#define PSMI_AM_CONN_REP    2
+#define PSMI_AM_DISC_REQ    3
+#define PSMI_AM_DISC_REP    4
+
+#define PSMI_KASSIST_OFF       0x0
+#define PSMI_KASSIST_CMA_GET   0x1
+#define PSMI_KASSIST_CMA_PUT   0x2
+
+#define PSMI_KASSIST_CMA       0x3
+#define PSMI_KASSIST_GET       0x1
+#define PSMI_KASSIST_PUT       0x2
+#define PSMI_KASSIST_MASK      0x3
+
+#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET
+#define PSMI_KASSIST_MODE_DEFAULT_STRING  "cma-get"
+
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+/*
+ * Eventually, we will allow users to register handlers as "don't reply", which
+ * may save on some of the buffering requirements
+ */
+#define PSMI_HANDLER_NEEDS_REPLY(handler)    1
+#define PSMI_VALIDATE_REPLY(handler)    assert(PSMI_HANDLER_NEEDS_REPLY(handler))
+
+int psmi_amsh_poll(ptl_t *ptl, int replyonly);
+
+/* Shared memory AM, forward decls */
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, int flags);
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags);
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+		       psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		       const void *src, size_t len, void *dest, int flags);
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len);
+void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg,
+			     void *buf, size_t len);
+void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg,
+				 void *buf, size_t len);
+void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg,
+				 void *buf, size_t len);
+void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg,
+				void *buf, size_t len);
+void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		     size_t len);
+
+/* AM over shared memory (forward decls) */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+			   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm2_am_completion_fn_t completion_fn,
+			   void *completion_ctxt);
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+			 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm2_am_completion_fn_t completion_fn,
+			 void *completion_ctxt);
+
+#define amsh_conn_handler_hidx	 1
+#define mq_handler_hidx          2
+#define mq_handler_data_hidx     3
+#define mq_handler_rtsmatch_hidx 4
+#define mq_handler_rtsdone_hidx  5
+#define am_handler_hidx          6
+
+#define AMREQUEST_SHORT 0
+#define AMREQUEST_LONG  1
+#define AMREPLY_SHORT   2
+#define AMREPLY_LONG    3
+#define AM_IS_REPLY(x)     ((x)&0x2)
+#define AM_IS_REQUEST(x)   (!AM_IS_REPLY(x))
+#define AM_IS_LONG(x)      ((x)&0x1)
+#define AM_IS_SHORT(x)     (!AM_IS_LONG(x))
+
+#define AM_FLAG_SRC_ASYNC   0x1
+#define AM_FLAG_SRC_TEMP    0x2
+
+/*
+ * Request Fifo.
+ */
+typedef
+struct am_reqq {
+	struct am_reqq *next;
+
+	ptl_t *ptl;
+	psm2_epaddr_t epaddr;
+	int amtype;
+	psm2_handler_t handler;
+	psm2_amarg_t args[8];
+	int nargs;
+	uint32_t len;
+	void *src;
+	void *dest;
+	int amflags;
+	int flags;
+} am_reqq_t;
+
+struct am_reqq_fifo_t {
+	am_reqq_t *first;
+	am_reqq_t **lastp;
+};
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl);
+void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      void *src, size_t len, void *dest, int flags);
+
+/*
+ * Shared memory Active Messages, implementation derived from
+ * Lumetta, Mainwaring, Culler.  Multi-Protocol Active Messages on a Cluster of
+ * SMP's. Supercomputing 1997.
+ *
+ * We support multiple endpoints in shared memory, but we only support one
+ * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some
+ * structures are endpoint specific (as denoted * with amsh_ep_) and others are
+ * specific to the single shared memory context * (amsh_ global variables).
+ *
+ * Each endpoint maintains a shared request block and a shared reply block.
+ * Each block is composed of queues for small, medium and large messages.
+ */
+
+#define QFREE      0
+#define QUSED      1
+#define QREADY     2
+#define QREADYMED  3
+#define QREADYLONG 4
+
+#define QISEMPTY(flag) (flag < QREADY)
+#if defined(__x86_64__) || defined(__i386__)
+#  define _QMARK_FLAG_FENCE()  asm volatile("" : : : "memory")	/* compilerfence */
+#else
+#  error No _QMARK_FLAG_FENCE() defined for this platform
+#endif
+
+#define _QMARK_FLAG(pkt_ptr, _flag)		\
+	do {					\
+		_QMARK_FLAG_FENCE();		\
+		(pkt_ptr)->flag = (_flag);	\
+	} while (0)
+
+#define QMARKFREE(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QFREE)
+#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY)
+#define QMARKUSED(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QUSED)
+
+#define AMFMT_SYSTEM       1
+#define AMFMT_SHORT_INLINE 2
+#define AMFMT_SHORT        3
+#define AMFMT_LONG         4
+#define AMFMT_LONG_END     5
+
+#define _shmidx		_ptladdr_u16[0]
+#define _return_shmidx	_ptladdr_u16[1]
+#define _cstate		_ptladdr_u16[2]
+
+#define AMSH_CMASK_NONE    0
+#define AMSH_CMASK_PREREQ  1
+#define AMSH_CMASK_POSTREQ 2
+#define AMSH_CMASK_DONE    3
+
+#define AMSH_CSTATE_OUTGOING_MASK                0x0f
+#define AMSH_CSTATE_OUTGOING_NONE                0x01
+#define AMSH_CSTATE_OUTGOING_REPLIED             0x02
+#define AMSH_CSTATE_OUTGOING_ESTABLISHED         0x03
+#define AMSH_CSTATE_OUTGOING_DISC_REPLIED        0x04
+#define AMSH_CSTATE_OUTGOING_DISC_REQUESTED      0x05
+#define AMSH_CSTATE_OUTGOING_GET(amaddr)  ((amaddr)->_cstate & AMSH_CSTATE_OUTGOING_MASK)
+#define AMSH_CSTATE_OUTGOING_SET(amaddr, state)                                      \
+	(amaddr)->_cstate = (((amaddr)->_cstate & ~AMSH_CSTATE_OUTGOING_MASK) | \
+			    ((AMSH_CSTATE_OUTGOING_ ## state) & AMSH_CSTATE_OUTGOING_MASK))
+
+#define AMSH_CSTATE_INCOMING_MASK              0xf0
+#define AMSH_CSTATE_INCOMING_NONE              0x10
+#define AMSH_CSTATE_INCOMING_DISC_REQUESTED    0x40
+#define AMSH_CSTATE_INCOMING_ESTABLISHED       0x50
+#define AMSH_CSTATE_INCOMING_GET(amaddr)  ((amaddr)->_cstate & AMSH_CSTATE_INCOMING_MASK)
+#define AMSH_CSTATE_INCOMING_SET(amaddr, state)                             \
+	(amaddr)->_cstate = (((amaddr)->_cstate & ~AMSH_CSTATE_INCOMING_MASK) | \
+			    ((AMSH_CSTATE_INCOMING_ ## state) & AMSH_CSTATE_INCOMING_MASK))
+
+/**********************************
+ * Shared memory packet formats
+ **********************************/
+typedef
+struct am_pkt_short {
+	uint32_t flag;	      /**> Packet state */
+	union {
+		uint32_t bulkidx; /**> index in bulk packet queue */
+		uint32_t length;  /**> length when no bulkidx used */
+	};
+	uint16_t shmidx;      /**> index in shared segment */
+	uint16_t type;
+	uint16_t nargs;
+	uint16_t handleridx;
+
+	psm2_amarg_t args[NSHORT_ARGS];	/* AM arguments */
+
+	/* We eventually will expose up to 8 arguments, but this isn't implemented
+	 * For now.  >6 args will probably require a medium instead of a short */
+} __attribute__ ((aligned(64)))
+am_pkt_short_t;
+PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64);
+
+typedef struct am_pkt_bulk {
+	uint32_t flag;
+	uint32_t idx;
+	uintptr_t dest;		/* Destination pointer in "longs" */
+	uint32_t dest_off;	/* Destination pointer offset */
+	uint32_t len;		/* Destination length within offset */
+	psm2_amarg_t args[NBULK_ARGS];	/* Additional "spillover" for >6 args */
+	uint8_t payload[0];
+} am_pkt_bulk_t;
+/* No strict size decl, used for mediums and longs */
+
+/****************************************************
+ * Shared memory header and block control structures
+ ***************************************************/
+
+/* Each pkt queue has the same header format, although the queue
+ * consumers don't use the 'head' index in the same manner. */
+typedef struct am_ctl_qhdr {
+	uint32_t head;		/* Touched only by 1 consumer */
+	uint8_t _pad0[64 - 4];
+
+	pthread_spinlock_t lock;
+	uint32_t tail;		/* XXX candidate for fetch-and-incr */
+	uint32_t elem_cnt;
+	uint32_t elem_sz;
+	uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)];
+} am_ctl_qhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128);
+
+/* Each block reserves some space at the beginning to store auxiliary data */
+#define AMSH_BLOCK_HEADER_SIZE  4096
+
+/* Each process has a reply qhdr and a request qhdr */
+typedef struct am_ctl_blockhdr {
+	volatile am_ctl_qhdr_t shortq;
+	volatile am_ctl_qhdr_t longbulkq;
+} am_ctl_blockhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2);
+
+/* We cache the "shorts" because that's what we poll on in the critical path.
+ * We take care to always update these pointers whenever the segment is remapped.
+ */
+typedef struct am_ctl_qshort_cache {
+	volatile am_pkt_short_t *base;
+	volatile am_pkt_short_t *head;
+	volatile am_pkt_short_t *end;
+} am_ctl_qshort_cache_t;
+
+/******************************************
+ * Shared segment local directory (global)
+ ******************************************
+ *
+ * Each process keeps a directory for where request and reply structures are
+ * located at its peers.  This directory must be re-initialized every time the
+ * shared segment moves in the VM, and the segment moves every time we remap()
+ * for additional memory.
+ */
+struct amsh_qdirectory {
+	am_ctl_blockhdr_t *qreqH;
+	am_pkt_short_t *qreqFifoShort;
+	am_pkt_bulk_t *qreqFifoLong;
+
+	am_ctl_blockhdr_t *qrepH;
+	am_pkt_short_t *qrepFifoShort;
+	am_pkt_bulk_t *qrepFifoLong;
+} __attribute__ ((aligned(64)));
+
+#define AMSH_HAVE_CMA   0x1
+#define AMSH_HAVE_KASSIST 0x1
+
+/******************************************
+ * Shared fifo element counts and sizes
+ ******************************************
+ * These values are context-wide, they can only be set early on and can't be *
+ * modified at runtime.  All endpoints are expected to use the same values.
+ */
+typedef
+struct amsh_qinfo {
+	int qreqFifoShort;
+	int qreqFifoLong;
+
+	int qrepFifoShort;
+	int qrepFifoLong;
+} amsh_qinfo_t;
+
+/******************************************
+ * Per-endpoint structures (ep-local)
+ ******************************************
+ * Each endpoint keeps its own information as to where it resides in the
+ * directory, and maintains its own cached copies of where the short header
+ * resides in shared memory.
+ *
+ * This structure is carefully arranged to optimize cache locality and
+ * performance.  Do not modify without careful and thorough analysis.
+ */
+struct am_ctl_nodeinfo {
+	uint16_t psm_verno;
+	volatile uint16_t is_init;
+	volatile pid_t pid;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	uintptr_t amsh_shmbase;
+	amsh_qinfo_t amsh_qsizes;
+	uint32_t amsh_features;
+	struct amsh_qdirectory qdir;
+} __attribute__((aligned(64)));
+
+struct ptl {
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ctl;
+
+	int connect_phase;
+	int connect_outgoing;
+	int connect_incoming;
+
+	int zero_polls;
+	int amsh_only_polls;
+	int max_ep_idx, am_ep_size;
+	int psmi_kassist_mode;
+	char *amsh_keyname;
+
+	/* These three items carefully picked to fit in one cache line. */
+	am_ctl_qshort_cache_t reqH;
+	am_ctl_qshort_cache_t repH;
+	struct am_reqq_fifo_t psmi_am_reqq_fifo;
+
+	am_pkt_short_t amsh_empty_shortpkt;
+
+	struct am_ctl_nodeinfo *self_nodeinfo;
+	struct am_ctl_nodeinfo *am_ep;
+} __attribute__((aligned(64)));
+
+#endif
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
new file mode 100644
index 0000000..1f20cdf
--- /dev/null
+++ b/ptl_am/ptl.c
@@ -0,0 +1,364 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+/**
+ * Callback function when a receive request is matched with the
+ * tag obtained from the RTS packet.
+ */
+static
+psm2_error_t
+ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
+			    amsh_am_token_t *tok)
+{
+	psm2_amarg_t args[5];
+	psm2_epaddr_t epaddr = req->rts_peer;
+	ptl_t *ptl = epaddr->ptlctl->ptl;
+	int cma_succeed = 0;
+	int pid = 0, cuda_ipc_send_completion = 0;
+
+	PSM2_LOG_MSG("entering.");
+	psmi_assert((tok != NULL && was_posted)
+		    || (tok == NULL && !was_posted));
+
+	_HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n",
+		  req, req->buf, req->recv_msglen, tok);
+#ifdef PSM_CUDA
+	if (req->cuda_ipc_handle_attached) {
+
+	      void* cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf,
+						  (cudaIpcMemHandle_t*)&req->cuda_ipc_handle,
+								 req->recv_msglen,
+								 req->rts_peer->epid);
+		/* cudaMemcpy into the receive side buffer
+		 * based on its location */
+		if (req->is_buf_gpu_mem) {
+			PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_dev_ptr,
+				       req->recv_msglen, cudaMemcpyDeviceToDevice);
+			PSMI_CUDA_CALL(cudaEventRecord, req->cuda_ipc_event, 0);
+			PSMI_CUDA_CALL(cudaEventSynchronize, req->cuda_ipc_event);
+		} else
+			PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_dev_ptr,
+				       req->recv_msglen, cudaMemcpyDeviceToHost);
+		cuda_ipc_send_completion = 1;
+		am_cuda_memhandle_release(cuda_ipc_dev_ptr);
+		req->cuda_ipc_handle_attached = 0;
+		goto send_cts;
+	}
+#endif
+
+	if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET)
+	    && req->recv_msglen > 0
+	    && (pid = psmi_epaddr_pid(epaddr))) {
+#ifdef PSM_CUDA
+		/* If the buffer on the send side is on the host,
+		 * we alloc a bounce buffer, use kassist and then
+		 * do a cudaMemcpy if the buffer on the recv side
+		 * resides on the GPU
+		 */
+		if (req->is_buf_gpu_mem) {
+			void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->recv_msglen);
+			size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+					cuda_ipc_bounce_buf, req->recv_msglen);
+			psmi_assert_always(nbytes == req->recv_msglen);
+			PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_bounce_buf,
+				       req->recv_msglen, cudaMemcpyHostToDevice);
+			/* Cuda library has recent optimizations where they do
+			 * not guarantee synchronus nature for Host to Device
+			 * copies for msg sizes less than 64k. The event record
+			 * and synchronize calls are to guarentee completion.
+			 */
+			PSMI_CUDA_CALL(cudaEventRecord, req->cuda_ipc_event, 0);
+			PSMI_CUDA_CALL(cudaEventSynchronize, req->cuda_ipc_event);
+			psmi_free(cuda_ipc_bounce_buf);
+		} else {
+			/* cma can be done in handler context or not. */
+			size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+						req->buf, req->recv_msglen);
+			psmi_assert_always(nbytes == req->recv_msglen);
+		}
+#else
+		/* cma can be done in handler context or not. */
+		size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+					req->buf, req->recv_msglen);
+		if (nbytes == -1) {
+			ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+			_HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n");
+		}
+		else {
+			psmi_assert_always(nbytes == req->recv_msglen);
+			cma_succeed = 1;
+		}
+		psmi_assert_always(nbytes == req->recv_msglen);
+#endif
+	}
+
+#ifdef PSM_CUDA
+send_cts:
+#endif
+	args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr;
+	args[1].u64w0 = (uint64_t) (uintptr_t) req;
+	args[2].u64w0 = (uint64_t) (uintptr_t) req->buf;
+	args[3].u32w0 = req->recv_msglen;
+	args[3].u32w1 = tok != NULL ? 1 : 0;
+	args[4].u32w0 = ptl->psmi_kassist_mode;		// pass current kassist mode to the peer process
+
+	if (tok != NULL) {
+		psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl,
+				 tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx,
+				 args, 5, NULL, 0, NULL, 0);
+	} else
+		psmi_amsh_short_request(ptl, epaddr, mq_handler_rtsmatch_hidx,
+					args, 5, NULL, 0, 0);
+
+	/* 0-byte completion or we used kassist */
+	if (pid || cma_succeed ||
+		req->recv_msglen == 0 || cuda_ipc_send_completion == 1) {
+		psmi_mq_handle_rts_complete(req);
+	}
+	PSM2_LOG_MSG("leaving.");
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted)
+{
+	/* was_posted == 0 allows us to assume that we're not running this callback
+	 * within am handler context (i.e. we can poll) */
+	psmi_assert(was_posted == 0);
+	return ptl_handle_rtsmatch_request(req, 0, NULL);
+}
+
+void
+psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		   size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	psm2_mq_req_t req;
+	psm2_mq_tag_t tag;
+	int rc;
+	uint32_t opcode = args[0].u32w0;
+	uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1;
+
+	tag.tag[0] = args[1].u32w1;
+	tag.tag[1] = args[1].u32w0;
+	tag.tag[2] = args[2].u32w1;
+	psmi_assert(toki != NULL);
+	_HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n",
+		  tok->mq, opcode, (int)len, msglen);
+
+	switch (opcode) {
+	case MQ_MSG_TINY:
+	case MQ_MSG_SHORT:
+	case MQ_MSG_EAGER:
+		rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming,
+					     &tag, msglen, 0, buf,
+					     (uint32_t) len, 1, opcode, &req);
+
+		/* for eager matching */
+		req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming;
+		req->msg_seqnum = 0;	/* using seqnum 0 */
+		break;
+	default:{
+			void *sreq = (void *)(uintptr_t) args[3].u64w0;
+			uintptr_t sbuf = (uintptr_t) args[4].u64w0;
+			psmi_assert(narg == 5);
+			psmi_assert_always(opcode == MQ_MSG_LONGRTS);
+			rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming,
+						&tag, msglen, NULL, 0, 1,
+						ptl_handle_rtsmatch, &req);
+
+			req->rts_peer = tok->tok.epaddr_incoming;
+			req->ptl_req_ptr = sreq;
+			req->rts_sbuf = sbuf;
+#ifdef PSM_CUDA
+			/* Payload in RTS would mean an IPC handle has been
+			 * sent. This would also mean the sender has to
+			 * send from a GPU buffer
+			 */
+			if (buf && len > 0) {
+				req->cuda_ipc_handle = *((cudaIpcMemHandle_t*)buf);
+				req->cuda_ipc_handle_attached = 1;
+			}
+#endif
+
+			if (rc == MQ_RET_MATCH_OK)	/* we are in handler context, issue a reply */
+				ptl_handle_rtsmatch_request(req, 1, tok);
+			/* else will be called later */
+			break;
+		}
+	}
+	return;
+}
+
+void
+psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+	psmi_assert(toki != NULL);
+
+	psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming;
+	psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0);	/* using seqnum 0 */
+	psmi_assert_always(req != NULL);
+	psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len);
+
+	return;
+}
+
+/**
+ * Function to handle CTS on the sender.
+ */
+void
+psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			    size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+	psmi_assert(toki != NULL);
+
+	ptl_t *ptl = tok->ptl;
+	psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+#ifdef PSM_CUDA
+	/* If send side req has a cuda ipc handle attached then we can
+	 * assume the data has been copied as soon as we get a CTS
+	 */
+	if (sreq->cuda_ipc_handle_attached) {
+		sreq->cuda_ipc_handle_attached = 0;
+		psmi_mq_handle_rts_complete(sreq);
+		return;
+	}
+#endif
+	void *dest = (void *)(uintptr_t) args[2].u64w0;
+	uint32_t msglen = args[3].u32w0;
+	psm2_amarg_t rarg[1];
+
+	_HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n",
+		  sreq, (void *)(uintptr_t) args[1].u64w0, sreq->buf, dest,
+		  msglen);
+
+	if (msglen > 0) {
+		rarg[0].u64w0 = args[1].u64w0;	/* rreq */
+		int kassist_mode = ptl->psmi_kassist_mode;
+		int kassist_mode_peer = args[4].u32w0;
+		// In general, peer process(es) shall have the same kassist mode set,
+		// but due to dynamic CMA failure detection, we must align local and remote state,
+		// and make protocol to adopt to that potential change.
+		if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) {
+			ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+			goto no_kassist;
+		}
+
+		if (kassist_mode & PSMI_KASSIST_PUT) {
+			int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming);
+			size_t nbytes = cma_put(sreq->buf, pid, dest, msglen);
+			if (nbytes == -1) {
+				_HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n");
+				ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+				goto no_kassist;
+			}
+
+			psmi_assert_always(nbytes == msglen);
+
+			/* Send response that PUT is complete */
+			psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx,
+					      rarg, 1, NULL, 0, 0);
+		} else if (!(kassist_mode & PSMI_KASSIST_MASK)) {
+			/* Only transfer if kassist is off, i.e. neither GET nor PUT. */
+no_kassist:
+			psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg,
+					     1, sreq->buf, msglen, dest, 0);
+		}
+	}
+	psmi_mq_handle_rts_complete(sreq);
+}
+
+void
+psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			   size_t len)
+{
+	psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+	psmi_assert(narg == 1);
+	_HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->buf,
+		  rreq->recv_msglen);
+	psmi_mq_handle_rts_complete(rreq);
+}
+
+void
+psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	psm2_am_handler_fn_t hfn;
+
+	psmi_assert(toki != NULL);
+
+	hfn = psm_am_get_handler_function(tok->mq->ep,
+					  (psm2_handler_t) args[0].u32w0);
+
+	/* Invoke handler function. For AM we do not support break functionality */
+	hfn(toki, args + 1, narg - 1, buf, len);
+
+	return;
+}
diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h
new file mode 100644
index 0000000..e1bd064
--- /dev/null
+++ b/ptl_am/ptl_fwd.h
@@ -0,0 +1,64 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_AMSH_H
+#define _PTL_FWD_AMSH_H
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_amsh;
+
+extern int psmi_shm_mq_rv_thresh;
+
+#endif
diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile
new file mode 100644
index 0000000..d48c883
--- /dev/null
+++ b/ptl_ips/Makefile
@@ -0,0 +1,96 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir = $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ipserror.o ips_recvq.o \
+		   ips_recvhdrq.o ips_spio.o ips_proto_recv.o ips_proto_connect.o \
+		   ips_proto_dump.o ips_proto_mq.o ips_subcontext.o \
+		   ips_writehdrq.o ips_proto_expected.o ips_tid.o \
+		   ips_scb.o ips_proto_am.o ips_opp_path_rec.o ips_tidflow.o \
+		   ips_epstate.o ips_crc32.o ips_path_rec.o ips_tidcache.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno; \
+		cd -; \
+	fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+	@echo "Nothing to do for install."
diff --git a/ptl_ips/ips_crc32.c b/ptl_ips/ips_crc32.c
new file mode 100644
index 0000000..d6ed1bf
--- /dev/null
+++ b/ptl_ips/ips_crc32.c
@@ -0,0 +1,91 @@
+/* The code in this file was derived from crc32.c in zlib 1.2.3, and
+   modified from its original form to suit our requirements. The zlib
+   license and crc32.c copyright and credits are preserved below. */
+
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.3, July 18th, 2005
+
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup at gzip.org          madler at alumni.caltech.edu
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64 at csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+	uint32_t c;
+	int n, k;
+
+	for (n = 0; n < 256; n++) {
+		c = (uint32_t) n;
+		for (k = 0; k < 8; k++) {
+			if (c & 1)
+				c = 0xedb88320 ^ (c >> 1);
+			else
+				c = c >> 1;
+		}
+		crc_table[n] = c;
+	}
+	crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc)
+{
+	uint32_t c = crc;
+	uint32_t n;
+
+	if (!crc_table_computed) {
+		make_crc_table();
+	}
+	for (n = 0; n < len; n++) {
+		c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8);
+	}
+	return c;
+}
diff --git a/ptl_ips/ips_epstate.c b/ptl_ips/ips_epstate.c
new file mode 100644
index 0000000..8206847
--- /dev/null
+++ b/ptl_ips/ips_epstate.c
@@ -0,0 +1,154 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_epstate.h"
+
+/* The indexes are used to map a particular endpoint to a structure at the
+ * receiver.  Although we take extra care to validate the identity of endpoints
+ * when packets are received, the communication index is at an offset selected
+ * by the endpoint that allocates the index.  This narrows the window of two
+ * jobs communicated with the same set of indexes from getting crosstalk.
+ */
+/* Allocate new epaddrs in chunks of 128 */
+#define PTL_EPADDR_ALLOC_CHUNK  128
+
+psm2_error_t
+ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context)
+{
+	memset(eps, 0, sizeof(*eps));
+	eps->context = context;
+	eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) &
+				(IPS_EPSTATE_CONNIDX_MAX-1);
+	return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps)
+{
+	if (eps->eps_tab)
+		psmi_free(eps->eps_tab);
+	memset(eps, 0, sizeof(*eps));
+	return PSM2_OK;
+}
+
+/*
+ * Add ipsaddr with epid to the epstate table, return new index to caller in
+ * 'connidx'.
+ */
+psm2_error_t
+ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr,
+		ips_epstate_idx *connidx_o)
+{
+	int i, j;
+	ips_epstate_idx connidx;
+
+	if (++eps->eps_tabsizeused > eps->eps_tabsize) {	/* realloc */
+		struct ips_epstate_entry *newtab;
+		eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK;
+		newtab = (struct ips_epstate_entry *)
+		    psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT,
+				eps->eps_tabsize,
+				sizeof(struct ips_epstate_entry));
+		if (newtab == NULL)
+			return PSM2_NO_MEMORY;
+		else if (eps->eps_tab) {	/* NOT first alloc */
+			for (i = 0;
+			     i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++)
+				newtab[i] = eps->eps_tab[i];	/* deep copy */
+			psmi_free(eps->eps_tab);
+		}
+		eps->eps_tab = newtab;
+	}
+	/* Find the next free hole.  We can afford to do this since connect is not
+	 * in the critical path */
+	for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) {
+		if (j == eps->eps_tabsize)
+			j = 0;
+		if (eps->eps_tab[j].ipsaddr == NULL) {
+			eps->eps_tab_nextidx = j + 1;
+			if (eps->eps_tab_nextidx == eps->eps_tabsize)
+				eps->eps_tab_nextidx = 0;
+			break;
+		}
+	}
+	psmi_assert_always(i != eps->eps_tabsize);
+	connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	_HFI_VDBG("node %s gets connidx=%d (table idx %d)\n",
+		  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx,
+		  j);
+	eps->eps_tab[j].ipsaddr = ipsaddr;
+	if (j >= IPS_EPSTATE_CONNIDX_MAX) {
+		return psmi_handle_error(eps->context->ep,
+					 PSM2_TOO_MANY_ENDPOINTS,
+					 "Can't connect to more than %d non-local endpoints",
+					 IPS_EPSTATE_CONNIDX_MAX);
+	}
+	*connidx_o = connidx;
+	return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx)
+{
+	ips_epstate_idx idx;
+	/* actual table index */
+	idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	psmi_assert_always(idx < eps->eps_tabsize);
+	_HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx);
+	eps->eps_tab[idx].ipsaddr = NULL;
+	/* We may eventually want to release memory, but probably not */
+	eps->eps_tabsizeused--;
+	return PSM2_OK;
+}
diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h
new file mode 100644
index 0000000..7308040
--- /dev/null
+++ b/ptl_ips/ips_epstate.h
@@ -0,0 +1,100 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_EPSTATE_H
+#define _IPS_EPSTATE_H
+
+#include "psm_user.h"
+
+typedef uint32_t ips_epstate_idx;
+#define IPS_EPSTATE_CONNIDX_MAX (1<<26)
+
+struct ips_epaddr;
+
+struct ips_epstate_entry {
+	struct ips_epaddr *ipsaddr;
+};
+
+struct ips_epstate {
+	const psmi_context_t *context;
+	ips_epstate_idx eps_base_idx;
+	int eps_tabsize;
+	int eps_tabsizeused;
+	int eps_tab_nextidx;
+
+	struct ips_epstate_entry *eps_tab;
+};
+
+psm2_error_t ips_epstate_init(struct ips_epstate *eps,
+			     const psmi_context_t *contextj);
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps);
+
+psm2_error_t ips_epstate_add(struct ips_epstate *eps,
+			    struct ips_epaddr *ipsaddr,
+			    ips_epstate_idx *connidx);
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx);
+
+PSMI_INLINE(
+struct ips_epstate_entry *
+ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx))
+{
+	idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	if (idx < eps->eps_tabsize)
+		return &eps->eps_tab[idx];
+	else
+		return NULL;
+}
+
+#endif /* _IPS_EPSTATE_H */
diff --git a/ptl_ips/ips_expected_proto.h b/ptl_ips/ips_expected_proto.h
new file mode 100644
index 0000000..a402b93
--- /dev/null
+++ b/ptl_ips/ips_expected_proto.h
@@ -0,0 +1,397 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * Control and state structure for one instance of the expected protocol.  The
+ * protocol depends on some upcalls from internal portions of the receive
+ * protocol (such as opcodes dedicated for expected protocol handling)
+ */
+
+/*
+ * Expected tid operations are carried out over "sessions".  One session is a
+ * collection of N tids where N is determined by the expected message window
+ * size (-W option or PSM2_MQ_RNDV_HFI_WINDOW).  Since naks can cause
+ * retransmissions, each session has an session index (_desc_idx) and a
+ * generation count (_desc_genc) to be able to identify if retransmitted
+ * packets reference the correct session.
+ *
+ * index and generation count are each 4 bytes encoded in one ptl_arg.  They
+ * could be compressed further but we have the header space, so we don't
+ * bother.
+ */
+#define _desc_idx   u32w0
+#define _desc_genc  u32w1
+
+/*
+ * For debug and/or other reasons, we can log the state of each tid and
+ * optionally associate it to a particular receive descriptor
+ */
+
+#define TIDSTATE_FREE	0
+#define TIDSTATE_USED	1
+
+struct ips_tidinfo {
+	uint32_t tid;
+	uint32_t state;
+	struct ips_tid_recv_desc *tidrecvc;
+};
+
+/* Generate an expected header every 16 packets */
+#define PSM_DEFAULT_EXPECTED_HEADER 16
+
+struct ips_protoexp {
+	const struct ptl *ptl;
+	struct ips_proto *proto;
+	struct psmi_timer_ctrl *timerq;
+	struct ips_tid tidc;
+	struct ips_tf tfc;
+
+	psm_transfer_type_t ctrl_xfer_type;
+	psm_transfer_type_t tid_xfer_type;
+	struct ips_scbctrl tid_scbc_rv;
+	mpool_t tid_desc_send_pool;
+	mpool_t tid_getreq_pool;
+	mpool_t tid_sreq_pool;	/* backptr into proto->ep->mq */
+	mpool_t tid_rreq_pool;	/* backptr into proto->ep->mq */
+	struct drand48_data tidflow_drand48_data;
+	uint32_t tid_flags;
+	uint32_t tid_send_fragsize;
+	uint32_t tid_page_offset_mask;
+	uint64_t tid_page_mask;
+	uint32_t hdr_pkt_interval;
+	struct ips_tidinfo *tid_info;
+
+	 STAILQ_HEAD(ips_tid_send_pend,	/* pending exp. sends */
+		     ips_tid_send_desc) pend_sendq;
+	struct psmi_timer timer_send;
+
+	 STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq;	/* pending tid reqs */
+	struct psmi_timer timer_getreqs;
+
+#ifdef PSM_CUDA
+	STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */
+		    ips_tid_get_request) cudapend_getreqsq;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg;
+	mpool_t cuda_hostbuf_pool_recv;
+	mpool_t cuda_hostbuf_pool_small_recv;
+	cudaStream_t cudastream_recv;
+#endif
+};
+
+/*
+ * TID member list format used in communication.
+ * Since the compiler does not make sure the bit fields order,
+ * we use mask and shift defined below.
+typedef struct {
+	uint32_t length:11;	// in page unit, max 1024 pages
+	uint32_t reserved:9;	// for future usage
+	uint32_t tidctrl:2;	// hardware defined tidctrl value
+	uint32_t tid:10;	// hardware only support 10bits
+}
+ips_tid_session_member;
+ */
+#define IPS_TIDINFO_LENGTH_SHIFT	0
+#define IPS_TIDINFO_LENGTH_MASK		0x7ff
+#define IPS_TIDINFO_TIDCTRL_SHIFT	20
+#define IPS_TIDINFO_TIDCTRL_MASK	0x3
+#define IPS_TIDINFO_TID_SHIFT		22
+#define IPS_TIDINFO_TID_MASK		0x3ff
+
+#define IPS_TIDINFO_GET_LENGTH(tidinfo)	\
+	(((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK)
+#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \
+	(((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK)
+#define IPS_TIDINFO_GET_TID(tidinfo) \
+	(((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK)
+
+typedef struct {
+	uint8_t  tsess_unaligned_start;	/* unaligned bytes at starting */
+	uint8_t  tsess_unaligned_end;	/* unaligned bytes at ending */
+	uint16_t tsess_tidcount;	/* tid number for the session */
+	uint32_t tsess_tidoffset;	/* offset in first tid */
+	uint32_t tsess_srcoff;	/* source offset from beginning */
+	uint32_t tsess_length;	/* session length, including start/end */
+
+	uint32_t tsess_list[0];	/* must be last in struct */
+} ips_tid_session_list;
+
+/*
+ * Send-side expected send descriptors.
+ *
+ * Descriptors are allocated when tid grant requests are received (the 'target'
+ * side of an RDMA get request).  Descriptors are added to a pending queue of
+ * expected sends and processed one at a time (scb's are requested and messages
+ * sent until all fragments of the descriptor's length are put on the wire).
+ *
+ */
+#define TIDSENDC_SDMA_VEC_DEFAULT	260
+
+struct ips_tid_send_desc {
+	struct ips_protoexp *protoexp;
+	 STAILQ_ENTRY(ips_tid_send_desc) next;
+
+	/* Filled in at allocation time */
+	ptl_arg_t sdescid;	/* sender descid */
+	ptl_arg_t rdescid;	/* reciever descid */
+	ips_epaddr_t *ipsaddr;
+	psm2_mq_req_t mqreq;
+
+	/* tidflow to send tid traffic */
+	struct ips_flow tidflow;
+
+	/* Iterated during send progress */
+	void *userbuf;		/* user privided buffer */
+	void *buffer;
+	uint32_t length;	/* total length, includint start/end */
+
+	uint32_t tidbytes;	/* bytes sent over tid so far */
+	uint32_t remaining_tidbytes;
+	uint32_t offset_in_tid;	/* could be more than page */
+	uint32_t remaining_bytes_in_tid;
+
+	uint16_t frame_send;
+	uint16_t tid_idx;
+	uint16_t is_complete;
+	uint16_t frag_size;
+	/* bitmap of queued control messages for flow */
+	uint16_t ctrl_msg_queued;
+
+#ifdef PSM_CUDA
+	/* As size of cuda_hostbuf is less than equal to window size,
+	 * there is a guarantee that the maximum number of host bufs we
+	 * would need to attach to a tidsendc would be 2
+	 */
+	struct ips_cuda_hostbuf *cuda_hostbuf[2];
+	/* Number of hostbufs attached */
+	uint8_t cuda_num_buf;
+#endif
+	/*
+	 * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes,
+	 * so the max possible tid window size mq->hfi_base_window_rv is 4M.
+	 * However, PSM must fit tid grant message into a single transfer
+	 * unit, either PIO or SDMA, PSM will shrink the window accordingly.
+	 */
+	uint16_t tsess_tidlist_length;
+	union {
+		ips_tid_session_list tid_list;
+		uint8_t filler[PSM_TIDLIST_BUFSIZE+
+			sizeof(ips_tid_session_list)];
+	};
+};
+
+#define TIDRECVC_STATE_FREE      0
+#define TIDRECVC_STATE_BUSY      1
+
+struct ips_expected_recv_stats {
+	uint32_t nSeqErr;
+	uint32_t nGenErr;
+	uint32_t nReXmit;
+	uint32_t nErrChkReceived;
+};
+
+struct ips_tid_recv_desc {
+	const psmi_context_t *context;
+	struct ips_protoexp *protoexp;
+
+	ptl_arg_t rdescid;	/* reciever descid */
+	ips_epaddr_t *ipsaddr;
+	struct ips_tid_get_request *getreq;
+
+	/* scb to send tid grant CTS */
+	ips_scb_t *grantscb;
+	/* scb to send tid data completion */
+	ips_scb_t *completescb;
+
+	/* tidflow to only send ctrl msg ACK and NAK */
+	struct ips_flow tidflow;
+
+	/* TF protocol state (recv) */
+	uint32_t state;
+	uint32_t tidflow_active_gen;
+	uint32_t tidflow_nswap_gen;
+	psmi_seqnum_t tidflow_genseq;
+
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *cuda_hostbuf;
+	uint8_t is_ptr_gpu_backed;
+#endif
+
+	void *buffer;
+	uint32_t recv_msglen;
+	uint32_t recv_tidbytes;	/* exlcude start/end trim */
+
+	struct ips_expected_recv_stats stats;
+
+	/* bitmap of queued control messages for */
+	uint16_t ctrl_msg_queued;
+	/*
+	 * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes,
+	 * so the max possible tid window size mq->hfi_base_window_rv is 4M.
+	 * However, PSM must fit tid grant message into a single transfer
+	 * unit, either PIO or SDMA, PSM will shrink the window accordingly.
+	 */
+	uint16_t tsess_tidlist_length;
+	union {
+		ips_tid_session_list tid_list;
+		uint8_t filler[PSM_TIDLIST_BUFSIZE+
+			sizeof(ips_tid_session_list)];
+	};
+};
+
+/*
+ * Get requests, issued by MQ when there's a match on a large message.  Unlike
+ * an RDMA get, the initiator identifies the location of the data at the target
+ * using a 'send token' instead of a virtual address.  This, of course, assumes
+ * that the target has already registered the token and communicated it to the
+ * initiator beforehand (it actually sends the token as part of the initial
+ * MQ message that contains the MQ tag).
+ *
+ * The operation is semantically a two-sided RDMA get.
+ */
+typedef void (*ips_tid_completion_callback_t) (void *);
+
+struct ips_tid_get_request {
+	STAILQ_ENTRY(ips_tid_get_request) tidgr_next;
+	struct ips_protoexp *tidgr_protoexp;
+	psm2_epaddr_t tidgr_epaddr;
+
+	void *tidgr_lbuf;
+	uint32_t tidgr_length;
+	uint32_t tidgr_rndv_winsz;
+	uint32_t tidgr_sendtoken;
+	ips_tid_completion_callback_t tidgr_callback;
+	void *tidgr_ucontext;
+
+	uint32_t tidgr_offset;	/* offset in bytes */
+	uint32_t tidgr_bytesdone;
+	uint32_t tidgr_flags;
+
+#ifdef PSM_CUDA
+	int cuda_hostbuf_used;
+	uint32_t tidgr_cuda_bytesdone;
+	STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend,	/* pending exp. sends */
+		    ips_cuda_hostbuf) pend_cudabuf;
+#endif
+};
+
+/*
+ * Descriptor limits, structure contents of struct psmi_rlimit_mpool for
+ * normal, min and large configurations.
+ */
+#define TID_SENDSESSIONS_LIMITS {				\
+	    .env = "PSM2_TID_SENDSESSIONS_MAX",			\
+	    .descr = "Tid max send session descriptors",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
+	    .minval = 1,					\
+	    .maxval = 1<<30,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 256,  8192 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,     1 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 512, 16384 }	\
+	}
+
+/*
+ * Expected send support
+ */
+/*
+ * The expsend token is currently always a pointer to a MQ request.  It is
+ * echoed on the wire throughout various phases of the expected send protocol
+ * to identify a particular send.
+ */
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+			      const struct ips_proto *proto,
+			      uint32_t protoexp_flags, int num_of_send_bufs,
+			      int num_of_send_desc,
+			      struct ips_protoexp **protoexp_o);
+MOCK_DCL_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp);
+void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev);
+
+int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev);
+int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev);
+
+psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc);
+
+PSMI_ALWAYS_INLINE(
+void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len))
+{
+	while (len) {
+		dst[len-1] = src[len-1];
+		len--;
+	}
+}
+
+/*
+ * Peer is waiting (blocked) for this request
+ */
+#define IPS_PROTOEXP_TIDGET_WAIT	0x1
+#define IPS_PROTOEXP_TIDGET_PEERWAIT	0x2
+psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+			    void *buf, uint32_t length,
+			    psm2_epaddr_t epaddr,
+			    uint32_t remote_tok, uint32_t flags,
+			    ips_tid_completion_callback_t
+			    callback, void *context);
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+			    ips_epaddr_t *ipsaddr, psm2_mq_req_t req,
+			    ptl_arg_t rdescid, uint32_t tidflow_genseq,
+			    ips_tid_session_list *tid_list,
+			    uint32_t tid_list_size);
diff --git a/ptl_ips/ips_opp_path_rec.c b/ptl_ips/ips_opp_path_rec.c
new file mode 100644
index 0000000..b9c3904
--- /dev/null
+++ b/ptl_ips/ips_opp_path_rec.c
@@ -0,0 +1,602 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include <dlfcn.h>
+
+#define DF_OPP_LIBRARY "libopasadb.so.1.0.0"
+#define DATA_VFABRIC_OFFSET 8
+
+/* SLID and DLID are in network byte order */
+static psm2_error_t
+ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto,
+		     uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+		     ips_path_rec_t **ppath_rec)
+{
+	psm2_error_t err = PSM2_OK;
+	ibta_path_rec_t query, opp_response;
+#ifdef _HFI_DEBUGGING
+	int opp_response_set = 0;
+#endif
+	ips_path_rec_t *path_rec;
+	int opp_err;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+	uint64_t timeout_ack_ms;
+
+	/* Query path record query cache first */
+	bzero(&query, sizeof(query));
+	bzero(eplid, sizeof(eplid));
+
+	/* Bulk service ID is control service id + 1 */
+	switch (type) {
+	case IPS_PATH_LOW_PRIORITY:
+		query.service_id =
+		    __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET);
+		break;
+	case IPS_PATH_NORMAL_PRIORITY:
+	case IPS_PATH_HIGH_PRIORITY:
+	default:
+		query.service_id = __cpu_to_be64(proto->ep->service_id);
+	}
+
+	query.slid = slid;
+	query.dlid = dlid;
+
+	snprintf(eplid, sizeof(eplid), "%s_%x_%x",
+		 (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH",
+		 query.slid, query.dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+	if (!epath) {		/* Unable to find path record in cache */
+		elid.key =
+		    psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+		path_rec = (ips_path_rec_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, 1,
+				sizeof(ips_path_rec_t));
+		if (!elid.key || !path_rec) {
+			if (elid.key)
+				psmi_free(elid.key);
+			if (path_rec)
+				psmi_free(path_rec);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+
+		/* Get path record between local LID and remote */
+		opp_err =
+		    proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt,
+							  &query,
+							  &opp_response);
+		if (opp_err) {
+			psmi_free(path_rec);
+			psmi_free(elid.key);
+			err = PSM2_EPID_PATH_RESOLUTION;
+			goto fail;
+		}
+#ifdef _HFI_DEBUGGING
+		opp_response_set = 1;
+#endif
+		/* Create path record */
+		path_rec->pr_slid = opp_response.slid;
+		path_rec->pr_dlid = opp_response.dlid;
+		path_rec->pr_mtu =
+		    min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f),
+			proto->epinfo.ep_mtu);
+		path_rec->pr_pkey = ntohs(opp_response.pkey);
+		path_rec->pr_sl = ntohs(opp_response.qos_class_sl);
+		path_rec->pr_static_ipd =
+		    proto->ips_ipd_delay[opp_response.rate & 0x3f];
+
+		/* Setup CCA parameters for path */
+		if (path_rec->pr_sl > PSMI_SL_MAX) {
+			psmi_free(path_rec);
+			psmi_free(elid.key);
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+		if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) {
+			_HFI_CCADBG("No CCA for sl %d, disable CCA\n",
+				    path_rec->pr_sl);
+			proto->flags &= ~IPS_PROTO_FLAG_CCA;
+			proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+		}
+		if (!(proto->ep->context.runtime_flags &
+					HFI1_CAP_STATIC_RATE_CTRL)) {
+			_HFI_CCADBG("No Static-Rate-Control, disable CCA\n");
+			proto->flags &= ~IPS_PROTO_FLAG_CCA;
+			proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+		}
+
+		path_rec->proto = proto;
+		path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min;
+		path_rec->pr_timer_cca = NULL;
+
+		/* Determine active IPD for path. Is max of static rate and CCT table */
+		if (!(proto->flags & IPS_PROTO_FLAG_CCA)) {
+			path_rec->pr_active_ipd = 0;
+			path_rec->pr_cca_divisor = 0;
+		} else if ((path_rec->pr_static_ipd) &&
+		    ((path_rec->pr_static_ipd + 1) >
+		     (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+			path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+			path_rec->pr_cca_divisor = 0;	/*Static rate has no CCA divisor */
+		} else {
+			/* Pick it from the CCT table */
+			path_rec->pr_active_ipd =
+			    proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+			path_rec->pr_cca_divisor =
+			    proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+		}
+
+		/* Compute max timeout based on pkt life time for path */
+		timeout_ack_ms =
+		    ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) /
+		     1000000UL);
+		timeout_ack_ms =
+		    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT +
+				timeout_ack_ms);
+		if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms)
+			proto->epinfo.ep_timeout_ack_max = timeout_ack_ms;
+
+		/* Add path record into cache */
+		strcpy(elid.key, eplid);
+		elid.data = (void *)path_rec;
+		hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+	} else			/* Path record found in cache */
+		path_rec = (ips_path_rec_t *) epath->data;
+
+#ifdef _HFI_DEBUGGING
+	/* Dump path record stats */
+	_HFI_PRDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n",
+		   (uint64_t) __be64_to_cpu(query.service_id),
+		   __be16_to_cpu(slid), __be16_to_cpu(dlid));
+	if (opp_response_set)
+	{
+		_HFI_PRDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f),
+			   path_rec->pr_mtu);
+		_HFI_PRDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey));
+		_HFI_PRDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl));
+		_HFI_PRDBG("Rate: %x, IPD: %x\n", (opp_response.rate & 0x3f),
+			   path_rec->pr_static_ipd);
+	}
+	_HFI_PRDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n",
+		   proto->epinfo.ep_timeout_ack,
+		   proto->epinfo.ep_timeout_ack_max);
+#endif
+	/* Return the IPS path record */
+	*ppath_rec = path_rec;
+
+fail:
+	return err;
+}
+
+static psm2_error_t
+ips_opp_path_rec(struct ips_proto *proto,
+		 uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+		 unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+	psm2_error_t err = PSM2_OK;
+	uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc);
+	ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY;
+	ips_path_rec_t *path;
+	ips_path_grp_t *pathgrp;
+	uint16_t path_slid, path_dlid;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	/*
+	 * High Priority Path
+	 * ------------------
+	 *
+	 * Uses the "base" Service ID. For now there exists only 1 high priority
+	 * path between nodes even for non zero LMC fabrics.
+	 *
+	 * Normal/Low Priority Paths
+	 * -------------------------
+	 *
+	 * Currently these paths are the same i.e. they are queried for the same
+	 * Service ID/vFabric which is the Base Service ID for High Priority + 1.
+	 *
+	 * Use case Scenarios
+	 * ------------------
+	 *
+	 * Since with vFabrics we have the capability to define different QoS
+	 * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is
+	 * setup in a separate vFabric for high priority traffic. The NORMAL paths
+	 * are setup in a separate vFabric optimized for high bandwidth. This allows
+	 * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked
+	 * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.)
+	 * also use the high priority control vFabric.
+	 *
+	 * NOTE: In order to distinguish between the different vFabrics the user
+	 * specifies the service ID to use via mpirun (or environment variable).
+	 * This is the service ID for the high priority control traffic. The bulk
+	 * data vFabric is identified by service ID + 1. So for each MPI application
+	 * one should specify two service IDs for the high priority and bulk data.
+	 * Both these service IDs can be placed in the same vFabric which can be
+	 * configured for high priority or bandwidth traffic giving us the default
+	 * behavior upto Infinhfi 2.5 release.
+	 *
+	 * NOTE: All of the above would have really helped if the S20 silicon could
+	 * correctly support IBTA QoS features. Due to S20 design we can only have
+	 * high priority VLarb table (low priority VLarb table results in round
+	 * robin arbitration ignoring the weights!). But if this is fixed in a
+	 * subsequent chip respin then this may potentially help our scalability
+	 * on large fabrics.
+	 *
+	 * Mesh/Torus and DOR routed networks
+	 * ----------------------------------
+	 *
+	 * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be
+	 * more). We would like to take advantage of dispersive routing on these
+	 * fabrics as well to obtain better "worst case/congested" bandwidth. For
+	 * these networks currently the base LIDs are used for UPDN routing which
+	 * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR
+	 * routing (Dimension Ordered Routing) to avoid deadlocks and provide
+	 * higher performance. If a fabric is disrupted then only the base UPDN
+	 * routing is available. PSM should continue to operate in this environment
+	 * albeit with degraded performance. In disrupted fabric the OPP path
+	 * record queries may fail for some DOR routed LIDs i.e. no path exists
+	 * PSM should hence ignore path record failures as they indicate a disrupted
+	 * fabric and only use valid paths that are returned from the replica. This
+	 * will degenerate to only using the UPDN paths on disrupted fabrics and DOR
+	 * routes only for fully configured fabrics. Note: For a clean fabric the
+	 * base LIDs that are configured for UPDN route will not exist in the replica
+	 * as DOR routes are preferred. Hence we will only dispersively route across
+	 * the DOR routes only using the UPDN route for disrupted fabrics.
+	 *
+	 * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN
+	 * TAKE PLACE.
+	 */
+
+	/* Check if this path grp is already in hash table */
+	snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+	if (epath) {		/* Find path group in cache */
+		*ppathgrp = (ips_path_grp_t *) epath->data;
+		return err;
+	}
+
+	/* If base lids are only used then reset num_path to 1 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+		num_path = 1;
+
+	/* Allocate a new pathgroup */
+	elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+	pathgrp = (ips_path_grp_t *)
+	    psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+			num_path * IPS_PATH_MAX_PRIORITY *
+			sizeof(ips_path_rec_t *));
+	if (!elid.key || !pathgrp) {
+		if (elid.key)
+			psmi_free(elid.key);
+		if (pathgrp)
+			psmi_free(pathgrp);
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	/* dlid is the peer base lid */
+	pathgrp->pg_base_lid = __be16_to_cpu(dlid);
+
+	pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] =
+	    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] =
+	    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0;
+
+	/* For now there is always only one high priority path between nodes. */
+	for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+		err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto,
+					   path_slid, path_dlid,
+					   desthfi_type, &path);
+
+		if (err == PSM2_OK) {	/* Valid high priority path found */
+			/* Resolved high priority path successfully */
+			pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++;
+			pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path;
+
+			/* Increment current path index */
+			cpath++;
+		}
+	}
+
+	/* Make sure we have atleast 1 high priority path */
+	if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+	/* Once we have the high-priority path, set the partition key */
+	if (hfi_set_pkey(proto->ep->context.ctrl,
+			 (uint16_t) pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey) != 0) {
+		err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+					"Couldn't set device pkey 0x%x: %s",
+					(int)pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey,
+					strerror(errno));
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		goto fail;
+	}
+
+
+	/* Next setup the bulk paths. If the subnet administrator has misconfigured
+	 * or rather not configured two separate service IDs we place the bulk
+	 * paths in the same vFabric as the control paths.
+	 */
+
+	path_type = IPS_PATH_NORMAL_PRIORITY;
+	for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_normal_path_res:
+		err = ips_opp_get_path_rec(path_type, proto,
+					   path_slid, path_dlid, desthfi_type,
+					   &path);
+		if (err != PSM2_OK) {
+			if (path_type == IPS_PATH_NORMAL_PRIORITY) {
+				/* Subnet may only be configured for one service ID/vFabric. Default
+				 * to using the control vFabric/service ID for bulk data as well.
+				 */
+				path_type = IPS_PATH_HIGH_PRIORITY;
+				goto retry_normal_path_res;
+			}
+
+			/* Unable to resolve path for <path_slid, path_dline>. This is possible
+			 * for disrupted fabrics using DOR routing so continue to acquire paths
+			 */
+			err = PSM2_OK;
+			continue;
+		}
+
+		/* Valid path. */
+		pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++;
+		cpath++;
+	}
+
+	/* Make sure we have at least have a single bulk data transfer path */
+	if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+	path_type = IPS_PATH_LOW_PRIORITY;
+	for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_low_path_res:
+		err = ips_opp_get_path_rec(path_type, proto,
+					   path_slid, path_dlid, desthfi_type,
+					   &path);
+		if (err != PSM2_OK) {
+			if (path_type == IPS_PATH_LOW_PRIORITY) {
+				/* Subnet may only be configured for one service ID/vFabric. Default
+				 * to using the control vFabric/service ID for bulk data as well.
+				 */
+				path_type = IPS_PATH_HIGH_PRIORITY;
+				goto retry_low_path_res;
+			}
+
+			/* Unable to resolve path for <path_slid, path_dline>. This is possible
+			 * for disrupted fabrics using DOR routing so continue to acquire paths
+			 */
+			err = PSM2_OK;
+			continue;
+		}
+
+		/* Valid path. */
+		pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++;
+		cpath++;
+	}
+
+	/* Make sure we have at least have a single bulk data transfer path */
+	if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+		    proto->epinfo.ep_context %
+		    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+		pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+		    proto->epinfo.ep_context %
+		    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+	}
+
+	/* Add path group into cache */
+	strcpy(elid.key, eplid);
+	elid.data = (void *)pathgrp;
+	hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+	*ppathgrp = pathgrp;
+
+fail:
+	if (err != PSM2_OK)
+		_HFI_PRDBG
+		    ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n",
+		     slid, dlid);
+	return err;
+}
+
+static psm2_error_t ips_opp_fini(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (proto->opp_lib)
+		dlclose(proto->opp_lib);
+
+	return err;
+}
+
+psm2_error_t ips_opp_init(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	char hfiName[32];
+
+	proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW);
+	if (!proto->opp_lib) {
+		char *err = dlerror();
+		_HFI_ERROR
+		    ("Unable to open OFED Plus Plus library %s. Error: %s\n",
+		     DF_OPP_LIBRARY, err ? err : "no dlerror()");
+		goto fail;
+	}
+
+	/* Resolve symbols that we require within opp library */
+	proto->opp_fn.op_path_find_hca =
+	    dlsym(proto->opp_lib, "op_path_find_hfi");
+	proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open");
+	proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close");
+	proto->opp_fn.op_path_get_path_by_rec =
+	    dlsym(proto->opp_lib, "op_path_get_path_by_rec");
+
+	/* If we can't resovle any symbol then fail to load opp module */
+	if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open ||
+	    !proto->opp_fn.op_path_close
+	    || !proto->opp_fn.op_path_get_path_by_rec) {
+		_HFI_ERROR
+		    ("Unable to resolve symbols in OPP library. Unloading.\n");
+		goto fail;
+	}
+
+	/* If PSM2_IDENTIFY is set display the OPP library location being used. */
+	if (getenv("PSM2_IDENTIFY")) {
+		Dl_info info_opp;
+		printf
+		    ("PSM2 path record queries using OFED Plus Plus (%s) from %s\n",
+		     DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open,
+					    &info_opp) ? info_opp.
+		     dli_fname :
+		     "Unknown/unsupported version of OPP library found!");
+	}
+
+	/* Obtain handle to hfi (requires verbs on node) */
+	snprintf(hfiName, sizeof(hfiName), "hfi1_%d",
+		 proto->ep->context.ctrl->__hfi_unit);
+	proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device);
+	if (!proto->hndl) {
+		_HFI_ERROR
+		    ("OPP: Unable to find HFI %s. Disabling OPP interface for path record queries.\n",
+		     hfiName);
+		goto fail;
+	}
+
+	/* Get OPP context */
+	proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1);
+	if (!proto->opp_ctxt) {
+		_HFI_ERROR
+		    ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n");
+		goto fail;
+	}
+
+	/* Setup default errorcheck timeout. OPP may change it later. */
+	proto->epinfo.ep_timeout_ack =
+	    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+	proto->epinfo.ep_timeout_ack_max =
+	    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+	proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT;
+
+	/* OPP initialized successfully */
+	proto->ibta.get_path_rec = ips_opp_path_rec;
+	proto->ibta.fini = ips_opp_fini;
+	proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC;
+
+	return err;
+
+fail:
+	_HFI_ERROR("Make sure SM is running...\n");
+	_HFI_ERROR("Make sure service ibacm is running...\n");
+	_HFI_ERROR("to start ibacm: service ibacm start\n");
+	_HFI_ERROR("or enable it at boot time: opaconfig -E ibacm\n\n");
+
+	err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+				"Unable to initialize OFED Plus library successfully.\n");
+
+	if (proto->opp_lib)
+		dlclose(proto->opp_lib);
+
+	return err;
+}
diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c
new file mode 100644
index 0000000..647b111
--- /dev/null
+++ b/ptl_ips/ips_path_rec.c
@@ -0,0 +1,791 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+static void ips_gen_ipd_table(struct ips_proto *proto)
+{
+	uint8_t delay = 0, step = 1;
+	/* Based on our current link rate setup the IPD table */
+	memset(proto->ips_ipd_delay, 0xFF, sizeof(proto->ips_ipd_delay));
+
+	/*
+	 * Based on the starting rate of the link, we let the code to
+	 * fall through to next rate without 'break' in the code. The
+	 * decrement is doubled at each rate level...
+	 */
+	switch (proto->epinfo.ep_link_rate) {
+	case IBV_RATE_300_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_200_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_168_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_120_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+	case IBV_RATE_112_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+	case IBV_RATE_100_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_80_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_80_GBPS] = delay;
+	case IBV_RATE_60_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_60_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_40_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_40_GBPS] = delay;
+	case IBV_RATE_30_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_30_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_25_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_25_GBPS] = delay;
+	case IBV_RATE_20_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_20_GBPS] = delay;
+		delay += step;
+		step *= 2;
+	case IBV_RATE_10_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_10_GBPS] = delay;
+	case IBV_RATE_5_GBPS:
+		proto->ips_ipd_delay[IBV_RATE_5_GBPS] = delay;
+	default:
+		break;
+	}
+}
+
+static psm2_error_t ips_gen_cct_table(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	uint32_t cca_divisor, ipdidx, ipdval = 1;
+	uint16_t *cct_table;
+
+	/* The CCT table is static currently. If it's already created then return */
+	if (proto->cct)
+		goto fail;
+
+	/* Allocate the CCT table */
+	cct_table = psmi_calloc(proto->ep, UNDEFINED,
+				proto->ccti_size, sizeof(uint16_t));
+	if (!cct_table) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	if (proto->ccti_size)
+	{
+		/* The first table entry is always 0 i.e. no IPD delay */
+		cct_table[0] = 0;
+	}
+
+	/* Generate the remaining CCT table entries */
+	for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4, ipdval++)
+		for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) {
+			if ((ipdidx + cca_divisor) == proto->ccti_size)
+				break;
+			cct_table[ipdidx + cca_divisor] =
+			    (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) |
+			     (ipdval & 0x3FFF));
+			_HFI_CCADBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n",
+				  ipdidx + cca_divisor,
+				  cct_table[ipdidx + cca_divisor],
+				  (cct_table[ipdidx + cca_divisor] >>
+				   CCA_DIVISOR_SHIFT),
+				  cct_table[ipdidx +
+					    cca_divisor] & CCA_IPD_MASK);
+		}
+
+	/* On link up/down CCT is re-generated. If CCT table is previously created
+	 * free it
+	 */
+	if (proto->cct) {
+		psmi_free(proto->cct);
+		proto->cct = NULL;
+	}
+
+	/* Update to the new CCT table */
+	proto->cct = cct_table;
+
+fail:
+	return err;
+}
+
+static opa_rate ips_default_hfi_rate(uint16_t hfi_type)
+{
+	opa_rate rate;
+
+	switch (hfi_type) {
+	case PSMI_HFI_TYPE_OPA1:
+		rate = IBV_RATE_100_GBPS;
+		break;
+	case PSMI_HFI_TYPE_OPA2:
+		rate = IBV_RATE_120_GBPS;
+		break;
+	default:
+		rate = IBV_RATE_MAX;
+	}
+
+	return rate;
+}
+
+static opa_rate ips_rate_to_enum(int link_rate)
+{
+	opa_rate rate;
+
+	switch (link_rate) {
+	case 300:
+		rate = IBV_RATE_300_GBPS;
+		break;
+	case 200:
+		rate = IBV_RATE_200_GBPS;
+		break;
+	case 100:
+		rate = IBV_RATE_100_GBPS;
+		break;
+	case 25:
+		rate = IBV_RATE_25_GBPS;
+		break;
+	case 168:
+		rate = IBV_RATE_168_GBPS;
+		break;
+	case 112:
+		rate = IBV_RATE_112_GBPS;
+		break;
+	case 56:
+		rate = IBV_RATE_56_GBPS;
+		break;
+	case 14:
+		rate = IBV_RATE_14_GBPS;
+		break;
+	case 120:
+		rate = IBV_RATE_120_GBPS;
+		break;
+	case 80:
+		rate = IBV_RATE_80_GBPS;
+		break;
+	case 60:
+		rate = IBV_RATE_60_GBPS;
+		break;
+	case 40:
+		rate = IBV_RATE_40_GBPS;
+		break;
+	case 30:
+		rate = IBV_RATE_30_GBPS;
+		break;
+	case 20:
+		rate = IBV_RATE_20_GBPS;
+		break;
+	case 10:
+		rate = IBV_RATE_10_GBPS;
+		break;
+	case 5:
+		rate = IBV_RATE_5_GBPS;
+		break;
+	default:
+		rate = IBV_RATE_MAX;
+	}
+
+	return rate;
+}
+
+static psm2_error_t
+ips_none_get_path_rec(struct ips_proto *proto,
+		      uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+		      unsigned long timeout, ips_path_rec_t **ppath_rec)
+{
+	psm2_error_t err = PSM2_OK;
+	ips_path_rec_t *path_rec;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	/* Query the path record cache */
+	snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+	if (!epath) {
+		elid.key =
+		    psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+		path_rec = (ips_path_rec_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, 1,
+				sizeof(ips_path_rec_t));
+		if (!elid.key || !path_rec) {
+			if (elid.key)
+				psmi_free(elid.key);
+			if (path_rec)
+				psmi_free(path_rec);
+			return PSM2_NO_MEMORY;
+		}
+
+		/* Create path record */
+		path_rec->pr_slid = slid;
+		path_rec->pr_dlid = dlid;
+		path_rec->pr_mtu = proto->epinfo.ep_mtu;
+		path_rec->pr_pkey = proto->epinfo.ep_pkey;
+		path_rec->pr_sl = proto->epinfo.ep_sl;
+
+		/* Determine the IPD based on our local link rate and default link rate for
+		 * remote hfi type.
+		 */
+		path_rec->pr_static_ipd =
+		    proto->ips_ipd_delay[ips_default_hfi_rate(desthfi_type)];
+
+		_HFI_CCADBG("pr_static_ipd = %d\n", (int) path_rec->pr_static_ipd);
+
+		/* Setup CCA parameters for path */
+		if (path_rec->pr_sl > PSMI_SL_MAX) {
+			psmi_free(elid.key);
+			psmi_free(path_rec);
+			return PSM2_INTERNAL_ERR;
+		}
+		if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) {
+			_HFI_CCADBG("No CCA for sl %d, disable CCA\n",
+				    path_rec->pr_sl);
+			proto->flags &= ~IPS_PROTO_FLAG_CCA;
+			proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+		}
+		if (!(proto->ep->context.runtime_flags &
+					HFI1_CAP_STATIC_RATE_CTRL)) {
+			_HFI_CCADBG("No Static-Rate-Control, disable CCA\n");
+			proto->flags &= ~IPS_PROTO_FLAG_CCA;
+			proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+		}
+
+		path_rec->proto = proto;
+		path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min;
+		path_rec->pr_timer_cca = NULL;
+
+		/* Determine active IPD for path. Is max of static rate and CCT table */
+		if (!(proto->flags & IPS_PROTO_FLAG_CCA)) {
+			_HFI_CCADBG("No IPS_PROTO_FLAG_CCA\n");
+
+			path_rec->pr_active_ipd = 0;
+			path_rec->pr_cca_divisor = 0;
+
+			_HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+			_HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+		} else if ((path_rec->pr_static_ipd) &&
+		    ((path_rec->pr_static_ipd + 1) >
+		     (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+			_HFI_CCADBG("IPS_PROTO_FLAG_CCA set, Setting pr_active_ipd.\n");
+
+			path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+			path_rec->pr_cca_divisor = 0;
+
+			_HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+			_HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+		} else {
+			/* Pick it from the CCT table */
+			_HFI_CCADBG("Picking up active IPD from CCT table, index %d, value 0x%x\n",
+				    (int) path_rec->pr_ccti, (int) proto->cct[path_rec->pr_ccti]);
+
+			path_rec->pr_active_ipd =
+			    proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+			path_rec->pr_cca_divisor =
+			    proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+
+			_HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+			_HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+		}
+
+		/* Add path record into cache */
+		strcpy(elid.key, eplid);
+		elid.data = (void *)path_rec;
+		hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+	} else
+		path_rec = (ips_path_rec_t *) epath->data;
+
+	/* Return IPS path record */
+	*ppath_rec = path_rec;
+
+	return err;
+}
+
+static psm2_error_t
+ips_none_path_rec(struct ips_proto *proto,
+		  uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+		  unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+	psm2_error_t err = PSM2_OK;
+	uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc);
+	uint16_t base_slid, base_dlid;
+	ips_path_rec_t *path;
+	ips_path_grp_t *pathgrp;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	/* For the "none" path record resolution all paths are assumed to be of equal
+	 * priority however since we want to isolate all control traffic (acks, naks)
+	 * to a separate path for non zero LMC subnets the "first path" between a
+	 * pair of endpoints is always the "higher" priority paths. The rest of the
+	 * paths are the normal (and low priority) paths.
+	 */
+
+	/* Query the path record cache */
+	snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+	if (epath) {		/* Find path group in cache */
+		*ppathgrp = (ips_path_grp_t *) epath->data;
+		return err;
+	}
+
+	/* If base lids are only used then reset num_path to 1 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+		num_path = 1;
+
+	/* Allocate a new pathgroup */
+	elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+	pathgrp = (ips_path_grp_t *)
+	    psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+			num_path * IPS_PATH_MAX_PRIORITY *
+			sizeof(ips_path_rec_t *));
+	if (!elid.key || !pathgrp) {
+		if (elid.key)
+			psmi_free(elid.key);
+		if (pathgrp)
+			psmi_free(pathgrp);
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	/* dlid is the peer base lid */
+	pathgrp->pg_base_lid = __be16_to_cpu(dlid);
+
+	if (num_path > 1) {
+		/* One control path and (num_path - 1) norm and low priority paths */
+		pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1;
+	} else {
+		/* LMC of 0. Use the same path for all priorities */
+		pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1;
+	}
+
+	/* For "none" path record we just setup 2^lmc paths. To get better load
+	 * balance
+	 */
+	for (pidx = 0; pidx < num_path; pidx++) {
+		base_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		base_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+		err =
+		    ips_none_get_path_rec(proto, base_slid, base_dlid,
+					  desthfi_type, timeout, &path);
+		if (err != PSM2_OK) {
+			psmi_free(elid.key);
+			psmi_free(pathgrp);
+			goto fail;
+		}
+
+		if (num_path > 1) {
+			if (pidx == 0) {
+				/* First path is always the high priority path */
+				pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] =
+				    path;
+			} else {
+				pathgrp->pg_path[pidx -
+						 1][IPS_PATH_NORMAL_PRIORITY] =
+				    path;
+				pathgrp->pg_path[pidx -
+						 1][IPS_PATH_LOW_PRIORITY] =
+				    path;
+			}
+		} else {
+			pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path;
+			pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path;
+			pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path;
+		}
+	}
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+		    proto->epinfo.ep_context %
+		    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+		pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+		    proto->epinfo.ep_context %
+		    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+	}
+
+	/* Add path record into cache */
+	strcpy(elid.key, eplid);
+	elid.data = (void *)pathgrp;
+	hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+	*ppathgrp = pathgrp;
+
+fail:
+	if (err != PSM2_OK)
+		_HFI_PRDBG
+		    ("Unable to get path record for LID %x <---> DLID %x.\n",
+		     slid, dlid);
+	return err;
+}
+
+static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Obtain the SL and PKEY to use from the environment (HFI_SL & PSM_KEY) */
+	proto->epinfo.ep_sl = proto->ep->out_sl;
+	proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey;
+
+	/*
+	 * Parse the err_chk settings from the environment.
+	 * <min_timeout>:<max_timeout>:<timeout_factor>
+	 */
+	{
+		union psmi_envvar_val env_to;
+		char *errchk_to = PSM_TID_TIMEOUT_DEFAULT;
+		int tvals[3] = {
+			IPS_PROTO_ERRCHK_MS_MIN_DEFAULT,
+			IPS_PROTO_ERRCHK_MS_MAX_DEFAULT,
+			IPS_PROTO_ERRCHK_FACTOR_DEFAULT
+		};
+
+		if (!psmi_getenv("PSM2_ERRCHK_TIMEOUT",
+				 "Errchk timeouts in mS <min:max:factor>",
+				 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+				 (union psmi_envvar_val)errchk_to, &env_to)) {
+			/* Not using default values, parse what we can */
+			errchk_to = env_to.e_str;
+			psmi_parse_str_tuples(errchk_to, 3, tvals);
+			/* Adjust for max smaller than min, things would break */
+			if (tvals[1] < tvals[0])
+				tvals[1] = tvals[0];
+		}
+
+		proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]);
+		proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]);
+		proto->epinfo.ep_timeout_ack_factor = tvals[2];
+	}
+
+	proto->ibta.get_path_rec = ips_none_path_rec;
+	proto->ibta.fini = NULL;
+
+	/* With no path records queries set pkey manually */
+	if (hfi_set_pkey(proto->ep->context.ctrl,
+			 (uint16_t) proto->ep->network_pkey) != 0) {
+		err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+					"Couldn't set device pkey 0x%x: %s",
+					(int)proto->ep->network_pkey,
+					strerror(errno));
+	}
+
+	return err;
+}
+
+/* (Re)load the SL2VL table */
+psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto)
+{
+	int ret, i;
+
+	/* Get SL2SC table for unit, port */
+	for (i = 0; i < 32; i++) {
+		if ((ret =
+		     hfi_get_port_sl2sc(proto->ep->context.ctrl->__hfi_unit,
+					proto->ep->context.ctrl->__hfi_port,
+					(uint8_t) i)) < 0) {
+			/* Unable to get SL2SC. Set it to default */
+			ret = PSMI_SC_DEFAULT;
+		}
+
+		proto->sl2sc[i] = (uint16_t) ret;
+	}
+	/* Get SC2VL table for unit, port */
+	for (i = 0; i < 32; i++) {
+		if ((ret =
+		     hfi_get_port_sc2vl(proto->ep->context.ctrl->__hfi_unit,
+					proto->ep->context.ctrl->__hfi_port,
+					(uint8_t) i)) < 0) {
+			/* Unable to get SC2VL. Set it to default */
+			ret = PSMI_VL_DEFAULT;
+		}
+
+		proto->sc2vl[i] = (uint16_t) ret;
+	}
+
+	return PSM2_OK;
+}
+
+/* On link up/down we need to update some state */
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	int ret;
+
+	/* Get base lid, lmc and rate as these may have changed if the link bounced */
+	proto->epinfo.ep_base_lid =
+	    __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid));
+
+	if ((ret = hfi_get_port_lmc(proto->ep->context.ctrl->__hfi_unit,
+				    proto->ep->context.ctrl->__hfi_port)) < 0) {
+		err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+					"Could obtain LMC for unit %u:%u. Error: %s",
+					proto->ep->context.ctrl->__hfi_unit,
+					proto->ep->context.ctrl->__hfi_port,
+					strerror(errno));
+		goto fail;
+	}
+	proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC);
+
+	if ((ret = hfi_get_port_rate(proto->ep->context.ctrl->__hfi_unit,
+				     proto->ep->context.ctrl->__hfi_port)) <
+	    0) {
+		err =
+		    psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+				      "Could obtain link rate for unit %u:%u. Error: %s",
+				      proto->ep->context.ctrl->__hfi_unit,
+				      proto->ep->context.ctrl->__hfi_port,
+				      strerror(errno));
+		goto fail;
+	}
+	proto->epinfo.ep_link_rate = ips_rate_to_enum(ret);
+
+	/* Load the SL2SC2VL table */
+	ips_ibta_init_sl2sc2vl_table(proto);
+
+	/* Regenerate new IPD table for the updated link rate. */
+	ips_gen_ipd_table(proto);
+
+	/* Generate the CCT table.  */
+	err = ips_gen_cct_table(proto);
+
+fail:
+	return err;
+}
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val psm_path_policy;
+	union psmi_envvar_val disable_cca;
+	union psmi_envvar_val cca_prescan;
+
+	/* Get the path selection policy */
+	psmi_getenv("PSM2_PATH_SELECTION",
+		    "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)"adaptive", &psm_path_policy);
+
+	if (!strcasecmp((const char *)psm_path_policy.e_str, "adaptive"))
+		proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE;
+	else if (!strcasecmp((const char *)psm_path_policy.e_str, "static_src"))
+		proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC;
+	else if (!strcasecmp
+		 ((const char *)psm_path_policy.e_str, "static_dest"))
+		proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST;
+	else if (!strcasecmp
+		 ((const char *)psm_path_policy.e_str, "static_base"))
+		proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE;
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+		_HFI_PRDBG("Using adaptive path selection.\n");
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+		_HFI_PRDBG("Static path selection: Src Context\n");
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+		_HFI_PRDBG("Static path selection: Dest Context\n");
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+		_HFI_PRDBG("Static path selection: Base LID\n");
+
+	psmi_getenv("PSM2_DISABLE_CCA",
+		    "Disable use of Congestion Control Architecure (CCA) [enabled] ",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &disable_cca);
+	if (disable_cca.e_uint)
+		_HFI_CCADBG("CCA is disabled for congestion control.\n");
+	else {
+		int i;
+		char ccabuf[256];
+		uint8_t *p;
+
+		proto->flags |= IPS_PROTO_FLAG_CCA;
+/*
+ * If user set any environment variable, use self CCA.
+ */
+		if (getenv("PSM2_CCTI_INCREMENT") || getenv("PSM2_CCTI_TIMER")
+		    || getenv("PSM2_CCTI_TABLE_SIZE")) {
+			goto disablecca;
+		}
+
+		psmi_getenv("PSM2_CCA_PRESCAN",
+                    "Enable Congestion Control Prescanning (disabled by default) ",
+                    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+                    (union psmi_envvar_val)0, &cca_prescan);
+
+		if (cca_prescan.e_uint)
+			proto->flags |= IPS_PROTO_FLAG_CCA_PRESCAN;
+
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+		i = hfi_get_cc_settings_bin(proto->ep->context.ctrl->__hfi_unit,
+					    proto->ep->context.ctrl->__hfi_port,
+					    ccabuf);
+		if (i <= 0) {
+			goto disablecca;
+		}
+		p = (uint8_t *) ccabuf;
+		memcpy(&proto->ccti_ctrlmap, p, 4);
+		p += 4;
+		memcpy(&proto->ccti_portctrl, p, 2);
+		p += 2;
+		for (i = 0; i < 32; i++) {
+			proto->cace[i].ccti_increase = *p;
+			p++;
+			/* skip reserved u8 */
+			p++;
+			memcpy(&proto->cace[i].ccti_timer_cycles, p, 2);
+			p += 2;
+			proto->cace[i].ccti_timer_cycles =
+			    us_2_cycles(proto->cace[i].ccti_timer_cycles);
+			proto->cace[i].ccti_threshold = *p;
+			p++;
+			proto->cace[i].ccti_min = *p;
+			p++;
+		}
+
+		i = hfi_get_cc_table_bin(proto->ep->context.ctrl->__hfi_unit,
+					 proto->ep->context.ctrl->__hfi_port,
+					 &proto->cct);
+		if (i < 0) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		} else if (i == 0) {
+			goto disablecca;
+		}
+		proto->ccti_limit = i;
+		proto->ccti_size = proto->ccti_limit + 1;
+
+		_HFI_CCADBG("ccti_limit = %d\n", (int) proto->ccti_limit);
+		for (i = 0; i < proto->ccti_limit; i++)
+			_HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]);
+
+
+		goto finishcca;
+
+/*
+ * Disable CCA.
+ */
+disablecca:
+		proto->flags &= ~IPS_PROTO_FLAG_CCA;
+		proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+	}
+
+finishcca:
+	/* Initialize path record/group hash table */
+	hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash);
+	hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash);
+
+	/* On startup treat it as a link up/down event to setup state . */
+	if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK)
+		goto fail;
+
+	/* Setup the appropriate query interface for the endpoint */
+	switch (proto->ep->path_res_type) {
+	case PSM2_PATH_RES_OPP:
+		err = ips_opp_init(proto);
+		if (err != PSM2_OK)
+			_HFI_ERROR
+			    ("Unable to use OFED Plus Plus for path record queries.\n");
+		break;
+	case PSM2_PATH_RES_UMAD:
+		_HFI_ERROR
+		    ("Path record queries using UMAD is not supported in PSM version %d.%dx\n",
+		     PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+		err = PSM2_EPID_PATH_RESOLUTION;
+		break;
+	case PSM2_PATH_RES_NONE:
+	default:
+		err = ips_none_path_rec_init(proto);
+	}
+
+fail:
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (proto->ibta.fini)
+		err = proto->ibta.fini(proto);
+
+	/* Destroy the path record/group hash */
+	hdestroy_r(&proto->ips_path_rec_hash);
+	hdestroy_r(&proto->ips_path_grp_hash);
+
+	return err;
+}
diff --git a/ptl_ips/ips_path_rec.h b/ptl_ips/ips_path_rec.h
new file mode 100644
index 0000000..21cbef5
--- /dev/null
+++ b/ptl_ips/ips_path_rec.h
@@ -0,0 +1,185 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */
+
+
+#ifndef _IPS_PATH_REC_H_
+#define _IPS_PATH_REC_H_
+
+#include <search.h>
+
+/* Default size of path record hash table */
+#define DF_PATH_REC_HASH_SIZE 2047
+
+/* Default size of path group hash table */
+#define DF_PATH_GRP_HASH_SIZE 255
+
+/* Default size of CCT table. Must be multiple of 64 */
+#define DF_CCT_TABLE_SIZE 128
+
+/* CCT max IPD delay. */
+#define DF_CCT_MAX_IPD_DELAY_US 21
+
+/* CCA divisor shift */
+#define CCA_DIVISOR_SHIFT 14
+
+/* CCA ipd mask */
+#define CCA_IPD_MASK 0x3FFF
+
+/* A lot of these are IBTA specific defines that are available in other header
+ * files. To minimize dependencies with PSM build process they are listed
+ * here. Most of this is used to implement IBTA compliance features with PSM
+ * like path record query etc.
+ */
+
+enum opa_mtu {
+	IBTA_MTU_256  = 1,
+	IBTA_MTU_512  = 2,
+	IBTA_MTU_1024 = 3,
+	IBTA_MTU_2048 = 4,
+	IBTA_MTU_4096 = 5,
+	OPA_MTU_8192  = 6,
+	OPA_MTU_10240 = 7,
+	IBTA_MTU_MIN  = IBTA_MTU_256,
+	OPA_MTU_MIN   = IBTA_MTU_256,
+	OPA_MTU_MAX   = OPA_MTU_10240,
+};
+
+typedef enum {
+        IBV_RATE_MAX      = 0,
+        IBV_RATE_2_5_GBPS = 2,
+        IBV_RATE_5_GBPS   = 5,
+        IBV_RATE_10_GBPS  = 3,
+        IBV_RATE_20_GBPS  = 6,
+        IBV_RATE_30_GBPS  = 4,
+        IBV_RATE_40_GBPS  = 7,
+        IBV_RATE_60_GBPS  = 8,
+        IBV_RATE_80_GBPS  = 9,
+        IBV_RATE_120_GBPS = 10,
+        IBV_RATE_14_GBPS  = 11,
+        IBV_RATE_56_GBPS  = 12,
+        IBV_RATE_112_GBPS = 13,
+        IBV_RATE_168_GBPS = 14,
+        IBV_RATE_25_GBPS  = 15,
+        IBV_RATE_100_GBPS = 16,
+        IBV_RATE_200_GBPS = 17,
+        IBV_RATE_300_GBPS = 18
+} opa_rate;
+
+static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
+{
+	switch (mtu) {
+	case IBTA_MTU_256:
+		return 256;
+	case IBTA_MTU_512:
+		return 512;
+	case IBTA_MTU_1024:
+		return 1024;
+	case IBTA_MTU_2048:
+		return 2048;
+	case IBTA_MTU_4096:
+		return 4096;
+	case OPA_MTU_8192:
+		return 8192;
+	case OPA_MTU_10240:
+		return 10240;
+	default:
+		return -1;
+	}
+}
+
+/* This is same as ob_path_rec from ib_types.h. Listed here to be self
+ * contained to minimize dependencies during build etc.
+ */
+typedef struct _ibta_path_rec {
+	uint64_t service_id;	/* net order */
+	uint8_t dgid[16];
+	uint8_t sgid[16];
+	uint16_t dlid;		/* net order */
+	uint16_t slid;		/* net order */
+	uint32_t hop_flow_raw;	/* net order */
+	uint8_t tclass;
+	uint8_t num_path;
+	uint16_t pkey;		/* net order */
+	uint16_t qos_class_sl;	/* net order */
+	uint8_t mtu;		/* IBTA encoded */
+	uint8_t rate;		/* IBTA encoded */
+	uint8_t pkt_life;	/* IBTA encoded */
+	uint8_t preference;
+	uint8_t resv2[6];
+} ibta_path_rec_t;
+
+/*
+ * PSM IPS path record components for endpoint.
+ */
+struct ips_proto;
+typedef struct ips_path_rec {
+	uint16_t pr_slid;	/* For Torus/non zero LMC fabrics this can be diff */
+	uint16_t pr_dlid;
+	uint16_t pr_mtu;	/* < Path's MTU */
+	uint16_t pr_pkey;
+	uint16_t pr_static_ipd;	/* Static rate IPD from path record */
+	uint8_t pr_sl;
+
+	/* IBTA CCA parameters per path */
+	uint8_t pr_cca_divisor;	/* CCA divisor [14:15] in CCT entry */
+	uint16_t pr_active_ipd;	/* The current active IPD. max(static,cct) */
+	uint16_t pr_ccti;	/* CCA table index */
+	psmi_timer *pr_timer_cca;	/* Congestion timer for epr_ccti increment. */
+	struct ips_proto *proto;	/* for global info */
+} ips_path_rec_t;
+
+psm2_error_t ips_opp_init(struct ips_proto *proto);
+
+#endif
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
new file mode 100644
index 0000000..150bda1
--- /dev/null
+++ b/ptl_ips/ips_proto.c
@@ -0,0 +1,2348 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * IPS - Interconnect Protocol Stack.
+ */
+
+#include <assert.h>
+#include <sys/uio.h>		/* writev */
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_help.h"
+#include "psmi_wrappers.h"
+
+/*
+ * Control message types have their own flag to determine whether a message of
+ * that type is queued or not.  These flags are kept in a state bitfield.
+ */
+#define CTRL_MSG_ACK_QUEUED                     0x0001
+#define CTRL_MSG_NAK_QUEUED                     0x0002
+#define CTRL_MSG_BECN_QUEUED                    0x0004
+#define CTRL_MSG_ERR_CHK_QUEUED                 0x0008
+#define CTRL_MSG_ERR_CHK_GEN_QUEUED             0x0010
+#define CTRL_MSG_CONNECT_REQUEST_QUEUED		0x0020
+#define CTRL_MSG_CONNECT_REPLY_QUEUED		0x0040
+#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED	0x0080
+#define CTRL_MSG_DISCONNECT_REPLY_QUEUED	0x0100
+
+#ifdef PSM_CUDA
+uint32_t gpudirect_send_threshold;
+uint32_t gpudirect_recv_threshold;
+#endif
+
+static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto);
+static psm2_error_t proto_sdma_init(struct ips_proto *proto,
+				   const psmi_context_t *context);
+
+#ifdef PSM_CUDA
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+{
+	struct ips_cuda_hostbuf *icb;
+	struct ips_cuda_hostbuf_mpool_cb_context *ctxt =
+		(struct ips_cuda_hostbuf_mpool_cb_context *) context;
+
+	icb = (struct ips_cuda_hostbuf *)obj;
+	if (is_alloc) {
+		PSMI_CUDA_CALL(cudaHostAlloc,
+			       (void **) &icb->host_buf,
+			       ctxt->bufsz,
+			       cudaHostAllocPortable);
+		PSMI_CUDA_CALL(cudaEventCreate, &icb->copy_status);
+	} else {
+		if (icb->host_buf) {
+			PSMI_CUDA_CALL(cudaFreeHost, icb->host_buf);
+			PSMI_CUDA_CALL(cudaEventDestroy, icb->copy_status);
+		}
+	}
+	return;
+}
+#endif
+
+psm2_error_t
+ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
+	       int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
+	       const struct psmi_timer_ctrl *timerq,
+	       const struct ips_epstate *epstate,
+	       const struct ips_spio *spioc, struct ips_proto *proto)
+{
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+	const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+	uint32_t protoexp_flags, cksum_sz;
+	union psmi_envvar_val env_tid, env_cksum, env_mtu;
+	psm2_error_t err = PSM2_OK;
+
+	/*
+	 * Checksum packets within PSM. Default is off.
+	 * This is heavy weight and done in software so not recommended for
+	 * production runs.
+	 */
+
+	psmi_getenv("PSM2_CHECKSUM",
+		    "Enable checksum of messages (0 disables checksum)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)0, &env_cksum);
+
+	memset(proto, 0, sizeof(struct ips_proto));
+	proto->ptl = (ptl_t *) ptl;
+	proto->ep = context->ep;	/* cached */
+	proto->mq = context->ep->mq;	/* cached */
+	proto->fd = context->fd;	/* cached */
+	proto->pend_sends.proto = proto;
+	psmi_timer_entry_init(&proto->pend_sends.timer,
+			      ips_proto_timer_pendq_callback,
+			      &proto->pend_sends);
+	STAILQ_INIT(&proto->pend_sends.pendq);
+	proto->epstate = (struct ips_epstate *)epstate;
+	proto->timerq = (struct psmi_timer_ctrl *)timerq;
+	proto->spioc = (struct ips_spio *)spioc;
+
+	proto->epinfo.ep_baseqp = base_info->bthqp;
+	proto->epinfo.ep_context = ctxt_info->ctxt;	/* "real" context */
+	proto->epinfo.ep_subcontext = ctxt_info->subctxt;
+	proto->epinfo.ep_hfi_type = psmi_get_hfi_type(context);
+	proto->epinfo.ep_jkey = base_info->jkey;
+
+	/* If checksums enabled we insert checksum at end of packet */
+	cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0;
+	proto->epinfo.ep_mtu = context->ep->mtu;
+	/* Decrement checksum */
+	proto->epinfo.ep_mtu -= cksum_sz;
+
+	/* See if user specifies a lower MTU to use */
+	if (!psmi_getenv
+	    ("PSM2_MTU", "MTU specified by user: 1-7,256-8192,10240]",
+	     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+	     (union psmi_envvar_val)-1, &env_mtu)) {
+		if (env_mtu.e_int != 256 && env_mtu.e_int != 512
+		    && env_mtu.e_int != 1024 && env_mtu.e_int != 2048
+		    && env_mtu.e_int != 4096 && env_mtu.e_int != 8192
+		    && env_mtu.e_int != 10240) {
+			if (env_mtu.e_int < OPA_MTU_MIN ||
+			    env_mtu.e_int > OPA_MTU_MAX)
+				env_mtu.e_int = OPA_MTU_8192;
+			env_mtu.e_int =
+			    opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int);
+		}
+		if (proto->epinfo.ep_mtu > env_mtu.e_int)
+			proto->epinfo.ep_mtu = env_mtu.e_int;
+	}
+
+	/*
+	 * The PIO size should not include the ICRC because it is
+	 * stripped by HW before delivering to receiving buffer.
+	 * We decide to use minimum 2 PIO buffers so that PSM has
+	 * turn-around time to do PIO transfer. Each credit is a
+	 * block of 64 bytes. Also PIO buffer size must not be
+	 * bigger than MTU.
+	 */
+	proto->epinfo.ep_piosize = (ctxt_info->credits / 2) * 64 -
+	    (sizeof(struct ips_message_header) + HFI_PCB_SIZE_IN_BYTES +
+	     cksum_sz);
+	proto->epinfo.ep_piosize =
+	    min(proto->epinfo.ep_piosize, proto->epinfo.ep_mtu);
+
+	/* Keep PIO as multiple of cache line size */
+	if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES)
+		proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1);
+
+	/* Save back to hfi level. */
+	context->ctrl->__hfi_mtusize = proto->epinfo.ep_mtu;
+	context->ctrl->__hfi_piosize = proto->epinfo.ep_piosize;
+
+	/* sdma completion queue */
+	proto->sdma_comp_queue =
+	    (struct hfi1_sdma_comp_entry *) base_info->sdma_comp_bufbase;
+	proto->sdma_queue_size = ctxt_info->sdma_ring_size;
+	/* don't use the last slot */
+
+	{
+		/* configure sdma_avail_counter */
+		union psmi_envvar_val env_sdma_avail;
+		int tmp_queue_size = proto->sdma_queue_size - 1;
+
+		psmi_getenv("PSM2_MAX_PENDING_SDMA_REQS",
+			"PSM maximum pending SDMA requests",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val) tmp_queue_size,
+			&env_sdma_avail);
+
+		if ((env_sdma_avail.e_int < 8) || (env_sdma_avail.e_int > proto->sdma_queue_size - 1))
+			proto->sdma_avail_counter = proto->sdma_queue_size - 1;
+		else
+			proto->sdma_avail_counter = env_sdma_avail.e_int;
+	}
+
+
+	proto->sdma_fill_index = 0;
+	proto->sdma_done_index = 0;
+	proto->sdma_scb_queue = (struct ips_scb **)
+		psmi_calloc(proto->ep, UNDEFINED,
+		proto->sdma_queue_size, sizeof(struct ips_scb *));
+	if (proto->sdma_scb_queue == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT);
+	proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U;
+	proto->t_init = get_cycles();
+	proto->t_fini = 0;
+	proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0;
+	proto->runid_key = getpid();
+
+	proto->num_connected_outgoing = 0;
+	proto->num_connected_incoming = 0;
+	proto->num_disconnect_requests = 0;
+	proto->stray_warn_interval = (uint64_t) -1;
+	proto->done_warning = 0;
+	proto->done_once = 0;
+	proto->num_bogus_warnings = 0;
+	proto->psmi_logevent_tid_send_reqs.interval_secs = 15;
+	proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+	proto->psmi_logevent_tid_send_reqs.count = 0;
+#ifdef PSM_CUDA
+	/*
+	 * We will need to add two extra bytes to iov_len
+	 * when passing sdma hdr info to driver due to
+	 * the new flags member in the struct.
+	 */
+	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+		proto->ips_extra_sdmahdr_size = sizeof(struct sdma_req_info) -
+						sizeof(struct sdma_req_info_v6_3);
+	else
+#endif
+	if (sizeof(struct sdma_req_info) != sizeof(struct sdma_req_info_v6_3))
+		proto->ips_extra_sdmahdr_size = sizeof(struct sdma_req_info) -
+						sizeof(struct sdma_req_info_v6_3);
+	else
+		proto->ips_extra_sdmahdr_size = 0;
+
+	/* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */
+	if ((err = ips_ibta_init(proto)))
+		goto fail;
+
+	{
+		/* User asks for HFI loopback? */
+		union psmi_envvar_val env_loopback;
+
+		psmi_getenv("PSM2_HFI_LOOPBACK",
+			"PSM uses HFI loopback (default is disabled i.e. 0)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			(union psmi_envvar_val)0, /* Disabled by default */
+			&env_loopback);
+
+		if (env_loopback.e_uint)
+			proto->flags |= IPS_PROTO_FLAG_LOOPBACK;
+	}
+
+	{
+		/* Disable coalesced ACKs? */
+		union psmi_envvar_val env_coalesce_acks;
+
+		psmi_getenv("PSM2_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1,	/* Enabled by default */
+			    &env_coalesce_acks);
+
+		if (env_coalesce_acks.e_uint)
+			proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS;
+	}
+
+	{
+		/* Number of credits per flow */
+		union psmi_envvar_val env_flow_credits;
+		int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc);
+
+		psmi_getenv("PSM2_FLOW_CREDITS",
+			    "Number of unacked packets (credits) per flow (default is 64)",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)df_flow_credits,
+			    &env_flow_credits);
+		proto->flow_credits = env_flow_credits.e_uint;
+	}
+
+	/*
+	 * Pre-calculate the PSN mask to support 24 or 31 bits PSN.
+	 */
+	if ((context->runtime_flags & HFI1_CAP_EXTENDED_PSN)) {
+		proto->psn_mask = 0x7FFFFFFF;
+	} else {
+		proto->psn_mask = 0xFFFFFF;
+	}
+
+	/*
+	 * Initialize SDMA, otherwise, turn on all PIO.
+	 */
+	if ((context->runtime_flags & HFI1_CAP_SDMA)) {
+		if ((err = proto_sdma_init(proto, context)))
+			goto fail;
+	} else {
+		proto->flags |= IPS_PROTO_FLAG_SPIO;
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    ~0U;
+	}
+
+	/*
+	 * Setup the protocol wide short message ep flow.
+	 */
+	if (proto->flags & IPS_PROTO_FLAG_SDMA) {
+		proto->msgflowid = EP_FLOW_GO_BACK_N_DMA;
+	} else {
+		proto->msgflowid = EP_FLOW_GO_BACK_N_PIO;
+	}
+
+	/*
+	 * Clone sendreq mpool configuration for pend sends config
+	 */
+	{
+		uint32_t chunks, maxsz;
+
+		psmi_assert_always(proto->ep->mq->sreq_pool != NULL);
+		psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks,
+					&maxsz);
+
+		proto->pend_sends_pool =
+		    psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks,
+				      maxsz, 0, DESCRIPTORS, NULL, NULL);
+		if (proto->pend_sends_pool == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Create a pool of CCA timers for path_rec. The timers should not
+	 * exceed the scb number num_of_send_desc(default 4K).
+	 */
+	{
+		uint32_t chunks, maxsz;
+
+		chunks = 256;
+		maxsz = num_of_send_desc;
+
+		proto->timer_pool =
+		    psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz,
+				      0, DESCRIPTORS, NULL, NULL);
+		if (proto->timer_pool == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Register ips protocol statistics
+	 *
+	 * We put a (*) in the output to denote stats that may cause a drop in
+	 * performance.
+	 *
+	 * We put a (**) in the output of those stats that "should never happen"
+	 */
+	{
+		struct psmi_stats_entry entries[] = {
+			PSMI_STATS_DECLU64("pio busy count",
+					   &proto->stats.pio_busy_cnt),
+			/* Throttling by kernel */
+			PSMI_STATS_DECLU64("writev busy cnt",
+					   &proto->stats.writev_busy_cnt),
+			/* When local dma completion is in the way... */
+			PSMI_STATS_DECLU64("writev compl. eagain",
+					   &proto->stats.writev_compl_eagain),
+			/* When remote completion happens before local completion */
+			PSMI_STATS_DECLU64("writev compl. delay (*)",
+					   &proto->stats.writev_compl_delay),
+			PSMI_STATS_DECLU64("scb unavail eager count",
+					   &proto->stats.scb_egr_unavail_cnt),
+			PSMI_STATS_DECLU64("scb unavail exp count",
+					   &proto->stats.scb_exp_unavail_cnt),
+			PSMI_STATS_DECLU64("rcvhdr overflows",	/* Normal egr/hdr ovflw */
+					   &proto->stats.hdr_overflow),
+			PSMI_STATS_DECLU64("rcveager overflows",
+					   &proto->stats.egr_overflow),
+			PSMI_STATS_DECLU64("lid zero errs (**)",	/* shouldn't happen */
+					   &proto->stats.lid_zero_errs),
+			PSMI_STATS_DECLU64("unknown packets (**)",	/* shouldn't happen */
+					   &proto->stats.unknown_packets),
+			PSMI_STATS_DECLU64("stray packets (*)",
+					   &proto->stats.stray_packets),
+			PSMI_STATS_DECLU64("pio stalls (*)",	/* shouldn't happen too often */
+					   &proto->spioc->spio_num_stall_total),
+			PSMI_STATS_DECLU64("ICRC error (*)",
+					   &proto->error_stats.num_icrc_err),
+			PSMI_STATS_DECLU64("ECC error ",
+					   &proto->error_stats.num_ecc_err),
+			PSMI_STATS_DECLU64("Len error",
+					   &proto->error_stats.num_len_err),
+			PSMI_STATS_DECLU64("TID error ",
+					   &proto->error_stats.num_tid_err),
+			PSMI_STATS_DECLU64("DC error ",
+					   &proto->error_stats.num_dc_err),
+			PSMI_STATS_DECLU64("DCUNC error ",
+					   &proto->error_stats.num_dcunc_err),
+			PSMI_STATS_DECLU64("KHDRLEN error ",
+					   &proto->error_stats.num_khdrlen_err),
+
+		};
+
+		err =
+		    psmi_stats_register_type
+		    ("OPA low-level protocol stats",
+		     PSMI_STATSTYPE_IPSPROTO, entries,
+		     PSMI_STATS_HOWMANY(entries), NULL);
+		if (err != PSM2_OK)
+			goto fail;
+	}
+
+	/*
+	 * Control Queue and messaging
+	 */
+	ctrlq_init(&proto->ctrlq, proto);
+
+	/*
+	 * Receive-side handling
+	 */
+	if ((err = ips_proto_recv_init(proto)))
+		goto fail;
+
+	/*
+	 * Eager buffers.  We don't care to receive a callback when eager buffers
+	 * are newly released since we actively poll for new bufs.
+	 */
+	{
+		/* configure PSM bounce buffer size */
+		union psmi_envvar_val env_bbs;
+
+		psmi_getenv("PSM2_BOUNCE_SZ",
+			"PSM bounce buffer size (default is 8192B)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)8192,
+			&env_bbs);
+
+		proto->scb_bufsize = env_bbs.e_uint;
+	}
+
+	if ((err = ips_scbctrl_init(context, num_of_send_desc,
+				    num_of_send_bufs, imm_size,
+				    proto->scb_bufsize, NULL, NULL,
+				    &proto->scbc_egr)))
+		goto fail;
+
+	/*
+	 * Expected protocol handling.
+	 * If we enable tid-based expected rendezvous, the expected protocol code
+	 * handles its own rv scb buffers.  If not, we have to enable eager-based
+	 * rendezvous and we allocate scb buffers for it.
+	 */
+	psmi_getenv("PSM2_TID",
+		    "Tid proto flags (0 disables protocol)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)IPS_PROTOEXP_FLAGS_DEFAULT,
+		    &env_tid);
+	protoexp_flags = env_tid.e_uint;
+
+	if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED) {
+		if (cuda_runtime_version >= 7000) {
+			PSMI_CUDA_CALL(cudaStreamCreateWithFlags,
+			       &proto->cudastream_send, cudaStreamNonBlocking);
+		} else {
+			PSMI_CUDA_CALL(cudaStreamCreate,
+			       &proto->cudastream_send);
+		}
+	}
+#endif
+		proto->scbc_rv = NULL;
+		if ((err = ips_protoexp_init(context, proto, protoexp_flags,
+					     num_of_send_bufs, num_of_send_desc,
+					     &proto->protoexp)))
+			goto fail;
+	} else {
+		proto->protoexp = NULL;
+		proto->scbc_rv = (struct ips_scbctrl *)
+		    psmi_calloc(proto->ep, DESCRIPTORS,
+				1, sizeof(struct ips_scbctrl));
+		if (proto->scbc_rv == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		/*
+		 * Rendezvous buffers. We want to get a callback for rendezvous bufs
+		 * since we asynchronously try to make progress on these sends and only
+		 * schedule them on the timerq if there are pending sends and available
+		 * bufs.
+		 */
+		if ((err =
+		     ips_scbctrl_init(context, num_of_send_desc,
+				      0 /* no bufs */ ,
+				      0, 0 /* bufsize==0 */ ,
+				      ips_proto_rv_scbavail_callback,
+				      proto, proto->scbc_rv)))
+			goto fail;
+	}
+
+	/*
+	 * Parse the tid error settings from the environment.
+	 * <interval_secs>:<max_count_before_exit>
+	 */
+	{
+		int tvals[2];
+		char *tid_err;
+		union psmi_envvar_val env_tiderr;
+
+		tid_err = "-1:0";	/* no tiderr warnings, never exits */
+		tvals[0] = -1;
+		tvals[1] = 0;
+
+		if (!psmi_getenv("PSM2_TID_ERROR",
+				 "Tid error control <intval_secs:max_errors>",
+				 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+				 (union psmi_envvar_val)tid_err, &env_tiderr)) {
+			/* not using default values */
+			tid_err = env_tiderr.e_str;
+			psmi_parse_str_tuples(tid_err, 2, tvals);
+		}
+		if (tvals[0] >= 0)
+			proto->tiderr_warn_interval = sec_2_cycles(tvals[0]);
+		else
+			proto->tiderr_warn_interval = UINT64_MAX;
+		proto->tiderr_max = tvals[1];
+		_HFI_PRDBG("Tid error control: warning every %d secs%s, "
+			   "fatal error after %d tid errors%s\n",
+			   tvals[0], (tvals[0] < 0) ? " (no warnings)" : "",
+			   tvals[1], (tvals[1] == 0) ? " (never fatal)" : "");
+	}
+
+	/* Active Message interface. AM requests compete with MQ for eager
+	 * buffers, since request establish the amount of buffering in the
+	 * network (maximum number of requests in flight). The AM init function
+	 * does not allow the number of send buffers to be set separately from
+	 * the number of send descriptors, because otherwise it would have to
+	 * impose extremely arcane constraints on the relative amounts to avoid
+	 * a deadlock scenario. Thus, it handles it internally. The constraint
+	 * is: In a node pair, the number of reply send buffers on at least one
+	 * of the nodes must be at least double the number (optimal: double + 1)
+	 * of send descriptors on the other node. */
+	if ((err = ips_proto_am_init(proto,
+				     min(num_of_send_bufs, num_of_send_desc),
+				     imm_size,
+				     &proto->proto_am)))
+		goto fail;
+
+#if 0
+	if (!host_pid) {
+		char ipbuf[INET_ADDRSTRLEN], *p;
+		host_pid = (uint32_t) getpid();
+		host_ipv4addr = psmi_get_ipv4addr();	/* already be */
+		if (host_ipv4addr == 0) {
+			_HFI_DBG("Unable to obtain local IP address, "
+				 "not fatal but some features may be disabled\n");
+		} else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) {
+			_HFI_INFO("Localhost IP address is set to the "
+				  "loopback address 127.0.0.1, "
+				  "not fatal but some features may be disabled\n");
+		} else {
+			p = (char *)inet_ntop(AF_INET,
+					      (const void *)&host_ipv4addr,
+					      ipbuf, sizeof(ipbuf));
+			_HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p,
+				   host_pid);
+		}
+
+		/* Store in big endian for use in ERR_CHK */
+		host_pid = __cpu_to_be32(host_pid);
+	}
+#endif
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_gpudirect_rdma;
+	psmi_getenv("PSM2_GPUDIRECT",
+				"Use GPUDirect RDMA support to allow the HFI to directly read"
+				" from the GPU for SDMA.  Requires driver support.(default is "
+				" disabled i.e. 0)",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+				(union psmi_envvar_val)0, /* Disabled by default */
+				&env_gpudirect_rdma);
+
+	/* Default Send threshold for Gpu-direct set to 30000 */
+	union psmi_envvar_val env_gpudirect_send_thresh;
+	psmi_getenv("PSM2_GPUDIRECT_SEND_THRESH",
+		    "Threshold to switch off Gpu-Direct feature on send side",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)30000, &env_gpudirect_send_thresh);
+	gpudirect_send_threshold = env_gpudirect_send_thresh.e_uint;
+
+	union psmi_envvar_val env_gpudirect_recv_thresh;
+	psmi_getenv("PSM2_GPUDIRECT_RECV_THRESH",
+		    "Threshold to switch off Gpu-Direct feature on receive side",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &env_gpudirect_recv_thresh);
+	gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
+
+	if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
+		if (!PSMI_IS_CUDA_ENABLED ||
+			/* All pio, No SDMA*/
+			(proto->flags & IPS_PROTO_FLAG_SPIO) ||
+			!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) ||
+			!PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+			err = psmi_handle_error(PSMI_EP_NORETURN,
+					PSM2_INTERNAL_ERR,
+					"Requires hfi1 driver with GPU-Direct feature enabled.\n");
+		proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
+		proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
+	} else {
+		/* The following environment variables are here for internal
+		 * experimentation and will not be documented for any customers.
+		 */
+		/* Use GPUDirect RDMA for SDMA send? */
+		union psmi_envvar_val env_gpudirect_rdma_send;
+		psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
+					"Use GPUDirect RDMA support to allow the HFI to directly"
+					" read from the GPU for SDMA.  Requires driver"
+					" support.(default is disabled i.e. 0)",
+					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+					(union psmi_envvar_val)0, /* Disabled by default */
+					&env_gpudirect_rdma_send);
+
+		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
+			if (!PSMI_IS_CUDA_ENABLED ||
+				/* All pio, No SDMA*/
+				(proto->flags & IPS_PROTO_FLAG_SPIO))
+				err = psmi_handle_error(PSMI_EP_NORETURN,
+						PSM2_INTERNAL_ERR,
+						"Unable to start run as PSM would require cuda, sdma"
+						"and TID support\n");
+			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
+		}
+		/* Use GPUDirect RDMA for recv? */
+		union psmi_envvar_val env_gpudirect_rdma_recv;
+		psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
+					"Use GPUDirect RDMA support to allow the HFI to directly"
+					" write into GPU.  Requires driver support.(default is"
+					" disabled i.e. 0)",
+					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+					(union psmi_envvar_val)0, /* Disabled by default */
+					&env_gpudirect_rdma_recv);
+
+		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
+			if (!PSMI_IS_CUDA_ENABLED ||
+				!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
+					err = psmi_handle_error(PSMI_EP_NORETURN,
+							PSM2_INTERNAL_ERR,
+							"Unable to start run as PSM would require cuda,"
+							" sdma and TID support\n");
+			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
+		}
+	}
+
+	if (PSMI_IS_CUDA_ENABLED &&
+		 (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
+		struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+		uint32_t maxsz, chunksz, max_elements;
+
+		if ((err = psmi_parse_mpool_env(proto->mq, 1,
+						&rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		/* the maxsz is the amount in MB, not the number of entries,
+		 * since the element size depends on the window size */
+		max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+		/* mpool requires max_elements to be power of 2. round down. */
+		max_elements = 1 << (31 - __builtin_clz(max_elements));
+		proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv;
+		proto->cuda_hostbuf_pool_send =
+			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+						   chunksz, max_elements, 0,
+						   UNDEFINED, NULL, NULL,
+						   psmi_cuda_hostbuf_alloc_func,
+						   (void *)
+						   &proto->cuda_hostbuf_send_cfg);
+
+		if (proto->cuda_hostbuf_pool_send == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate CUDA host send buffer pool");
+			goto fail;
+		}
+
+		/* use the same number of elements for the small pool */
+		proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ;
+		proto->cuda_hostbuf_pool_small_send =
+			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+						   chunksz, max_elements, 0,
+						   UNDEFINED, NULL, NULL,
+						   psmi_cuda_hostbuf_alloc_func,
+						   (void *)
+						   &proto->cuda_hostbuf_small_send_cfg);
+
+		if (proto->cuda_hostbuf_pool_small_send == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate CUDA host small send buffer pool");
+			goto fail;
+		}
+
+		/* Configure the amount of prefetching */
+		union psmi_envvar_val env_prefetch_limit;
+
+		psmi_getenv("PSM2_CUDA_PREFETCH_LIMIT",
+			    "How many TID windows to prefetch at RTS time(default is 2)",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT,
+			    &env_prefetch_limit);
+		proto->cuda_prefetch_limit = env_prefetch_limit.e_uint;
+	}
+#endif
+fail:
+	return err;
+}
+
+psm2_error_t
+ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
+{
+	struct psmi_eptab_iterator itor;
+	uint64_t t_start;
+	uint64_t t_grace_start, t_grace_time, t_grace_interval;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	int i;
+	union psmi_envvar_val grace_intval;
+
+	psmi_getenv("PSM2_CLOSE_GRACE_PERIOD",
+		    "Additional grace period in seconds for closing end-point.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &grace_intval);
+
+	if (getenv("PSM2_CLOSE_GRACE_PERIOD")) {
+		t_grace_time = grace_intval.e_uint * SEC_ULL;
+	} else if (timeout_in > 0) {
+		/* default to half of the close time-out */
+		t_grace_time = timeout_in / 2;
+	} else {
+		/* propagate the infinite time-out case */
+		t_grace_time = 0;
+	}
+
+	if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT)
+		t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+	/* At close we will busy wait for the grace interval to see if any
+	 * receive progress is made. If progress is made we will wait for
+	 * another grace interval, until either no progress is made or the
+	 * entire grace period has passed. If the grace interval is too low
+	 * we may miss traffic and exit too early. If the grace interval is
+	 * too large the additional time spent while closing the program
+	 * will become visible to the user. */
+	psmi_getenv("PSM2_CLOSE_GRACE_INTERVAL",
+		    "Grace interval in seconds for closing end-point.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &grace_intval);
+
+	if (getenv("PSM2_CLOSE_GRACE_INTERVAL")) {
+		t_grace_interval = grace_intval.e_uint * SEC_ULL;
+	} else {
+		/* A heuristic is used to scale up the timeout linearly with
+		 * the number of endpoints, and we allow one second per 1000
+		 * endpoints. */
+		t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000;
+	}
+
+	if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL)
+		t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL;
+	if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL)
+		t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	t_start = proto->t_fini = get_cycles();
+
+	/* Close whatever has been left open */
+	if (proto->num_connected_outgoing > 0) {
+		int num_disc = 0;
+		int *mask;
+		psm2_error_t *errs;
+		psm2_epaddr_t *epaddr_array;
+
+		psmi_epid_itor_init(&itor, proto->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl == proto->ptl)
+				num_disc++;
+		}
+		psmi_epid_itor_fini(&itor);
+		mask =
+		    (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				       sizeof(int));
+		errs = (psm2_error_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				sizeof(psm2_error_t));
+		epaddr_array = (psm2_epaddr_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				sizeof(psm2_epaddr_t));
+
+		if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+			if (epaddr_array)
+				psmi_free(epaddr_array);
+			if (errs)
+				psmi_free(errs);
+			if (mask)
+				psmi_free(mask);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		psmi_epid_itor_init(&itor, proto->ep);
+		i = 0;
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			/*
+			 * if cstate_outgoing is CSTATE_NONE, then we know it
+			 * is an uni-directional connect, in that the peer
+			 * sent a connect request to us, but we never sent one
+			 * out to the peer epid. Ignore handling those in
+			 * ips_proto_disconnect() as we will do the right thing
+			 * when a disconnect request for the epaddr comes in from the peer.
+			 */
+			if (epaddr->ptlctl->ptl == proto->ptl &&
+				((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) {
+				mask[i] = 1;
+				epaddr_array[i] = epaddr;
+				i++;
+				IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr);
+			}
+		}
+		psmi_epid_itor_fini(&itor);
+		err = ips_proto_disconnect(proto, force, num_disc, epaddr_array,
+					   mask, errs, timeout_in);
+		psmi_free(mask);
+		psmi_free(errs);
+		psmi_free(epaddr_array);
+	}
+
+	t_grace_start = get_cycles();
+
+	while (psmi_cycles_left(t_grace_start, t_grace_time)) {
+		uint64_t t_grace_interval_start = get_cycles();
+		int num_disconnect_requests = proto->num_disconnect_requests;
+		PSMI_BLOCKUNTIL(
+		    proto->ep, err,
+		    proto->num_connected_incoming == 0 ||
+			(!psmi_cycles_left(t_start, timeout_in) &&
+			    (!psmi_cycles_left(t_grace_interval_start,
+					       t_grace_interval) ||
+			     !psmi_cycles_left(t_grace_start, t_grace_time))));
+		if (num_disconnect_requests == proto->num_disconnect_requests) {
+			/* nothing happened in this grace interval so break out early */
+			break;
+		}
+	}
+
+#if _HFI_DEBUGGING
+	if (_HFI_PRDBG_ON) {
+		uint64_t t_grace_finish = get_cycles();
+
+		_HFI_PRDBG_ALWAYS(
+			"Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n",
+			proto->num_connected_outgoing, proto->num_connected_incoming,
+			(int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) /
+			MSEC_ULL), (int)(t_grace_time / MSEC_ULL));
+	}
+#endif
+
+	if ((err = ips_ibta_fini(proto)))
+		goto fail;
+
+	if ((err = ips_proto_am_fini(&proto->proto_am)))
+		goto fail;
+
+	if ((err = ips_scbctrl_fini(&proto->scbc_egr)))
+		goto fail;
+
+	ips_proto_recv_fini(proto);
+
+	if (proto->protoexp) {
+		if ((err = ips_protoexp_fini(proto->protoexp)))
+			goto fail;
+	} else {
+		ips_scbctrl_fini(proto->scbc_rv);
+		psmi_free(proto->scbc_rv);
+	}
+
+	psmi_mpool_destroy(proto->pend_sends_pool);
+	psmi_mpool_destroy(proto->timer_pool);
+
+	psmi_free(proto->sdma_scb_queue);
+
+fail:
+	proto->t_fini = proto->t_init = 0;
+	return err;
+}
+
+static
+psm2_error_t
+proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context)
+{
+	union psmi_envvar_val env_sdma, env_hfiegr;
+	psm2_error_t err = PSM2_OK;
+
+	/*
+	 * Only initialize if RUNTIME_SDMA is enabled.
+	 */
+	psmi_assert_always(context->runtime_flags & HFI1_CAP_SDMA);
+
+	psmi_getenv("PSM2_SDMA",
+		    "hfi send dma flags (0 disables send dma, 2 disables send pio, "
+		    "1 for both sdma/spio, default 1)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)1, &env_sdma);
+	if (env_sdma.e_uint == 0)
+		proto->flags |= IPS_PROTO_FLAG_SPIO;
+	else if (env_sdma.e_uint == 2)
+		proto->flags |= IPS_PROTO_FLAG_SDMA;
+
+	if (!(proto->flags & (IPS_PROTO_FLAG_SDMA | IPS_PROTO_FLAG_SPIO))) {
+		/* use both spio and sdma */
+		if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M)
+		{
+			proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2;
+			proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_PHI2;
+		} else {
+			proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_XEON;
+			proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_XEON;
+		}
+
+		if (!psmi_getenv("PSM2_MQ_EAGER_SDMA_SZ",
+				"hfi pio-to-sdma eager switchover",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val) proto->iovec_thresh_eager,
+				&env_hfiegr)) {
+			proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+				 env_hfiegr.e_uint;
+		}
+	} else if (proto->flags & IPS_PROTO_FLAG_SDMA) {	/* all sdma */
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    0;
+	} else if (proto->flags & IPS_PROTO_FLAG_SPIO) {	/* all spio */
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    ~0U;
+	}
+
+	return err;
+}
+
+static
+void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto)
+{
+	/* clear the ctrl send queue */
+	memset(ctrlq, 0, sizeof(*ctrlq));
+
+	proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED;
+	proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED;
+	proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED;
+	proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED;
+	proto->message_type_to_index[OPCODE_ERR_CHK_GEN] =
+	    CTRL_MSG_ERR_CHK_GEN_QUEUED;
+	proto->message_type_to_index[OPCODE_CONNECT_REQUEST] =
+	    CTRL_MSG_CONNECT_REQUEST_QUEUED;
+	proto->message_type_to_index[OPCODE_CONNECT_REPLY] =
+	    CTRL_MSG_CONNECT_REPLY_QUEUED;
+	proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] =
+	    CTRL_MSG_DISCONNECT_REQUEST_QUEUED;
+	proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] =
+	    CTRL_MSG_DISCONNECT_REPLY_QUEUED;
+
+	ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0;
+	ctrlq->ctrlq_overflow = 0;
+	ctrlq->ctrlq_proto = proto;
+
+	/*
+	 * We never enqueue ctrl messages with real payload. If we do,
+	 * the queue 'elem_payload' size needs to be big enough.
+	 * Note: enqueue nak/ack is very important for performance.
+	 */
+	proto->ctrl_msg_queue_enqueue =
+	    CTRL_MSG_ACK_QUEUED |
+	    CTRL_MSG_NAK_QUEUED |
+	    CTRL_MSG_BECN_QUEUED;
+
+	psmi_timer_entry_init(&ctrlq->ctrlq_timer,
+			      ips_proto_timer_ctrlq_callback, ctrlq);
+
+	return;
+}
+
+static __inline__ void _build_ctrl_message(struct ips_proto *proto,
+			struct ips_flow *flow, uint8_t message_type,
+			ips_scb_t *ctrlscb, uint32_t paylen)
+{
+	uint32_t tot_paywords = (sizeof(struct ips_message_header) +
+		HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT;
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_message_header *p_hdr = &ctrlscb->ips_lrh;
+	ips_path_rec_t *ctrl_path =
+	    ipsaddr->pathgrp->pg_path[ipsaddr->
+				      hpp_index][IPS_PATH_HIGH_PRIORITY];
+
+	if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) &&
+	    (++ipsaddr->hpp_index >=
+	     ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]))
+		ipsaddr->hpp_index = 0;
+
+	/* Control messages go over the control path. */
+	p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+				      ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) <<
+				       HFI_LRH_SL_SHIFT) |
+				      ((proto->sl2sc[ctrl_path->pr_sl] &
+					HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT));
+	p_hdr->lrh[1] = ctrl_path->pr_dlid;
+	p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK);
+	p_hdr->lrh[3] = ctrl_path->pr_slid;
+
+	p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey |
+				      (message_type << HFI_BTH_OPCODE_SHIFT));
+
+	/* If flow is congested then generate a BECN for path. */
+	if_pf(flow->flags & IPS_FLOW_FLAG_GEN_BECN) {
+		p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+					      ipsaddr->
+					      subcontext <<
+					      HFI_BTH_SUBCTXT_SHIFT | flow->
+					      flowid << HFI_BTH_FLOWID_SHIFT |
+					      proto->epinfo.
+					      ep_baseqp << HFI_BTH_QP_SHIFT | 1
+					      << HFI_BTH_BECN_SHIFT);
+		flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN;
+	}
+	else {
+		p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+					      ipsaddr->
+					      subcontext <<
+					      HFI_BTH_SUBCTXT_SHIFT | flow->
+					      flowid << HFI_BTH_FLOWID_SHIFT |
+					      proto->epinfo.
+					      ep_baseqp << HFI_BTH_QP_SHIFT);
+	}
+
+	/* p_hdr->bth[2] already set by caller, or don't care */
+	/* p_hdr->ack_seq_num already set by caller, or don't care */
+
+	p_hdr->connidx = ipsaddr->connidx_outgoing;
+	p_hdr->flags = 0;
+
+	p_hdr->khdr.kdeth0 = __cpu_to_le32(
+			(ctrlscb->flags & IPS_SEND_FLAG_INTR) |
+			(IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT));
+	p_hdr->khdr.kdeth1 = __cpu_to_le32(proto->epinfo.ep_jkey);
+
+	return;
+}
+
+psm2_error_t
+ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
+{
+	struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context;
+	struct ips_proto *proto = ctrlq->ctrlq_proto;
+	struct ips_ctrlq_elem *cqe;
+	uint32_t have_cksum;
+	psm2_error_t err;
+
+	have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+	/* service ctrl send queue first */
+	while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) {
+		cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail];
+
+		if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) {
+			err = ips_spio_transfer_frame(proto,
+				cqe->msg_scb.flow, &cqe->msg_scb.pbc,
+				cqe->msg_scb.cksum, 0, PSMI_TRUE,
+				have_cksum, cqe->msg_scb.cksum[0]
+#ifdef PSM_CUDA
+			       , 0
+#endif
+				);
+		} else {
+			err = ips_dma_transfer_frame(proto,
+				cqe->msg_scb.flow, &cqe->msg_scb,
+				cqe->msg_scb.cksum, 0,
+				have_cksum, cqe->msg_scb.cksum[0]);
+		}
+
+		if (err == PSM2_OK) {
+			ips_proto_epaddr_stats_set(proto, cqe->message_type);
+			*cqe->msg_queue_mask &=
+			    ~message_type2index(proto, cqe->message_type);
+			cqe->msg_queue_mask = NULL;
+			ctrlq->ctrlq_tail =
+			    (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE;
+		} else {
+			psmi_assert(err == PSM2_EP_NO_RESOURCES);
+
+			if (proto->flags & IPS_PROTO_FLAG_SDMA)
+				proto->stats.writev_busy_cnt++;
+			else
+				proto->stats.pio_busy_cnt++;
+			/* re-request a timer expiration */
+			psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+					   PSMI_TIMER_PRIO_0);
+			return PSM2_OK;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+/* Update cqe struct which is a single element from pending control message queue */
+PSMI_ALWAYS_INLINE(
+void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask,
+			  struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){
+
+	cqe->message_type = message_type;
+	cqe->msg_queue_mask = msg_queue_mask;
+	psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh,
+		       &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh));
+	cqe->msg_scb.flow = flow;
+	cqe->msg_scb.cksum[0] = ctrlscb->cksum[0];
+}
+
+psm2_error_t
+ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
+			uint16_t *msg_queue_mask, ips_scb_t *ctrlscb,
+			void *payload, uint32_t paylen)
+{
+	psm2_error_t err = PSM2_EP_NO_RESOURCES;
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+	struct ips_ctrlq *ctrlq = &proto->ctrlq;
+	struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe;
+	uint32_t have_cksum;
+
+	psmi_assert(message_type >= OPCODE_ACK &&
+			message_type <= OPCODE_DISCONNECT_REPLY);
+	psmi_assert((paylen & 0x3) == 0);	/* require 4-byte multiple */
+	psmi_assert(flow->frag_size >=
+			(paylen + PSM_CRC_SIZE_IN_BYTES));
+
+	/* Drain queue if non-empty */
+	if (cqe[ctrlq->ctrlq_tail].msg_queue_mask)
+		ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL);
+
+	/* finish setup control message header */
+	_build_ctrl_message(proto, flow, message_type, ctrlscb, paylen);
+
+	/* If enabled checksum control message */
+	have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+	if (have_cksum) {
+		ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+		ips_do_cksum(proto, &ctrlscb->ips_lrh,
+				payload, paylen, ctrlscb->cksum);
+	}
+
+	/*
+	 * for ACK/NAK/BECN, we use the fast flow to send over, otherwise,
+	 * we use the original flow
+	 */
+	if (message_type == OPCODE_ACK ||
+	    message_type == OPCODE_NAK ||
+	    message_type == OPCODE_BECN)
+	{
+		psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[proto->msgflowid];
+	}
+
+	switch (flow->transfer) {
+	case PSM_TRANSFER_PIO:
+		err = ips_spio_transfer_frame(proto, flow,
+			     &ctrlscb->pbc, payload, paylen,
+			     PSMI_TRUE, have_cksum, ctrlscb->cksum[0]
+#ifdef PSM_CUDA
+			     , 0
+#endif
+			     );
+		break;
+	case PSM_TRANSFER_DMA:
+		err = ips_dma_transfer_frame(proto, flow,
+			     ctrlscb, payload, paylen,
+			     have_cksum, ctrlscb->cksum[0]);
+		break;
+	default:
+		err = PSM2_INTERNAL_ERR;
+		break;
+	}
+
+	if (err == PSM2_OK)
+		ips_proto_epaddr_stats_set(proto, message_type);
+
+	_HFI_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d,"
+		  "src=%p,len=%d returns %d\n",
+		  (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh),
+		  __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err);
+
+	if (err != PSM2_EP_NO_RESOURCES)
+		return err;
+	if (proto->flags & IPS_PROTO_FLAG_SDMA)
+		proto->stats.writev_busy_cnt++;
+	else
+		proto->stats.pio_busy_cnt++;
+
+	if (proto->ctrl_msg_queue_enqueue & proto->
+	    message_type_to_index[message_type]) {
+		/* We only queue control msg without payload */
+		psmi_assert(paylen == 0);
+
+		if ((*msg_queue_mask) & proto->
+		    message_type_to_index[message_type]) {
+
+			if (message_type == OPCODE_ACK) {
+				/* Pending queue should contain latest ACK type message,
+				 * overwrite the previous one. */
+				ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask,
+						     flow, ctrlscb, message_type);
+			}
+
+			err = PSM2_OK;
+		} else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) {
+			/* entry is free */
+			if (message_type == OPCODE_ACK) {
+				/* Track the index of last ACK type message in queue*/
+				flow->ack_index = ctrlq->ctrlq_head;
+			}
+
+			*msg_queue_mask |=
+			    message_type2index(proto, message_type);
+
+			ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask,
+					     flow, ctrlscb, message_type);
+
+			ctrlq->ctrlq_head =
+			    (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE;
+			/* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */
+			psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+					   PSMI_TIMER_PRIO_0);
+
+			err = PSM2_OK;
+		} else {
+			proto->ctrl_msg_queue_overflow++;
+		}
+	}
+
+	return err;
+}
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb)
+{
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+	ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb);
+	if ((proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+	    (scb->tidctrl == 0) && (scb->nfrag == 1)) {
+		scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+		ips_do_cksum(proto, &scb->ips_lrh,
+			     ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]);
+	}
+
+	/* If this is the first scb on flow, pull in both timers. */
+	if (flow->timer_ack == NULL) {
+		psmi_assert(flow->timer_send == NULL);
+		flow->timer_ack = scb->timer_ack;
+		flow->timer_send = scb->timer_send;
+	}
+	psmi_assert(flow->timer_ack != NULL);
+	psmi_assert(flow->timer_send != NULL);
+
+	/* Every flow has a pending head that points into the unacked queue.
+	 * If sends are already pending, process those first */
+	if (SLIST_EMPTY(&flow->scb_pend))
+		SLIST_FIRST(&flow->scb_pend) = scb;
+
+	/* Insert scb into flow's unacked queue */
+	STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq);
+
+#ifdef PSM_DEBUG
+	/* update scb counters in flow. */
+	flow->scb_num_pending++;
+	flow->scb_num_unacked++;
+#endif
+}
+MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue);
+
+/*
+ * This function attempts to flush the current list of pending
+ * packets through PIO.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Packet triggered through PIO.
+ * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm2_error_t
+ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
+{
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+	struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+	int num_sent = 0;
+	uint64_t t_cyc;
+	ips_scb_t *scb;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(!SLIST_EMPTY(scb_pend));
+
+	/* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+	if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) {
+		if (nflushed)
+			*nflushed = 0;
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) {
+		scb = SLIST_FIRST(scb_pend);
+		psmi_assert(scb->nfrag == 1);
+
+		if ((err = ips_spio_transfer_frame(proto, flow, &scb->pbc,
+						   ips_scb_buffer(scb),
+						   scb->payload_size,
+						   PSMI_FALSE,
+						   scb->ips_lrh.
+						   flags &
+						   IPS_SEND_FLAG_PKTCKSUM,
+						   scb->cksum[0]
+#ifdef PSM_CUDA
+						   , IS_TRANSFER_BUF_GPU_MEM(scb)
+#endif
+						)) == PSM2_OK) {
+			t_cyc = get_cycles();
+			scb->flags &= ~IPS_SEND_FLAG_PENDING;
+			scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+			scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc;
+			psmi_timer_request(proto->timerq, flow->timer_ack,
+					   scb->abs_timeout);
+			num_sent++;
+			flow->credits--;
+			SLIST_REMOVE_HEAD(scb_pend, next);
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+
+		} else
+			break;
+	}
+
+	/* If out of flow credits re-schedule send timer */
+	if (!SLIST_EMPTY(scb_pend)) {
+		proto->stats.pio_busy_cnt++;
+		psmi_timer_request(proto->timerq, flow->timer_send,
+				   get_cycles() + proto->timeout_send);
+	}
+
+	if (nflushed != NULL)
+		*nflushed = num_sent;
+
+	return err;
+}
+
+/*
+ * Flush all packets currently marked as pending
+ */
+static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+				struct ips_scb_pendlist *slist, int *num_sent);
+
+/*
+ * Flush all packets queued up on a flow via send DMA.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Able to flush entire pending queue for DMA.
+ * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA.
+ * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ *                      or writev returned a recoverable error (no mem for
+ *                      descriptors, dma interrupted or no space left in dma
+ *                      queue).
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ *			  rxe/txe parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+psm2_error_t
+ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed)
+{
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+	struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+	ips_scb_t *scb = NULL;
+	psm2_error_t err = PSM2_OK;
+	int nsent = 0;
+
+	psmi_assert(!SLIST_EMPTY(scb_pend));
+
+	/* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+	if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) {
+		if (nflushed)
+			*nflushed = 0;
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	err = scb_dma_send(proto, flow, scb_pend, &nsent);
+	if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES &&
+	    err != PSM2_OK_NO_PROGRESS)
+		goto fail;
+
+	if (nsent > 0) {
+		uint64_t t_cyc = get_cycles();
+		int i = 0;
+		/*
+		 * inflight counter proto->iovec_cntr_next_inflight should not drift
+		 * from completion counter proto->iovec_cntr_last_completed away too
+		 * far because we only have very small scb counter compared with
+		 * uint32_t counter value.
+		 */
+#ifdef PSM_DEBUG
+		flow->scb_num_pending -= nsent;
+#endif
+		SLIST_FOREACH(scb, scb_pend, next) {
+			if (++i > nsent)
+				break;
+			scb->flags &= ~IPS_SEND_FLAG_PENDING;
+			scb->ack_timeout =
+			    scb->nfrag * proto->epinfo.ep_timeout_ack;
+			scb->abs_timeout =
+			    scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc;
+
+			psmi_assert(proto->sdma_scb_queue
+					[proto->sdma_fill_index] == NULL);
+			proto->sdma_scb_queue[proto->sdma_fill_index] = scb;
+			scb->dma_complete = 0;
+
+			proto->sdma_avail_counter--;
+			proto->sdma_fill_index++;
+			if (proto->sdma_fill_index == proto->sdma_queue_size)
+				proto->sdma_fill_index = 0;
+
+			/* Flow credits can temporarily go to negative for
+			 * packets tracking purpose, because we have sdma
+			 * chunk processing which can't send exact number
+			 * of packets as the number of credits.
+			 */
+			flow->credits -= scb->nfrag;
+		}
+		SLIST_FIRST(scb_pend) = scb;
+	}
+
+	if (SLIST_FIRST(scb_pend) != NULL) {
+		psmi_assert(flow->scb_num_pending > 0);
+
+		switch (flow->protocol) {
+		case PSM_PROTOCOL_TIDFLOW:
+			/* For Tidflow we can cancel the ack timer if we have flow credits
+			 * available and schedule the send timer. If we are out of flow
+			 * credits then the ack timer is scheduled as we are waiting for
+			 * an ACK to reclaim credits. This is required since multiple
+			 * tidflows may be active concurrently.
+			 */
+			if (flow->credits > 0) {
+				/* Cancel ack timer and reschedule send timer. Increment
+				 * writev_busy_cnt as this really is DMA buffer exhaustion.
+				 */
+				psmi_timer_cancel(proto->timerq,
+						  flow->timer_ack);
+				psmi_timer_request(proto->timerq,
+						   flow->timer_send,
+						   get_cycles() +
+						   (proto->timeout_send << 1));
+				proto->stats.writev_busy_cnt++;
+			} else {
+				/* Re-instate ACK timer to reap flow credits */
+				psmi_timer_request(proto->timerq,
+						   flow->timer_ack,
+						   get_cycles() +
+						   (proto->epinfo.
+						    ep_timeout_ack >> 2));
+			}
+
+			break;
+		case PSM_PROTOCOL_GO_BACK_N:
+		default:
+			if (flow->credits > 0) {
+				/* Schedule send timer and increment writev_busy_cnt */
+				psmi_timer_request(proto->timerq,
+						   flow->timer_send,
+						   get_cycles() +
+						   (proto->timeout_send << 1));
+				proto->stats.writev_busy_cnt++;
+			} else {
+				/* Schedule ACK timer to reap flow credits */
+				psmi_timer_request(proto->timerq,
+						   flow->timer_ack,
+						   get_cycles() +
+						   (proto->epinfo.
+						    ep_timeout_ack >> 2));
+			}
+			break;
+		}
+	} else {
+		/* Schedule ack timer */
+		psmi_timer_cancel(proto->timerq, flow->timer_send);
+		psmi_timer_request(proto->timerq, flow->timer_ack,
+				   get_cycles() + proto->epinfo.ep_timeout_ack);
+	}
+
+	/* We overwrite error with its new meaning for flushing packets */
+	if (nsent > 0)
+		if (scb)
+			err = PSM2_OK_NO_PROGRESS;	/* partial flush */
+		else
+			err = PSM2_OK;	/* complete flush */
+	else
+		err = PSM2_EP_NO_RESOURCES;	/* no flush at all */
+
+fail:
+	if (nflushed)
+		*nflushed = nsent;
+
+	return err;
+}
+
+/*
+ * Fault injection in dma sends. Since DMA through writev() is all-or-nothing,
+ * we don't inject faults on a packet-per-packet basis since the code gets
+ * quite complex.  Instead, each call to flush_dma or transfer_frame is treated
+ * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND
+ * setting.
+ *
+ * The effect is as if the event was successful but dropped on the wire
+ * somewhere.
+ */
+PSMI_ALWAYS_INLINE(int dma_do_fault())
+{
+
+	if_pf(PSMI_FAULTINJ_ENABLED()) {
+		PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1,
+					  IPS_FAULTINJ_DMALOST);
+		return psmi_faultinj_is_fault(fi);
+	}
+	else
+	return 0;
+}
+
+/*
+ * Driver defines the following sdma completion error code, returned
+ * as negative value:
+ * #define SDMA_TXREQ_S_OK        0
+ * #define SDMA_TXREQ_S_SENDERROR 1
+ * #define SDMA_TXREQ_S_ABORTED   2
+ * #define SDMA_TXREQ_S_SHUTDOWN  3
+ *
+ * When hfi is in freeze mode, driver will complete all the pending
+ * sdma request as aborted. Since PSM needs to recover from hfi
+ * freeze mode, this routine ignore aborted error.
+ */
+psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto)
+{
+	ips_scb_t *scb;
+	struct hfi1_sdma_comp_entry *comp;
+	uint32_t status;
+
+	while (proto->sdma_done_index != proto->sdma_fill_index) {
+		comp = &proto->sdma_comp_queue[proto->sdma_done_index];
+		status = comp->status;
+		psmi_rmb();
+
+		if (status == QUEUED)
+			return PSM2_OK;
+
+		/* Mark sdma request is complete */
+		scb = proto->sdma_scb_queue[proto->sdma_done_index];
+		if (scb) {
+			scb->dma_complete = 1;
+			proto->sdma_scb_queue[proto->sdma_done_index] = NULL;
+		}
+
+		if (status == ERROR && ((int)comp->errcode) != -2) {
+			psm2_error_t err =
+			   psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+				"SDMA completion error: %d (fd=%d, index=%d)",
+				0 - comp->errcode,
+				proto->fd,
+				proto->sdma_done_index);
+			return err;
+		}
+
+		proto->sdma_avail_counter++;
+		proto->sdma_done_index++;
+		if (proto->sdma_done_index == proto->sdma_queue_size)
+			proto->sdma_done_index = 0;
+	}
+
+	return PSM2_OK;
+}
+
+/*
+
+Handles ENOMEM on a DMA completion.
+
+ */
+static inline
+psm2_error_t
+handle_ENOMEM_on_DMA_completion(struct ips_proto *proto)
+{
+	psm2_error_t err;
+	time_t now = time(NULL);
+
+	if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) {
+		uint64_t lengthEvicted =
+			ips_tidcache_evict(&proto->protoexp->tidc, -1);
+
+		if (!proto->writevFailTime)
+			proto->writevFailTime = now;
+
+		if (lengthEvicted)
+			return PSM2_OK; /* signals a retry of the writev command. */
+		else
+			return PSM2_EP_NO_RESOURCES;  /* should signal a return of
+							no progress, and retry later */
+	}
+	else if (!proto->writevFailTime)
+	{
+		proto->writevFailTime = now;
+		return PSM2_EP_NO_RESOURCES;  /* should signal a return of
+						 no progress, and retry later */
+	}
+	else
+	{
+		static const double thirtySeconds = 30.0;
+
+		if (difftime(now, proto->writevFailTime) >
+		    thirtySeconds) {
+			err = psmi_handle_error(
+				proto->ep,
+				PSM2_EP_DEVICE_FAILURE,
+				"SDMA completion error: out of "
+				"memory (fd=%d, index=%d)",
+				proto->fd,
+				proto->sdma_done_index);
+			return err;
+		}
+		return PSM2_EP_NO_RESOURCES;  /* should signal a return of
+						 no progress, and retry later */
+	}
+}
+
+/* ips_dma_transfer_frame is used only for control messages, and is
+ * not enabled by default, and not tested by QA; expected send
+ * dma goes through scb_dma_send() */
+psm2_error_t
+ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
+		       ips_scb_t *scb, void *payload, uint32_t paylen,
+		       uint32_t have_cksum, uint32_t cksum)
+{
+	ssize_t ret;
+	psm2_error_t err;
+	struct sdma_req_info *sdmahdr;
+	uint16_t iovcnt;
+	struct iovec iovec[2];
+
+	/* See comments above for fault injection */
+	if_pf(dma_do_fault())
+	    return PSM2_OK;
+
+	/*
+	 * Check if there is a sdma queue slot.
+	 */
+	if (proto->sdma_avail_counter == 0) {
+		err = ips_proto_dma_completion_update(proto);
+		if (err)
+			return err;
+
+		if (proto->sdma_avail_counter == 0) {
+			return PSM2_EP_NO_RESOURCES;
+		}
+	}
+
+	/*
+	 * If we have checksum, put to the end of payload. We make sure
+	 * there is enough space in payload for us to put 8 bytes checksum.
+	 * for control message, payload is internal PSM buffer, not user buffer.
+	 */
+	if (have_cksum) {
+		uint32_t *ckptr = (uint32_t *) ((char *)payload + paylen);
+		*ckptr = cksum;
+		ckptr++;
+		*ckptr = cksum;
+		paylen += PSM_CRC_SIZE_IN_BYTES;
+	}
+
+	/*
+	 * Setup PBC.
+	 */
+	ips_proto_pbc_update(proto, flow, PSMI_TRUE,
+			     &scb->pbc, HFI_MESSAGE_HDR_SIZE, paylen);
+
+	/*
+	 * Setup SDMA header and io vector.
+	 */
+	sdmahdr = (struct sdma_req_info *)
+		   psmi_get_sdma_req_info(scb, proto->ips_extra_sdmahdr_size);
+	sdmahdr->npkts = 1;
+	sdmahdr->fragsize = flow->frag_size;
+
+	sdmahdr->comp_idx = proto->sdma_fill_index;
+	psmi_assert(proto->sdma_comp_queue
+		[proto->sdma_fill_index].status != QUEUED);
+
+	iovcnt = 1;
+	iovec[0].iov_base = sdmahdr;
+	iovec[0].iov_len = HFI_SDMA_HDR_SIZE +
+				 proto->ips_extra_sdmahdr_size;
+	if (paylen > 0) {
+		iovcnt++;
+		iovec[1].iov_base = payload;
+		iovec[1].iov_len = paylen;
+	}
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+		sdmahdr->ctrl = 2 |
+		    (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+		    (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+	} else {
+#endif
+	sdmahdr->ctrl = 1 |
+	    (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+	    (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+	}
+#endif
+
+	/*
+	 * Write into driver to do SDMA work.
+	 */
+retry:
+	ret = hfi_cmd_writev(proto->fd, iovec, iovcnt);
+
+	if (ret > 0) {
+		proto->writevFailTime = 0;
+		psmi_assert_always(ret == 1);
+
+		proto->sdma_avail_counter--;
+		proto->sdma_fill_index++;
+		if (proto->sdma_fill_index == proto->sdma_queue_size)
+			proto->sdma_fill_index = 0;
+
+		/*
+		 * Wait for completion of this control message if
+		 * stack buffer payload is used. This should not be
+		 * a performance issue because sdma control message
+		 * is not a performance code path.
+		 */
+		if (iovcnt > 1) {
+			/* Setup scb ready for completion. */
+			psmi_assert(proto->sdma_scb_queue
+					[sdmahdr->comp_idx] == NULL);
+			proto->sdma_scb_queue[sdmahdr->comp_idx] = scb;
+			scb->dma_complete = 0;
+
+			/* Wait for completion */
+			err = ips_proto_dma_wait_until(proto, scb);
+		} else
+			err = PSM2_OK;
+	} else {
+		/*
+		 * ret == 0: Driver did not queue packet. Try later.
+		 * ENOMEM: No kernel memory to queue request, try later? *
+		 * ECOMM: Link may have gone down
+		 * EINTR: Got interrupt while in writev
+		 */
+		if (errno == ENOMEM) {
+			err = handle_ENOMEM_on_DMA_completion(proto);
+			if (err == PSM2_OK)
+				goto retry;
+		} else if (ret == 0 || errno == ECOMM || errno == EINTR) {
+			err = psmi_context_check_status(
+			    (const psmi_context_t *)&proto->ep->context);
+			/*
+			 * During a link bounce the err returned from
+			 * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case
+			 * the error code which we need to return to the calling flush
+			 * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to
+			 * signal it to restart the timers to flush the packets.
+			 * Not doing so would leave the packet on the unacked and
+			 * pending q without the sdma descriptors ever being updated.
+			 */
+			if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK)
+				err = PSM2_EP_NO_RESOURCES;
+		}
+
+		else
+			err = psmi_handle_error(proto->ep,
+						PSM2_EP_DEVICE_FAILURE,
+						"Unhandled error in writev(): "
+						"%s (fd=%d,iovec=%p,len=%d)",
+						strerror(errno),
+						proto->fd,
+						&iovec,
+						1);
+	}
+
+	return err;
+}
+
+/*
+ * Caller still expects num_sent to always be correctly set in case of an
+ * error.
+ *
+ * Recoverable errors:
+ * PSM2_OK: At least one packet was successfully queued up for DMA.
+ * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ *                      or writev returned a recoverable error (no mem for
+ *                      descriptors, dma interrupted or no space left in dma
+ *                      queue).
+ * PSM2_OK_NO_PROGRESS: Cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected
+ *                        error in calling writev(), or chip failure, rxe/txe
+ *                        parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+static
+psm2_error_t
+scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+	     struct ips_scb_pendlist *slist, int *num_sent)
+{
+	psm2_error_t err = PSM2_OK;
+	struct sdma_req_info *sdmahdr;
+	struct ips_scb *scb;
+	struct iovec *iovec;
+	uint16_t iovcnt;
+
+	unsigned int vec_idx = 0;
+	unsigned int scb_idx = 0, scb_sent = 0;
+	unsigned int num = 0, max_elem;
+	uint32_t have_cksum;
+	uint32_t fillidx;
+	int16_t credits;
+	ssize_t ret;
+
+	/* See comments above for fault injection */
+	if_pf(dma_do_fault()) goto fail;
+
+	/* Check how many SCBs to send based on flow credits */
+	credits = flow->credits;
+	psmi_assert(SLIST_FIRST(slist) != NULL);
+	SLIST_FOREACH(scb, slist, next) {
+		num++;
+		credits -= scb->nfrag;
+		if (credits <= 0)
+			break;
+	}
+	if (proto->sdma_avail_counter < num) {
+		/* if there is not enough sdma slot,
+		 * update and use what we have.
+		 */
+		err = ips_proto_dma_completion_update(proto);
+		if (err)
+			goto fail;
+		if (proto->sdma_avail_counter == 0) {
+			err = PSM2_EP_NO_RESOURCES;
+			goto fail;
+		}
+		if (proto->sdma_avail_counter < num)
+			num = proto->sdma_avail_counter;
+	}
+
+	/* header, payload, checksum, tidarray */
+	max_elem = 4 * num;
+	iovec = alloca(sizeof(struct iovec) * max_elem);
+
+	if_pf(iovec == NULL) {
+		err = psmi_handle_error(PSMI_EP_NORETURN,
+					PSM2_NO_MEMORY,
+					"alloca for %d bytes failed in writev",
+					(int)(sizeof(struct iovec) * max_elem));
+		goto fail;
+	}
+
+	fillidx = proto->sdma_fill_index;
+	SLIST_FOREACH(scb, slist, next) {
+		/* Can't exceed posix max writev count */
+		if (vec_idx + (int)!!(scb->payload_size > 0) >= UIO_MAXIOV)
+			break;
+
+		psmi_assert(vec_idx < max_elem);
+		psmi_assert_always(((scb->payload_size & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == non_dw_mul_sdma));
+
+		/* Checksum all eager packets */
+		have_cksum = scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM;
+
+		/*
+		 * Setup PBC.
+		 */
+		ips_proto_pbc_update(
+		    proto,
+		    flow,
+		    PSMI_FALSE,
+		    &scb->pbc,
+		    HFI_MESSAGE_HDR_SIZE,
+		    scb->payload_size +
+			(have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0));
+
+		sdmahdr = (struct sdma_req_info *)
+			   psmi_get_sdma_req_info(scb, proto->ips_extra_sdmahdr_size);
+
+		sdmahdr->npkts =
+		    scb->nfrag > 1 ? scb->nfrag_remaining : scb->nfrag;
+		sdmahdr->fragsize =
+		    scb->frag_size ? scb->frag_size : flow->frag_size;
+
+		sdmahdr->comp_idx = fillidx;
+		psmi_assert(proto->sdma_comp_queue[fillidx].status != QUEUED);
+		fillidx++;
+		if (fillidx == proto->sdma_queue_size)
+			fillidx = 0;
+
+		/*
+		 * Setup io vector.
+		 */
+		iovec[vec_idx].iov_base = sdmahdr;
+		iovec[vec_idx].iov_len = HFI_SDMA_HDR_SIZE +
+					 proto->ips_extra_sdmahdr_size;
+		vec_idx++;
+		iovcnt = 1;
+		_HFI_VDBG("hdr=%p,%d\n",
+			  iovec[vec_idx - 1].iov_base,
+			  (int)iovec[vec_idx - 1].iov_len);
+
+		if (scb->payload_size > 0) {
+			/*
+			 * OPA1 supports byte-aligned payload. If it is
+			 * single packet per scb, use payload_size, else
+			 * multi-packets per scb, use remaining chunk_size.
+			 * payload_size is the remaining chunk first packet
+			 * length.
+			 */
+			iovec[vec_idx].iov_base = ips_scb_buffer(scb);
+			iovec[vec_idx].iov_len = scb->nfrag > 1
+						     ? scb->chunk_size_remaining
+						     : scb->payload_size;
+			vec_idx++;
+			iovcnt++;
+#ifdef PSM_CUDA
+			if (PSMI_IS_CUDA_ENABLED && IS_TRANSFER_BUF_GPU_MEM(scb)) {
+			    	/* without this attr, CUDA memory accesses
+				 * do not synchronize with gpudirect-rdma accesses.
+				 * We set this field only if the currently loaded driver
+				 * supports this field. If not, we have other problems
+				 * where we have a non gpu-direct enabled driver loaded
+				 * and PSM2 is trying to use GPU features.
+				 */
+				if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+					sdmahdr->flags = HFI1_BUF_GPU_MEM;
+				else
+					sdmahdr->flags = 0;
+			} else if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+					sdmahdr->flags = 0;
+#endif
+
+			_HFI_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n",
+				  scb->seq_num.psn_num,
+				  iovec[vec_idx - 2].iov_base,
+				  (int)iovec[vec_idx - 2].iov_len,
+				  iovec[vec_idx - 1].iov_base,
+				  (int)iovec[vec_idx - 1].iov_len);
+		}
+
+		/* If checksum then update checksum  */
+		if (have_cksum) {
+			scb->cksum[1] = scb->cksum[0];
+			iovec[vec_idx].iov_base = scb->cksum;
+			iovec[vec_idx].iov_len = PSM_CRC_SIZE_IN_BYTES;
+			vec_idx++;
+			iovcnt++;
+
+			_HFI_VDBG("chsum=%p,%d\n",
+				  iovec[vec_idx - 1].iov_base,
+				  (int)iovec[vec_idx - 1].iov_len);
+		}
+
+		/*
+		 * If it is TID receive, attached tid info.
+		 */
+		if (scb->tidctrl) {
+			iovec[vec_idx].iov_base = scb->tsess;
+			iovec[vec_idx].iov_len = scb->tsess_length;
+			vec_idx++;
+			iovcnt++;
+
+#ifdef PSM_CUDA
+			/*
+			 * The driver knows to check for "flags" field in
+			 * sdma_req_info only if ctrl=2.
+			 */
+			if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+				sdmahdr->ctrl = 2 |
+					(EXPECTED << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+					(iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+			} else {
+#endif
+			sdmahdr->ctrl = 1 |
+				(EXPECTED << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+				(iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+			}
+#endif
+			_HFI_VDBG("tid-info=%p,%d\n",
+				  iovec[vec_idx - 1].iov_base,
+				  (int)iovec[vec_idx - 1].iov_len);
+		} else {
+#ifdef PSM_CUDA
+			if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+				sdmahdr->ctrl = 2 |
+					(EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+					(iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+			} else {
+#endif
+			sdmahdr->ctrl = 1 |
+				(EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+				(iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+			}
+#endif
+		}
+
+		/* Can bound the number to send by 'num' */
+		if (++scb_idx == num)
+			break;
+	}
+	psmi_assert(vec_idx > 0);
+retry:
+	ret = hfi_cmd_writev(proto->fd, iovec, vec_idx);
+
+	if (ret > 0) {
+		proto->writevFailTime = 0;
+		/* No need for inflight system call, we can infer it's value
+		 * from
+		 * writev's return value */
+		scb_sent += ret;
+	} else {
+		/*
+		 * ret == 0: Driver did not queue packet. Try later.
+		 * ENOMEM: No kernel memory to queue request, try later?
+		 * ECOMM: Link may have gone down
+		 * EINTR: Got interrupt while in writev
+		 */
+		if (errno == ENOMEM) {
+			err = handle_ENOMEM_on_DMA_completion(proto);
+			if (err == PSM2_OK)
+				goto retry;
+		} else if (ret == 0 || errno == ECOMM || errno == EINTR) {
+			err = psmi_context_check_status(
+			    (const psmi_context_t *)&proto->ep->context);
+			/*
+			 * During a link bounce the err returned from
+			 * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case
+			 * the error code which we need to return to the calling flush
+			 * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to
+			 * signal the caller to restart the timers to flush the packets.
+			 * Not doing so would leave the packet on the unacked and
+			 * pending q without the sdma descriptors ever being updated.
+			 */
+			if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK)
+				err = PSM2_EP_NO_RESOURCES;
+		} else {
+			err = psmi_handle_error(
+			    proto->ep,
+			    PSM2_EP_DEVICE_FAILURE,
+			    "Unexpected error in writev(): %s (errno=%d) "
+			    "(fd=%d,iovec=%p,len=%d)",
+			    strerror(errno),
+			    errno,
+			    proto->fd,
+			    iovec,
+			    vec_idx);
+			goto fail;
+		}
+	}
+
+fail:
+	*num_sent = scb_sent;
+	psmi_assert(*num_sent <= num && *num_sent >= 0);
+	return err;
+}
+
+/*
+ * Because we only lazily reap send dma completions, it's possible that we
+ * receive a packet's remote acknowledgement before seeing that packet's local
+ * completion.  As part of processing ack packets and releasing scbs, we issue
+ * a wait for the local completion if the scb is marked as having been sent via
+ * send dma.
+ */
+psm2_error_t
+ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb)
+{
+	psm2_error_t err = PSM2_OK;
+	int spin_cnt = 0;
+	int did_yield = 0;
+
+	PSMI_PROFILE_BLOCK();
+
+	do {
+		if (spin_cnt++ == proto->ep->yield_spin_cnt) {
+			/* Have to yield holding the PSM lock, mostly because we don't
+			 * support another thread changing internal state at this point in
+			 * the code.
+			 */
+			did_yield = 1;
+			spin_cnt = 0;
+			sched_yield();
+		}
+
+		err = ips_proto_dma_completion_update(proto);
+		if (err)
+			return err;
+	} while (scb->dma_complete == 0);
+
+	if (did_yield)
+		proto->stats.writev_compl_delay++;
+
+	PSMI_PROFILE_UNBLOCK();
+
+	return err;
+}
+
+psm2_error_t
+ips_proto_timer_ack_callback(struct psmi_timer *current_timer,
+			     uint64_t current)
+{
+	struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+	uint64_t t_cyc_next = get_cycles();
+	psmi_seqnum_t err_chk_seq;
+	ips_scb_t *scb, ctrlscb;
+	uint8_t message_type;
+
+	if (STAILQ_EMPTY(&flow->scb_unacked))
+		return PSM2_OK;
+
+	scb = STAILQ_FIRST(&flow->scb_unacked);
+
+	if (current >= scb->abs_timeout) {
+		int done_local = 0;
+
+		/* We have to ensure that the send is at least locally complete before
+		 * sending an error check or else earlier data can get to the
+		 * destination *after* we pio or dma this err_chk.
+		 */
+		if (flow->transfer == PSM_TRANSFER_DMA) {
+			/* error is caught inside this routine */
+			ips_proto_dma_completion_update(proto);
+
+			if (scb->dma_complete)
+				done_local = 1;
+			else
+				proto->stats.writev_compl_eagain++;
+		} else
+			done_local = 1;	/* Always done for PIO flows */
+
+		scb->ack_timeout =
+		    min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor,
+			proto->epinfo.ep_timeout_ack_max);
+		scb->abs_timeout = t_cyc_next + scb->ack_timeout;
+
+		if (done_local) {
+			_HFI_VDBG
+			    ("sending err_chk flow=%d with first=%d,last=%d\n",
+			     flow->flowid,
+			     STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num,
+			     STAILQ_LAST(&flow->scb_unacked, ips_scb,
+					 nextq)->seq_num.psn_num);
+
+			ctrlscb.flags = 0;
+			if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD)
+				ctrlscb.flags |= IPS_SEND_FLAG_INTR;
+
+			err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ?
+					flow->xmit_seq_num :
+					SLIST_FIRST(&flow->scb_pend)->seq_num;
+
+			if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+				message_type = OPCODE_ERR_CHK_GEN;
+				err_chk_seq.psn_seq -= 1;
+				/* Receive descriptor index */
+				ctrlscb.ips_lrh.data[0].u64 =
+					scb->tidsendc->rdescid.u64;
+				/* Send descriptor index */
+				ctrlscb.ips_lrh.data[1].u64 =
+					scb->tidsendc->sdescid.u64;
+			} else {
+				PSM2_LOG_MSG("sending ERR_CHK message");
+				message_type = OPCODE_ERR_CHK;
+				err_chk_seq.psn_num = (err_chk_seq.psn_num - 1)
+					& proto->psn_mask;
+			}
+			ctrlscb.ips_lrh.bth[2] =
+					__cpu_to_be32(err_chk_seq.psn_num);
+
+			ips_proto_send_ctrl_message(flow, message_type,
+					&flow->ipsaddr->ctrl_msg_queued,
+					&ctrlscb, ctrlscb.cksum, 0);
+		}
+
+		t_cyc_next = get_cycles() + scb->ack_timeout;
+	} else
+		t_cyc_next += (scb->abs_timeout - current);
+
+	psmi_timer_request(proto->timerq, current_timer, t_cyc_next);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_timer_send_callback(struct psmi_timer *current_timer,
+			      uint64_t current)
+{
+	struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+
+	/* If flow is marked as congested adjust injection rate - see process nak
+	 * when a congestion NAK is received.
+	 */
+	if_pf(flow->flags & IPS_FLOW_FLAG_CONGESTED) {
+
+		/* Clear congestion flag and decrease injection rate */
+		flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+		if ((flow->path->pr_ccti +
+		     proto->cace[flow->path->pr_sl].ccti_increase) <=
+		    proto->ccti_limit)
+			ips_cca_adjust_rate(flow->path,
+					    proto->cace[flow->path->pr_sl].
+					    ccti_increase);
+	}
+
+	if (!SLIST_EMPTY(&flow->scb_pend))
+		flow->flush(flow, NULL);
+
+	return PSM2_OK;
+}
+
+psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment)
+{
+	struct ips_proto *proto = path_rec->proto;
+
+	/* Increment/decrement ccti for path */
+	psmi_assert_always(path_rec->pr_ccti >=
+			   proto->cace[path_rec->pr_sl].ccti_min);
+	path_rec->pr_ccti += cct_increment;
+
+	/* Determine new active IPD.  */
+#if _HFI_DEBUGGING
+	uint16_t prev_ipd = 0;
+	uint16_t prev_divisor = 0;
+	if (_HFI_CCADBG_ON) {
+		prev_ipd = path_rec->pr_active_ipd;
+		prev_divisor = path_rec->pr_cca_divisor;
+	}
+#endif
+	if ((path_rec->pr_static_ipd) &&
+	    ((path_rec->pr_static_ipd + 1) >
+	     (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+		path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+		path_rec->pr_cca_divisor = 0;
+	} else {
+		path_rec->pr_active_ipd =
+		    proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+		path_rec->pr_cca_divisor =
+		    proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+	}
+
+#if _HFI_DEBUGGING
+	if (_HFI_CCADBG_ON) {
+		_HFI_CCADBG_ALWAYS("CCA: %s injection rate to <%x.%x> from <%x.%x>\n",
+			(cct_increment > 0) ? "Decreasing" : "Increasing",
+			path_rec->pr_cca_divisor, path_rec->pr_active_ipd,
+			prev_divisor, prev_ipd);
+	}
+#endif
+
+	/* Reschedule CCA timer if this path is still marked as congested */
+	if (path_rec->pr_ccti > proto->cace[path_rec->pr_sl].ccti_min) {
+		if (path_rec->pr_timer_cca == NULL) {
+			path_rec->pr_timer_cca =
+			    (struct psmi_timer *)psmi_mpool_get(proto->
+								timer_pool);
+			psmi_assert(path_rec->pr_timer_cca != NULL);
+			psmi_timer_entry_init(path_rec->pr_timer_cca,
+					      ips_cca_timer_callback, path_rec);
+		}
+		psmi_timer_request(proto->timerq,
+				   path_rec->pr_timer_cca,
+				   get_cycles() +
+				   proto->cace[path_rec->pr_sl].
+				   ccti_timer_cycles);
+	} else if (path_rec->pr_timer_cca) {
+		psmi_mpool_put(path_rec->pr_timer_cca);
+		path_rec->pr_timer_cca = NULL;
+	}
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current)
+{
+	ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context;
+
+	/* Increase injection rate for flow. Decrement CCTI */
+	if (path_rec->pr_ccti > path_rec->proto->cace[path_rec->pr_sl].ccti_min)
+		return ips_cca_adjust_rate(path_rec, -1);
+
+	psmi_mpool_put(path_rec->pr_timer_cca);
+	path_rec->pr_timer_cca = NULL;
+	return PSM2_OK;
+}
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
new file mode 100644
index 0000000..00da753
--- /dev/null
+++ b/ptl_ips/ips_proto.h
@@ -0,0 +1,687 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_H
+#define _IPS_PROTO_H
+
+#include "psm_user.h"
+
+#include "ips_recvhdrq.h"
+#include "ips_tid.h"
+#include "ips_scb.h"
+#include "ips_epstate.h"
+#include "ips_spio.h"
+#include "ips_stats.h"
+#include "ips_proto_am.h"
+#include "ips_tidflow.h"
+#include "ips_path_rec.h"
+
+typedef enum ips_path_type {
+	IPS_PATH_LOW_PRIORITY,
+	IPS_PATH_NORMAL_PRIORITY,
+	IPS_PATH_HIGH_PRIORITY,
+	IPS_PATH_MAX_PRIORITY
+} ips_path_type_t;
+
+/*
+ * Local Endpoint info.
+ *
+ * Contains information necessary for composing packets for the local endpoint
+ */
+struct ips_epinfo {
+	uint16_t ep_base_lid;
+	uint8_t ep_baseqp;
+	uint8_t ep_lmc;
+	opa_rate ep_link_rate;
+	uint16_t ep_context;
+	uint16_t ep_subcontext;
+	uint16_t ep_hfi_type;
+	uint16_t ep_sl;		/* HFI_SL only when path record not used */
+	uint16_t ep_mtu;
+	uint16_t ep_piosize;
+	uint16_t ep_pkey;	/* PSM2_PKEY only when path record not used */
+	uint16_t ep_jkey;
+	uint64_t ep_timeout_ack;	/* PSM2_ERRCHK_TIMEOUT if no path record */
+	uint64_t ep_timeout_ack_max;
+	uint32_t ep_timeout_ack_factor;
+};
+
+/*
+ * Remote Endpoint info.
+ *
+ * Contains information necessary for composing packets for a remote endpoint
+ */
+#define IPS_MAX_PATH_LMC 3
+typedef struct ips_path_grp {
+	/* For LMC/Torus keep list of base and max dlid. Used for pkt verification */
+	uint16_t pg_base_lid;
+	uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY];
+	uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY];
+	ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY];
+} ips_path_grp_t;
+
+/*
+ * Control messages.
+ *
+ * ips low-level control messages to ensure reliability of eager packets.
+ *
+ */
+struct ips_proto;
+psm2_error_t ips_proto_init(const psmi_context_t *context, const struct ptl *ptl, int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, const struct psmi_timer_ctrl *timerq,	/* PTL's timerq */
+			   const struct ips_epstate *epstate,	/* PTL's epstate */
+			   const struct ips_spio *spioc,	/* PTL's spio control */
+			   struct ips_proto *proto);	/* output protocol */
+
+psm2_error_t ips_proto_fini(struct ips_proto *proto, int force,
+			   uint64_t timeout);
+
+/*
+ * Control message structures
+ */
+#define CTRL_MSG_QEUEUE_SIZE 64	/* power of two */
+
+struct ips_ctrlq_elem {
+	uint8_t message_type;
+	uint16_t *msg_queue_mask;
+	ips_scb_t msg_scb;
+};
+
+struct ips_ctrlq {
+	/* Queued control messages, queued when pio is busy */
+	struct ips_proto *ctrlq_proto;
+
+	uint32_t ctrlq_head;
+	uint32_t ctrlq_tail;
+	uint32_t ctrlq_overflow;
+
+	struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN;
+	struct psmi_timer ctrlq_timer;	/* when in timerq */
+};
+
+/* Connect/disconnect, as implemented by ips */
+
+/*
+ * Connections are not pairwise but we keep a single 'epaddr' for messages-from
+ * and messages-to a remote 'epaddr'.  State transitions for connecting TO and
+ * FROM 'epaddrs' are the following:
+ * Connect TO (Connect OUTGOING):
+ *   NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE
+ *
+ * Connect FROM (we receive a connect request - Connect INCOMING)
+ *   NONE -> ESTABLISHED -> NONE
+ */
+#define CSTATE_ESTABLISHED		1
+#define CSTATE_NONE			2
+#define CSTATE_OUTGOING_DISCONNECTED	3
+#define CSTATE_OUTGOING_WAITING		4
+#define CSTATE_OUTGOING_WAITING_DISC	5
+
+psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep,
+			      const psm2_epid_t *array_of_epid,
+			      const int *array_of_epid_mask,
+			      psm2_error_t *array_of_errors,
+			      psm2_epaddr_t *array_of_epaddr,
+			      uint64_t timeout_in);
+
+psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+				 psm2_epaddr_t array_of_epaddr[],
+				 const int array_of_epaddr_mask[],
+				 psm2_error_t array_of_errors[],
+				 uint64_t timeout_in);
+
+int ips_proto_isconnected(struct ips_epaddr *ipsaddr);
+
+/*
+ * Pending operation structures
+ */
+struct ips_pend_sreq {
+	STAILQ_ENTRY(ips_pend_sreq) next;
+	psm2_mq_req_t req;
+	uint32_t type;
+};
+
+#define IPS_PENDSEND_EAGER_DATA	1
+#define IPS_PENDSEND_EAGER_REQ	2
+#define IPS_PENDSEND_EXP_TIDS	3
+#define IPS_PENDSEND_EXP_SENDS	4
+
+STAILQ_HEAD(ips_pendsendq, ips_pend_sreq);
+
+struct ips_pend_sends {
+	struct ips_proto *proto;	/* back ptr */
+	struct psmi_timer timer;
+	struct ips_pendsendq pendq;
+};
+
+/*
+ * One instance of the protocol
+ */
+
+struct ips_protoexp;
+
+struct ips_proto_stats {
+	uint64_t pio_busy_cnt;
+	uint64_t writev_busy_cnt;
+	uint64_t writev_compl_eagain;
+	uint64_t writev_compl_delay;
+	uint64_t scb_egr_unavail_cnt;
+	uint64_t scb_exp_unavail_cnt;
+	uint64_t hdr_overflow;
+	uint64_t egr_overflow;
+	uint64_t lid_zero_errs;
+	uint64_t unknown_packets;
+	uint64_t stray_packets;
+};
+
+struct ips_proto_error_stats {
+	uint64_t num_icrc_err;
+	uint64_t num_ecc_err;
+	uint64_t num_len_err;
+	uint64_t num_tid_err;
+	uint64_t num_dc_err;
+	uint64_t num_dcunc_err;
+	uint64_t num_khdrlen_err;
+};
+
+/*
+ * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init
+ */
+struct ips_proto_epaddr_stats {
+	uint64_t err_chk_send;
+	uint64_t err_chk_recv;
+	uint64_t nak_send;
+	uint64_t nak_recv;
+	uint64_t connect_req;
+	uint64_t disconnect_req;
+	uint64_t tids_grant_send;
+	uint64_t tids_grant_recv;
+	uint64_t send_rexmit;
+	uint64_t congestion_pkts;	/* IB CCA FECN packets */
+};
+
+/* OPP support structure. */
+struct opp_api {
+	void *(*op_path_find_hca) (const char *name, void **device);
+	void *(*op_path_open) (void *device, int port_num);
+	void (*op_path_close) (void *context);
+	int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query,
+					ibta_path_rec_t *response);
+};
+
+struct ips_ibta_compliance_fn {
+	psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid,
+				    uint16_t dlid, uint16_t desthfi_type,
+				    unsigned long timeout,
+				    ips_path_grp_t **ppathgrp);
+	psm2_error_t(*fini) (struct ips_proto *proto);
+};
+
+/* please don't change the flow id order */
+typedef enum ips_epaddr_flow {
+	EP_FLOW_GO_BACK_N_PIO,
+	EP_FLOW_GO_BACK_N_DMA,
+	EP_FLOW_TIDFLOW,	/* Can either pio or dma for tidflow */
+	EP_FLOW_LAST		/* Keep this the last endpoint flow */
+} ips_epaddr_flow_t;
+
+typedef enum psm_transfer_type {
+	PSM_TRANSFER_PIO,
+	PSM_TRANSFER_DMA,
+	PSM_TRANSFER_LAST	/* Keep this the last transfer type */
+} psm_transfer_type_t;
+
+typedef enum psm_protocol_type {
+	PSM_PROTOCOL_GO_BACK_N,
+	PSM_PROTOCOL_TIDFLOW,
+	PSM_PROTOCOL_LAST	/* Keep this the last protocol type */
+} psm_protocol_type_t;
+
+struct ips_proto {
+	struct ptl *ptl;	/* cached */
+	psm2_ep_t ep;		/* cached, for errors */
+	psm2_mq_t mq;		/* cached, for mq handling */
+	int fd;			/* cached, for writev ops */
+
+	/* Pending sends */
+	struct ips_pend_sends pend_sends;
+	struct ips_epstate *epstate;
+	struct psmi_timer_ctrl *timerq;
+
+	struct ips_protoexp *protoexp;
+	struct ips_scbctrl *scbc_rv;
+	struct ips_spio *spioc;
+	struct ips_scbctrl scbc_egr;
+	struct ips_epinfo epinfo;
+
+	ips_scb_t **sdma_scb_queue;
+	struct hfi1_sdma_comp_entry *sdma_comp_queue;
+	uint16_t sdma_queue_size;
+	uint16_t sdma_fill_index;
+	uint16_t sdma_done_index;
+	uint16_t sdma_avail_counter;
+
+	uint64_t timeout_send;
+	uint32_t flags;		/* < if IPS_PROTO_FLAG_SDMA is NOT set, SPIO flow will be initialized
+				 * < if IPS_PROTO_FLAG_SPIO is NOT set, SDMA flow will be initialized
+				 * < so both flows (SDMA and PIO) will be initialized if both of the
+				 * < IPS_PROTO_FLAG_S{DMA,PIO} are CLEARED
+				 */
+	uint32_t iovec_thresh_eager;
+	uint32_t iovec_thresh_eager_blocking;
+	uint32_t psn_mask;
+	uint32_t scb_bufsize;
+	uint16_t flow_credits;
+	mpool_t pend_sends_pool;
+	mpool_t timer_pool;
+	struct ips_ibta_compliance_fn ibta;
+	struct ips_proto_stats stats;
+	struct ips_proto_error_stats error_stats;
+	struct ips_proto_epaddr_stats epaddr_stats;
+
+	struct ips_proto_am proto_am;
+
+	struct ips_ctrlq ctrlq;
+	/* pure sdma mode, use dma flow, otherwise, use pio flow */
+	ips_epaddr_flow_t msgflowid;
+
+	/* Handling tid errors */
+	uint32_t tiderr_cnt;
+	uint32_t tiderr_max;
+	uint64_t tiderr_tnext;
+	uint64_t tiderr_warn_interval;
+
+	uint64_t t_init;
+	uint64_t t_fini;
+	uint32_t runid_key;
+
+	int num_connected_outgoing;
+	int num_connected_incoming;
+	int num_disconnect_requests;
+
+	/* misc state variables. */
+
+	/* Smallest interval in cycles between which we warn about stray
+	 * messages This is a per-endpoint quantity, overridable with
+	 * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die"
+	 * message.
+	 */
+	uint64_t stray_warn_interval;
+	int done_warning;
+	int done_once;
+	int num_bogus_warnings;
+	struct {
+		uint32_t interval_secs;
+		uint64_t next_warning;
+		uint64_t count;
+	} psmi_logevent_tid_send_reqs;
+
+	/* SL2SC and SC2VL table for protocol */
+	uint16_t sl2sc[32];
+	uint16_t sc2vl[32];
+
+	/* CCA per port */
+	uint16_t *cct;		/* cct table */
+	uint16_t ccti_size;	/* ccti table size */
+	uint16_t ccti_limit;	/* should be <= size-1 */
+
+	uint16_t ccti_portctrl;	/* QP or SL CC */
+	uint32_t ccti_ctrlmap;	/* map for valid sl */
+	struct cace {		/* CACongestionEntry */
+		uint8_t ccti_increase;	/* steps to increase */
+		/* uint16_t  ccti_timer;*/ /* CCTI Timer in units of 1.024 usec */
+		uint64_t ccti_timer_cycles; /* converted from us_2_cycles() */
+		uint8_t ccti_threshold;	/* threshold to make log */
+		uint8_t ccti_min;	/* min value for ccti */
+	} cace[32];		/* 32 service levels */
+
+	/* Path record support */
+	uint8_t ips_ipd_delay[IBV_RATE_300_GBPS + 1];
+	struct hsearch_data ips_path_rec_hash;
+	struct hsearch_data ips_path_grp_hash;
+	void *opp_lib;
+	void *hndl;
+	void *device;
+	void *opp_ctxt;
+	struct opp_api opp_fn;
+
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg;
+	mpool_t cuda_hostbuf_pool_send;
+	mpool_t cuda_hostbuf_pool_small_send;
+	cudaStream_t cudastream_send;
+	unsigned cuda_prefetch_limit;
+#endif
+	int ips_extra_sdmahdr_size;
+/*
+ * Control message queue for pending messages.
+ *
+ * Control messages are queued as pending when no PIO is available for sending
+ * the message.  They are composed on the fly and do not need buffering.
+ *
+ * Variables here are write once (at init) and read afterwards (except the msg
+ * queue overflow counters).
+ */
+	uint32_t ctrl_msg_queue_overflow;
+	uint32_t ctrl_msg_queue_enqueue;
+	uint32_t message_type_to_index[256];
+#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)])
+
+	time_t writevFailTime;
+};
+
+/*
+ * Endpoint address, encapsulates per-endpoint protocol metadata
+ *
+ * Directly implements the ptl epaddr.
+ */
+typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed);
+
+/**
+ * ips_flow is a structure that combines all information regarding a send
+ * from one endpoint to another one. Specifically, it is the place where
+ * the Maximum Transmission Unit for a send is calculated, given how many
+ * factors could possibly influence the MTU calculation. See ips_flow_init
+ * documentation for more details.
+ */
+struct ips_flow {
+	SLIST_ENTRY(ips_flow) next;	/* List of flows with pending acks */
+	ips_flow_flush_fn_t flush;	/* flush function for this flow */
+
+	struct ips_epaddr *ipsaddr;	/* back pointer, remote endpoint */
+	ips_path_rec_t *path;	/* Path to use for flow */
+
+	uint16_t frag_size;	/* < This flow's fragment size, calculated as the
+				   < minimum of all relevant MTUs involved */
+
+	uint16_t flowid:2;	/* flow id: pio(0) or dma(1) or tidflow(2) */
+	uint16_t transfer:3;	/* spio or sdma */
+	uint16_t protocol:3;	/* go-back-n or tidflow */
+	uint16_t flags:8;	/* flow state flags */
+
+	uint16_t cca_ooo_pkts;	/* cca out of order packets */
+	uint16_t cwin;		/* Size of congestion window */
+	uint16_t ack_interval;	/* interval to ack packets */
+	uint16_t ack_counter;	/* counter to ack packets */
+	int16_t  credits;	/* Current credits available to send on flow */
+	uint32_t ack_index;     /* Index of the last ACK message type in pending message queue */
+
+	psmi_seqnum_t xmit_seq_num;	/* transmit packet sequence number */
+	psmi_seqnum_t xmit_ack_num;	/* acked packet sequence number */
+	psmi_seqnum_t recv_seq_num;	/* recieved packet sequence number */
+
+	psmi_timer *timer_send;	/* timer for frames that got a busy PIO */
+	psmi_timer *timer_ack;	/* timer for unacked frames */
+
+	 STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked;	/* unacked queue */
+	 SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend;	/* pending queue */
+
+#ifdef PSM_DEBUG
+	uint32_t scb_num_pending;	/* pending scb counter */
+	uint32_t scb_num_unacked;	/* unacked scb counter */
+#endif
+};
+
+#define IPS_FLOW_MSG_TOGGLE_OOO_MASK	(1 << 0)	/* ooo msg check */
+#define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK	(1 << 1)	/* unexp msg check */
+/*
+ * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other.
+ */
+struct ips_epaddr {
+	struct psm2_epaddr epaddr;	/* inlined psm level epaddr */
+	struct ips_msgctl *msgctl;	/* ips level msg control */
+
+	struct ips_epaddr *next;	/* linklist */
+
+	struct ips_flow flows[EP_FLOW_LAST - 1];	/* pio and dma */
+	ips_path_grp_t *pathgrp;	/* pointer to slid/dlid group in hash */
+
+	uint32_t connidx_outgoing;	/* peer's connection idx */
+	uint32_t connidx_incoming;	/* my connection idx */
+
+	uint16_t ctrl_msg_queued;	/* bitmap of queued control messages to be send */
+	uint32_t window_rv;		/* RNDV window size per connection */
+
+	uint8_t  hpp_index;	/* high priority index */
+	uint8_t  context;	/* real context value */
+	uint8_t  subcontext;	/* sub context, 3 bits, 5 bits for future */
+	uint8_t  msg_toggle;	/* only 2 bits used, 6 bits for future */
+
+	/* this portion is only for connect/disconnect */
+	uint64_t s_timeout;	/* used as a time in close */
+	uint32_t runid_key;	/* peer process pid */
+	uint32_t credit:2;	/* credit to connect/disconnect: 0 or 1 */
+	uint32_t cstate_outgoing:3;	/* connection state to, max 7 */
+	uint32_t cstate_incoming:3;	/* connection state from, max 7 */
+	uint32_t delay_in_ms:8;	/* disconnect delay in ms */
+	uint32_t cerror_outgoing:8;	/* error code during connection */
+	uint32_t cerror_incoming:8;	/* error code during connection */
+};
+
+/*
+ * ips_msgctl_t is per connection struct.
+ */
+struct ips_msgctl {
+	struct ips_epaddr master_epaddr; /* Master rail's epaddr */
+
+	struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */
+	uint16_t mq_send_seqnum;	 /* next sending message sequence */
+	uint16_t mq_recv_seqnum;	 /* next receiving message sequence */
+	uint16_t am_send_seqnum;	 /* next sending message sequence */
+	uint16_t am_recv_seqnum;	 /* next receiving message sequence */
+	uint16_t ipsaddr_count;		 /* number of ipsaddr to use */
+	uint16_t outoforder_count;	 /* number of outoforder messages */
+};
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node)
+{
+	ips_epaddr_t *cur;
+
+	/* The new node is inserted before head. */
+	node->next = head;
+
+	/* Circle around the linked list to head's predecessor and update. */
+	for (cur = head; cur->next != head; cur = cur->next);
+	cur->next = node;
+}
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_REMOVE(ips_epaddr_t *node)
+{
+	ips_epaddr_t *cur;
+
+	/* Circle around to node's predecessor and update. */
+	for (cur = node; cur->next != node; cur = cur->next);
+	cur->next = node->next;
+	node->next = node;
+}
+
+/*
+ * Initialize a flow, setting its attributes. Selects the path the flow will
+ * use as well as calculates the flow's fragment size defined as:
+ * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends
+ * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends
+ */
+void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+		   ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+		   psm_protocol_type_t protocol, ips_path_type_t path_type,
+		   uint32_t flow_index);
+MOCK_DCL_EPILOGUE(ips_flow_init);
+
+void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr,
+			  struct ips_flow *flow);
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb);
+MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue);
+
+psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed);
+psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed);
+
+/* Wrapper for enqueue + flush */
+psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb);
+
+void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb);
+psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto,
+				    ips_epaddr_t *ipsaddr, int *nflushed);
+psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb);
+psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto);
+
+psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto,
+				   struct ips_flow *flow, ips_scb_t *scb,
+				   void *payload, uint32_t paylen,
+				   uint32_t have_cksum, uint32_t cksum);
+
+/*
+ * Protocol receive processing
+ *
+ */
+/* Error handling for unknown packet, packet is unknown when epid doesn't match
+ * in epstate table */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev);
+/* Exposed for fastpath only */
+int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev);
+/* Handling error cases */
+int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * Protocol exception handling and frame dumps
+ */
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len);
+void ips_proto_dump_err_stats(struct ips_proto *proto);
+void ips_proto_show_rhf_errors(const uint32_t *rhdr);
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg);
+void ips_proto_dump_frame(void *frame, int lenght, char *message);
+void ips_proto_dump_data(void *data, int data_length);
+void ips_proto_dump_eager(uint32_t *curr_rcv_hdr);
+
+/*
+ * Checksum of ips packets
+ */
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc);
+
+/*
+ * Matched-Queue processing and sends
+ */
+psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto,
+				      psm2_mq_req_t req);
+psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto,
+				       psm2_mq_req_t req);
+int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev);
+void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl);
+int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev);
+
+psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr,
+			      uint32_t flags, psm2_mq_tag_t *tag,
+			      const void *ubuf, uint32_t len);
+
+psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr,
+			       uint32_t flags, psm2_mq_tag_t *tag,
+			       const void *ubuf, uint32_t len, void *context,
+			       psm2_mq_req_t *req_o);
+
+#define IPS_NON_DW_MUL_NOT_ALLOWED	0
+#define IPS_NON_DW_MUL_ALLOWED		1
+void ips_proto_mq_set_non_dw_mul_sdma(uint32_t mode);
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * IPS packet service routine table.
+ */
+typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev);
+extern ips_packet_service_fn_t
+	ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED];
+
+/* IBTA feature related functions (path record, sl2sc2vl etc.) */
+psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto);
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto);
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto);
+MOCK_DCL_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto);
+
+PSMI_ALWAYS_INLINE(
+void* psmi_get_sdma_req_info(struct ips_scb *scb, int sdmahdr_extra_bytes))
+{
+#ifdef PSM_CUDA
+	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+		return (void *)(((char*)&scb->pbc) - sizeof(struct sdma_req_info_v6_3) -
+				  sdmahdr_extra_bytes);
+#endif
+	return (void *)(((char*)&scb->pbc) - sizeof(struct sdma_req_info_v6_3));
+}
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset,
+			      uint32_t len))
+{
+	uint32_t window_len;
+	window_len = len - offset;
+	if (window_len >= max_window)
+		window_len = max_window;
+	return window_len;
+}
+#endif
+
+#endif /* _IPS_PROTO_H */
diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c
new file mode 100644
index 0000000..98a7460
--- /dev/null
+++ b/ptl_ips/ips_proto_am.c
@@ -0,0 +1,595 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+struct ips_am_token {
+	struct psmi_am_token tok;
+
+	/* ptl-specific token stuff */
+	struct ips_epaddr *epaddr_rail;
+	struct ips_proto_am *proto_am;
+};
+
+struct ips_am_message {
+	struct ips_message_header p_hdr;
+	struct ips_am_message *next;
+	struct ips_epaddr *ipsaddr;
+	struct ips_proto_am *proto_am;
+	uint64_t *payload;
+	uint32_t paylen;
+	uint16_t seqnum;
+};
+
+/* These variables are shared for all packet flows in a PSM process; they are
+ * shared across multiple rails.  There is no single AM object to hang these
+ * off of, so they are declared here as globals. */
+static struct {
+	struct ips_am_message head;
+	struct ips_am_message *tail;
+} ips_am_outoforder_q;
+
+static mpool_t ips_am_msg_pool;
+
+/* This calculation ensures that the number of reply slots will always be at
+ * least twice as large + 1 as the number of request slots. This is optimal: the
+ * minimum amount required is actually only twice as many, but it is much
+ * slower. */
+#define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1)
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+		  int num_send_slots,
+		  uint32_t imm_size,
+		  struct ips_proto_am *proto_am)
+{
+	psm2_error_t err = PSM2_OK;
+	int send_buf_size = proto->ep->context.ctrl->__hfi_piosize;
+	int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots);
+	int num_req_slots = num_send_slots - num_rep_slots;
+
+	proto_am->proto = proto;
+
+	/* In a node pair, the number of reply send buffers on at least one of
+	 * the nodes must be at least double the number (optimal: double + 1) of
+	 * send descriptors on the other node. While this constraint applies
+	 * only to the reply send buffers, allowing the caller to tune only the
+	 * number of request send buffers would be awkward, as they have no
+	 * knowledge of the subdivision of the memory into separate mempools for
+	 * requests and replies. It's an internal concern at this point. */
+	if ((err = ips_scbctrl_init(&proto->ep->context,
+				    num_req_slots,
+				    num_req_slots,
+				    imm_size,
+				    send_buf_size,
+				    NULL,
+				    NULL,
+				    &proto_am->scbc_request)))
+		goto fail;
+
+	if ((err = ips_scbctrl_init(&proto->ep->context,
+				    num_rep_slots,
+				    num_rep_slots,
+				    imm_size,
+				    send_buf_size,
+				    NULL,
+				    NULL,
+				    &proto_am->scbc_reply)))
+		goto fail;
+
+	if (ips_am_msg_pool == NULL) {
+		union psmi_envvar_val max_msgs;
+
+		ips_am_outoforder_q.head.next = NULL;
+		ips_am_outoforder_q.tail = &ips_am_outoforder_q.head;
+
+		psmi_getenv("PSM2_AM_MAX_OOO_MSGS",
+			"Maximum number of OOO Active Messages to queue before dropping.",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1024, &max_msgs);
+
+		ips_am_msg_pool = psmi_mpool_create(
+				sizeof(struct ips_am_message),
+				32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL);
+	}
+fail:
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am)
+{
+	ips_scbctrl_fini(&proto_am->scbc_request);
+	ips_scbctrl_fini(&proto_am->scbc_reply);
+	if (ips_am_msg_pool != NULL) {
+		psmi_mpool_destroy(ips_am_msg_pool);
+		ips_am_msg_pool = NULL;
+	}
+
+	return PSM2_OK;
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS);
+	int max_payload =
+	    ep->context.ctrl->__hfi_piosize -
+	    ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t));
+
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS;
+	parameters->max_nargs = max_nargs;
+	parameters->max_request_short = max_payload;
+	parameters->max_reply_short = max_payload;
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr,
+		psm2_amarg_t *args, int nargs, uint8_t opcode,
+		void *src, size_t len, int flags, int pad_bytes)
+{
+	int i, hdr_qwords = IPS_AM_HDR_NARGS;
+	struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto;
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+
+	/* There are a limited number of bits for nargs in the header, making
+	   overflow very easy.  Make sure the values match. */
+	psmi_assert(nargs == scb->ips_lrh.amhdr_nargs);
+
+	_HFI_VDBG("%s src=%p len=%d, nargs=%d\n",
+		  ((opcode == OPCODE_AM_REQUEST) ||
+		   (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep",
+		  src, (int)len, nargs);
+
+	if (nargs == 1) {	/* fastpath */
+		scb->ips_lrh.data[0].u64w0 = args[0].u64w0;
+		hdr_qwords--;
+	} else if (nargs > 1) {
+		/* Easily unrollable but leave as is in case we can increase
+		 * qwords on the chip in the near future */
+		for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--)
+			scb->ips_lrh.data[i].u64w0 = args[i].u64w0;
+
+		if (nargs > IPS_AM_HDR_NARGS) {
+			/* Slow case -- we don't have iovec and not enough
+			 * space in the message header, so we have to copy the
+			 * user's arguments even if the payload is marked ASYNC
+			 */
+			uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb);
+			size_t arg_payload_len =
+			    sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS);
+
+			psmi_mq_mtucpy((void *)bufp,
+				       &args[IPS_AM_HDR_NARGS],
+				       arg_payload_len);
+			bufp += arg_payload_len;
+			scb->payload_size = arg_payload_len;
+
+			if (src != NULL && len > 0) {
+				psmi_mq_mtucpy((void *)bufp, src, len);
+				scb->payload_size += len;
+			}
+
+			psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+			scb->payload_size += pad_bytes;
+			scb->ips_lrh.amhdr_len = pad_bytes;
+			goto send_scb;
+		}
+	}
+
+	if (len == 0) {
+		scb->payload_size = 0;
+		scb->ips_lrh.amhdr_len = 0;
+	} else if (len <= (hdr_qwords << 3)) {
+		/* Inline the payload into the header. */
+		/* This path CANNOT handle length = 0 due to limited space
+		   in the header.  If IPS_SEND_FLAG_AMISTINY is set, an
+		   amhdr_len value of 0 means a full payload, i.e.
+		   1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */
+		psmi_assert(len > 0);
+
+		psmi_mq_mtucpy(&scb->ips_lrh.
+			       data[IPS_AM_HDR_NARGS - hdr_qwords], src, len);
+		scb->payload_size = 0;
+		psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS));
+		scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1);
+		scb->flags |= IPS_SEND_FLAG_AMISTINY;
+	} else { /* Whatever's left requires a separate payload */
+		if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */
+			ips_scb_buffer(scb) = src;
+		else /* May need to re-xmit user data, keep it around */
+			psmi_mq_mtucpy(ips_scb_buffer(scb), src, len);
+
+		psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+		scb->payload_size = len + pad_bytes;
+		scb->ips_lrh.amhdr_len = pad_bytes;
+	}
+
+send_scb:
+	ips_scb_opcode(scb) = opcode;
+	scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++;
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	return PSM2_OK;
+}
+
+static inline int
+calculate_pad_bytes(size_t len)
+{
+	/* Align to dword (4 bytes) */
+	size_t dword_aligned_len = (len + 3) & ~3;
+	return dword_aligned_len - len;
+}
+
+static inline
+void
+ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs,
+		int pad_bytes,
+		psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+
+	scb->completion_am = completion_fn;
+	scb->cb_param = completion_ctxt;
+	scb->ips_lrh.amhdr_hidx = handler;
+	scb->ips_lrh.amhdr_len = pad_bytes;
+	scb->ips_lrh.amhdr_nargs = nargs;
+	scb->ips_lrh.flags = 0;
+	if (completion_fn)
+		scb->flags |= IPS_SEND_FLAG_ACKREQ;
+	return;
+}
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt)
+{
+	struct ips_proto_am *proto_am = &epaddr->proto->proto_am;
+	psm2_error_t err;
+	ips_scb_t *scb;
+	ips_epaddr_t *ipsaddr;
+	int pad_bytes = calculate_pad_bytes(len);
+	int payload_sz = (nargs << 3);
+
+	if_pt(!(flags & PSM2_AM_FLAG_ASYNC))
+	    payload_sz += len;
+
+	if (payload_sz > (IPS_AM_HDR_NARGS << 3)) {
+		/* Payload can't fit in header, allocate buffer to carry data */
+		int arg_sz = (nargs > IPS_AM_HDR_NARGS) ?
+		    ((nargs - IPS_AM_HDR_NARGS) << 3) : 0;
+
+		/* len + pad_bytes + overflow_args */
+		PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+				err,
+				((scb = ips_scbctrl_alloc(
+				      &proto_am->scbc_request,
+				      1,
+				      len + pad_bytes + arg_sz,
+				      IPS_SCB_FLAG_ADD_BUFFER)) != NULL));
+	} else {
+		PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+				err,
+				((scb = ips_scbctrl_alloc_tiny(
+				      &proto_am->scbc_request)) != NULL));
+	}
+
+	psmi_assert_always(scb != NULL);
+	ips_am_scb_init(scb, handler, nargs, pad_bytes,
+			completion_fn, completion_ctxt);
+
+	/* Select the next ipsaddr for multi-rail */
+	ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next;
+	ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+
+	return am_short_reqrep(scb, ipsaddr, args,
+			       nargs,
+			       (flags & PSM2_AM_FLAG_NOREPLY) ?
+			       OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST,
+			       src, len, flags, pad_bytes);
+}
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+		   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	struct ips_am_token *token = (struct ips_am_token *)tok;
+	struct ips_proto_am *proto_am = token->proto_am;
+	struct ips_epaddr *ipsaddr = token->epaddr_rail;
+	int pad_bytes = calculate_pad_bytes(len);
+	int scb_flags = 0;
+	ips_scb_t *scb;
+
+	if (!token->tok.can_reply) {
+		_HFI_ERROR("Invalid AM reply for request!");
+		return PSM2_AM_INVALID_REPLY;
+	}
+
+	psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply));
+
+	if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) {
+		scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply);
+	} else {
+		int payload_sz = (nargs << 3);
+
+		payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ?
+			      0 : (len + pad_bytes);
+		scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ?
+		    IPS_SCB_FLAG_ADD_BUFFER : 0;
+
+		scb =
+		    ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz,
+				      scb_flags);
+	}
+
+	psmi_assert_always(scb != NULL);
+	ips_am_scb_init(scb, handler, nargs, pad_bytes,
+			completion_fn, completion_ctxt);
+	am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY,
+			src, len, flags, pad_bytes);
+	return PSM2_OK;
+}
+
+/* Prepares and runs a handler from a receive event. */
+static int
+ips_am_run_handler(const struct ips_message_header *p_hdr,
+		struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am,
+		uint64_t *payload,
+		uint32_t paylen)
+{
+	struct ips_am_token token;
+	int nargs = p_hdr->amhdr_nargs;
+	psm2_am_handler_fn_t hfn;
+	psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data;
+
+	token.tok.flags = p_hdr->flags;
+	token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr;
+	token.tok.can_reply =
+		(_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST);
+	token.epaddr_rail = ipsaddr;
+	token.proto_am = proto_am;
+
+	if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) {
+		/* Payload is packed into header after args */
+		payload = (uint64_t *)&p_hdr->data[nargs].u64;
+		paylen = p_hdr->amhdr_len;
+		/* Interpret amhdr_len == 0 as 16 bytes of payload */
+		if (paylen == 0)
+			paylen = 1 << IPS_AM_HDR_LEN_BITS;
+	} else {
+		if (nargs > IPS_AM_HDR_NARGS) {
+			/* Args are split across header and payload */
+			int payload_args_len =
+				(nargs - IPS_AM_HDR_NARGS) *
+				sizeof(psm2_amarg_t);
+
+			args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t));
+
+			args[0].u64 = p_hdr->data[0].u64;
+			args[1].u64 = p_hdr->data[1].u64;
+
+			memcpy(&args[2], payload, payload_args_len);
+
+			payload += nargs - IPS_AM_HDR_NARGS;
+			paylen -= payload_args_len;
+		}
+
+		/* Subtract off padding bytes (dword padding) for non-TINY. */
+		paylen -= p_hdr->amhdr_len;
+	}
+
+	hfn = psm_am_get_handler_function(proto_am->proto->ep,
+			p_hdr->amhdr_hidx);
+
+	int ret = hfn(&token, args, nargs, payload, paylen);
+	return ret;
+}
+
+static int
+ips_proto_am_handle_outoforder_queue()
+{
+	struct ips_am_message *msg, *prev;
+	int ret = IPS_RECVHDRQ_CONTINUE;
+
+	prev = &ips_am_outoforder_q.head;
+	msg = ips_am_outoforder_q.head.next;
+
+	while (msg != NULL) {
+		struct ips_epaddr *ipsaddr = msg->ipsaddr;
+		if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) {
+			prev = msg;
+			msg = msg->next;
+			continue;
+		}
+
+		ipsaddr->msgctl->am_recv_seqnum++;
+
+		if (ips_am_run_handler(&msg->p_hdr,
+					ipsaddr, msg->proto_am,
+					msg->payload, msg->paylen))
+			ret = IPS_RECVHDRQ_BREAK;
+
+		prev->next = msg->next;
+		if (prev->next == NULL)
+			ips_am_outoforder_q.tail = prev;
+
+		psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload);
+		psmi_mpool_put(msg);
+
+		msg = prev->next;
+	}
+
+	return ret;
+}
+
+static void
+ips_proto_am_queue_msg(struct ips_am_message *msg)
+{
+	msg->next = NULL;
+	ips_am_outoforder_q.tail->next = msg;
+	ips_am_outoforder_q.tail = msg;
+}
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	struct ips_am_message *msg = NULL;
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	enum ips_msg_order msgorder;
+
+	psmi_assert(flowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[flowid];
+	/*
+	 * Based on AM request/reply traffic pattern, if we don't have a reply
+	 * scb slot then we can't process the request packet, we just silently
+	 * drop it.  Otherwise, it will be a deadlock.  note:
+	 * ips_proto_is_expected_or_nak() can not be called in this case.
+	 */
+	if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST &&
+	    !ips_scbctrl_avail(&proto_am->scbc_reply))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	if (!ips_proto_is_expected_or_nak(rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	uint16_t send_msgseq =
+	    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq,
+			&ipsaddr->msgctl->am_recv_seqnum);
+
+	if (msgorder == IPS_MSG_ORDER_FUTURE)
+		return IPS_RECVHDRQ_REVISIT;
+	else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+		uint64_t *msg_payload;
+		uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+		psmi_assert(paylen == 0 || payload);
+		msg = psmi_mpool_get(ips_am_msg_pool);
+		if (unlikely(msg == NULL)) {
+			/* Out of memory, drop the packet. */
+			flow->recv_seq_num.psn_num =
+				(flow->recv_seq_num.psn_num - 1) &
+				rcv_ev->proto->psn_mask;
+			return IPS_RECVHDRQ_BREAK;
+		}
+		msg_payload = psmi_mq_sysbuf_alloc(
+				msg->proto_am->proto->mq,
+				ips_recvhdrq_event_paylen(rcv_ev));
+		if (unlikely(msg_payload == NULL)) {
+			/* Out of memory, drop the packet. */
+			flow->recv_seq_num.psn_num =
+				(flow->recv_seq_num.psn_num - 1) &
+				rcv_ev->proto->psn_mask;
+			psmi_mpool_put(msg);
+			return IPS_RECVHDRQ_BREAK;
+		}
+
+		memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header));
+		memcpy(msg_payload, payload, paylen);
+
+		msg->payload = msg_payload;
+		msg->ipsaddr = ipsaddr;
+		msg->proto_am = proto_am;
+		msg->paylen = paylen;
+		msg->seqnum =
+			__le32_to_cpu(p_hdr->khdr.kdeth0) &
+			HFI_KHDR_MSGSEQ_MASK;
+
+		ips_proto_am_queue_msg(msg);
+	} else if ((msgorder == IPS_MSG_ORDER_EXPECTED) ||
+		   (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) {
+		uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+		psmi_assert(paylen == 0 || payload);
+		if (ips_am_run_handler(p_hdr, ipsaddr, proto_am,
+					payload, paylen))
+			ret = IPS_RECVHDRQ_BREAK;
+
+		ips_proto_am_handle_outoforder_queue();
+	}
+
+	/* Look if the handler replied, if it didn't, ack the request */
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+	return ret;
+}
diff --git a/ptl_ips/ips_proto_am.h b/ptl_ips/ips_proto_am.h
new file mode 100644
index 0000000..3e0a271
--- /dev/null
+++ b/ptl_ips/ips_proto_am.h
@@ -0,0 +1,93 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_AM_H
+#define _IPS_PROTO_AM_H
+
+#include "psm_user.h"
+#include "ips_scb.h"
+
+struct ips_proto_am {
+	struct ips_proto *proto;	/* back pointer */
+	struct ips_scbctrl scbc_request;
+	struct ips_scbctrl scbc_reply;
+};
+
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+		   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn, void *completion_ctxt);
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt);
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+             int num_send_slots,
+             uint32_t imm_size,
+             struct ips_proto_am *proto_am);
+MOCK_DCL_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am);
+
+#endif /* _IPS_PROTO_AM_H */
diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c
new file mode 100644
index 0000000..e537d10
--- /dev/null
+++ b/ptl_ips/ips_proto_connect.c
@@ -0,0 +1,1551 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/*
+ * define connection version. this is the basic version, optimized
+ * version will be added later for scalability.
+ */
+#define IPS_CONNECT_VERNO	  0x0001
+
+struct ips_connect_hdr {
+	uint16_t connect_verno;	/* should be ver 1 */
+	uint16_t psm_verno;	/* should be 2.0 */
+	uint32_t connidx;	/* ignore if 0xffffffff */
+	uint64_t epid;		/* epid of connector process */
+};
+
+struct ips_connect_reqrep {
+	uint16_t connect_verno;	/* should be ver 1 */
+	uint16_t psm_verno;	/* should be 2.0 */
+	uint32_t connidx;	/* ignore if 0xffffffff */
+	uint64_t epid;		/* epid of connector process */
+	/* above should be same as ips_connect_hdr */
+
+	uint16_t connect_result;	/* error code */
+	uint16_t sl;		/* service level for matching */
+	uint16_t mtu;		/* receive payload */
+	uint16_t job_pkey;	/* partition key for verification */
+
+	uint32_t runid_key;	/* one-time stamp connect key */
+	uint32_t initpsn;	/* initial psn for flow */
+
+	char hostname[128];	/* sender's hostname string */
+};
+
+/* Startup protocol in PSM/IPS
+ *
+ * Start timer.
+ *
+ * For all nodes to connect to:
+ *   Grab connect lock
+ *   Look up epid in table
+ *      MATCH.
+ *         assert cstate_outgoing != CONNECT_WAITING (no re-entrancy)
+ *         If cstate_outgoing == CONNECT_DONE
+ *            return the already connected address.
+ *         else
+ *            assert cstate_outgoing == CONNECT_NONE
+ *            assert cstate_incoming == CONNECT_DONE
+ *            cstate_outgoing := CONNECT_WAITING
+ *            assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN
+ *            req->connidx := epaddr->connidx_incoming
+ *            add to list of pending connect.
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_outgoing := CONNECT_WAITING
+ *         cstate_incoming := CONNECT_NONE
+ *         connidx_outgoing := UNKNOWN
+ *         req->connidx := epaddr->connidx_incoming := NEW connidx integer
+ *         add to list of pending connect
+ *   Release connect lock
+ *
+ * expected_connect_count = ep->total_connect_count + num_to_connect
+ * while (expected_connect_count != ep->total_connect_count)
+ *    check for timeout
+ *    progress();
+ *
+ * For all connection requests received (within progress loop)
+ *   If uuid doesn't match, NAK the connect and skip request
+ *   Grab connect lock
+ *   Lock up epid in table
+ *      MATCH
+ *	   if cstate_incoming == CONNECT_DONE
+ *	      req->connidx := epaddr->connidx_incoming
+ *            compose reply and send again (this is a dupe request).
+ *         else
+ *            assert cstate_incoming == CONNECT_NONE
+ *            assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE)
+ *            cstate_incoming := CONNECT_DONE
+ *            epaddr->connidx_outgoing := req->connidx
+ *            req->connidx := epaddr->connidx_incoming
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_incoming := CONNECT_DONE
+ *         epaddr->connidx_outgoing = req->connidx;
+ *         rep->connidx := epaddr->connidx_incoming := NEW connidx integer
+ *         compose connect reply and send
+ *   Release connect lock
+ *
+ * For all connection replies received:
+ *    If connect_result != 0, process error and skip.
+ *    assert cstate_outgoing == CONNECT_WAITING
+ *    if cstate_incoming == CONNECT_DONE
+ *       assert rep->connidx == epaddr->connidx_outgoing
+ *    else
+ *	 epaddr->connidx_outgoing := rep->connidx
+ *    cstate_outgoing := CONNECT_DONE
+ *    ep->total_connect_count ++
+ *
+ *   * Fill in a connection request:
+ *      1. Set connect protocol version and PSM versions
+ *      2. Set the uuid attached to current endpoint and add the job_pkey
+ *         the node wishes to communicate post-connect.
+ *      3. Set our mtu, bitwidth and endianess to detect inconsistencies
+ *
+ */
+
+/**
+ * Configure flows for an ipsaddr.
+ *
+ * @arg ipsaddr - the ipsaddr to configure the flows for
+ * @arg proto - the protocol used
+ *
+ * @pre proto's flags must be set
+ *
+ * Flows should be configured:
+ * - immediately upon creation of an ipsaddr
+ * - whenever a connection is established and the receiver's characteristics
+ *   (e.g. mtu) become known
+ */
+ustatic
+void
+ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+	/* PIO flow uses the normal priority path, to separate low
+	 * priority path for bulk sdma data packets
+	 */
+	ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto,
+		      ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+		      IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO);
+
+	/* DMA flow uses the low priority path, multi MTU sized eager
+	 * message uses the same flow to transfer to avoid out of order.
+	 */
+	ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], proto,
+		      ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N,
+		      IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_DMA);
+}
+
+/*
+ * Teardown any unnecessary timers that could still be active and assign NULL
+ * to pointers in flow structs. We do this mainly for PIO and DMA flows.
+ * TidFlow teardowns are conducted in ips_protoexp_fini()
+ */
+static
+void
+ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+	struct ips_flow *flow;
+	int i;
+
+	for (i = 0; i < EP_FLOW_TIDFLOW; i++) {
+		flow = &ipsaddr->flows[i];
+
+		/* Cancel any stale flow->timers in flight */
+		if (flow->timer_ack) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+		}
+
+		if (flow->timer_send) {
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+		}
+
+		flow->flush = NULL;
+		flow->path = NULL;
+		flow->ipsaddr = NULL;
+	}
+}
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+		 const char *hostname, uint16_t hfi_type, unsigned long timeout);
+
+/*
+ * Given a connection request, set mtu, communication index and hdr length
+ * parameters.
+ *
+ * The most subtle parameter is the mtu.  When set as 'req->mtu', the mtu
+ * is our connecting peer's declared mtu (which may not be the same as our
+ * mtu).  The approach is to take the smaller of both mtus when communicating
+ * with that peer.  Also, when using pio, the size can be further restricted by
+ * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers).
+ */
+static
+psm2_error_t
+ips_ipsaddr_set_req_params(struct ips_proto *proto,
+			   ips_epaddr_t *ipsaddr,
+			   const struct ips_connect_reqrep *req,
+			   uint32_t paylen)
+{
+	psm2_ep_t ep;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	int i, start, count;
+	uint64_t *data;
+	psmi_assert_always(req->mtu > 0);
+	uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu);
+	int ptype, pidx;
+
+	/*
+	 * Make RNDV window size being dependent on MTU size;
+	 * This is due to fact that number of send packets
+	 * within a given window must not exceed 2048 (@ref PSM_TID_MAX_PKTS).
+	 * Use smaller of two values:
+	 * unified MTU * PSM_TID_MAX_PKTS vs already configured window size.
+	 */
+	ipsaddr->window_rv = min(common_mtu * PSM_TID_MAX_PKTS, proto->mq->hfi_base_window_rv);
+
+	/*
+	 * For static routes i.e. "none" path resolution update all paths to
+	 * have the same profile (mtu, sl etc.).
+	 *
+	 * For path record queries the epr_mtu and epr_sl are setup correctly
+	 * from the path itself.
+	 */
+	for (ptype = IPS_PATH_LOW_PRIORITY;
+	     ptype < IPS_PATH_MAX_PRIORITY; ptype++)
+		for (pidx = 0;
+		     pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) {
+			if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) {
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+					common_mtu;
+			} else {
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+				    min(common_mtu,
+					ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu);
+			}
+		}
+
+	/*
+	 * We've got updated mtu/path records, need to re-initialize the flows to take
+	 * into account _real_ (updated) remote endpoint characteristics
+	 */
+	ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+	/*
+	 * Save peer's info.
+	 */
+	ipsaddr->connidx_outgoing = req->connidx;
+	ipsaddr->runid_key = req->runid_key;
+	/* ipsaddr->initpsn = req->initpsn; */
+
+	err =
+	    psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid),
+				   (char *)req->hostname, 0);
+	if (err)
+		return err;
+
+	/*
+	 * Check if there is other rails to setup.
+	 */
+	paylen -= sizeof(struct ips_connect_reqrep);
+	if (paylen == 0)
+		return PSM2_OK;
+
+	/*
+	 * Yes, other rail's gid/epid is attached.
+	 */
+	if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) {
+		return PSM2_INTERNAL_ERR;
+	}
+	count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t));
+	if (count > HFI_MAX_RAILS)
+		return PSM2_INTERNAL_ERR;
+
+	/*
+	 * Both side are ordered, so just search from small to big.
+	 */
+	start = 0;
+	data = (uint64_t *) (req + 1);
+	ep = proto->ep->mctxt_next;
+
+	struct drand48_data drand48_data;
+	srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data);
+
+	/* Loop over all slave endpoints */
+	while (ep != ep->mctxt_master) {
+		for (i = start; i < count; i++) {
+
+			/* There is a gid match, create the epaddr */
+			if (data[2 * i] == ep->gid_hi) {
+
+				epaddr =
+					ips_alloc_epaddr(&ep->ptl_ips.ptl->proto, 0,
+							 data[2 * i + 1], NULL,
+							  PSMI_HFI_TYPE_OPA1,
+									  5000);
+				if (epaddr == NULL)
+					return PSM2_NO_MEMORY;
+
+				/* link the ipsaddr */
+				IPS_MCTXT_APPEND(ipsaddr,
+						 (ips_epaddr_t *) epaddr);
+
+				/* Setup message control info to the same struct */
+				((ips_epaddr_t *) epaddr)->msgctl =
+				    ipsaddr->msgctl;
+				ipsaddr->msgctl->ipsaddr_count++;
+
+				/* randomize the rail to start traffic */
+				long int rnum;
+				lrand48_r(&drand48_data, &rnum);
+				if ((rnum % count) == i) {
+					ipsaddr->msgctl->ipsaddr_next =
+					    (ips_epaddr_t *) epaddr;
+				}
+
+				/* update the starting point,
+				 * all previous ones are not valid anymore */
+				start = i + 1;
+				break;
+			}
+		}
+
+		ep = ep->mctxt_next;
+	}
+
+	return PSM2_OK;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_request(struct ips_proto *proto,
+				    struct ips_flow *flow, uint8_t message_type,
+				    uint16_t *msg_queue_mask, uint64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ips_scb_t ctrlscb;
+	/* msg header plus gid+epid for all rails plus checksum */
+	char payload[sizeof(struct ips_connect_reqrep) +
+		16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES];
+	uint32_t paylen;
+
+	ctrlscb.flags = 0;
+	paylen = ips_proto_build_connect_message(proto,
+			flow->ipsaddr, message_type, payload);
+	psmi_assert_always(paylen <= sizeof(payload));
+
+	do {
+		err = ips_proto_send_ctrl_message(flow, message_type,
+				msg_queue_mask, &ctrlscb, payload, paylen);
+		if (err == PSM2_OK) {
+			break;
+		}
+		if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) {
+			break;
+		}
+	} while (get_cycles() < timeout);
+
+	return err;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_reply(struct ips_proto *proto,
+				    struct ips_flow *flow, uint8_t message_type,
+				    uint16_t *msg_queue_mask)
+{
+	/* This will try up to 100 times until the message is sent. The code
+	 * is persistent because dropping replies will lead to a lack of
+	 * overall progress on the connection/disconnection. We do not want
+	 * to poll from here, and we cannot afford a lengthy timeout, since
+	 * this is called from the receive path.
+	 */
+	psm2_error_t err = PSM2_OK;
+	int i;
+	ips_scb_t ctrlscb;
+	/* msg header plus gid+epid for all rails plus checksum */
+	char payload[sizeof(struct ips_connect_reqrep) +
+		16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES];
+	uint32_t paylen;
+
+	ctrlscb.flags = 0;
+	paylen = ips_proto_build_connect_message(proto,
+			flow->ipsaddr, message_type, payload);
+	psmi_assert_always(paylen <= sizeof(payload));
+
+	for (i = 0; i < 100; i++) {
+		err = ips_proto_send_ctrl_message(flow, message_type,
+				msg_queue_mask, &ctrlscb, payload, paylen);
+		if (err == PSM2_OK) {
+			break;
+		}
+	}
+
+	return err;
+}
+
+int
+ips_proto_build_connect_message(struct ips_proto *proto,
+				ips_epaddr_t *ipsaddr,
+				uint8_t opcode, void *payload)
+{
+	struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+	struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload;
+	uint32_t paylen = 0;
+
+	psmi_assert_always(proto != NULL);
+
+	hdr->connect_verno = IPS_CONNECT_VERNO;
+	hdr->psm_verno = PSMI_VERNO;
+	hdr->connidx = (uint32_t) ipsaddr->connidx_incoming;
+	hdr->epid = proto->ep->epid;
+
+	switch (opcode) {
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_CONNECT_REQUEST:
+		if (opcode == OPCODE_CONNECT_REQUEST) {
+			req->connect_result = PSM2_OK;
+			req->runid_key = proto->runid_key;
+		} else {
+			req->connect_result = ipsaddr->cerror_incoming;
+			req->runid_key = ipsaddr->runid_key;
+		}
+
+		req->sl = proto->epinfo.ep_sl;
+		req->mtu = proto->epinfo.ep_mtu;
+		req->job_pkey = proto->epinfo.ep_pkey;
+
+		strncpy(req->hostname, psmi_gethostname(),
+			sizeof(req->hostname) - 1);
+		req->hostname[sizeof(req->hostname) - 1] = '\0';
+
+		paylen = sizeof(struct ips_connect_reqrep);
+
+		/* Attach all multi-context subnetids and epids. */
+		if (proto->ep->mctxt_master == proto->ep) {
+			psm2_ep_t ep = proto->ep->mctxt_next;
+			uint64_t *data = (uint64_t *) (req + 1);
+			while (ep != proto->ep) {
+				*data = ep->gid_hi;
+				paylen += sizeof(uint64_t);
+				data++;
+				*data = ep->epid;
+				paylen += sizeof(uint64_t);
+				data++;
+				ep = ep->mctxt_next;
+			}
+		}
+
+		break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+		paylen = sizeof(struct ips_connect_hdr);
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unexpected/unhandled connection opcode 0x%x\n",
+				  opcode);
+		break;
+	}
+
+	return paylen;
+}
+
+void
+MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+	      ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+	      psm_protocol_type_t protocol, ips_path_type_t path_type,
+	      uint32_t flow_index)
+{
+	psmi_assert_always(protocol < PSM_PROTOCOL_LAST);
+	psmi_assert_always(flow_index < EP_FLOW_LAST);
+
+	SLIST_NEXT(flow, next) = NULL;
+	if (transfer_type == PSM_TRANSFER_PIO) {
+		flow->flush = ips_proto_flow_flush_pio;
+	} else {
+		flow->flush = ips_proto_flow_flush_dma;
+	}
+
+	flow->path =
+	    ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp);
+
+	/* Select the fragment size for this flow. Flow is the common
+	 * denominator between the local endpoint, the remote endpoint,
+	 * the path between those and whether it's a PIO or DMA send.
+	 * Hence, it "owns" the maximum transmission unit in its frag_size
+	 * member.
+	 */
+
+	/* min of local MTU and path MTU */
+	flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu);
+	/* if PIO, need to consider local pio buffer size */
+	if (transfer_type == PSM_TRANSFER_PIO) {
+		flow->frag_size = min(flow->frag_size, proto->epinfo.ep_piosize);
+		_HFI_VDBG("[ipsaddr=%p] PIO flow->frag_size: %u = min("
+			"proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u), proto->epinfo.ep_piosize(%u))\n",
+			ipsaddr, flow->frag_size, proto->epinfo.ep_mtu,
+			flow->path->pr_mtu, proto->epinfo.ep_piosize);
+	} else {
+		_HFI_VDBG("[ipsaddr=%p] SDMA flow->frag_size: %u = min("
+			"proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n",
+			ipsaddr, flow->frag_size, proto->epinfo.ep_mtu,
+			flow->path->pr_mtu);
+	}
+
+	flow->ipsaddr = ipsaddr;
+	flow->transfer = transfer_type;
+	flow->protocol = protocol;
+	flow->flowid = flow_index;
+	flow->xmit_seq_num.psn_val = 0;
+	flow->recv_seq_num.psn_val = 0;
+	flow->xmit_ack_num.psn_val = 0;
+	flow->flags = 0;
+	flow->cca_ooo_pkts = 0;
+	flow->credits = flow->cwin = proto->flow_credits;
+	flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1);
+	flow->ack_counter = 0;
+#ifdef PSM_DEBUG
+	flow->scb_num_pending = 0;
+	flow->scb_num_unacked = 0;
+#endif
+
+	flow->timer_ack = NULL;
+	flow->timer_send = NULL;
+
+	STAILQ_INIT(&flow->scb_unacked);
+	SLIST_INIT(&flow->scb_pend);
+	return;
+}
+MOCK_DEF_EPILOGUE(ips_flow_init);
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+		 const char *hostname, uint16_t hfi_type, unsigned long timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	ips_path_grp_t *pathgrp;
+	uint16_t lid;
+
+	/* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl
+	 * structures are collocated in memory for performance reasons -- this is
+	 * why ips allocates memory for all three together.
+	 *
+	 * The PSM/PTL structure data is filled in upon successfully ep connect in
+	 * ips_ptl_connect().
+	 */
+	if (master) {
+		struct ips_msgctl *msgctl;
+
+		/* Although an ips_msgtl is allocated here, it can be safely casted to
+		   both an ips_epaddr and a psm2_epaddr.  It is eventually freed as an
+		   ips_epaddr. */
+		msgctl =
+		    (struct ips_msgctl *)psmi_calloc(proto->ep,
+						     PER_PEER_ENDPOINT, 1,
+						     sizeof(struct ips_msgctl));
+		if (msgctl == NULL)
+			return NULL;
+
+		ipsaddr = &msgctl->master_epaddr;
+		epaddr = (psm2_epaddr_t) ipsaddr;
+
+		ipsaddr->msgctl = msgctl;
+
+		/* initialize items in ips_msgctl_t */
+		msgctl->ipsaddr_next = ipsaddr;
+		msgctl->mq_send_seqnum = 0;
+		msgctl->mq_recv_seqnum = 0;
+		msgctl->am_send_seqnum = 0;
+		msgctl->am_recv_seqnum = 0;
+		msgctl->ipsaddr_count = 1;
+		msgctl->outoforder_count = 0;
+	} else {
+		epaddr =
+		    (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1,
+					       sizeof(struct ips_epaddr));
+		psmi_assert_always(epaddr);
+		ipsaddr = (ips_epaddr_t *) epaddr;
+	}
+
+	epaddr->ptlctl = proto->ptl->ctl;
+	epaddr->proto = proto;
+	epaddr->epid = epid;
+
+	/* IPS-level epaddr */
+	ipsaddr->next = ipsaddr;
+
+	ipsaddr->ctrl_msg_queued = 0;
+	ipsaddr->msg_toggle = 0;
+
+	/* Actual context of peer */
+	ipsaddr->context = PSMI_EPID_GET_CONTEXT(epid);
+	/* Subcontext */
+	ipsaddr->subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+
+	/* Get path record for <service, slid, dlid> tuple */
+	lid = PSMI_EPID_GET_LID(epid);
+	err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid,
+				       __cpu_to_be16(lid), hfi_type, timeout,
+				       &pathgrp);
+	if (err != PSM2_OK) {
+		psmi_free(epaddr);
+		return NULL;
+	}
+	ipsaddr->pathgrp = pathgrp;
+
+	/* Setup high priority path index, control messages use the high
+	 * priority CONTROL path.
+	 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+		ipsaddr->hpp_index = 0;
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+		ipsaddr->hpp_index = ipsaddr->context %
+		    ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+		ipsaddr->hpp_index = proto->epinfo.ep_context %
+		    ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+	else			/* Base LID  */
+		ipsaddr->hpp_index = 0;
+
+	/*
+	 * Set up the flows on this ipsaddr
+	 */
+	ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+	/* clear connection state. */
+	ipsaddr->cstate_outgoing = CSTATE_NONE;
+	ipsaddr->cstate_incoming = CSTATE_NONE;
+
+	/* Add epaddr to PSM's epid table */
+	psmi_epid_add(proto->ep, epaddr->epid, epaddr);
+	psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr);
+
+	return epaddr;
+}
+
+static
+void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto)
+{
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+	ips_flow_fini(ipsaddr, proto);
+
+	_HFI_VDBG("epaddr=%p,ipsaddr=%p,connidx_incoming=%d\n", epaddr, ipsaddr,
+		  ipsaddr->connidx_incoming);
+	psmi_epid_remove(epaddr->proto->ep, epaddr->epid);
+	ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming);
+	psmi_free(epaddr);
+	return;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto,
+		       psm2_epaddr_t epaddr, struct ips_connect_reqrep *req,
+		       uint32_t paylen);
+
+psm2_error_t
+ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode,
+			  struct ips_message_header *p_hdr, void *payload,
+			  uint32_t paylen)
+{
+	struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	epaddr = psmi_epid_lookup(proto->ep, hdr->epid);
+	ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL;
+
+	switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+		err = ptl_handle_connect_req(proto, epaddr,
+					     (struct ips_connect_reqrep *)hdr,
+					     paylen);
+		break;
+
+	case OPCODE_CONNECT_REPLY:
+		{
+			struct ips_connect_reqrep *req =
+			    (struct ips_connect_reqrep *)payload;
+
+			if (!ipsaddr || req->runid_key != proto->runid_key) {
+				_HFI_PRDBG
+				    ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid %d:%d:%d\n",
+				     ipsaddr, req->runid_key, proto->runid_key,
+				     (int)PSMI_EPID_GET_LID(hdr->epid),
+				     (int)PSMI_EPID_GET_CONTEXT(hdr->epid),
+				     (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid));
+			} else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) {
+				/* possible dupe */
+				_HFI_VDBG("connect dupe, expected %d got %d\n",
+					  CSTATE_OUTGOING_WAITING,
+					  ipsaddr->cstate_outgoing);
+			} else {
+				/* Reply to our request for connection (i.e. outgoing connection) */
+				if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) {
+					err =
+					    ips_ipsaddr_set_req_params(proto,
+								       ipsaddr,
+								       req,
+								       paylen);
+					if (err)
+						goto fail;
+				}
+				ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED;
+				ipsaddr->cerror_outgoing = req->connect_result;
+			}
+		}
+		break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+		{
+			ips_epaddr_t ipsaddr_f;	/* fake a ptl addr */
+			int epaddr_do_free = 0;
+			psmi_assert_always(paylen ==
+					   sizeof(struct ips_connect_hdr));
+			_HFI_VDBG("Got a disconnect from %s\n",
+				  psmi_epaddr_get_name(hdr->epid));
+			proto->num_disconnect_requests++;
+			/* It's possible to get a disconnection request on a ipsaddr that
+			 * we've since removed if the request is a dupe.  Instead of
+			 * silently dropping the packet, we "echo" the request in the
+			 * reply. */
+			if (ipsaddr == NULL) {
+				ips_path_grp_t *pathgrp;
+				uint16_t lid;
+
+				ipsaddr = &ipsaddr_f;
+				memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t));
+				ipsaddr_f.context =
+				    PSMI_EPID_GET_CONTEXT(hdr->epid);
+				ipsaddr_f.subcontext =
+				    PSMI_EPID_GET_SUBCONTEXT(hdr->epid);
+
+				/* Get path record for peer */
+				lid = PSMI_EPID_GET_LID(hdr->epid);
+				err = proto->ibta.get_path_rec(proto,
+							       proto->epinfo.
+								   ep_base_lid,
+							       __cpu_to_be16(lid),
+							       PSMI_HFI_TYPE_OPA1,
+								   3000, &pathgrp);
+				if (err != PSM2_OK)
+					goto fail;
+
+				ipsaddr_f.pathgrp = pathgrp;
+				((psm2_epaddr_t) &ipsaddr_f)->ptlctl =
+				    proto->ptl->ctl;
+				((psm2_epaddr_t) &ipsaddr_f)->proto = proto;
+				/* If the send fails because of pio_busy, don't let ips queue
+				 * the request on an invalid ipsaddr, just drop the reply */
+				ipsaddr_f.ctrl_msg_queued = ~0;
+
+				psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+
+				ips_flow_init(&ipsaddr_f.
+					      flows[proto->msgflowid], proto,
+					      &ipsaddr_f, PSM_TRANSFER_PIO,
+					      PSM_PROTOCOL_GO_BACK_N,
+					      IPS_PATH_LOW_PRIORITY,
+					      EP_FLOW_GO_BACK_N_PIO);
+				_HFI_VDBG
+				    ("Disconnect on unknown epaddr, just echo request\n");
+			} else if (ipsaddr->cstate_incoming != CSTATE_NONE) {
+				ipsaddr->cstate_incoming = CSTATE_NONE;
+				proto->num_connected_incoming--;
+				if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+					epaddr_do_free = 1;
+				}
+			}
+
+			ips_proto_send_ctrl_message_reply(proto, &ipsaddr->
+							  flows[proto->
+								msgflowid],
+							  OPCODE_DISCONNECT_REPLY,
+							  &ipsaddr->
+							  ctrl_msg_queued);
+			/* We can safely free the ipsaddr if required since disconnect
+			 * messages are never enqueued so no reference to ipsaddr is kept */
+			if (epaddr_do_free) {
+				ips_free_epaddr(epaddr, proto);
+				epaddr = NULL;
+			}
+		}
+		break;
+
+	case OPCODE_DISCONNECT_REPLY:
+		if (!ipsaddr) {
+			_HFI_VDBG
+			    ("Unknown disconnect reply from epid %d:%d.%d\n",
+			     (int)PSMI_EPID_GET_LID(hdr->epid),
+			     (int)PSMI_EPID_GET_CONTEXT(hdr->epid),
+			     (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid));
+			break;
+		} else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) {
+			ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+			/* Freed in disconnect() if cstate_incoming == NONE */
+		}		/* else dupe reply */
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unexpected/unhandled connect opcode 0x%x\n",
+				  opcode);
+	}
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr,
+		       struct ips_connect_reqrep *req, uint32_t paylen)
+{
+	ips_epaddr_t *ipsaddr;
+	psm2_error_t err = PSM2_OK;
+	uint16_t connect_result;
+	int newconnect = 0;
+
+	if (req->epid == proto->ep->epid) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR,
+				  "Network connectivity problem: Locally detected duplicate "
+				  "LIDs 0x%04x on hosts %s and %s. (Exiting)",
+				  (uint32_t) psm2_epid_nid(req->epid),
+				  psmi_epaddr_get_hostname(req->epid),
+				  psmi_gethostname());
+		/* XXX no return */
+		abort();
+	} else if (epaddr == NULL) {	/* new ep connect before we call into connect */
+		newconnect = 1;
+		if ((epaddr =
+		     ips_alloc_epaddr(proto, 1, req->epid, req->hostname,
+					      PSMI_HFI_TYPE_OPA1,
+				      5000)) == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	} else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) {
+		ipsaddr = (ips_epaddr_t *) epaddr;
+		/* Duplicate lid detection.  */
+		if (ipsaddr->runid_key == req->runid_key)
+			goto do_reply;	/* duplicate request, not duplicate lid */
+		else {		/* Some out of context message.  Just drop it */
+			if (!proto->done_warning) {
+				psmi_syslog(proto->ep, 1, LOG_INFO,
+					    "Non-fatal connection problem: Received an out-of-context "
+					    "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+					    req->hostname,
+					    (int)psm2_epid_nid(req->epid),
+					    psm2_epid_context(req->epid));
+				proto->done_warning = 1;
+			}
+			goto no_reply;
+		}
+	} else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) {
+		/* pre-created epaddr in multi-rail */
+		psmi_assert_always(epaddr->proto->ep !=
+				   epaddr->proto->ep->mctxt_master);
+		newconnect = 1;
+	}
+
+	ipsaddr = (ips_epaddr_t *) epaddr;
+	psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE);
+
+	/* Check connect version and psm version */
+	if (req->connect_verno < 0x0001) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION,
+				  "Connect protocol (%x,%x) is obsolete and incompatible",
+				  (req->connect_verno >> 8) & 0xff,
+				  req->connect_verno & 0xff);
+		connect_result = PSM2_EPID_INVALID_CONNECT;
+	} else if (!psmi_verno_isinteroperable(req->psm_verno)) {
+		connect_result = PSM2_EPID_INVALID_VERSION;
+	} else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) &&
+		   proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY &&
+		   proto->epinfo.ep_pkey != req->job_pkey) {
+		connect_result = PSM2_EPID_INVALID_PKEY;
+	} else if (req->sl != proto->epinfo.ep_sl) {
+		connect_result = PSM2_EPID_INVALID_CONNECT;
+		_HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl);
+	} else {
+		connect_result = PSM2_OK;
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+			ips_epstate_idx idx;
+			psmi_assert_always(newconnect == 1);
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		}
+	}
+
+	/* Incoming connection request */
+	if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) {
+		err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen);
+		if (err)
+			goto fail;
+	}
+	ipsaddr->cstate_incoming = CSTATE_ESTABLISHED;
+	ipsaddr->cerror_incoming = connect_result;
+
+	ipsaddr->runid_key = req->runid_key;
+
+	proto->num_connected_incoming++;
+
+do_reply:
+	ips_proto_send_ctrl_message_reply(proto,
+					  &ipsaddr->flows[proto->msgflowid],
+					  OPCODE_CONNECT_REPLY,
+					  &ipsaddr->ctrl_msg_queued);
+no_reply:
+fail:
+	return err;
+}
+
+psm2_error_t
+ips_proto_connect(struct ips_proto *proto, int numep,
+		  const psm2_epid_t *array_of_epid,
+		  const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		  psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+	int i, n, n_first;
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	ips_epstate_idx idx;
+	int numep_toconnect = 0, numep_left;
+	union psmi_envvar_val credits_intval;
+	int connect_credits;
+
+	psmi_getenv("PSM2_CONNECT_CREDITS",
+		    "End-point connect request credits.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)100, &credits_intval);
+
+	connect_credits = credits_intval.e_uint;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	/* All timeout values are in cycles */
+	uint64_t t_start = get_cycles();
+	/* Print a timeout at the warning interval */
+	union psmi_envvar_val warn_intval;
+	uint64_t to_warning_interval;
+	uint64_t to_warning_next;
+
+	/* Setup warning interval */
+	psmi_getenv("PSM2_CONNECT_WARN_INTERVAL",
+		    "Period in seconds to warn if connections are not completed."
+		    "Default is 300 seconds, 0 to disable",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)300, &warn_intval);
+
+	to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL);
+	to_warning_next = t_start + to_warning_interval;
+
+	/* Some sanity checks */
+	psmi_assert_always(array_of_epid_mask != NULL);
+
+	/* First pass: make sure array of errors is at least fully defined */
+	for (i = 0; i < numep; i++) {
+		_HFI_VDBG("epid-connect=%s connect to %d:%d:%d\n",
+			  array_of_epid_mask[i] ? "YES" : " NO",
+			  (int)PSMI_EPID_GET_LID(array_of_epid[i]),
+			  (int)PSMI_EPID_GET_CONTEXT(array_of_epid[i]),
+			  (int)PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i]));
+		if (array_of_epid_mask[i]) {
+			array_of_errors[i] = PSM2_EPID_UNKNOWN;
+			array_of_epaddr[i] = NULL;
+		}
+	}
+
+	/* Second pass: see what to connect and what is connectable. */
+	for (i = 0, numep_toconnect = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+
+		/* Can't send to epid on same lid if not loopback */
+		if ((psm2_epid_nid(proto->ep->epid) ==
+		    psm2_epid_nid(array_of_epid[i])) &&
+		    !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) {
+			array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+			continue;
+		}
+
+		if ((PSMI_EPID_VERSION == PSMI_EPID_V2)
+			 && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) !=
+			 	 PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) {
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Trying to connect to a HFI (subnet id - %"PRIu64")on a"
+					  " different subnet - %"PRIu64" \n",
+					  PSMI_GET_SUBNET_ID(proto->ep->gid_hi),
+					  PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]));
+		}
+
+		epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]);
+		if (epaddr == NULL) {
+			/* We're sending a connect request message before some other node
+			 * has sent its connect message */
+			epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i],
+						  NULL,
+						  PSMI_HFI_TYPE_OPA1,
+						  (timeout_in / 1000000UL));
+			if (epaddr == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto fail;
+			}
+
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		} else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) {	/* already connected */
+			psmi_assert_always(((ips_epaddr_t *) epaddr)->
+					   cstate_outgoing == CSTATE_ESTABLISHED);
+			array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED;
+			array_of_epaddr[i] = epaddr;
+			continue;
+		} else if (((ips_epaddr_t *) epaddr)->cstate_incoming ==
+			   CSTATE_NONE) {
+			/* pre-created epaddr in multi-rail */
+			psmi_assert_always(epaddr->proto->ep !=
+					   epaddr->proto->ep->mctxt_master);
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		} else {
+			/* We've already received a connect request message from a remote
+			 * peer, it's time to send our own. */
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			/* No re-entrancy sanity check and makes sure we are not connected
+			 * twice (caller's precondition) */
+			psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE);
+			psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE);
+		}
+
+		ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING;
+		ipsaddr->cerror_outgoing = PSM2_OK;
+		array_of_epaddr[i] = epaddr;
+		ipsaddr->s_timeout = get_cycles();
+		ipsaddr->delay_in_ms = 1;
+		ipsaddr->credit = 0;
+		numep_toconnect++;
+	}
+
+	/* Second pass: do the actual connect.
+	 * PSM2_EPID_UNKNOWN: Not connected yet.
+	 * PSM2_EPID_UNREACHABLE: Not to be connected.
+	 * PSM2_OK: Successfully connected.
+	 * Start sending connect messages at a random index between 0 and numep-1
+	 */
+	numep_left = numep_toconnect;
+	n_first = ((uint32_t) get_cycles()) % numep;
+	while (numep_left > 0) {
+		for (n = 0; n < numep; n++) {
+			int keep_polling = 1;
+			i = (n_first + n) % numep;
+			if (!array_of_epid_mask[i])
+				continue;
+			switch (array_of_errors[i]) {
+			case PSM2_EPID_UNREACHABLE:
+			case PSM2_EPID_ALREADY_CONNECTED:
+			case PSM2_OK:
+				continue;
+			default:
+				break;
+			}
+			psmi_assert_always(array_of_epaddr[i] != NULL);
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+				/* This is not the real error code, we only set OK here
+				 * so we know to stop polling for the reply. The actual
+				 * error is in ipsaddr->cerror_outgoing */
+				array_of_errors[i] = PSM2_OK;
+				numep_left--;
+				connect_credits++;
+				ipsaddr->credit = 0;
+				continue;
+			}
+			while (keep_polling) {
+				if (!psmi_cycles_left(t_start, timeout_in)) {
+					err = PSM2_TIMEOUT;
+					goto err_timeout;
+				}
+				if (to_warning_interval
+				    && get_cycles() >= to_warning_next) {
+#if _HFI_DEBUGGING
+					uint64_t waiting_time = 0;
+					if (_HFI_INFO_ON) {
+					    waiting_time = cycles_to_nanosecs(
+								get_cycles() -
+								t_start) / SEC_ULL;
+					}
+#endif
+					const char *first_name = NULL;
+					int num_waiting = 0;
+
+					for (i = 0; i < numep; i++) {
+						if (!array_of_epid_mask[i] ||
+						    array_of_errors[i] !=
+						    PSM2_EPID_UNKNOWN)
+							continue;
+						if (!first_name)
+							first_name =
+							    psmi_epaddr_get_name
+							    (array_of_epid[i]);
+						num_waiting++;
+					}
+					if (_HFI_INFO_ON) {
+						if (first_name) {
+						_HFI_INFO_ALWAYS
+						    ("Couldn't connect to %s (and %d others). "
+						     "Time elapsed %02i:%02i:%02i. Still trying...\n",
+						     first_name, num_waiting,
+						     (int)(waiting_time / 3600),
+						     (int)((waiting_time / 60) -
+							   ((waiting_time /
+							     3600) * 60)),
+						     (int)(waiting_time -
+							   ((waiting_time /
+							     60) * 60)));
+						}
+					}
+					to_warning_next =
+					    get_cycles() + to_warning_interval;
+				}
+
+				if (get_cycles() > ipsaddr->s_timeout) {
+					if (!ipsaddr->credit && connect_credits) {
+						ipsaddr->credit = 1;
+						connect_credits--;
+					}
+					if (ipsaddr->credit) {
+						_HFI_VDBG
+						    ("Connect req to %u:%u:%u\n",
+						     __be16_to_cpu(ipsaddr->
+								   pathgrp->
+								   pg_base_lid),
+						     ipsaddr->context,
+						     ipsaddr->subcontext);
+					    if (
+					    ips_proto_send_ctrl_message_request
+						    (proto, &ipsaddr->
+						     flows[proto->msgflowid],
+						     OPCODE_CONNECT_REQUEST,
+						     &ipsaddr->ctrl_msg_queued,
+						     0) == PSM2_OK) {
+							keep_polling = 0;
+							ipsaddr->delay_in_ms =
+							    min(100,
+								ipsaddr->
+								delay_in_ms <<
+								1);
+							ipsaddr->s_timeout =
+							    get_cycles() +
+							    nanosecs_to_cycles
+							    (ipsaddr->
+							     delay_in_ms *
+							     MSEC_ULL);
+						}
+						/* If not, send got "busy", keep trying */
+					} else {
+						keep_polling = 0;
+					}
+				}
+
+				if ((err =
+				     psmi_err_only(psmi_poll_internal
+						   (proto->ep, 1))))
+					goto fail;
+
+				if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+					/* This is not the real error code, we only set OK here
+					 * so we know to stop polling for the reply. The actual
+					 * error is in ipsaddr->cerror_outgoing */
+					array_of_errors[i] = PSM2_OK;
+					numep_left--;
+					connect_credits++;
+					ipsaddr->credit = 0;
+					break;
+				}
+			}
+		}
+	}
+
+err_timeout:
+	/* Find the worst error to report */
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+		switch (array_of_errors[i]) {
+			/* These are benign */
+		case PSM2_EPID_UNREACHABLE:
+		case PSM2_EPID_ALREADY_CONNECTED:
+			break;
+		case PSM2_EPID_UNKNOWN:
+			array_of_errors[i] = PSM2_TIMEOUT;
+			err = psmi_error_cmp(err, PSM2_TIMEOUT);
+			break;
+		case PSM2_OK:
+			/* Restore the real connect error */
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			array_of_errors[i] = ipsaddr->cerror_outgoing;
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+			if (ipsaddr->cerror_outgoing != PSM2_OK) {
+				err = psmi_error_cmp(err, ipsaddr->cerror_outgoing);
+				ips_free_epaddr(array_of_epaddr[i], proto);
+				array_of_epaddr[i] = NULL;
+			} else {
+				proto->num_connected_outgoing++;
+				psmi_assert_always(ipsaddr->pathgrp->
+						   pg_path[0]
+						   [IPS_PATH_HIGH_PRIORITY]->
+						   pr_mtu > 0);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+fail:
+	return err;
+}
+
+/* Repercussions on MQ.
+ *
+ * If num_connected==0, everything that exists in the posted queue should
+ * complete and the error must be marked epid_was_closed.
+ *
+ */
+
+psm2_error_t
+ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+		     psm2_epaddr_t array_of_epaddr[],
+		     const int array_of_epaddr_mask[],
+		     psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	ips_epaddr_t *ipsaddr;
+	int numep_left, numep_todisc, i, n;
+	int n_first;
+	int has_pending;
+	uint64_t timeout;
+	psm2_error_t err = PSM2_OK;
+	uint64_t reqs_sent = 0;
+	union psmi_envvar_val credits_intval;
+	int disconnect_credits;
+	uint64_t t_warning, t_start;
+	union psmi_envvar_val warn_intval;
+	unsigned warning_secs;
+
+	/* In case of a forced close, we cancel whatever timers are pending
+	 * on the proto so that we don't have zombie timers coming back
+	 * after the internal structures of PSM2 have been destroyed
+	 */
+	if (force) {
+		struct psmi_timer *t_cursor;
+		TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) {
+			psmi_timer_cancel(proto->timerq, t_cursor);
+		}
+	}
+
+	psmi_assert_always(numep > 0);
+
+	psmi_getenv("PSM2_DISCONNECT_CREDITS",
+		    "End-point disconnect request credits.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)100, &credits_intval);
+
+	disconnect_credits = credits_intval.e_uint;
+
+	/* Setup warning interval */
+	psmi_getenv("PSM2_DISCONNECT_WARN_INTERVAL",
+		    "Period in seconds to warn if disconnections are not completed."
+		    "Default is 300 seconds, 0 to disable.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)300, &warn_intval);
+
+	warning_secs = warn_intval.e_uint;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	/* First pass: see what to disconnect and what is disconnectable */
+	for (i = 0, numep_todisc = 0; i < numep; i++) {
+		if (!array_of_epaddr_mask[i])
+			continue;
+		psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl ==
+				   proto->ptl);
+		ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+		ipsaddr->credit = 0;
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+			array_of_errors[i] = PSM2_OK;
+			continue;
+		} else {
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+		}
+		_HFI_VDBG("disconnecting %p\n", array_of_epaddr[i]);
+		array_of_errors[i] = PSM2_EPID_UNKNOWN;
+		numep_todisc++;
+	}
+	if (numep_todisc == 0)
+		goto success;
+
+	/* Wait for everyone to ack previous packets before putting */
+	if (timeout_in == 0)
+		timeout = ~0ULL;
+	else
+		timeout = get_cycles() + nanosecs_to_cycles(timeout_in);
+
+	t_start = get_cycles();
+	t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL);
+
+	n_first = ((uint32_t) get_cycles()) % numep;
+	if (!force) {
+		numep_left = numep_todisc;
+		do {
+			for (n = 0; n < numep; n++) {
+				i = (n_first + n) % numep;
+				if (!array_of_epaddr_mask[i]
+				    || array_of_errors[i] == PSM2_OK)
+					continue;
+				ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+				switch (ipsaddr->cstate_outgoing) {
+				case CSTATE_OUTGOING_DISCONNECTED:
+					array_of_errors[i] = PSM2_OK;
+					numep_left--;
+					disconnect_credits++;
+					ipsaddr->credit = 0;
+					continue;
+				case CSTATE_OUTGOING_WAITING_DISC:
+					if (ipsaddr->s_timeout > get_cycles())
+						continue;
+					ipsaddr->delay_in_ms =
+					    min(100, ipsaddr->delay_in_ms << 1);
+					ipsaddr->s_timeout = get_cycles() +
+					    nanosecs_to_cycles(ipsaddr->
+							       delay_in_ms *
+							       MSEC_ULL);
+					ips_proto_send_ctrl_message_request
+					    (proto,
+					     &ipsaddr->flows[proto->msgflowid],
+					     OPCODE_DISCONNECT_REQUEST,
+					     &ipsaddr->ctrl_msg_queued,
+					     timeout);
+					reqs_sent++;
+					break;
+				case CSTATE_ESTABLISHED:
+					/* Still pending acks, hold off for now */
+					has_pending =
+					    !STAILQ_EMPTY(&ipsaddr->flows
+							  [EP_FLOW_GO_BACK_N_PIO].
+							  scb_unacked)
+					    ||
+					    !STAILQ_EMPTY(&ipsaddr->flows
+							  [EP_FLOW_GO_BACK_N_DMA].
+							  scb_unacked);
+					if (has_pending)
+						continue;
+					if (!ipsaddr->credit
+					    && disconnect_credits) {
+						ipsaddr->credit = 1;
+						disconnect_credits--;
+					}
+					if (!ipsaddr->credit)
+						continue;
+					ipsaddr->delay_in_ms = 1;
+					ipsaddr->cstate_outgoing =
+					    CSTATE_OUTGOING_WAITING_DISC;
+					ipsaddr->s_timeout =
+					    get_cycles() +
+					    nanosecs_to_cycles(MSEC_ULL);
+					ips_proto_send_ctrl_message_request
+					    (proto,
+					     &ipsaddr->flows[proto->msgflowid],
+					     OPCODE_DISCONNECT_REQUEST,
+					     &ipsaddr->ctrl_msg_queued,
+					     timeout);
+					reqs_sent++;
+					break;
+				default:
+					psmi_handle_error(PSMI_EP_NORETURN,
+							  PSM2_INTERNAL_ERR,
+							  "Unhandled/unknown close state %d",
+							  ipsaddr->cstate_outgoing);
+					break;
+				}
+			}
+			if (numep_left == 0)
+				break;
+
+			if ((err =
+			     psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+				goto fail;
+
+			if (warning_secs && get_cycles() > t_warning) {
+				_HFI_INFO
+				    ("graceful close in progress for %d/%d peers "
+				     "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+				     numep_left, numep_todisc,
+				     (int)(cycles_to_nanosecs
+					   (get_cycles() - t_start) / MSEC_ULL),
+				     (int)(timeout_in / MSEC_ULL),
+				     (unsigned long long)reqs_sent);
+				t_warning =
+				    get_cycles() +
+				    nanosecs_to_cycles(warning_secs * SEC_ULL);
+			}
+		}
+		while (timeout > get_cycles());
+
+		if (numep_left > 0) {
+			err = PSM2_TIMEOUT;
+			for (i = 0; i < numep; i++) {
+				if (!array_of_epaddr_mask[i])
+					continue;
+				if (array_of_errors[i] == PSM2_EPID_UNKNOWN) {
+					array_of_errors[i] = PSM2_TIMEOUT;
+					_HFI_VDBG
+					    ("disc timeout on index %d, epaddr %s\n",
+					     i,
+					     psmi_epaddr_get_name
+					     (array_of_epaddr[i]->epid));
+				}
+			}
+			_HFI_PRDBG("graceful close incomplete for %d/%d peers "
+				   "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+				   numep_left, numep_todisc,
+				   (int)(cycles_to_nanosecs
+					 (get_cycles() - t_start) / MSEC_ULL),
+				   (int)(timeout_in / MSEC_ULL),
+				   (unsigned long long)reqs_sent);
+		} else
+			_HFI_PRDBG
+			    ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n",
+			     numep_todisc,
+			     (int)(cycles_to_nanosecs(get_cycles() - t_start) /
+				   MSEC_ULL), (unsigned long long)reqs_sent);
+	} else {
+		for (n = 0; n < numep; n++) {
+			i = (n_first + n) % numep;
+			if (!array_of_epaddr_mask[i])
+				continue;
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+			ips_proto_send_ctrl_message_request(proto, &ipsaddr->
+						    flows[proto->msgflowid],
+						    OPCODE_DISCONNECT_REQUEST,
+						    &ipsaddr->ctrl_msg_queued,
+						    0);
+			/* Force state to DISCONNECTED */
+			ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+			array_of_errors[i] = PSM2_OK;
+		}
+		_HFI_VDBG("non-graceful close complete from %d peers\n", numep);
+	}
+
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK)
+			continue;
+		ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE)
+			continue;
+		psmi_assert_always(ipsaddr->cstate_outgoing ==
+				   CSTATE_OUTGOING_DISCONNECTED);
+		proto->num_connected_outgoing--;
+		/* Remote disconnect req arrived already, remove this epid.  If it
+		 * hasn't arrived yet, that's okay, we'll pick it up later and just
+		 * mark our connect-to status as being "none". */
+		if (ipsaddr->cstate_incoming == CSTATE_NONE) {
+			ips_free_epaddr(array_of_epaddr[i], proto);
+			array_of_epaddr[i] = NULL;
+		} else
+			ipsaddr->cstate_outgoing = CSTATE_NONE;
+	}
+
+fail:
+success:
+	return err;
+}
+
+int ips_proto_isconnected(ips_epaddr_t *ipsaddr)
+{
+	if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED ||
+	    ipsaddr->cstate_incoming == CSTATE_ESTABLISHED)
+		return 1;
+	else
+		return 0;
+}
diff --git a/ptl_ips/ips_proto_dump.c b/ptl_ips/ips_proto_dump.c
new file mode 100644
index 0000000..3e3e8e7
--- /dev/null
+++ b/ptl_ips/ips_proto_dump.c
@@ -0,0 +1,255 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_header.h"
+#include "ips_proto_help.h"
+
+void ips_proto_dump_frame(void *frame, int lenght, char *message)
+{
+	uint8_t *raw_frame = frame;
+	int counter;
+	char default_message[] = "<UNKNOWN>";
+
+	if (!message)
+		message = default_message;
+
+	printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame,
+	       message);
+
+	for (counter = 0; counter < lenght; counter++) {
+		if ((counter % 16) == 0)
+			printf("\n");
+
+		if ((counter % 4) == 0)
+			printf("   ");
+
+		printf("%02X ", raw_frame[counter]);
+	}
+	printf("\n");
+}
+
+void ips_proto_dump_data(void *data, int data_length)
+{
+	int counter;
+	uint8_t *payload = (uint8_t *) data;
+
+	printf("\nHex dump of data, length = %i\n", data_length);
+
+	for (counter = 0; counter < data_length; counter++) {
+		if ((counter % 16) == 0)
+			printf("\n %04d: ", counter);
+
+		if ((counter % 4) == 0)
+			printf("   ");
+
+		printf("%02X ", payload[counter]);
+	}
+	printf("\n");
+}
+
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg)
+{
+	psmi_seqnum_t ack_seq_num;
+
+	printf("\nHeader decoding in hex: %s\n", msg ? msg : "");
+
+	printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n",
+	       __be16_to_cpu(p_hdr->lrh[0]));
+	printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1]));
+	printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2]));
+	printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3]));
+
+	printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n",
+	       __be32_to_cpu(p_hdr->bth[0]));
+	printf("BTH: F1-B1-Res6-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1]));
+	printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2]));
+
+	printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1));
+	printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n",
+	       __le32_to_cpu(p_hdr->khdr.kdeth0));
+
+	printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr));
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+		printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n",
+		       (__be32_to_cpu(p_hdr->bth[1]) >>
+			HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK,
+		       (__be32_to_cpu(p_hdr->bth[2]) >>
+			HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK,
+		       (__be32_to_cpu(p_hdr->bth[2]) >>
+			HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK);
+	else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW)
+		printf("ack_seq_num gen %x, seq %x\n",
+		       ack_seq_num.psn_gen, ack_seq_num.psn_seq);
+	else
+		printf("ack_seq_num %x\n", ack_seq_num.psn_num);
+
+	printf("src_rank/connidx %x\n", p_hdr->connidx);
+	if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+		printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc);
+	printf("flags %x\n", p_hdr->flags);
+}
+
+/* linux doesn't have strlcat; this is a stripped down implementation */
+/* not super-efficient, but we use it rarely, and only for short strings */
+/* not fully standards conforming! */
+static size_t strlcat(char *d, const char *s, size_t l)
+{
+	int dlen = strlen(d), slen, max;
+	if (l <= dlen)		/* bug */
+		return l;
+	slen = strlen(s);
+	max = l - (dlen + 1);
+	if (slen > max)
+		slen = max;
+	memcpy(d + dlen, s, slen);
+	d[dlen + slen] = '\0';
+	return dlen + slen + 1;	/* standard says to return full length, not actual */
+}
+
+/* decode RHF errors; only used one place now, may want more later */
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len)
+{
+	*msg = '\0';		/* if no errors, and so don't need to check what's first */
+
+	if (err & HFI_RHF_ICRCERR)
+		strlcat(msg, "icrcerr ", len);
+	if (err & HFI_RHF_ECCERR)
+		strlcat(msg, "eccerr ", len);
+	if (err & HFI_RHF_LENERR)
+		strlcat(msg, "lenerr ", len);
+	if (err & HFI_RHF_TIDERR)
+		strlcat(msg, "tiderr ", len);
+	if (err & HFI_RHF_DCERR)
+		strlcat(msg, "dcerr ", len);
+	if (err & HFI_RHF_DCUNCERR)
+		strlcat(msg, "dcuncerr ", len);
+	if (err & HFI_RHF_KHDRLENERR)
+		strlcat(msg, "khdrlenerr ", len);
+}
+
+void ips_proto_dump_err_stats(struct ips_proto *proto)
+{
+	char err_stat_msg[2048];
+	char tmp_buf[128];
+	int len = sizeof(err_stat_msg);
+
+	if (!(hfi_debug & __HFI_PKTDBG))
+		return;
+
+	*err_stat_msg = '\0';
+
+	if (proto->error_stats.num_icrc_err ||
+	    proto->error_stats.num_ecc_err ||
+	    proto->error_stats.num_len_err ||
+	    proto->error_stats.num_tid_err ||
+	    proto->error_stats.num_dc_err ||
+	    proto->error_stats.num_dcunc_err ||
+	    proto->error_stats.num_khdrlen_err) {
+
+		snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: ");
+
+		if (proto->error_stats.num_icrc_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %" PRIu64 " ",
+				 proto->error_stats.num_icrc_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_ecc_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %" PRIu64 " ",
+				 proto->error_stats.num_ecc_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_len_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %" PRIu64 " ",
+				 proto->error_stats.num_len_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_tid_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf), "TID: %" PRIu64 " ",
+				 proto->error_stats.num_tid_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_dc_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf), "DC: %" PRIu64 " ",
+				 proto->error_stats.num_dc_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_dcunc_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf),
+				 "DCUNC: %" PRIu64 " ",
+				 proto->error_stats.num_dcunc_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+
+		if (proto->error_stats.num_khdrlen_err) {
+			snprintf(tmp_buf, sizeof(tmp_buf),
+				 "KHDRLEN: %" PRIu64 " ",
+				 proto->error_stats.num_khdrlen_err);
+			strlcat(err_stat_msg, tmp_buf, len);
+		}
+		strlcat(err_stat_msg, "\n", len);
+	} else
+		strlcat(err_stat_msg, "No previous errors.\n", len);
+
+	_HFI_ERROR("%s", err_stat_msg);
+}
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
new file mode 100644
index 0000000..c0ca988
--- /dev/null
+++ b/ptl_ips/ips_proto_expected.c
@@ -0,0 +1,2957 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+static uint32_t hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw = 0;
+
+void
+ips_protoexp_hfi1_check_dma_no_hdrsupp_for_msgs_leq_8dw(void)
+{
+	if ((hfi_get_user_major_version() >  6) ||
+	    (hfi_get_user_major_version() == 6  &&
+	     hfi_get_user_minor_version() >= 3)) {
+		hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw = 1;
+	}
+}
+
+/*
+ * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is
+ * needed
+ */
+#define _HFI_EXP _HFI_VDBG
+
+/*
+ * Timer callbacks.  When we need work to be done out of the receive process
+ * loop, we schedule work on timers to be done at a later time.
+ */
+static psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static void
+ips_protoexp_do_tf_seqerr(struct ips_protoexp *protoexp,
+			   struct ips_tid_recv_desc *tidrecvc,
+			   struct ips_message_header *p_hdr);
+static void
+ips_protoexp_do_tf_generr(struct ips_protoexp *protoexp,
+			   struct ips_tid_recv_desc *tidrecvc,
+			   struct ips_message_header *p_hdr);
+
+static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+static void ips_tid_avail_callback(struct ips_tid *tidc, void *context);
+static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context);
+
+/* Defined at the ptl-level (breaks abstractions but needed for shared vs
+ * non-shared contexts */
+extern int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc);
+static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc);
+
+#ifdef PSM_CUDA
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+			      struct ips_tid_send_desc *tidsendc);
+static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+					psm2_mq_req_t req,
+					struct ips_tid_send_desc *tidsendc,
+					struct ips_cuda_hostbuf *chb_prev,
+					uint32_t tsess_srcoff,
+					uint32_t tsess_length,
+					uint32_t tsess_unaligned_start,
+					psm2_chb_match_type_t type);
+#endif
+
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+		  const struct ips_proto *proto,
+		  uint32_t protoexp_flags,
+		  int num_of_send_bufs,
+		  int num_of_send_desc, struct ips_protoexp **protoexp_o)
+{
+	ips_protoexp_hfi1_check_dma_no_hdrsupp_for_msgs_leq_8dw();
+
+	struct ips_protoexp *protoexp = NULL;
+	uint32_t tidmtu_max;
+	psm2_error_t err = PSM2_OK;
+
+	protoexp = (struct ips_protoexp *)
+	    psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp));
+	if (protoexp == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	*protoexp_o = protoexp;
+
+	protoexp->ptl = (const struct ptl *)proto->ptl;
+	protoexp->proto = (struct ips_proto *)proto;
+	protoexp->timerq = proto->timerq;
+	srand48_r((long int) getpid(), &protoexp->tidflow_drand48_data);
+	protoexp->tid_flags = protoexp_flags;
+	if (context->runtime_flags & HFI1_CAP_HDRSUPP) {
+		union psmi_envvar_val env_hdrsupp;
+
+		psmi_getenv("PSM2_HDRSUPP",
+			"header suppression(0 disables it)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			(union psmi_envvar_val)1, &env_hdrsupp);
+		if (env_hdrsupp.e_uint)
+			protoexp->tid_flags |= IPS_PROTOEXP_FLAG_HDR_SUPP;
+		else
+			/* user wants to turn off header suppression */
+			context->ctrl->__hfi_tfvalid = 0;
+	}
+
+	if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) {
+		protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED;
+	}
+
+	{
+		/*
+		 * Adjust the session window size so that tid-grant message can
+		 * fit into a single frag size packet for single transfer, PSM
+		 * must send tid-grant message with a single packet.
+		 */
+		uint32_t fragsize, winsize;
+
+		if (proto->flags & IPS_PROTO_FLAG_SDMA)
+			fragsize = proto->epinfo.ep_mtu;
+		else
+			fragsize = proto->epinfo.ep_piosize;
+
+		winsize = 2 * PSMI_PAGESIZE	/* bytes per tid-pair */
+			/* space in packet */
+			* min((fragsize - sizeof(ips_tid_session_list)),
+			/* space in tidsendc/tidrecvc descriptor */
+			PSM_TIDLIST_BUFSIZE)
+			/ sizeof(uint32_t);	/* convert to tid-pair */
+
+		if (proto->mq->hfi_base_window_rv > winsize)
+			proto->mq->hfi_base_window_rv = winsize;
+	}
+
+	/* Must be initialized already */
+	/* Comment out because of Klockwork scanning critical error. CQ 11/16/2012
+	   psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL &&
+	   proto->ep->mq->rreq_pool != NULL &&
+	   proto->ep->mq->sreq_pool != NULL);
+	 */
+	psmi_assert_always(proto->timerq != NULL);
+	/* Make sure pbc is at the right place before the message header */
+	psmi_assert_always(sizeof(struct hfi_pbc) == (size_t)
+			   (offsetof(struct ips_scb, ips_lrh) -
+			    offsetof(struct ips_scb, pbc)));
+
+	/* These request pools are managed by the MQ component */
+	protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool;
+	protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool;
+
+	/* tid traffic xfer type */
+	if (proto->flags & IPS_PROTO_FLAG_SPIO)
+		protoexp->tid_xfer_type = PSM_TRANSFER_PIO;
+	else
+		protoexp->tid_xfer_type = PSM_TRANSFER_DMA;
+
+	/* ctrl ack/nak xfer type */
+	if (proto->flags & IPS_PROTO_FLAG_SDMA)
+		protoexp->ctrl_xfer_type = PSM_TRANSFER_DMA;
+	else
+		protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO;
+
+	/* Initialize tid flow control. */
+	err = ips_tf_init(protoexp, context, &protoexp->tfc,
+			       ips_tidflow_avail_callback);
+	if (err != PSM2_OK)
+		goto fail;
+
+	if (proto->flags & IPS_PROTO_FLAG_SPIO)
+		tidmtu_max = proto->epinfo.ep_piosize;
+	else
+		tidmtu_max = proto->epinfo.ep_mtu;
+
+	protoexp->tid_send_fragsize = tidmtu_max;
+
+	if ((err = ips_tid_init(context, protoexp,
+				ips_tid_avail_callback, protoexp)))
+		goto fail;
+
+	if ((err = ips_scbctrl_init(context, num_of_send_desc, 0,
+				    0, 0, ips_tid_scbavail_callback,
+				    protoexp, &protoexp->tid_scbc_rv)))
+		goto fail;
+
+	{
+		/* Determine interval to generate headers (relevant only when header
+		 * suppression is enabled) else headers will always be generated.
+		 *
+		 * The PSM2_EXPECTED_HEADERS environment variable can specify the
+		 * packet interval to generate headers at. Else a header packet is
+		 * generated every
+		 * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize).
+		 * Note: A header is always generated for the last packet in the flow.
+		 */
+
+		union psmi_envvar_val env_exp_hdr;
+		uint32_t defval = min(PSM_DEFAULT_EXPECTED_HEADER,
+				      proto->mq->hfi_base_window_rv /
+				      protoexp->tid_send_fragsize);
+
+		psmi_getenv("PSM2_EXPECTED_HEADERS",
+			    "Interval to generate expected protocol headers",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)defval, &env_exp_hdr);
+
+		protoexp->hdr_pkt_interval = env_exp_hdr.e_uint;
+		/* Account for flow credits - Should try to have atleast 4 headers
+		 * generated per window.
+		 */
+		protoexp->hdr_pkt_interval =
+		    max(min
+			(protoexp->hdr_pkt_interval, proto->flow_credits >> 2),
+			1);
+
+		if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) {
+			_HFI_VDBG
+			    ("Overriding PSM2_EXPECTED_HEADERS=%u to be '%u'\n",
+			     env_exp_hdr.e_uint, protoexp->hdr_pkt_interval);
+		}
+
+	}
+
+	{
+		union psmi_envvar_val env_rts_cts_interleave;
+
+		psmi_getenv("PSM2_RTS_CTS_INTERLEAVE",
+			    "Interleave the handling of RTS to provide a fair distribution between multiple senders",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)0, &env_rts_cts_interleave);
+		if (env_rts_cts_interleave.e_uint)
+			protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE;
+	}
+
+	/* Send descriptors.
+	 *
+	 * There can be up to 2^32 of these send descriptors.  We conservatively
+	 * allocate 256 but large node configurations can allocate up to sdesc_num
+	 * of these (they are about 2k each).
+	 * We impose a theoretical limit of 2^30.
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+						&rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		protoexp->tid_desc_send_pool =
+		    psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz,
+				      maxsz, 0, DESCRIPTORS, NULL, NULL);
+
+		if (protoexp->tid_desc_send_pool == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate tid descriptor memory pool");
+			goto fail;
+		}
+	}
+
+	/* Receive descriptors are an array in tidflow structure. */
+
+	/* This pool can never be smaller than the max number of rreqs that can be
+	 * allocated. */
+	{
+		uint32_t rreq_per_chunk, rreq_max;
+
+		psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL);
+
+		psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool,
+					&rreq_per_chunk, &rreq_max);
+
+		protoexp->tid_getreq_pool =
+		    psmi_mpool_create(sizeof(struct ips_tid_get_request),
+				      rreq_per_chunk, rreq_max, 0, DESCRIPTORS,
+				      NULL, NULL);
+
+		if (protoexp->tid_getreq_pool == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate getreq descriptor memory pool");
+			goto fail;
+		}
+	}
+
+	/* Timers to handle requeueing of work out of the receive path */
+	psmi_timer_entry_init(&protoexp->timer_send,
+			      ips_tid_pendsend_timer_callback, protoexp);
+	STAILQ_INIT(&protoexp->pend_sendq);
+	psmi_timer_entry_init(&protoexp->timer_getreqs,
+			      ips_tid_pendtids_timer_callback, protoexp);
+	STAILQ_INIT(&protoexp->pend_getreqsq);
+
+	protoexp->tid_page_offset_mask = PSMI_PAGESIZE - 1;
+	protoexp->tid_page_mask = ~(PSMI_PAGESIZE - 1);
+
+	/*
+	 * After ips_tid_init(), we know if we use tidcache or not.
+	 * if tid cache is used, we can't use tid debug.
+	 */
+#ifdef PSM_DEBUG
+	if (protoexp->tidc.tid_array == NULL)
+		protoexp->tid_flags |= IPS_PROTOEXP_FLAG_TID_DEBUG;
+#endif
+
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+		int i;
+		protoexp->tid_info = (struct ips_tidinfo *)
+		    psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS,
+				sizeof(struct ips_tidinfo));
+		if (protoexp->tid_info == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		for (i = 0; i < IPS_TID_MAX_TIDS; i++) {
+			protoexp->tid_info[i].state = TIDSTATE_FREE;
+			protoexp->tid_info[i].tidrecvc = NULL;
+			protoexp->tid_info[i].tid = 0xFFFFFFFF;
+		}
+	} else
+		protoexp->tid_info = NULL;
+
+#ifdef PSM_CUDA
+	{
+		if (PSMI_IS_CUDA_ENABLED &&
+			 !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+			struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+			uint32_t maxsz, chunksz, max_elements;
+
+			if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+							&rlim, &maxsz, &chunksz)))
+				goto fail;
+
+			/* the maxsz is the amount in MB, not the number of entries,
+			 * since the element size depends on the window size */
+			max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+			/* mpool requires max_elements to be power of 2. round down. */
+			max_elements = 1 << (31 - __builtin_clz(max_elements));
+			protoexp->cuda_hostbuf_recv_cfg.bufsz =
+				proto->mq->hfi_base_window_rv;
+
+			protoexp->cuda_hostbuf_pool_recv =
+				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+							   chunksz, max_elements, 0,
+							   UNDEFINED, NULL, NULL,
+							   psmi_cuda_hostbuf_alloc_func,
+							   (void *)
+							   &protoexp->cuda_hostbuf_recv_cfg);
+
+			if (protoexp->cuda_hostbuf_pool_recv == NULL) {
+				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+							"Couldn't allocate CUDA host receive buffer pool");
+				goto fail;
+			}
+
+			protoexp->cuda_hostbuf_small_recv_cfg.bufsz =
+				CUDA_SMALLHOSTBUF_SZ;
+			protoexp->cuda_hostbuf_pool_small_recv =
+				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+							   chunksz, max_elements, 0,
+							   UNDEFINED, NULL, NULL,
+							   psmi_cuda_hostbuf_alloc_func,
+							   (void *)
+							   &protoexp->cuda_hostbuf_small_recv_cfg);
+
+			if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
+				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+							"Couldn't allocate CUDA host small receive buffer pool");
+				goto fail;
+			}
+
+			if (cuda_runtime_version >= 7000) {
+				PSMI_CUDA_CALL(cudaStreamCreateWithFlags,
+					&protoexp->cudastream_recv,
+					cudaStreamNonBlocking);
+			} else {
+				PSMI_CUDA_CALL(cudaStreamCreate,
+					&protoexp->cudastream_recv);
+			}
+			STAILQ_INIT(&protoexp->cudapend_getreqsq);
+		} else {
+			protoexp->cuda_hostbuf_pool_recv = NULL;
+			protoexp->cuda_hostbuf_pool_small_recv = NULL;
+		}
+	}
+#endif
+	psmi_assert(err == PSM2_OK);
+	return err;
+
+fail:
+#ifdef PSM_CUDA
+	if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL)
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+	if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL)
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+#endif
+	if (protoexp != NULL && protoexp->tid_getreq_pool != NULL)
+		psmi_mpool_destroy(protoexp->tid_getreq_pool);
+	if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL)
+		psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+	if (protoexp != NULL)
+		ips_scbctrl_fini(&protoexp->tid_scbc_rv);
+	if (protoexp != NULL)
+		psmi_free(protoexp);
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
+{
+	psm2_error_t err = PSM2_OK;
+
+#ifdef PSM_CUDA
+	if(PSMI_IS_CUDA_ENABLED &&
+		 !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+	}
+#endif
+	psmi_mpool_destroy(protoexp->tid_getreq_pool);
+	psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+
+	if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv)))
+		goto fail;
+
+	if ((err = ips_tid_fini(&protoexp->tidc)))
+		goto fail;
+
+	if ((err = ips_tf_fini(&protoexp->tfc)))
+		goto fail;
+
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG)
+		psmi_free(protoexp->tid_info);
+
+	psmi_free(protoexp);
+
+fail:
+	return err;
+}
+
+/* New scbs now available.  If we have pending sends or pending get requests,
+ * turn on the timer so it can be processed. */
+static
+void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+	if (!STAILQ_EMPTY(&protoexp->pend_sendq))
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+	return;
+}
+
+/* New Tids are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void ips_tid_avail_callback(struct ips_tid *tidc, void *context)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+	return;
+}
+
+/* New Tid Flows are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+	{
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+	}
+	return;
+}
+
+/*
+ * The tid get request is always issued from within the receive progress loop,
+ * which is why we always enqueue the request instead of issuing it directly.
+ * Eventually, if we expose tid_get to users, we will want to differentiate
+ * when the request comes from the receive progress loop from cases where the
+ * tid_get is issued directly from user code.
+ *
+ */
+psm2_error_t
+ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+				void *buf,
+				uint32_t length,
+				psm2_epaddr_t epaddr,
+				uint32_t remote_tok,
+				uint32_t flags,
+				ips_tid_completion_callback_t callback,
+				void *context)
+{
+	struct ips_tid_get_request *getreq;
+	int count, nbytes, tids, tidflows;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0);
+	getreq = (struct ips_tid_get_request *)
+	    psmi_mpool_get(protoexp->tid_getreq_pool);
+
+	/* We can't *really* run out of these here because we always allocate as
+	 * much as available receive reqs */
+	if_pf(getreq == NULL)
+	{
+		PSM2_LOG_MSG("leaving");
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			      "Ran out of 'getreq' descriptors");
+	}
+
+	getreq->tidgr_protoexp = protoexp;
+	getreq->tidgr_epaddr = epaddr;
+	getreq->tidgr_lbuf = buf;
+	getreq->tidgr_length = length;
+	getreq->tidgr_sendtoken = remote_tok;
+	getreq->tidgr_ucontext = context;
+	getreq->tidgr_callback = callback;
+	getreq->tidgr_offset = 0;
+	getreq->tidgr_bytesdone = 0;
+	getreq->tidgr_flags = flags;
+
+#ifdef PSM_CUDA
+	psm2_mq_req_t req = (psm2_mq_req_t)context;
+	if ((req->is_buf_gpu_mem &&
+	    !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) ||
+	    ((req->is_buf_gpu_mem &&
+	     (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
+	     gpudirect_recv_threshold &&
+	     length > gpudirect_recv_threshold))) {
+		getreq->cuda_hostbuf_used = 1;
+		getreq->tidgr_cuda_bytesdone = 0;
+		STAILQ_INIT(&getreq->pend_cudabuf);
+	} else
+		getreq->cuda_hostbuf_used = 0;
+#endif
+
+	/* nbytes is the bytes each channel should transfer. */
+	count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count;
+#ifdef PSM_CUDA
+	if (req->is_buf_gpu_mem)
+		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE);
+	else
+#endif
+		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE);
+	getreq->tidgr_rndv_winsz =
+	    min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv);
+	/* must be within the tid window size */
+	if (getreq->tidgr_rndv_winsz > PSM_TID_WINSIZE)
+		getreq->tidgr_rndv_winsz = PSM_TID_WINSIZE;
+
+	STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next);
+	tids = ips_tid_num_available(&protoexp->tidc);
+	tidflows = ips_tf_available(&protoexp->tfc);
+
+	if (tids > 0 && tidflows > 0)
+		ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0);
+	else if (tids != -1 && tidflows != -1)
+		psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs,
+				   PSMI_TIMER_PRIO_1);
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+/* List of perf events */
+#define _ips_logeventid_tid_send_reqs	0	/* out of tid send descriptors */
+
+#define ips_logevent_id(event)	 _ips_logeventid_ ## event
+#define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr)
+
+static
+void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context)
+{
+	uint64_t t_now = get_cycles();
+
+	switch (eventid) {
+	case ips_logevent_id(tid_send_reqs):{
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) context;
+			proto->psmi_logevent_tid_send_reqs.count++;
+
+			if (t_now >=
+			    proto->psmi_logevent_tid_send_reqs.next_warning) {
+				psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK,
+						  "Non-fatal temporary exhaustion of send tid dma descriptors "
+						  "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)",
+						  (double)
+						  cycles_to_nanosecs(t_now -
+								     proto->
+								     t_init) /
+						  1.0e9,
+						  (int)psm2_epid_nid(epaddr->
+								    epid),
+						  (int)psm2_epid_context(epaddr->
+									epid),
+						  (long long)proto->
+						  psmi_logevent_tid_send_reqs.
+						  count);
+				proto->psmi_logevent_tid_send_reqs.
+				    next_warning =
+				    t_now +
+				    sec_2_cycles(proto->
+						 psmi_logevent_tid_send_reqs.
+						 interval_secs);
+			}
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Expected Protocol.
+ *
+ * We're granted tids (as part of a tid get request) and expected to fulfill
+ * the request by associating the request's sendtoken to a tid send descriptor.
+ *
+ * It's possible to be out of tid send descriptors when somehow all allocated
+ * descriptors can't complete all of their sends.  For example, the targets of
+ * the sends may be busy in computation loops and not processing incoming
+ * packets.
+ */
+
+void
+ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc)
+{
+	ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+	struct ips_proto *proto = tidrecvc->protoexp->proto;
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	ips_scb_t *scb;
+
+	scb = tidrecvc->grantscb;
+
+	ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val;
+	scb->ips_lrh.data[0] = tidrecvc->rdescid;
+	scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length;
+	scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken;
+
+	ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list;
+	ips_scb_length(scb) = tidrecvc->tsess_tidlist_length;
+
+	PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_TX, proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d",
+		    tidrecvc->getreq->tidgr_sendtoken);
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+}
+
+void
+ips_protoexp_send_tid_completion(struct ips_tid_recv_desc *tidrecvc,
+				ptl_arg_t sdescid)
+{
+	ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+	struct ips_proto *proto = tidrecvc->protoexp->proto;
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	ips_scb_t *scb;
+
+	PSM_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM_LOG_EPM_TX, proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid ,"sdescid._desc_idx: %d",
+		    sdescid._desc_idx);
+	scb = tidrecvc->completescb;
+
+	ips_scb_opcode(scb) = OPCODE_EXPTID_COMPLETION;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	scb->ips_lrh.data[0] = sdescid;
+
+	/* Attached tidflow gen/seq */
+	scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val;
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+		flow->flags &= ~IPS_FLOW_FLAG_SKIP_CTS;                                  /* Let the next CTS be processed */
+		ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0);  /* and make explicit progress for it. */
+	}
+}
+
+#ifdef PSM_CUDA
+static
+void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb)
+{
+	PSMI_CUDA_CALL(cudaFreeHost, chb->host_buf);
+	PSMI_CUDA_CALL(cudaEventDestroy, chb->copy_status);
+	psmi_free(chb);
+	return;
+}
+#endif
+
+int
+ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr;
+	ptl_arg_t desc_id = p_hdr->data[0];
+	struct ips_tid_send_desc *tidsendc;
+
+	PSM2_LOG_MSG("entering");
+	PSM_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+		    rcv_ev->proto->ep->mq->ep->epid,"desc_id._desc_idx: %d",desc_id._desc_idx);
+
+	if (!ips_proto_is_expected_or_nak(rcv_ev))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+
+	if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ)
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq,
+				   &ipsaddr->flows[ips_proto_flowid(p_hdr)]);
+
+	ips_proto_process_ack(rcv_ev);
+
+	/*
+	 * Get the session send descriptor and complete.
+	 */
+	tidsendc = (struct ips_tid_send_desc *)
+	    psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+					 desc_id._desc_idx);
+	_HFI_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc);
+	if (tidsendc == NULL) {
+		_HFI_ERROR
+		    ("exptid comp: Index %d is out of range\n",
+		     desc_id._desc_idx);
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	} else {
+		ptl_arg_t desc_tidsendc;
+
+		psmi_mpool_get_obj_index_gen_count(tidsendc,
+						   &desc_tidsendc._desc_idx,
+						   &desc_tidsendc._desc_genc);
+
+		_HFI_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n",
+			  desc_id._desc_idx, desc_id._desc_genc,
+			  desc_tidsendc._desc_idx, desc_tidsendc._desc_genc);
+
+		/* See if the reference is still live and valid */
+		if (desc_tidsendc.u64 != desc_id.u64) {
+			_HFI_ERROR("exptid comp: Genc %d does not match\n",
+				desc_id._desc_genc);
+			PSM2_LOG_MSG("leaving");
+			return IPS_RECVHDRQ_CONTINUE;
+		}
+	}
+
+	if (!STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)) {
+		struct ips_message_header hdr;
+
+		/* Hack to handle the tidflow */
+		hdr.data[0] = rcv_ev->p_hdr->data[0];
+		hdr.ack_seq_num = rcv_ev->p_hdr->mdata;
+		hdr.khdr.kdeth0 = __cpu_to_le32(3 << HFI_KHDR_TIDCTRL_SHIFT);
+		rcv_ev->p_hdr = &hdr;
+
+		/*
+		 * This call should directly complete the tidflow
+		 * and free all scb on the unacked queue.
+		 */
+		ips_proto_process_ack(rcv_ev);
+
+		/* Keep KW happy. */
+		rcv_ev->p_hdr = NULL;
+		psmi_assert(STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked));
+	}
+
+	psm2_mq_req_t req = tidsendc->mqreq;
+	/* Check if we can complete the send request. */
+	req->send_msgoff += tidsendc->length;
+
+#ifdef PSM_CUDA
+	if (req->cuda_hostbuf_used) {
+		if (tidsendc->cuda_num_buf == 1) {
+			tidsendc->cuda_hostbuf[0]->bytes_read +=
+				tidsendc->tid_list.tsess_length;
+			if(tidsendc->cuda_hostbuf[0]->bytes_read ==
+				tidsendc->cuda_hostbuf[0]->size){
+				STAILQ_REMOVE(&req->sendreq_prefetch,
+					      tidsendc->cuda_hostbuf[0],
+					      ips_cuda_hostbuf, req_next);
+				if (tidsendc->cuda_hostbuf[0]->is_tempbuf)
+					psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]);
+				else {
+					tidsendc->cuda_hostbuf[0]->req = NULL;
+					tidsendc->cuda_hostbuf[0]->offset = 0;
+					tidsendc->cuda_hostbuf[0]->bytes_read = 0;
+					psmi_mpool_put(tidsendc->cuda_hostbuf[0]);
+				}
+				psmi_cuda_run_prefetcher(protoexp, tidsendc);
+			}
+		} else
+			psmi_free(tidsendc->userbuf);
+	}
+#endif
+	if (req->send_msgoff == req->send_msglen) {
+		psmi_mq_handle_rts_complete(req);
+	}
+
+	psmi_mpool_put(tidsendc);
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	struct ips_protoexp *protoexp = proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_tid_recv_desc *tidrecvc;
+	ptl_arg_t desc_id;
+	psmi_seqnum_t sequence_num, tf_sequence_num;
+
+	psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+	PSM2_LOG_MSG("entering");
+
+	desc_id._desc_idx = ips_proto_flowid(p_hdr);
+	PSM_LOG_EPM(OPCODE_EXPTID,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+		    proto->ep->mq->ep->epid,"desc_id._desc_idx: %d", desc_id._desc_idx);
+
+	desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;		/* skip */
+	}
+
+	/* IBTA CCA handling for expected flow. */
+	if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) {
+		/* Mark flow to generate BECN in control packet */
+		tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+		/* Update stats for congestion encountered */
+		proto->epaddr_stats.congestion_pkts++;
+		/* Clear FECN event */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+	}
+
+	sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+	if_pf(protoexp->tid_flags & IPS_PROTOEXP_FLAG_HDR_SUPP) {
+		/* Drop packet if generation number does not match. There
+		 * is a window that before we program the hardware tidflow
+		 * table with new gen/seq, hardware might receive some
+		 * packets with the old generation.
+		 */
+		if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen)
+		{
+			PSM2_LOG_MSG("leaving");
+			return IPS_RECVHDRQ_CONTINUE;
+		}
+
+#ifdef PSM_DEBUG
+		/* Check if new packet falls into expected seq range, we need
+		 * to deal with wrap around of the seq value from 2047 to 0
+		 * because seq is only 11 bits. */
+		int16_t seq_off = (int16_t)(sequence_num.psn_seq -
+					tidrecvc->tidflow_genseq.psn_seq);
+		if (seq_off < 0)
+			seq_off += 2048; /* seq is 11 bits */
+		psmi_assert(seq_off < 1024);
+#endif
+		/* NOTE: with RSM in use, we should not automatically update
+		 * our PSN from the HFI's PSN.  The HFI doesn't know about
+		 * RSM interceptions.
+		 */
+		/* (DON'T!) Update the shadow tidflow_genseq */
+		/* tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; */
+
+	}
+	/* Always check the sequence number if we get a header, even if SH. */
+	if_pt(sequence_num.psn_num == tidrecvc->tidflow_genseq.psn_num) {
+		/* Update the shadow tidflow_genseq */
+		tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1;
+
+		/* update the fake tidflow table with new seq, this is for
+		 * seqerr and err_chk_gen processing to get the latest
+		 * valid sequence number */
+		hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+			tidrecvc->rdescid._desc_idx,
+			tidrecvc->tidflow_genseq.psn_gen,
+			tidrecvc->tidflow_genseq.psn_seq);
+	} else {
+		/* Generation mismatch */
+		if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) {
+			ips_protoexp_do_tf_generr(protoexp,
+						tidrecvc, p_hdr);
+			PSM2_LOG_MSG("leaving");
+			return IPS_RECVHDRQ_CONTINUE;
+		} else {
+			/* Possible sequence mismatch error */
+			/* First, check if this is a recoverable SeqErr -
+			 * caused by a good packet arriving in a tidflow that
+			 * has had a FECN bit set on some earlier packet.
+			 */
+
+			/* If this is the first RSM packet, our own PSN state
+			 * is probably old.  Pull from the HFI if it has
+			 * newer data.
+			 */
+			tf_sequence_num.psn_val =
+				hfi_tidflow_get_seqnum(
+					hfi_tidflow_get(tidrecvc->context->ctrl,
+							tidrecvc->rdescid._desc_idx));
+			if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq)
+				tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq;
+
+			/* Now re-check the sequence numbers. */
+			if (sequence_num.psn_seq > tidrecvc->tidflow_genseq.psn_seq) {
+				/* It really was a sequence error.  Restart. */
+				ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr);
+				PSM2_LOG_MSG("leaving");
+				return IPS_RECVHDRQ_CONTINUE;
+			} else {
+				/* False SeqErr.  We can accept this packet. */
+				if (sequence_num.psn_seq == tidrecvc->tidflow_genseq.psn_seq)
+					tidrecvc->tidflow_genseq.psn_seq++;
+			}
+		}
+	}
+
+	/* Reset the swapped generation count as we received a valid packet */
+	tidrecvc->tidflow_nswap_gen = 0;
+
+	/* Do some sanity checking */
+	psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
+	int recv_completion = (tidrecvc->recv_tidbytes ==
+			       (p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev)));
+
+	/* If sender requested an ACK with the packet and it is not the last
+	 * packet, or if the incoming flow faced congestion, respond with an
+	 * ACK packet. The ACK when congested will have the BECN bit set.
+	 */
+	if (((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) &&
+		!recv_completion) ||
+	    (tidrecvc->tidflow.flags & IPS_FLOW_FLAG_GEN_BECN)) {
+		ips_scb_t ctrlscb;
+
+		/* Ack sender with descriptor index */
+		ctrlscb.flags = 0;
+		ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid;
+		ctrlscb.ips_lrh.ack_seq_num = tidrecvc->tidflow_genseq.psn_val;
+
+		ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+					    OPCODE_ACK,
+					    &tidrecvc->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+
+	/* If RSM has found a TID packet marked with FECN, the payload
+	 * will be written to the eager buffer, and we will have a payload
+	 * pointer here.  In that case, copy the payload into the user's
+	 * buffer.  If RSM did not intercept this EXPTID packet, the HFI
+	 * will handle the packet payload.
+	 * Possibly should assert(0 < paylen < MTU).
+	 */
+	if (ips_recvhdrq_event_payload(rcv_ev) &&
+	    ips_recvhdrq_event_paylen(rcv_ev))
+		psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset,
+			       ips_recvhdrq_event_payload(rcv_ev),
+			       ips_recvhdrq_event_paylen(rcv_ev));
+
+	/* If last packet then we are done. We send a tid transfer completion
+	 * packet back to sender, free all tids and close the current tidflow
+	 * as well as tidrecvc descriptor.
+	 * Note: If we were out of tidflow, this will invoke the callback to
+	 * schedule pending transfer.
+	 */
+	if (recv_completion) {
+		/* copy unaligned data if any */
+		uint8_t *dst, *src;
+
+		if (tidrecvc->tid_list.tsess_unaligned_start) {
+			dst = (uint8_t *)tidrecvc->buffer;
+			src = (uint8_t *)p_hdr->exp_ustart;
+#ifdef PSM_CUDA
+			if (tidrecvc->is_ptr_gpu_backed) {
+				PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+					       tidrecvc->tid_list.tsess_unaligned_start,
+					       cudaMemcpyHostToDevice);
+			} else
+#endif
+				ips_protoexp_unaligned_copy(dst, src,
+							    tidrecvc->tid_list.tsess_unaligned_start);
+		}
+
+		if (tidrecvc->tid_list.tsess_unaligned_end) {
+			dst = (uint8_t *)tidrecvc->buffer +
+				tidrecvc->recv_msglen -
+				tidrecvc->tid_list.tsess_unaligned_end;
+			src = (uint8_t *)p_hdr->exp_uend;
+#ifdef PSM_CUDA
+			if (tidrecvc->is_ptr_gpu_backed) {
+				PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+					       tidrecvc->tid_list.tsess_unaligned_end,
+					       cudaMemcpyHostToDevice);
+			} else
+#endif
+			  ips_protoexp_unaligned_copy(dst, src,
+						      tidrecvc->tid_list.tsess_unaligned_end);
+		}
+
+		/* reply tid transfer completion packet to sender */
+		ips_protoexp_send_tid_completion(tidrecvc, p_hdr->exp_sdescid);
+
+		/* Mark receive as done */
+		ips_tid_recv_free(tidrecvc);
+	}
+	PSM2_LOG_MSG("leaving");
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+#ifndef PSM_DEBUG
+#  define ips_dump_tids(tid_list, msg, ...)
+#else
+static
+void ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...)
+{
+	char buf[256];
+	size_t off = 0;
+	int i, num_tids = tid_list->tsess_tidcount;
+
+	va_list argptr;
+	va_start(argptr, msg);
+	off += vsnprintf(buf, sizeof(buf) - off, msg, argptr);
+	va_end(argptr);
+
+	for (i = 0; i < num_tids && off < (sizeof(buf) - 1); i++)
+		off += snprintf(buf + off, sizeof(buf) - off, "%d%s",
+				IPS_TIDINFO_GET_TID(tid_list->tsess_list[i]),
+				i < num_tids - 1 ? "," : "");
+
+	_HFI_VDBG("%s\n", buf);
+	return;
+}
+#endif
+
+static
+void ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc)
+{
+	char buf[256];
+	size_t off = 0;
+	int i;
+
+	off += snprintf(buf + off, sizeof(buf) - off,
+			"Remaining bytes: %d Member id %d is not in tid_session_id=%d :",
+			tidsendc->remaining_tidbytes, tidsendc->tid_idx,
+			tidsendc->rdescid._desc_idx);
+
+	for (i = 0; i < tidsendc->tid_list.tsess_tidcount + 1; i++)
+		off += snprintf(buf + off, sizeof(buf) - off, "%d,",
+				IPS_TIDINFO_GET_TID(tidsendc->tid_list.
+						    tsess_list[i]));
+	psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			  "Trying to use tid idx %d and there are %d members: %s\n",
+			  tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount,
+			  buf);
+	return;
+}
+
+#ifdef PSM_CUDA
+static
+psm2_error_t
+psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq)
+{
+	struct ips_protoexp *protoexp = getreq->tidgr_protoexp;
+	struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead =
+		&getreq->pend_cudabuf;
+	struct ips_cuda_hostbuf *chb;
+	cudaError_t status;
+
+	/* Get the getreq's first memcpy op */
+	while (!STAILQ_EMPTY(cmemcpyhead)) {
+		chb = STAILQ_FIRST(cmemcpyhead);
+		PSMI_CUDA_CHECK_EVENT(chb->copy_status, status);
+		if (status != cudaSuccess) {
+			/* At least one of the copies is still
+			 * in progress. Schedule the timer,
+			 * then leave the CUDA progress phase
+			 * and check for other pending TID work.
+			 */
+			psmi_timer_request(protoexp->timerq,
+					   &protoexp->timer_getreqs,
+					   PSMI_TIMER_PRIO_1);
+			return PSM2_OK_NO_PROGRESS;
+		}
+		/* The getreq's oldest cudabuf is done. Reclaim it. */
+		getreq->tidgr_cuda_bytesdone += chb->size;
+		STAILQ_REMOVE_HEAD(cmemcpyhead, next);
+		psmi_mpool_put(chb);
+	}
+	return PSM2_OK;
+}
+
+static
+struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len)
+{
+	struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*)
+						psmi_calloc(PSMI_EP_NONE,
+							    UNDEFINED, 1,
+							    sizeof(struct ips_cuda_hostbuf));
+	if (chb == NULL) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+						"Couldn't allocate cuda host buffers ");
+	}
+	PSMI_CUDA_CALL(cudaHostAlloc,
+			       (void **) &chb->host_buf,
+			       window_len,
+			       cudaHostAllocPortable);
+	PSMI_CUDA_CALL(cudaEventCreate, &chb->copy_status);
+	return chb;
+}
+
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+			      struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_proto *proto = protoexp->proto;
+	struct ips_cuda_hostbuf *chb = NULL;
+	psm2_mq_req_t req = tidsendc->mqreq;
+	uint32_t offset, window_len;
+
+	/* try to push the prefetcher forward */
+	if (req->prefetch_send_msgoff < req->send_msglen) {
+		/* some data remains to be sent */
+		offset = req->prefetch_send_msgoff;
+		window_len =
+			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+					     offset, req->buf_len);
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_small_send);
+		if (chb == NULL)
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_send);
+		/* were any buffers available for the prefetcher? */
+		if (chb == NULL)
+			return;
+		req->prefetch_send_msgoff += window_len;
+		chb->offset = offset;
+		chb->size = window_len;
+		chb->req = req;
+		chb->gpu_buf = (void *) req->buf + offset;
+		chb->bytes_read = 0;
+		PSMI_CUDA_CALL(cudaMemcpyAsync,
+			       chb->host_buf, chb->gpu_buf,
+			       window_len,
+			       cudaMemcpyDeviceToHost,
+			       proto->cudastream_send);
+		PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+			       proto->cudastream_send);
+
+		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+		return;
+	}
+	return;
+}
+
+static
+void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+				 psm2_mq_req_t req,
+				 struct ips_tid_send_desc *tidsendc,
+				 struct ips_cuda_hostbuf *chb_prev,
+				 uint32_t tsess_srcoff,
+				 uint32_t tsess_length,
+				 uint32_t tsess_unaligned_start,
+				 psm2_chb_match_type_t type)
+{
+	struct ips_proto *proto = protoexp->proto;
+	struct ips_cuda_hostbuf *chb = NULL;
+	uint32_t offset, window_len, attached=0;
+
+	/* try to push the prefetcher forward */
+	while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) {
+		/* some data remains to be sent */
+		offset = req->prefetch_send_msgoff;
+		window_len =
+			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+					     offset, req->buf_len);
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_small_send);
+		if (chb == NULL)
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_send);
+
+		/* were any buffers available? If not force allocate */
+		if (chb == NULL) {
+			chb = psmi_allocate_chb(window_len);
+			chb->is_tempbuf = 1;
+		}
+		req->prefetch_send_msgoff += window_len;
+		chb->offset = offset;
+		chb->size = window_len;
+		chb->req = req;
+		chb->gpu_buf = (void *) req->buf + offset;
+		chb->bytes_read = 0;
+		PSMI_CUDA_CALL(cudaMemcpyAsync,
+			       chb->host_buf, chb->gpu_buf,
+			       window_len,
+			       cudaMemcpyDeviceToHost,
+			       proto->cudastream_send);
+		PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+			       proto->cudastream_send);
+
+		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+		if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+			if ((tsess_srcoff < chb->offset)
+			     && ((tsess_srcoff + tsess_length) > chb->offset)) {
+				tidsendc->cuda_hostbuf[0] = chb_prev;
+				tidsendc->cuda_hostbuf[1] = chb;
+				tidsendc->cuda_num_buf = 2;
+				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+						tsess_length);
+				tidsendc->userbuf =
+					(void *)((uintptr_t) buffer);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+						tsess_unaligned_start);
+				return;
+			}
+		} else {
+			if (attached) {
+				tidsendc->cuda_hostbuf[0] = chb_prev;
+				tidsendc->cuda_hostbuf[1] = chb;
+				tidsendc->cuda_num_buf = 2;
+				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+						tsess_length);
+				tidsendc->userbuf =
+					(void *)((uintptr_t) buffer);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+						tsess_unaligned_start);
+				attached = 0;
+				return;
+			}
+			if ((tsess_srcoff > chb->offset)
+			    && (tsess_srcoff < (chb->offset + chb->size))
+			     && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) {
+				chb_prev = chb;
+				attached = 1;
+				chb = NULL;
+				continue;
+			} else if ((chb->offset <= tsess_srcoff) &&
+				  ((tsess_srcoff + tsess_length) <=
+				   (chb->offset+chb->size))) {
+				tidsendc->cuda_hostbuf[0] = chb;
+				tidsendc->cuda_hostbuf[1] = NULL;
+				tidsendc->cuda_num_buf = 1;
+				tidsendc->userbuf =
+					(void *)((uintptr_t) chb->host_buf +
+						tsess_srcoff - chb->offset);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+							tsess_unaligned_start );
+				return;
+			} else
+				chb = NULL;
+		}
+	}
+}
+
+
+static
+psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb,
+				       ips_tid_session_list *tid_list,
+				       uint32_t prefetch_send_msgoff)
+{
+	/* To get a match:
+	 * 1. Tid list offset + length is contained within a chb
+	 * 2. Tid list offset + length is contained within
+	 * the prefetched offset of this req.
+	 * 3. Tid list offset + length is partially prefetched
+	 * within one chb. (A partial match)
+	 */
+	if (chb->offset <= tid_list->tsess_srcoff) {
+		if ((chb->offset + chb->size) >=
+		    (tid_list->tsess_srcoff + tid_list->tsess_length)) {
+			return PSMI_CUDA_FULL_MATCH_FOUND;
+		} else {
+			if((chb->offset + chb->size) > tid_list->tsess_srcoff){
+				if(((chb->offset + (2 * chb->size)) >
+				   (tid_list->tsess_srcoff + tid_list->tsess_length)) &&
+						  ((prefetch_send_msgoff) >=
+						   (tid_list->tsess_srcoff + tid_list->tsess_length))){
+					return PSMI_CUDA_SPLIT_MATCH_FOUND;
+				} else if((tid_list->tsess_srcoff + tid_list->tsess_length)
+					> prefetch_send_msgoff) {
+					return PSMI_CUDA_PARTIAL_MATCH_FOUND;
+				}
+			}
+		}
+	}
+	return PSMI_CUDA_CONTINUE;
+}
+#endif
+
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+			   ips_epaddr_t *ipsaddr,
+			   psm2_mq_req_t req,
+			   ptl_arg_t rdescid,
+			   uint32_t tidflow_genseq,
+			   ips_tid_session_list *tid_list,
+			   uint32_t tid_list_size)
+{
+	struct ips_tid_send_desc *tidsendc;
+	uint32_t i, j, *src, *dst;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(tid_list_size > sizeof(ips_tid_session_list));
+	psmi_assert(tid_list_size <= sizeof(tidsendc->filler));
+	psmi_assert(tid_list->tsess_tidcount > 0);
+	psmi_assert((rdescid._desc_genc>>16) == 0);
+
+	tidsendc = (struct ips_tid_send_desc *)
+	    psmi_mpool_get(protoexp->tid_desc_send_pool);
+	if (tidsendc == NULL) {
+		PSM2_LOG_MSG("leaving");
+		ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr);
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	req->ptl_req_ptr = (void *)tidsendc;
+	tidsendc->protoexp = protoexp;
+
+	/* Uniquely identify this send descriptor in space and time */
+	tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc);
+	tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc);
+	tidsendc->rdescid = rdescid;
+	tidsendc->ipsaddr = ipsaddr;
+	tidsendc->mqreq = req;
+
+	/*
+	 * Copy received tidinfo to local tidsendc buffer.
+	 * while doing the copy, we try to merge the tids based on
+	 * following rules:
+	 * 1. both tids are virtually contiguous(i and i+1 in the array);
+	 * 2. both tids have the same tidpair value;
+	 * 3. first tid (i) has tidctrl=1;
+	 * 4. second tid (i+1) has tidctrl=2;
+	 * 5. total length does not exceed 512 pages (2M);
+	 *
+	 * The restriction of 512 pages comes from the limited number
+	 * of bits we have for KDETH.OFFSET:
+	 *   - The entire mapping space provided through TIDs is to be
+	 *     viewed as a zero-based address mapping.
+	 *   - We have 15 bits in KDETH offset field through which we
+	 *     can address upto a maximum of 2MB.
+	 *     (with 64-byte offset mode or KDETH.OM = 1)
+	 *   - Assuming a 4KB page size, 2MB/4KB = 512 pages.
+	 */
+	psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list,
+				sizeof(ips_tid_session_list));
+	ips_dump_tids(tid_list, "Received %d tids: ",
+				tid_list->tsess_tidcount);
+
+	src = tid_list->tsess_list;
+	dst = tidsendc->tid_list.tsess_list;
+	dst[0] = src[0];
+	j = 0; i = 1;
+	while (i < tid_list->tsess_tidcount) {
+		if ((((dst[j]>>IPS_TIDINFO_TIDCTRL_SHIFT)+1) ==
+		      (src[i]>>IPS_TIDINFO_TIDCTRL_SHIFT)) &&
+		    (((dst[j]&IPS_TIDINFO_LENGTH_MASK)+
+		      (src[i]&IPS_TIDINFO_LENGTH_MASK)) <=
+				PSM_MAX_NUM_PAGES_IN_TIDPAIR)) {
+			/*
+			 * merge 'i' to 'j'
+			 * (We need to specify "tidctrl" value as 3
+			 *  if we merge the individual tid-pairs.
+			 *  Doing that here)
+			 */
+			dst[j] += (2 << IPS_TIDINFO_TIDCTRL_SHIFT) +
+				(src[i] & IPS_TIDINFO_LENGTH_MASK);
+			i++;
+			if (i == tid_list->tsess_tidcount) break;
+		}
+		j++;
+		/* copy 'i' to 'j' */
+		dst[j] = src[i];
+		i++;
+	}
+	tidsendc->tid_list.tsess_tidcount = j + 1;
+	tid_list = &tidsendc->tid_list;
+
+	/* Initialize tidflow for window. Use path requested by remote endpoint */
+	ips_flow_init(&tidsendc->tidflow, protoexp->proto, ipsaddr,
+		      protoexp->tid_xfer_type, PSM_PROTOCOL_TIDFLOW,
+		      IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW);
+	tidsendc->tidflow.xmit_seq_num.psn_val = tidflow_genseq;
+	tidsendc->tidflow.xmit_ack_num.psn_val = tidflow_genseq;
+
+	tidsendc->userbuf =
+	    (void *)((uintptr_t) req->buf + tid_list->tsess_srcoff);
+	tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf +
+				tid_list->tsess_unaligned_start);
+	tidsendc->length = tid_list->tsess_length;
+	tidsendc->ctrl_msg_queued = 0;
+	tidsendc->frag_size = min(protoexp->tid_send_fragsize,
+		tidsendc->tidflow.frag_size);
+
+#ifdef PSM_CUDA
+	/* Matching on previous prefetches and initiating next prefetch */
+	struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL;
+	psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE;
+
+	/* check if the prefetcher has a buffer ready to use */
+	tidsendc->cuda_hostbuf[0] = NULL;
+	tidsendc->cuda_hostbuf[1] = NULL;
+	tidsendc->cuda_num_buf = 0;
+	if (req->cuda_hostbuf_used) {
+		/* To get a match:
+		 * 1. Tid list offset + length is contained within a chb
+		 * 2. Tid list offset + length is contained within
+		 * the prefetched offset of this req.
+		 * 3. Tid list offset + length is partially prefetched
+		 * within one chb. (A partial match)
+		 */
+		STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) {
+			rc = psmi_find_match_in_prefeteched_chb(chb,
+								tid_list,
+								req->prefetch_send_msgoff);
+			if (rc < PSMI_CUDA_CONTINUE)
+				break;
+		}
+		if (rc == PSMI_CUDA_FULL_MATCH_FOUND) {
+			tidsendc->userbuf =
+				(void *)((uintptr_t) chb->host_buf+
+					 tid_list->tsess_srcoff - chb->offset);
+			tidsendc->buffer =
+				(void *)((uintptr_t)tidsendc->userbuf +
+					 tid_list->tsess_unaligned_start);
+			/* now associate the buffer with the tidsendc */
+			tidsendc->cuda_hostbuf[0] = chb;
+			tidsendc->cuda_hostbuf[1] = NULL;
+			tidsendc->cuda_num_buf = 1;
+		} else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){
+			void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+					tid_list->tsess_length);
+			tidsendc->userbuf =
+				(void *)((uintptr_t) buffer);
+			tidsendc->buffer =
+				(void *)((uintptr_t)tidsendc->userbuf +
+				tid_list->tsess_unaligned_start);
+			chb_next = STAILQ_NEXT(chb, req_next);
+			tidsendc->cuda_hostbuf[0] = chb;
+			tidsendc->cuda_hostbuf[1] = chb_next;
+			tidsendc->cuda_num_buf = 2;
+		} else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+			psmi_attach_chb_to_tidsendc(protoexp, req,
+						    tidsendc,
+						    chb,
+						    tid_list->tsess_srcoff,
+						    tid_list->tsess_length,
+						    tid_list->tsess_unaligned_start,
+						    rc);
+		} else {
+			psmi_attach_chb_to_tidsendc(protoexp, req,
+						    tidsendc,
+						    NULL,
+						    tid_list->tsess_srcoff,
+						    tid_list->tsess_length,
+						    tid_list->tsess_unaligned_start,
+						    PSMI_CUDA_CONTINUE);
+		}
+	}
+#endif
+
+	/* frag size must be 64B multiples */
+	tidsendc->frag_size &= (~63);
+	tidsendc->is_complete = 0;
+	tidsendc->tid_idx = 0;
+	tidsendc->frame_send = 0;
+
+	tidsendc->tidbytes = 0;
+	tidsendc->remaining_tidbytes = tid_list->tsess_length -
+	    tid_list->tsess_unaligned_start - tid_list->tsess_unaligned_end;
+	tidsendc->remaining_bytes_in_tid =
+	    (IPS_TIDINFO_GET_LENGTH(tid_list->tsess_list[0]) << 12) -
+	    tid_list->tsess_tidoffset;
+	tidsendc->offset_in_tid = tid_list->tsess_tidoffset;
+
+	_HFI_EXP
+	    ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n",
+	     tidsendc->sdescid._desc_idx, rdescid._desc_idx,
+	     tid_list->tsess_srcoff, tid_list->tsess_length,
+	     tid_list->tsess_unaligned_start, tid_list->tsess_unaligned_end);
+
+	ips_tid_send_exp(tidsendc);
+
+	/* Add as a pending op and ring up the timer */
+	if (tidsendc->is_complete == 0) {
+		STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+		psmi_timer_request(protoexp->timerq, &protoexp->timer_send,
+			   PSMI_TIMER_PRIO_1);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	/* Consider breaking out of progress engine here */
+	return PSM2_OK;
+}
+
+static
+ips_scb_t *
+ips_scb_prepare_tid_sendctrl(struct ips_flow *flow,
+			     struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+	uint32_t *tsess_list = tidsendc->tid_list.tsess_list;
+	uint32_t tid, omode, offset, chunk_size;
+	uint32_t startidx, endidx;
+	uint32_t frame_len, nfrag;
+	uint8_t *bufptr = tidsendc->buffer;
+	ips_scb_t *scb;
+	uint8_t is_payload_per_frag_leq_8dw = 0;
+	 /* If payload in the first and last nfrag is less then or equal
+	  * to 8DW we disable header suppression so as to detect uncorrectable
+	  * errors which will otherwise be non-detectable(since header is
+	  * suppressed we lose RHF.EccErr)
+	  */
+	if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL)
+		return NULL;
+
+	/*
+	 * Make sure the next offset is in 64B multiples with the tid.
+	 */
+	frame_len =
+	    min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes);
+	if (frame_len > tidsendc->frag_size) {
+		frame_len =
+		    tidsendc->frag_size - (tidsendc->offset_in_tid & 63);
+	}
+	/*
+	 * Frame length is the amount of payload to be included in a particular
+	 * frag of the scb, so we check if frame len is less than or equal
+	 * to 8DW. If length is less then then or equal to 8DW for the first
+	 * frag then we avoid header suppression
+	 */
+	if (frame_len <= 32)
+		is_payload_per_frag_leq_8dw = 1;
+
+	/*
+	 * Using large offset mode based on offset length.
+	 */
+	if (tidsendc->offset_in_tid < 131072) {	/* 2^15 * 4 */
+		psmi_assert((tidsendc->offset_in_tid % 4) == 0);
+		offset = tidsendc->offset_in_tid / 4;
+		omode = 0;
+	} else {
+		psmi_assert((tidsendc->offset_in_tid % 64) == 0);
+		offset = tidsendc->offset_in_tid / 64;
+		omode = 1;
+	}
+	startidx = tidsendc->tid_idx;
+	tid = IPS_TIDINFO_GET_TID(tsess_list[startidx]);
+	scb->ips_lrh.khdr.kdeth0 = (offset & HFI_KHDR_OFFSET_MASK) |
+	    (omode << HFI_KHDR_OM_SHIFT) | (tid << HFI_KHDR_TID_SHIFT);
+
+	scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(tsess_list[startidx]);
+	scb->tsess = (uint32_t *) &tsess_list[startidx];
+
+	/*
+	 * Payload and buffer address for current packet. payload_size
+	 * must be the first packet size because it is used to initialize
+	 * the packet header.
+	 */
+	scb->payload_size = frame_len;
+	ips_scb_buffer(scb) = (void *)bufptr;
+	scb->frag_size = tidsendc->frag_size;
+
+	/*
+	 * Other packet fields.
+	 */
+	PSM_LOG_EPM(OPCODE_EXPTID,PSM_LOG_EPM_TX, protoexp->proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid,
+		    "psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid._desc_idx: %d, tidsendc->sdescid._desc_idx: %d",
+		    psmi_mpool_get_obj_index(tidsendc->mqreq),tidsendc->rdescid._desc_idx,tidsendc->sdescid._desc_idx);
+	ips_scb_opcode(scb) = OPCODE_EXPTID;
+	scb->ips_lrh.exp_sdescid = tidsendc->sdescid;
+	scb->ips_lrh.exp_rdescid_genc = (uint16_t)tidsendc->rdescid._desc_genc;
+	scb->ips_lrh.exp_offset = tidsendc->tidbytes;
+
+	scb->tidsendc = tidsendc;
+	SLIST_NEXT(scb, next) = NULL;
+
+	/*
+	 * Loop over the tid session list, count the frag number and payload size.
+	 */
+	nfrag = 1;
+	chunk_size = frame_len;
+	while (1) {
+		/* Record last tididx used */
+		endidx = tidsendc->tid_idx;
+		/* Check if all tidbytes are done */
+		tidsendc->remaining_tidbytes -= frame_len;
+		if (!tidsendc->remaining_tidbytes) {
+			/* We do another frame length check for the last frag */
+			if (frame_len <= 32)
+				is_payload_per_frag_leq_8dw = 1;
+			break;
+		}
+
+		/* Update in current tid */
+		tidsendc->remaining_bytes_in_tid -= frame_len;
+		tidsendc->offset_in_tid += frame_len;
+		psmi_assert((tidsendc->offset_in_tid % 64) == 0);
+
+		/* Done with this tid, move on to the next tid */
+		if (!tidsendc->remaining_bytes_in_tid) {
+			tidsendc->tid_idx++;
+			psmi_assert(tidsendc->tid_idx <
+				    tidsendc->tid_list.tsess_tidcount);
+			tidsendc->remaining_bytes_in_tid =
+			    IPS_TIDINFO_GET_LENGTH(tsess_list
+						   [tidsendc->tid_idx]) << 12;
+			tidsendc->offset_in_tid = 0;
+		}
+
+		/* For PIO, only single packet per scb allowed */
+		if (flow->transfer == PSM_TRANSFER_PIO) {
+			break;
+		}
+
+		frame_len =
+		    min(tidsendc->remaining_bytes_in_tid,
+			tidsendc->remaining_tidbytes);
+		if (frame_len > tidsendc->frag_size)
+			frame_len = tidsendc->frag_size;
+		nfrag++;
+		chunk_size += frame_len;
+	}
+
+	scb->nfrag = nfrag;
+	if (nfrag > 1) {
+		scb->nfrag_remaining = scb->nfrag;
+		scb->chunk_size = scb->chunk_size_remaining = chunk_size;
+	}
+	scb->tsess_length = (endidx - startidx + 1) * sizeof(uint32_t);
+
+	/* Keep track of latest buffer location so we restart at the
+	 * right location, if we don't complete the transfer */
+	tidsendc->buffer = bufptr + chunk_size;
+	tidsendc->tidbytes += chunk_size;
+
+	if (flow->transfer == PSM_TRANSFER_DMA &&
+		hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw) {
+		is_payload_per_frag_leq_8dw = 0;
+	}
+
+	/* If last packet, we want a completion notification */
+	if (!tidsendc->remaining_tidbytes) {
+		/* last packet/chunk, attach unaligned data */
+		uint8_t *dst, *src;
+
+		if (tidsendc->tid_list.tsess_unaligned_start) {
+			dst = (uint8_t *)scb->ips_lrh.exp_ustart;
+			src = (uint8_t *)tidsendc->userbuf;
+#ifdef PSM_CUDA
+			if (!tidsendc->mqreq->cuda_hostbuf_used) {
+				PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+					       tidsendc->tid_list.tsess_unaligned_start,
+					       cudaMemcpyDeviceToHost);
+			} else
+#endif
+				ips_protoexp_unaligned_copy(dst, src,
+							    tidsendc->tid_list.tsess_unaligned_start);
+
+		}
+
+		if (tidsendc->tid_list.tsess_unaligned_end) {
+			dst = (uint8_t *)&scb->ips_lrh.exp_uend;
+			src = (uint8_t *)tidsendc->userbuf +
+				tidsendc->length -
+				tidsendc->tid_list.tsess_unaligned_end;
+#ifdef PSM_CUDA
+			if (!tidsendc->mqreq->cuda_hostbuf_used) {
+				PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+					       tidsendc->tid_list.tsess_unaligned_end,
+					       cudaMemcpyDeviceToHost);
+			} else
+#endif
+				ips_protoexp_unaligned_copy(dst, src,
+							    tidsendc->tid_list.tsess_unaligned_end);
+		}
+		/*
+		 * If the number of fragments is greater then one and
+		 * "no header suppression" flag is unset then we go
+		 * ahead and suppress the header */
+		if ((scb->nfrag > 1) && (!is_payload_per_frag_leq_8dw))
+			scb->flags |= IPS_SEND_FLAG_HDRSUPP;
+		else
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+		tidsendc->is_complete = 1;
+	} else {
+		/* Do not suppress header every hdr_pkt_interval */
+		if ((++tidsendc->frame_send %
+				protoexp->hdr_pkt_interval) == 0)
+			/* Request an ACK */
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+		else {
+			if (!is_payload_per_frag_leq_8dw) {
+				/* Request hdr supp */
+				scb->flags |= IPS_SEND_FLAG_HDRSUPP;
+			}
+		}
+		/* assert only single packet per scb */
+		psmi_assert(scb->nfrag == 1);
+	}
+
+#ifdef PSM_CUDA
+	if (tidsendc->mqreq->is_buf_gpu_mem &&		/* request's buffer comes from GPU realm */
+	   !tidsendc->mqreq->cuda_hostbuf_used) {	/* and it was NOT moved to HOST memory */
+		scb->mq_req = tidsendc->mqreq;		/* so let's mark it per scb, not to check its locality again */
+	}
+#endif
+
+	return scb;
+}
+
+/*
+ * Returns:
+ *
+ * PSM2_OK: scb was allocated for at least one frame, the packet may be queued
+ *         or actually sent.
+ *
+ * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow
+ *		       to be enqueued before polling receive queue.
+ *
+ * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more
+ *                      scbs become available.
+ *
+ * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now.
+ *
+ */
+
+static
+psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
+{
+	ips_scb_t *scb = NULL;
+	psm2_error_t err = PSM2_OK, err_f;
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+	struct ips_proto *proto = protoexp->proto;
+	struct ips_flow *flow = &tidsendc->tidflow;
+
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *chb, *chb_next;
+	cudaError_t chb_status;
+	uint32_t offset_in_chb, i;
+	for (i = 0; i < tidsendc->cuda_num_buf; i++) {
+		chb = tidsendc->cuda_hostbuf[i];
+		if (chb) {
+			PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status);
+			if (chb_status != cudaSuccess) {
+				err = PSM2_OK_NO_PROGRESS;
+				PSM2_LOG_MSG("leaving");
+				return err;
+			}
+		}
+	}
+
+	if (tidsendc->cuda_num_buf == 2) {
+		chb = tidsendc->cuda_hostbuf[0];
+		chb_next = tidsendc->cuda_hostbuf[1];
+		offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset;
+		/* Copying data from multiple cuda
+		 * host buffers into a bounce buffer.
+		 */
+		memcpy(tidsendc->buffer, chb->host_buf +
+			offset_in_chb, chb->size-offset_in_chb);
+		memcpy(tidsendc->buffer+ chb->size -
+			offset_in_chb, chb_next->host_buf,
+			tidsendc->tid_list.tsess_srcoff +
+			tidsendc->tid_list.tsess_length - chb_next->offset);
+
+		chb->bytes_read += chb->size - offset_in_chb;
+		chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff +
+				  tidsendc->tid_list.tsess_length -
+				  chb_next->offset;
+		if(chb->bytes_read == chb->size) {
+			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb,
+				       ips_cuda_hostbuf, req_next);
+			if (chb->is_tempbuf)
+				psmi_deallocate_chb(chb);
+			else {
+				chb->req = NULL;
+				chb->offset = 0;
+				chb->bytes_read = 0;
+				psmi_mpool_put(chb);
+			}
+			psmi_cuda_run_prefetcher(protoexp, tidsendc);
+		 }
+		if(chb_next->bytes_read == chb_next->size) {
+			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next,
+				       ips_cuda_hostbuf, req_next);
+			if (chb_next->is_tempbuf)
+				psmi_deallocate_chb(chb_next);
+			else{
+				chb_next->req = NULL;
+				chb_next->offset = 0;
+				chb_next->bytes_read = 0;
+				psmi_mpool_put(chb_next);
+			}
+			psmi_cuda_run_prefetcher(protoexp, tidsendc);
+		}
+	}
+#endif
+	/*
+	 * We aggressively try to grab as many scbs as possible, enqueue them to a
+	 * flow and flush them when either we're out of scbs our we've completely
+	 * filled the send request.
+	 */
+	while (!tidsendc->is_complete) {
+		if_pf(tidsendc->tid_list.tsess_tidcount &&
+		      (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount ||
+		       tidsendc->tid_idx < 0))
+		    ips_expsend_tiderr(tidsendc);
+
+		if ((scb =
+		     ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) {
+			proto->stats.scb_exp_unavail_cnt++;
+			err = PSM2_EP_NO_RESOURCES;
+			break;
+		} else {
+			ips_proto_flow_enqueue(flow, scb);
+		}
+	}
+
+	if (!SLIST_EMPTY(&flow->scb_pend)) {	/* Something to flush */
+		int num_sent;
+
+		err_f = flow->flush(flow, &num_sent);
+
+		if (err != PSM2_EP_NO_RESOURCES) {
+			/* PSM2_EP_NO_RESOURCES is reserved for out-of-scbs */
+			if (err_f == PSM2_EP_NO_RESOURCES)
+				err = PSM2_TIMEOUT;	/* force a resend reschedule */
+			else if (err_f == PSM2_OK && num_sent > 0 &&
+				 !ips_ptl_recvq_isempty(protoexp->ptl))
+				err = PSM2_OK_NO_PROGRESS;	/* force a rcvhdrq service */
+		}
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+static
+psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context;
+	struct ips_tid_send_pend *phead = &protoexp->pend_sendq;
+	struct ips_tid_send_desc *tidsendc;
+	psm2_error_t err = PSM2_OK;
+
+	while (!STAILQ_EMPTY(phead)) {
+		tidsendc = STAILQ_FIRST(phead);
+
+		err = ips_tid_send_exp(tidsendc);
+
+		if (tidsendc->is_complete)
+			STAILQ_REMOVE_HEAD(phead, next);
+
+		if (err == PSM2_OK) {
+			/* Was able to complete the send, keep going */
+		} else if (err == PSM2_EP_NO_RESOURCES) {
+			/* No more sendbufs available, sendbuf callback will requeue this
+			 * timer */
+			break;
+		} else if (err == PSM2_TIMEOUT) {
+			/* Always a case of try later:
+			 * On PIO flow, means no send pio bufs available
+			 * On DMA flow, means kernel can't queue request or would have to block
+			 */
+			psmi_timer_request(protoexp->proto->timerq,
+					   &protoexp->timer_send,
+					   get_cycles() +
+					   protoexp->proto->timeout_send);
+			break;
+		} else {
+			/* Forced to reschedule later so we can check receive queue */
+			psmi_assert(err == PSM2_OK_NO_PROGRESS);
+			psmi_timer_request(protoexp->proto->timerq,
+					   &protoexp->timer_send,
+					   PSMI_TIMER_PRIO_1);
+			break;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+/* Right now, in the kernel we are allowing for virtually non-contiguous pages,
+   in a single call, and we are therefore locking one page at a time, but since
+   the intended use of this routine is for a single group of
+   virtually contiguous pages, that should change to improve
+   performance.  That means possibly changing the calling MPI code.
+   Doing so gets rid of some of the loop stuff here, and in the driver,
+   and allows for a single call to the core VM code in the kernel,
+   rather than one per page, definitely improving performance. */
+
+static
+psm2_error_t
+ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp,
+			struct ips_tid_recv_desc *tidrecvc,
+			uint32_t nbytes_this)
+{
+	ips_tid_session_list *tid_list = &tidrecvc->tid_list;
+	uintptr_t bufptr = (uintptr_t) tidrecvc->buffer;
+	uint32_t size = nbytes_this;
+	psm2_error_t err = PSM2_OK;
+	uintptr_t pageaddr;
+	uint32_t tidoff, pageoff, pagelen, reglen, num_tids;
+
+	psmi_assert(size >= 4);
+
+	/*
+	 * The following calculation does not work when size < 4
+	 * and bufptr is byte aligned, it can get negative value.
+	 */
+	tid_list->tsess_unaligned_start = (bufptr & 3) ? (4 - (bufptr & 3)) : 0;
+	size -= tid_list->tsess_unaligned_start;
+	bufptr += tid_list->tsess_unaligned_start;
+
+	tid_list->tsess_unaligned_end = size & 3;
+	size -= tid_list->tsess_unaligned_end;
+
+	psmi_assert(size > 0);
+
+#ifdef PSM_CUDA
+	/* Driver pins GPU pages when using GPU Direct RDMA for TID recieves,
+	 * to accomadate this change the calculations of pageaddr, pagelen
+	 * and pageoff have been modified to take GPU page size into
+	 * consideration.
+	 */
+	if (tidrecvc->is_ptr_gpu_backed) {
+		uint64_t page_mask = ~(PSMI_GPU_PAGESIZE -1);
+		uint32_t page_offset_mask = (PSMI_GPU_PAGESIZE -1);
+		pageaddr = bufptr & page_mask;
+		pagelen = (uint32_t) (PSMI_GPU_PAGESIZE +
+			  ((bufptr + size - 1) & page_mask) -
+			  (bufptr & page_mask));
+		tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask);
+	} else {
+		pageaddr = bufptr & protoexp->tid_page_mask;
+		pagelen = (uint32_t) (PSMI_PAGESIZE +
+			  ((bufptr + size - 1) & protoexp->tid_page_mask) -
+			  (bufptr & protoexp->tid_page_mask));
+		tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
+	}
+#else
+	pageaddr = bufptr & protoexp->tid_page_mask;
+	pagelen = (uint32_t) (PSMI_PAGESIZE +
+			     ((bufptr + size - 1) & protoexp->tid_page_mask) -
+			     (bufptr & protoexp->tid_page_mask));
+	tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
+#endif
+
+	reglen = pagelen;
+	if (protoexp->tidc.tid_array) {
+		if ((err = ips_tidcache_acquire(&protoexp->tidc,
+			    (void *)pageaddr, &reglen,
+			    (uint32_t *) tid_list->tsess_list, &num_tids,
+			    &tidoff
+#ifdef PSM_CUDA
+			    , tidrecvc->is_ptr_gpu_backed
+#endif
+			    )))
+			goto fail;
+	} else {
+		if ((err = ips_tid_acquire(&protoexp->tidc,
+			    (void *)pageaddr, &reglen,
+			    (uint32_t *) tid_list->tsess_list, &num_tids
+#ifdef PSM_CUDA
+			    , tidrecvc->is_ptr_gpu_backed
+#endif
+			)))
+			goto fail;
+	}
+
+	/*
+	 * PSM2 currently provides storage space enough to hold upto
+	 * 1024 tids. (PSM_TIDLIST_BUFSIZE). So, make sure we
+	 * don't get more than what we can hold from the tidcache here.
+	 *
+	 * The reason for 1024 tids comes from the PSM_TID_WINSIZE value
+	 * (currently 4MB. So, if in future, there is a change to this macro,
+	 * then you would need a change to PSM_TIDLIST_BUFSIZE as well).
+	 *
+	 * Assuming a 4KB page size, to be able to receive
+	 * a message of 4MB size, we'd need an maximum of 4MB/4KB = 1024 tids.
+	 */
+	psmi_assert(num_tids > 0);
+	psmi_assert(num_tids <= (PSM_TID_WINSIZE/PSM_TIDLIST_BUFSIZE));
+	if (reglen > pagelen) {
+		err = psmi_handle_error(protoexp->tidc.context->ep,
+			    PSM2_EP_DEVICE_FAILURE,
+			    "PSM tid registration: "
+			    "register more pages than asked");
+		goto fail;
+	} else if (reglen < pagelen) {
+		/*
+		 * driver registered less pages, update PSM records.
+		 */
+		tid_list->tsess_unaligned_end = 0;
+		tidrecvc->recv_tidbytes = reglen - pageoff;
+		tidrecvc->recv_msglen = tid_list->tsess_unaligned_start +
+		    tidrecvc->recv_tidbytes;
+	} else {
+		tidrecvc->recv_tidbytes = size;
+		tidrecvc->recv_msglen = nbytes_this;
+	}
+
+	tid_list->tsess_tidcount = num_tids;
+	tid_list->tsess_tidoffset = tidoff;
+
+	ips_dump_tids(tid_list, "Registered %d tids: ", num_tids);
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+ips_tid_recv_alloc(struct ips_protoexp *protoexp,
+		   ips_epaddr_t *ipsaddr,
+		   const struct ips_tid_get_request *getreq,
+		   uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc)
+{
+	psm2_error_t err;
+	ips_scb_t *grantscb, *completescb;
+	struct ips_tid_recv_desc *tidrecvc;
+
+	PSM2_LOG_MSG("entering");
+	/* Allocate all necessary resources. */
+
+	/* 1. allocate a tid grant scb. */
+	grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+	if (grantscb == NULL) {
+		/* ips_tid_scbavail_callback() will reschedule */
+		PSM2_LOG_MSG("leaving");
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	/* 2. allocate a tid complete scb. */
+	completescb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+	if (completescb == NULL) {
+		ips_scbctrl_free(grantscb);
+		/* ips_tid_scbavail_callback() will reschedule */
+		PSM2_LOG_MSG("leaving");
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	/* 3. allocate a tid flow entry. */
+	err = ips_tf_allocate(&protoexp->tfc, &tidrecvc);
+	if (err != PSM2_OK) {
+		ips_scbctrl_free(completescb);
+		ips_scbctrl_free(grantscb);
+		/* Unable to get a tidflow for expected protocol. */
+		psmi_timer_request(protoexp->timerq,
+			&protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+#ifdef PSM_CUDA
+       psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext;
+
+       if (req->is_buf_gpu_mem)
+               tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used;
+       else
+               tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem;
+
+	/* 4. allocate a cuda bounce buffer, if required */
+	struct ips_cuda_hostbuf *chb = NULL;
+	if (getreq->cuda_hostbuf_used) {
+		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ)
+			chb = (struct ips_cuda_hostbuf *)
+				psmi_mpool_get(
+					protoexp->cuda_hostbuf_pool_small_recv);
+		if (chb == NULL)
+			chb = (struct ips_cuda_hostbuf *)
+				psmi_mpool_get(
+					protoexp->cuda_hostbuf_pool_recv);
+		if (chb == NULL) {
+			/* Unable to get a cudahostbuf for TID.
+			 * Release the resources we're holding and reschedule.*/
+			ips_tf_deallocate(&protoexp->tfc,
+					  tidrecvc->rdescid._desc_idx);
+			ips_scbctrl_free(completescb);
+			ips_scbctrl_free(grantscb);
+			psmi_timer_request(protoexp->timerq,
+					   &protoexp->timer_getreqs,
+					   PSMI_TIMER_PRIO_1);
+			PSM2_LOG_MSG("leaving");
+			return PSM2_EP_NO_RESOURCES;
+		}
+
+		tidrecvc->cuda_hostbuf = chb;
+		tidrecvc->buffer = chb->host_buf;
+		chb->size = 0;
+		chb->gpu_buf = (void *)((uintptr_t) getreq->tidgr_lbuf +
+					getreq->tidgr_offset);
+	} else {
+		chb = NULL;
+		tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf +
+					    getreq->tidgr_offset);
+		tidrecvc->cuda_hostbuf = NULL;
+	}
+#else
+	tidrecvc->buffer =
+	    (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset);
+#endif
+
+	/* 5. allocate some tids from driver. */
+	err = ips_tid_recv_alloc_frag(protoexp, tidrecvc, nbytes_this);
+	if (err != PSM2_OK) {
+#ifdef PSM_CUDA
+		if (chb)
+			psmi_mpool_put(chb);
+#endif
+		ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+		ips_scbctrl_free(completescb);
+		ips_scbctrl_free(grantscb);
+		/* Unable to register tids */
+		psmi_timer_request(protoexp->timerq,
+			&protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+		int num_tids = tidrecvc->tid_list.tsess_tidcount;
+		int tid, i;
+		for (i = 0; i < num_tids; i++) {
+			tid =
+			    IPS_TIDINFO_GET_TID(tidrecvc->tid_list.
+					tsess_list[i]) * 2 +
+			    IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list.
+					tsess_list[i]) - 1;
+			psmi_assert(protoexp->tid_info[tid].state ==
+				    TIDSTATE_FREE);
+			psmi_assert(protoexp->tid_info[tid].tidrecvc == NULL);
+			psmi_assert(protoexp->tid_info[tid].tid == 0xFFFFFFFF);
+			protoexp->tid_info[tid].state = TIDSTATE_USED;
+			protoexp->tid_info[tid].tidrecvc = tidrecvc;
+			protoexp->tid_info[tid].tid =
+			    tidrecvc->tid_list.tsess_list[i];
+		}
+	}
+
+	/* Initialize recv descriptor */
+	tidrecvc->ipsaddr = ipsaddr;
+	tidrecvc->getreq = (struct ips_tid_get_request *)getreq;
+
+	/* Initialize tidflow, instead calling generic routine:
+	   ips_flow_init(&tidrecvc->tidflow, protoexp->proto, ipsaddr,
+		      protoexp->ctrl_xfer_type, PSM_PROTOCOL_TIDFLOW,
+		      IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW);
+	 * only reset following necessary field. */
+	tidrecvc->tidflow.ipsaddr = ipsaddr;
+	tidrecvc->tidflow.flags = 0;
+
+	tidrecvc->tidflow_nswap_gen = 0;
+	tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen;
+	tidrecvc->tidflow_genseq.psn_seq = 0;	/* Always start sequence number at 0 (zero),
+	 	 	 	 	 	   in order to prevent wraparound sequence numbers */
+	hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+			      tidrecvc->rdescid._desc_idx,
+			      tidrecvc->tidflow_genseq.psn_gen,
+			      tidrecvc->tidflow_genseq.psn_seq);
+
+	tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset;
+	tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen;
+
+	tidrecvc->ctrl_msg_queued = 0;
+	tidrecvc->state = TIDRECVC_STATE_BUSY;
+
+	tidrecvc->stats.nSeqErr = 0;
+	tidrecvc->stats.nGenErr = 0;
+	tidrecvc->stats.nReXmit = 0;
+	tidrecvc->stats.nErrChkReceived = 0;
+
+	/* This gets sent out as a control message, so we need to force 4-byte IB
+	 * alignment */
+	tidrecvc->tsess_tidlist_length = (uint16_t)
+	    PSMI_ALIGNUP((sizeof(ips_tid_session_list) +
+			  (tidrecvc->tid_list.tsess_tidcount *
+			   sizeof(uint32_t))), 4);
+
+	_HFI_EXP("alloc tidrecv=%d, paylen=%d, ntid=%d\n",
+		 tidrecvc->rdescid._desc_idx,
+		 tidrecvc->tsess_tidlist_length,
+		 tidrecvc->tid_list.tsess_tidcount);
+
+	tidrecvc->grantscb = grantscb;
+	tidrecvc->completescb = completescb;
+
+	*ptidrecvc = tidrecvc; /* return to caller */
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+	struct ips_tid_get_pend *phead =
+	    &((struct ips_protoexp *)timer->context)->pend_getreqsq;
+	struct ips_protoexp *protoexp;
+	struct ips_tid_get_request *getreq;
+	struct ips_tid_recv_desc *tidrecvc;
+	ips_epaddr_t *ipsaddr;
+	uint32_t nbytes_this, count;
+	int ret;
+
+	PSM2_LOG_MSG("entering");
+
+#ifdef PSM_CUDA
+	if (!(((struct ips_protoexp *)timer->context)->proto->flags
+		& IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) ||
+		((((struct ips_protoexp *)timer->context)->proto->flags &
+		   IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
+		   gpudirect_recv_threshold)) {
+		/* Before processing pending TID requests, first try to free up
+		 * any CUDA host buffers that are now idle. */
+		struct ips_tid_get_cudapend *cphead =
+			&((struct ips_protoexp *)timer->context)->cudapend_getreqsq;
+		psm2_error_t err;
+
+		/* See if any CUDA memcpys are in progress. Grab the first getreq... */
+		while (!STAILQ_EMPTY(cphead)) {
+			getreq = STAILQ_FIRST(cphead);
+
+			err = psmi_cuda_reclaim_hostbufs(getreq);
+			if (err == PSM2_OK_NO_PROGRESS)
+				goto cudapend_exit;
+
+			/* This pending cuda getreq has no more CUDA ops queued up.
+			 * Either it's completely done, or the CUDA copies have caught
+			 * up with the TID data xfer, but the TID xfer itself is not
+			 * finished.
+			 */
+			if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) {
+				/* TID xfer is done.
+				 * We should only get here if:
+				 * this was involved a cuda copy, and
+				 * the TIX xfer is done.
+				 */
+				psmi_assert(getreq->cuda_hostbuf_used);
+				psmi_assert(getreq->tidgr_length ==
+					    getreq->tidgr_offset);
+
+				/* Remove from the cudapend list, and reclaim */
+				getreq->tidgr_protoexp = NULL;
+				getreq->tidgr_epaddr = NULL;
+				STAILQ_REMOVE_HEAD(cphead, tidgr_next);
+
+				/* mark the req as done */
+				if (getreq->tidgr_callback)
+					getreq->tidgr_callback(getreq->tidgr_ucontext);
+				psmi_mpool_put(getreq);
+			} else
+				break; /* CUDA xfers in progress. Leave. */
+		}
+	}
+cudapend_exit:
+#endif
+
+	while (!STAILQ_EMPTY(phead)) {
+		getreq = STAILQ_FIRST(phead);
+		ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr);
+		count = ipsaddr->msgctl->ipsaddr_count;
+
+ipsaddr_next:
+		ipsaddr = ipsaddr->msgctl->ipsaddr_next;
+		ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+		protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp;
+
+		if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+			struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid];
+			if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) {
+				break;                                    /* skip sending next CTS */
+			}
+		}
+
+#ifdef PSM_CUDA
+		if (getreq->cuda_hostbuf_used) {
+			/* If this is a large transfer, we may be able to
+			 * start reclaiming before all of the data is sent. */
+			psmi_cuda_reclaim_hostbufs(getreq);
+		}
+#endif
+		/*
+		 * Calculate the next window size, avoid the last
+		 * window too small.
+		 */
+		nbytes_this = getreq->tidgr_length - getreq->tidgr_offset;
+		if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz)
+			nbytes_this = getreq->tidgr_rndv_winsz;
+		else if (nbytes_this > getreq->tidgr_rndv_winsz)
+			nbytes_this /= 2;
+
+		/*
+		 * If there is a next window and the next window
+		 * length is greater than PAGESIZE, make sure the window
+		 * starts on a page boundary.
+		 */
+#ifdef PSM_CUDA
+		psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext;
+		if (req->is_buf_gpu_mem){
+			if (((getreq->tidgr_offset + nbytes_this) <
+					getreq->tidgr_length) &&
+					nbytes_this > PSMI_GPU_PAGESIZE) {
+				uint32_t pageoff =
+					(((uintptr_t)getreq->tidgr_lbuf) &
+						(PSMI_GPU_PAGESIZE - 1)) +
+					getreq->tidgr_offset + nbytes_this;
+				nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1);
+			}
+		} else {
+#endif
+			if ((getreq->tidgr_offset + nbytes_this) <
+					getreq->tidgr_length &&
+					nbytes_this > PSMI_PAGESIZE) {
+				uint32_t pageoff =
+					(((uintptr_t)getreq->tidgr_lbuf) &
+						(PSMI_PAGESIZE - 1)) +
+					getreq->tidgr_offset + nbytes_this;
+				nbytes_this -= pageoff & (PSMI_PAGESIZE - 1);
+			}
+#ifdef PSM_CUDA
+		}
+#endif
+
+		psmi_assert(nbytes_this >= 4);
+		psmi_assert(nbytes_this <= PSM_TID_WINSIZE);
+
+		if ((ret = ips_tid_num_available(&protoexp->tidc)) <= 0) {
+			/* We're out of tids. If this process used all the resource,
+			 * the free callback will reschedule the operation, otherwise,
+			 * we reschedule it here */
+			if (ret == 0)
+			{
+				psmi_timer_request(protoexp->timerq,
+						   &protoexp->timer_getreqs,
+						   PSMI_TIMER_PRIO_1);
+			}
+		} else if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) {
+			/* We're out of tidflow. If this process used all the resource,
+			 * the free callback will reschedule the operation, otherwise,
+			 * we reschedule it here */
+			if (ret == 0)
+			{
+				psmi_timer_request(protoexp->timerq,
+						   &protoexp->timer_getreqs,
+						   PSMI_TIMER_PRIO_1);
+			}
+		} else if (ips_tid_recv_alloc(protoexp, ipsaddr,
+			      getreq, nbytes_this, &tidrecvc) == PSM2_OK) {
+			ips_protoexp_send_tid_grant(tidrecvc);
+
+			if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+				/*
+				 * Once the CTS was sent, we mark it per 'flow' object
+				 * not to proceed with next CTSes until that one is done.
+				 */
+				struct ips_proto *proto = tidrecvc->protoexp->proto;
+				struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+				flow->flags |= IPS_FLOW_FLAG_SKIP_CTS;
+			}
+
+			/*
+			 * nbytes_this is the asked length for this session,
+			 * ips_tid_recv_alloc() might register less pages, the
+			 * real length is in tidrecvc->recv_msglen.
+			 */
+			getreq->tidgr_offset += tidrecvc->recv_msglen;
+			psmi_assert(getreq->tidgr_offset <=
+				    getreq->tidgr_length);
+			_HFI_VDBG("GRANT tididx=%d srcoff=%d nbytes=%d/%d\n",
+				  tidrecvc->rdescid._desc_idx,
+				  getreq->tidgr_offset, tidrecvc->recv_msglen,
+				  getreq->tidgr_length);
+
+			if (getreq->tidgr_offset == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+				if (getreq->cuda_hostbuf_used) {
+					/* this completes the tid xfer setup.
+					   move to the pending cuda ops queue,
+					   set the timer to catch completion */
+					STAILQ_REMOVE_HEAD(phead, tidgr_next);
+					STAILQ_INSERT_TAIL(
+						&getreq->tidgr_protoexp->cudapend_getreqsq,
+						getreq, tidgr_next);
+					psmi_timer_request(getreq->tidgr_protoexp->timerq,
+							   &getreq->tidgr_protoexp->timer_getreqs,
+							   PSMI_TIMER_PRIO_1);
+					continue;
+				}
+#endif
+				getreq->tidgr_protoexp = NULL;
+				getreq->tidgr_epaddr = NULL;
+				STAILQ_REMOVE_HEAD(phead, tidgr_next);
+				continue;	/* try next grant request */
+			}
+			else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) {
+				/* In case of multi rail, PSM sends one CTS per request
+				 * per card after which the request is moved to the end
+				 * of the queue.
+				 */
+				count--;
+				if (count)
+					goto ipsaddr_next;
+				STAILQ_REMOVE_HEAD(phead, tidgr_next);
+				STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next);
+				continue;
+			}
+
+			/* created a tidrecvc, reset count */
+			count = ipsaddr->msgctl->ipsaddr_count;
+			goto ipsaddr_next;	/* try next fragment on next ipsaddr */
+		}
+
+		/*
+		 * We need to loop until we can't get a tidrecvc on all
+		 * ipsaddrs, then the callbacks on the home protoexp where
+		 * getreq is linked can resume this routine. Otherwise, we
+		 * might make this getreq to be orphaned and cause deadlock.
+		 */
+		count--;
+		if (count)
+			goto ipsaddr_next;
+		break;
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;		/* XXX err-broken */
+}
+
+#ifdef PSM_CUDA
+static
+void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
+{
+	struct ips_protoexp *protoexp = tidrecvc->protoexp;
+	struct ips_cuda_hostbuf *chb;
+
+	chb = tidrecvc->cuda_hostbuf;
+	chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
+			tidrecvc->tid_list.tsess_unaligned_end;
+
+	PSMI_CUDA_CALL(cudaMemcpyAsync,
+		       chb->gpu_buf, chb->host_buf,
+		       tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
+							tidrecvc->tid_list.tsess_unaligned_end,
+		       cudaMemcpyHostToDevice,
+		       protoexp->cudastream_recv);
+	PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+		       protoexp->cudastream_recv);
+
+	STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next);
+	tidrecvc->cuda_hostbuf = NULL;
+	ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0);
+}
+#endif
+
+static
+psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
+{
+	struct ips_protoexp *protoexp = tidrecvc->protoexp;
+	struct ips_tid_get_request *getreq = tidrecvc->getreq;
+	int tidcount = tidrecvc->tid_list.tsess_tidcount;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(getreq != NULL);
+	psmi_assert(tidcount > 0);
+	psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
+
+#ifdef PSM_CUDA
+	if (tidrecvc->cuda_hostbuf)
+		psmi_cudamemcpy_tid_to_device(tidrecvc);
+#endif
+
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+		int tid, i;
+
+		for (i = 0; i < tidcount; i++) {
+			tid =
+			    IPS_TIDINFO_GET_TID(tidrecvc->tid_list.
+					tsess_list[i]) * 2 +
+			    IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list.
+					tsess_list[i]) - 1;
+			psmi_assert(protoexp->tid_info[tid].state ==
+				    TIDSTATE_USED);
+			psmi_assert(protoexp->tid_info[tid].tidrecvc ==
+				    tidrecvc);
+			psmi_assert(protoexp->tid_info[tid].tid ==
+				    tidrecvc->tid_list.tsess_list[i]);
+			protoexp->tid_info[tid].state = TIDSTATE_FREE;
+			protoexp->tid_info[tid].tidrecvc = NULL;
+			protoexp->tid_info[tid].tid = 0xFFFFFFFF;
+		}
+	}
+
+	ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ",
+		      tidrecvc->tid_list.tsess_tidcount);
+
+	if (protoexp->tidc.tid_array) {
+		if ((err = ips_tidcache_release(&protoexp->tidc,
+			tidrecvc->tid_list.tsess_list, tidcount)))
+			goto fail;
+	} else {
+		if ((err = ips_tid_release(&protoexp->tidc,
+			tidrecvc->tid_list.tsess_list, tidcount)))
+			goto fail;
+	}
+
+	getreq->tidgr_bytesdone += tidrecvc->recv_msglen;
+
+	_HFI_EXP("req=%p bytes=%d/%d\n",
+		 getreq->tidgr_ucontext,
+		 getreq->tidgr_bytesdone, getreq->tidgr_length);
+
+	tidrecvc->state = TIDRECVC_STATE_FREE;
+
+	/* finally free the tidflow */
+	ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+
+	if (getreq->tidgr_bytesdone == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+		/* if cuda, we handle callbacks when the cuda xfer is done */
+		if (!getreq->cuda_hostbuf_used) {
+			if (getreq->tidgr_callback)
+				getreq->tidgr_callback(getreq->tidgr_ucontext);
+			psmi_mpool_put(getreq);
+		}
+#else
+		if (getreq->tidgr_callback)
+			getreq->tidgr_callback(getreq->tidgr_ucontext);
+		psmi_mpool_put(getreq);
+#endif
+	} else {
+		/* We just released some tids.
+		 * If requests are waiting on tids to be
+		 * freed, queue up the timer */
+		if (getreq->tidgr_offset < getreq->tidgr_length) {
+			ips_tid_pendtids_timer_callback(&getreq->
+							tidgr_protoexp->
+							timer_getreqs, 0);
+		}
+	}
+
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) {
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs,
+				   PSMI_TIMER_PRIO_1);
+	}
+
+fail:
+	return err;
+}
+
+void
+ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_tid_recv_desc *tidrecvc;
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+
+	ptl_arg_t desc_id;
+	int tidpair = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+		   HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK;
+	int tidctrl = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+		   HFI_KHDR_TIDCTRL_SHIFT) & HFI_KHDR_TIDCTRL_MASK;
+	int tid0, tid1, tid;
+
+	psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+	/* Expected sends not enabled */
+	if (protoexp == NULL)
+		return;
+
+	/* Not doing extra tid debugging or not really a tiderr */
+	if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) ||
+	    !(rcv_ev->error_flags & HFI_RHF_TIDERR))
+		return;
+
+	if (rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) {
+		_HFI_ERROR("receive type %d is not "
+			   "expected in tid debugging\n", rcv_ev->ptype);
+		return;
+	}
+
+	desc_id._desc_idx = ips_proto_flowid(p_hdr);
+	desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if (tidctrl != 3)
+		tid0 = tid1 = tidpair * 2 + tidctrl - 1;
+	else {
+		tid0 = tidpair * 2;
+		tid1 = tid0 + 1;
+	}
+
+	for (tid = tid0; tid <= tid1; tid++) {
+		if (protoexp->tid_info[tid].state == TIDSTATE_USED)
+			continue;
+
+		char buf[128];
+		char *s = "invalid (not even in table)";
+
+		if (tidrecvc->rdescid._desc_genc ==
+				    desc_id._desc_genc)
+			s = "valid";
+		else {
+			snprintf(buf, sizeof(buf) - 1,
+				 "wrong generation (gen=%d,received=%d)",
+				 tidrecvc->rdescid._desc_genc,
+				 desc_id._desc_genc);
+			buf[sizeof(buf) - 1] = '\0';
+			s = buf;
+		}
+
+		if (protoexp->tid_info[tid].tidrecvc != tidrecvc) {
+			_HFI_ERROR
+			    ("tid %d not a known member of tidsess %d\n",
+			     tid, desc_id._desc_idx);
+		}
+
+		_HFI_ERROR("tid %d is marked unused (session=%d): %s\n", tid,
+			   desc_id._desc_idx, s);
+	}
+	return;
+}
+
+void
+ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_tid_recv_desc *tidrecvc;
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	int hdr_err = rcv_ev->error_flags & HFI_RHF_KHDRLENERR;
+	uint8_t op_code = _get_proto_hfi_opcode(p_hdr);
+	char pktmsg[128];
+	char errmsg[256];
+
+	psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+	/* Expected sends not enabled */
+	if (protoexp == NULL)
+		return;
+
+	ips_proto_get_rhf_errstring(rcv_ev->error_flags, pktmsg,
+				    sizeof(pktmsg));
+
+	snprintf(errmsg, sizeof(errmsg),
+		 "%s pkt type opcode 0x%x at hd=0x%x %s\n",
+		 (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "Eager" :
+		 (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "Expected" :
+		 (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "Non-kd" :
+		 "<Error>", op_code, rcv_ev->recvq->state->hdrq_head, pktmsg);
+
+	if (!hdr_err) {
+		ptl_arg_t desc_id;
+		psmi_seqnum_t sequence_num;
+
+		desc_id._desc_idx = ips_proto_flowid(p_hdr);
+		desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+		tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+		if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+			/* Print this at very verbose level. Noisy links can have a few of
+			 * these! */
+			_HFI_VDBG
+			    ("Data Error Pkt and Recv Generation Mismatch: %s",
+			     errmsg);
+			return;	/* skip */
+		}
+
+		if (tidrecvc->state == TIDRECVC_STATE_FREE) {
+			_HFI_EPDBG
+			    ("Data Error Pkt for a Completed Rendezvous: %s",
+			     errmsg);
+			return;	/* skip */
+		}
+
+		/* See if CRC error for a previous packet */
+		sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+		if (sequence_num.psn_gen == tidrecvc->tidflow_genseq.psn_gen) {
+			/* Try to recover the flow by restarting from previous known good
+			 * sequence (possible if the packet with CRC error is after the "known
+			 * good PSN" else we can't restart the flow.
+			 */
+			return ips_protoexp_do_tf_seqerr(protoexp,
+					tidrecvc, p_hdr);
+		} else {
+			/* Print this at very verbose level */
+			_HFI_VDBG
+			    ("Data Error Packet. GenMismatch: Yes. Tidrecvc: %p. "
+			     "Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n",
+			     tidrecvc, sequence_num.psn_gen,
+			     sequence_num.psn_seq,
+			     tidrecvc->tidflow_genseq.psn_gen,
+			     tidrecvc->tidflow_genseq.psn_seq, errmsg);
+		}
+
+	} else {
+		_HFI_VDBG("HDR_ERROR: %s\n", errmsg);
+	}
+
+}
+
+psm2_error_t
+ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc)
+{
+	psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
+	ips_tfgen_allocate(&tidrecvc->protoexp->tfc,
+				 tidrecvc->rdescid._desc_idx,
+				 &tidrecvc->tidflow_active_gen);
+
+	/* Update tidflow table with new generation number */
+	tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen;
+	hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+			      tidrecvc->rdescid._desc_idx,
+			      tidrecvc->tidflow_genseq.psn_gen,
+			      tidrecvc->tidflow_genseq.psn_seq);
+
+	/* Increment swapped generation count for tidflow */
+	tidrecvc->tidflow_nswap_gen++;
+	return PSM2_OK;
+}
+
+void
+ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_tid_recv_desc *tidrecvc;
+	ptl_arg_t desc_id;
+
+	psmi_assert_always(protoexp != NULL);
+	psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+	desc_id._desc_idx = ips_proto_flowid(p_hdr);
+	desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc
+			&& tidrecvc->state == TIDRECVC_STATE_BUSY)
+		ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr);
+
+	return;
+}
+
+static
+void ips_protoexp_do_tf_seqerr(struct ips_protoexp *protoexp,
+			struct ips_tid_recv_desc *tidrecvc,
+			struct ips_message_header *p_hdr)
+{
+	psmi_seqnum_t sequence_num, tf_sequence_num;
+	ips_scb_t ctrlscb;
+
+	/* Update stats for sequence errors */
+	tidrecvc->stats.nSeqErr++;
+
+	sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+	/* Only care about sequence error for currently active generation */
+	if (tidrecvc->tidflow_active_gen != sequence_num.psn_gen)
+		return;
+
+	/* If a "large" number of swapped generation we are loosing packets
+	 * for this flow. Request throttling of tidflow by generating a
+	 * BECN. With header suppression we will miss some FECN packet
+	 * on OPA hence keeping track of swapped generation is another
+	 * mechanism to do congestion control for tidflows.
+	 *
+	 * For mismatched sender/receiver/link speeds we can get into a
+	 * deadly embrace where minimal progress is made due to generation
+	 * mismatch errors. This can occur if we wrap around the generation
+	 * count without making progress. Hence in cases where the swapped
+	 * generation count is > 254 stop sending BECN (and the NAK) so the
+	 * send -> receiver pipeline is flushed with an error check and things
+	 * can sync up. This should be an extremely rare event.
+	 */
+
+	if_pf(tidrecvc->tidflow_nswap_gen >= 254)
+		return;	/* Do not send NAK. Let error check kick in. */
+
+	if_pf((tidrecvc->tidflow_nswap_gen > 4) &&
+	      (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+		_HFI_CCADBG("Generating BECN. Number of swapped gen: %d.\n",
+				tidrecvc->tidflow_nswap_gen);
+		/* Mark flow to generate BECN in control packet */
+		tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+		/* Update stats for congestion encountered */
+		protoexp->proto->epaddr_stats.congestion_pkts++;
+	}
+
+	/* Get the latest seq from hardware tidflow table, if that value is
+	 * reliable. The value is not reliable if context sharing is used,
+	 * because context sharing might drop packet even though hardware
+	 * has received it successfully. The hardware table may also be
+	 * incorrect if RSM is intercepting TID & FECN & SH packets.
+	 * We can handle this condition by taking the most recent PSN whether
+	 * it comes from the tidflow table or from PSM's own accounting.
+	 */
+	if (!tidrecvc->context->tf_ctrl) {
+		tf_sequence_num.psn_val =
+			hfi_tidflow_get_seqnum(
+				hfi_tidflow_get(tidrecvc->context->ctrl,
+						tidrecvc->rdescid._desc_idx));
+		if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq)
+			tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq;
+	}
+
+	/* Swap generation for the flow. */
+	ips_protoexp_flow_newgen(tidrecvc);
+
+	ctrlscb.flags = 0;
+	ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid;
+	/* Keep peer generation but use my last received sequence */
+	sequence_num.psn_seq = tidrecvc->tidflow_genseq.psn_seq;
+	ctrlscb.ips_lrh.ack_seq_num = sequence_num.psn_val;
+
+	/* My new generation and last received sequence */
+	ctrlscb.ips_lrh.data[1].u32w0 = tidrecvc->tidflow_genseq.psn_val;
+
+	ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+				    OPCODE_NAK,
+				    &tidrecvc->ctrl_msg_queued,
+				    &ctrlscb, ctrlscb.cksum, 0);
+
+	/* Update stats for retransmit */
+	tidrecvc->stats.nReXmit++;
+
+	return;
+}
+
+void
+ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_tid_recv_desc *tidrecvc;
+	ptl_arg_t desc_id;
+
+	psmi_assert_always(protoexp != NULL);
+	psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+	/* For a generation error our NAK crossed on the wire or this is a stale
+	 * packet. Error recovery should sync things up again. Just drop this
+	 * packet.
+	 */
+	desc_id._desc_idx = ips_proto_flowid(p_hdr);
+	desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc
+			&& tidrecvc->state == TIDRECVC_STATE_BUSY)
+		ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr);
+
+	return;
+}
+
+static
+void ips_protoexp_do_tf_generr(struct ips_protoexp *protoexp,
+			struct ips_tid_recv_desc *tidrecvc,
+			struct ips_message_header *p_hdr)
+{
+	/* Update stats for generation errors */
+	tidrecvc->stats.nGenErr++;
+
+	/* If packet faced congestion we may want to generate
+	 * a CN packet to rate control sender.
+	 */
+
+	return;
+}
diff --git a/ptl_ips/ips_proto_header.h b/ptl_ips/ips_proto_header.h
new file mode 100644
index 0000000..6677162
--- /dev/null
+++ b/ptl_ips/ips_proto_header.h
@@ -0,0 +1,181 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HEADER_H
+#define _IPS_PROTO_HEADER_H
+
+/* Although defined as macros, the *_BITS values below are NOT meant to be
+   changed.  They are defined this way so that their values are written in
+   exactly one place.  These macros are used in struct ips_message_header
+   below, as well as in the active messages code for the purpose of
+   establishing how many arguments/handlers are supported, and to assert that
+   values written into the header fields are not too large for the number of
+   bits available. The preprocessor check below ensures less than 32 bits are
+   used.
+ */
+
+/* Number of bits to use for the amhdr_len field. */
+#define IPS_AM_HDR_LEN_BITS 4
+
+/* Number of bits to use for the amhdr_hidx field.  Bounds the number of
+ * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */
+#define IPS_AM_HDR_HIDX_BITS 8
+
+/* Number of bits to use for the amhdr_nargs field.  Bounds the number of
+   arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */
+#define IPS_AM_HDR_NARGS_BITS 4
+
+#if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32
+#error "Bad IPS header definition: AM fields must use 32 bits or less"
+#endif
+
+/* Number of AM arguments that can be packets into struct_ips_message_header.
+   Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in
+   the data payload. */
+#define IPS_AM_HDR_NARGS  \
+	(sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t))
+
+/* The actual size of the message header is determined by three paramters:
+ * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware)
+ *    OPA words contain LRH and BTH
+ * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol)
+ *    IPS hardware words contain ips-protocol-specific data
+ * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol)
+ *    IPS user words contain ips-protocol-specific data
+ *
+ * The header message size is determined to as IWORDS + HWORDS + UWORDS
+ */
+struct ips_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+
+	/* fields below this point are in host byte order */
+	struct hfi_kdeth khdr;
+
+	struct {
+		__u32 flags:6;
+		__u32 connidx:26;	/* connection idx */
+	};
+
+	union {
+		struct {
+			struct {
+				__u32 ack_seq_num:31;
+				__u32 reserved:1;
+			};
+
+			union {
+				struct {	/* for active message */
+					__u32 amhdr_len:IPS_AM_HDR_LEN_BITS;
+					__u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS;
+					__u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS;
+				};
+				__u32 mdata;	/* for misc data */
+			};
+
+			/* Inline arguments and/or message payload  */
+			union {
+				ptl_arg_t data[2];
+				__u32 uwords[4];
+			};
+		};
+
+		/* for message header packet only */
+		struct {
+			__u32 pad1;
+			__u32 tag[3];	/* 96 bits psm tag */
+			ptl_arg_t hdr_data;
+		};
+
+		/* for expected tid packet only */
+		struct {
+			__u8	  exp_ustart[3]; /* unaligned start bytes */
+			__u8	  exp_uend[3];   /* unaligned end bytes */
+			__u16	  exp_rdescid_genc; /* tidrecvc gen count */
+			ptl_arg_t exp_sdescid;  /* sender descriptor id */
+			__u32     exp_cksum;	/* optional checksum */
+			__u32     exp_offset;	/* packet offset */
+		};
+	};
+};
+
+/*
+ * OpCodes in BTH[0], 24-31 bits. Order is important!!!
+ */
+#define OPCODE_RESERVED			0xC0	/* reserved */
+#define OPCODE_TINY			0xC1	/* 0 <= msglen <= 8 */
+#define OPCODE_SHORT			0xC2	/* 8 < msglen <= MTU */
+#define OPCODE_EAGER			0xC3	/* eager packet */
+#define OPCODE_LONG_RTS			0xC4	/* ready to send */
+#define OPCODE_LONG_CTS			0xC5	/* confirm to send */
+#define OPCODE_LONG_DATA		0xC6	/* long data packets */
+#define OPCODE_EXPTID			0xC7	/* expected tid data */
+#define OPCODE_EXPTID_COMPLETION	0xC8	/* expected tid completion */
+#define OPCODE_ACK			0xC9	/* explicit ACK packet */
+#define OPCODE_NAK			0xCA	/* explicit NAK packet */
+#define OPCODE_BECN			0xCB	/* congestion control */
+#define OPCODE_ERR_CHK			0xCC	/* query eager receiving */
+#define OPCODE_ERR_CHK_GEN		0xCD	/* query tid receiving */
+#define OPCODE_CONNECT_REQUEST		0xCE	/* connect request */
+#define OPCODE_CONNECT_REPLY		0xCF	/* connect reply */
+#define OPCODE_DISCONNECT_REQUEST	0xD0	/* disconnect request */
+#define OPCODE_DISCONNECT_REPLY		0xD1	/* disconnect reply */
+#define OPCODE_AM_REQUEST_NOREPLY	0xD2	/* AM request w/o reply */
+#define OPCODE_AM_REQUEST		0xD3	/* AM request */
+#define OPCODE_AM_REPLY			0xD4	/* AM reply */
+#define OPCODE_FUTURE_FROM		0xD5	/* reserved for expansion */
+#define OPCODE_FUTURE_TO		0xDF	/* reserved for expansion */
+
+#endif /* _IPS_PROTO_HEADER_H */
diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h
new file mode 100644
index 0000000..5434b02
--- /dev/null
+++ b/ptl_ips/ips_proto_help.h
@@ -0,0 +1,705 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HELP_H
+#define _IPS_PROTO_HELP_H
+
+#include "ips_recvhdrq.h"
+#include "ips_proto.h"
+#include "ipserror.h"
+#include "psm_mq_internal.h"	/* psmi_mq_handle_tiny_envelope */
+#include "ptl_ips.h"
+
+/* hfi_opcode is not the ips-level opcode. */
+PSMI_ALWAYS_INLINE(
+uint8_t
+_get_proto_hfi_opcode(const struct ips_message_header *p_hdr))
+{
+	return ((__be32_to_cpu(p_hdr->bth[0]) >>
+		 HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+uint8_t
+ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow))
+{
+	/*
+	 * Setup ACK request if more than ack_interval packets
+	 * have not been requested an ACK
+	 */
+	if (scb->flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) {
+		flow->ack_counter = 0;
+	} else {
+		flow->ack_counter++;
+		if (flow->ack_counter > flow->ack_interval) {
+			flow->ack_counter = 0;
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+		}
+	}
+
+	/* Bottom 6 bits wind up in protocol header fields, other bits
+	 * control other aspects of packet composition */
+	return (uint8_t) (scb->flags & IPS_SEND_FLAG_PROTO_OPTS);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_epaddr_flow_t
+ips_proto_flowid(struct ips_message_header *p_hdr))
+{
+	return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >>
+				     HFI_BTH_FLOWID_SHIFT) &
+				    HFI_BTH_FLOWID_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+int
+ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr,
+	     void *payload, uint32_t paylen, uint32_t *cksum))
+{
+	uint16_t paywords;
+
+	/* Update the payload words in header */
+	paywords = (sizeof(struct ips_message_header) + paylen +
+		    PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >>
+	    BYTE2DWORD_SHIFT;
+	p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+	/* Need to regenerate KDETH checksum after updating payload length */
+	/* ips_kdeth_cksum(p_hdr); */
+
+	*cksum = 0xffffffff;
+
+	/* Checksum header */
+	*cksum = ips_crc_calculate(sizeof(struct ips_message_header),
+				   (uint8_t *) p_hdr, *cksum);
+
+	/* Checksum payload (if any) */
+	if (paylen) {
+		psmi_assert_always(payload);
+		*cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum);
+	}
+
+	return 0;
+}
+
+/* Get pbc static rate value for flow for a given message length */
+PSMI_ALWAYS_INLINE(
+uint16_t
+ips_proto_pbc_static_rate(struct ips_proto *proto, struct ips_flow *flow,
+			  uint32_t msgLen))
+{
+	uint32_t rate = 0;
+
+	/* The PBC rate is based on which HFI type as different media have different
+	 * mechanism for static rate control.
+	 */
+
+	switch (proto->epinfo.ep_hfi_type) {
+	case PSMI_HFI_TYPE_OPA1:
+		{
+		/*
+		 * time_to_send is:
+		 *
+		 *  (packet_length) [bits] / (pkt_egress_rate) [bits/sec]
+		 *  -----------------------------------------------------
+		 *     fabric_clock_period == (1 / 805 * 10^6) [1/sec]
+		 *
+		 *   (where pkt_egress_rate is assumed to be 100 Gbit/s.)
+		 */
+		uint32_t time_to_send = (8 * msgLen * 805) / (100000);
+		rate = (time_to_send >> flow->path->pr_cca_divisor) *
+				(flow->path->pr_active_ipd);
+
+		if (rate > 65535)
+			rate = 65535;
+
+		}
+		break;
+
+	default:
+		rate = 0;
+	}
+
+	return (uint16_t) rate;
+}
+
+/* This is a helper function to convert Per Buffer Control to little-endian */
+PSMI_ALWAYS_INLINE(
+void ips_proto_pbc_to_le(struct hfi_pbc *pbc))
+{
+	pbc->pbc0 = __cpu_to_le32(pbc->pbc0);
+	pbc->PbcStaticRateControlCnt = __cpu_to_le16(pbc->PbcStaticRateControlCnt);
+	pbc->fill1 = __cpu_to_le16(pbc->fill1);
+}
+
+/* This is only used for SDMA cases; pbc is really a pointer to
+ * struct ips_pbc_header * or the equivalent un-named structure
+ * in ips_scb. Please note pcb will be in little-endian byte
+ * order on return */
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow,
+		     uint32_t isCtrlMsg, struct hfi_pbc *pbc, uint32_t hdrlen,
+		     uint32_t paylen))
+{
+	int dw = (sizeof(struct hfi_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT;
+	int sc = proto->sl2sc[flow->path->pr_sl];
+	int vl = proto->sc2vl[sc];
+	uint16_t static_rate = 0;
+
+	if_pf(!isCtrlMsg && flow->path->pr_active_ipd)
+	    static_rate =
+	    ips_proto_pbc_static_rate(proto, flow, hdrlen + paylen);
+
+	pbc->pbc0 = (dw & HFI_PBC_LENGTHDWS_MASK) |
+	    ((vl & HFI_PBC_VL_MASK) << HFI_PBC_VL_SHIFT) |
+	    (((sc >> HFI_PBC_SC4_SHIFT) &
+	      HFI_PBC_SC4_MASK) << HFI_PBC_DCINFO_SHIFT);
+
+	pbc->PbcStaticRateControlCnt = static_rate & HFI_PBC_STATICRCC_MASK;
+
+	/* Per Buffer Control must be in little-endian */
+	ips_proto_pbc_to_le(pbc);
+
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t
+ips_proto_dest_context_from_header(struct ips_proto *proto,
+				   struct ips_message_header *p_hdr))
+{
+	return (__be32_to_cpu(p_hdr->bth[1]) & 0xFF);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+	      struct ips_flow *flow, ips_scb_t *scb, uint8_t flags))
+{
+	uint32_t paywords = (sizeof(struct ips_message_header) +
+			     scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >>
+	    BYTE2DWORD_SHIFT;
+	struct ips_message_header *p_hdr = &scb->ips_lrh;
+
+#if 0
+	/*
+	 * This scb has been used by this connection last time,
+	 * so some of the header fields are already set.
+	 */
+	if (scb->flow == flow) {
+		p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+		p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+					      (scb->
+					       opcode << BTH_OPCODE_SHIFT) |
+					      (extra_bytes <<
+					       BTH_EXTRA_BYTE_SHIFT));
+		p_hdr->bth[2] =
+		    __cpu_to_be32(flow->xmit_seq_num.
+				  psn | (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset |
+						   (scb->
+						    offset_mode <<
+						    HFI_KHDR_OM_SHIFT) | (scb->
+									  tid <<
+									  HFI_KHDR_TID_SHIFT)
+						   | (scb->
+						      tidctrl <<
+						      HFI_KHDR_TIDCTRL_SHIFT) |
+						   (scb->
+						    flags & IPS_SEND_FLAG_INTR)
+						   | (scb->
+						      flags &
+						      IPS_SEND_FLAG_HDR_SUPPRESS)
+						   | (IPS_PROTO_VERSION <<
+						      HFI_KHDR_KVER_SHIFT));
+
+		/* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */
+
+		p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+		p_hdr->flags = flags;
+
+		return;
+	}
+#endif
+
+	/* Setup LRH fields */
+	p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+				      ((flow->path->pr_sl & HFI_LRH_SL_MASK) <<
+				       HFI_LRH_SL_SHIFT) |
+				      ((proto->sl2sc[flow->path->pr_sl] &
+					HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT));
+	p_hdr->lrh[1] = flow->path->pr_dlid;
+	p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+	p_hdr->lrh[3] = flow->path->pr_slid;
+
+	/* Setup BTH fields */
+	p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+			      (scb->opcode << HFI_BTH_OPCODE_SHIFT));
+	p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num |
+				      (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+	if (scb->tidctrl) {	/* expected receive packet */
+		p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+					      (ipsaddr->
+					       subcontext <<
+					       HFI_BTH_SUBCTXT_SHIFT) |
+						(scb->tidsendc->
+						rdescid._desc_idx
+						 << HFI_BTH_FLOWID_SHIFT)
+					      | (proto->epinfo.
+						 ep_baseqp <<
+						 HFI_BTH_QP_SHIFT));
+
+		/* Setup KHDR fields */
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+						   (scb->
+						    tidctrl <<
+						    HFI_KHDR_TIDCTRL_SHIFT) |
+						   (scb->
+						    flags & IPS_SEND_FLAG_INTR)
+						   | (scb->
+						      flags &
+						      IPS_SEND_FLAG_HDRSUPP) |
+						   (IPS_PROTO_VERSION <<
+						    HFI_KHDR_KVER_SHIFT));
+	} else {		/* eager receive packet */
+		p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+					      (ipsaddr->
+					       subcontext <<
+					       HFI_BTH_SUBCTXT_SHIFT) |
+						(flow->flowid
+						 << HFI_BTH_FLOWID_SHIFT)
+					      | (proto->epinfo.
+						 ep_baseqp <<
+						 HFI_BTH_QP_SHIFT));
+
+		/* Setup KHDR fields */
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+						   (scb->
+						    flags & IPS_SEND_FLAG_INTR)
+						   | (IPS_PROTO_VERSION <<
+						      HFI_KHDR_KVER_SHIFT));
+
+		p_hdr->ack_seq_num = flow->recv_seq_num.psn_num;
+	}
+
+	p_hdr->khdr.job_key = __cpu_to_le32(proto->epinfo.ep_jkey);
+	p_hdr->connidx = ipsaddr->connidx_outgoing;
+	p_hdr->flags = flags;
+
+	scb->flow = flow;
+
+	return;
+}
+
+/*
+ * Assumes that the following fields are already set in scb:
+ * payload
+ * payload_size
+ * flags
+ */
+PSMI_INLINE(
+void
+ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+			   struct ips_flow *flow, ips_scb_t *scb))
+{
+	psmi_assert((scb->payload_size & 3) == 0);
+	ips_proto_hdr(proto, ipsaddr, flow, scb,
+		      ips_flow_gen_ackflags(scb, flow));
+
+	scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+	scb->abs_timeout = TIMEOUT_INFINITE;
+	scb->flags |= IPS_SEND_FLAG_PENDING;
+
+	if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+		flow->xmit_seq_num.psn_seq += scb->nfrag;
+		scb->seq_num = flow->xmit_seq_num;
+		scb->seq_num.psn_seq--;
+	} else {
+		flow->xmit_seq_num.psn_num =
+		    (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask;
+		scb->seq_num.psn_num =
+		    (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask;
+	}
+
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype))
+{
+	switch (msgtype) {
+	case OPCODE_ACK:
+		break;
+	case OPCODE_ERR_CHK:
+	case OPCODE_ERR_CHK_GEN:
+		proto->epaddr_stats.err_chk_send++;
+		break;
+	case OPCODE_NAK:
+		proto->epaddr_stats.nak_send++;
+		break;
+	case OPCODE_CONNECT_REQUEST:
+		proto->epaddr_stats.connect_req++;
+		break;
+	case OPCODE_DISCONNECT_REQUEST:
+		proto->epaddr_stats.disconnect_req++;
+		break;
+	default:
+		break;
+	}
+	return;
+}
+
+/*
+ * Exported there solely for inlining is_expected_or_nak and mq_tiny handling
+ */
+extern
+psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow,
+		uint8_t message_type, uint16_t *msg_queue_mask,
+		ips_scb_t *ctrlscb, void *payload, uint32_t paylen);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+	if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) {
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;	/* ACK clears NAK */
+		} else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) {
+			SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+		}
+
+		flow->flags |= IPS_FLOW_FLAG_PENDING_ACK;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+		/* Coalesced ACKs disabled. Send ACK immediately */
+		ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+					    &flow->ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+	if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;	/* NAK clears ACK */
+		} else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) {
+			SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+		}
+
+		flow->flags |= IPS_FLOW_FLAG_PENDING_NAK;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+		/* Coalesced ACKs disabled. Send NAK immediately */
+		ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+					    &flow->ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+}
+
+/* return 1 if packet is next expected in flow
+ * return 0 if packet is not next expected in flow (and nak packet).
+ */
+PSMI_ALWAYS_INLINE(
+int
+ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev))
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	psmi_seqnum_t sequence_num;
+
+	psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) ||
+		           (flowid == EP_FLOW_GO_BACK_N_DMA)
+	    );
+	flow = &ipsaddr->flows[flowid];
+	/* If packet faced congestion generate BECN in NAK. */
+	if_pf((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) &&
+	      ((flow->cca_ooo_pkts & 0xf) == 0)) {
+		/* Generate a BECN for every 16th OOO packet marked with a FECN. */
+		flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+		flow->cca_ooo_pkts++;
+		rcv_ev->proto->epaddr_stats.congestion_pkts++;
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;	/* Clear FECN event */
+	}
+
+	sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+	if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) {
+		flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND;
+
+		flow->recv_seq_num.psn_num =
+		    (flow->recv_seq_num.psn_num + 1) & proto->psn_mask;
+		flow->cca_ooo_pkts = 0;
+
+		/* don't process ack, caller will do it. */
+		return 1;
+
+	}
+
+	int16_t diff = (int16_t) (sequence_num.psn_num -
+			       flow->recv_seq_num.psn_num);
+	if (diff > 0) {
+		if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) {
+			/* Queue/Send NAK to peer  */
+			ips_proto_send_nak((struct ips_recvhdrq *)
+					   rcv_ev->recvq, flow);
+			flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+			flow->cca_ooo_pkts = 0;
+		} else if (proto->flags & IPS_PROTO_FLAG_CCA) {
+			flow->cca_ooo_pkts = diff;
+			if (flow->cca_ooo_pkts > flow->ack_interval) {
+				ips_scb_t ctrlscb;
+
+				rcv_ev->proto->epaddr_stats.congestion_pkts++;
+				flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+				_HFI_CCADBG
+				    ("BECN Generation. Expected: %d, Got: %d.\n",
+				     flow->recv_seq_num.psn_num,
+				     sequence_num.psn_num);
+
+				ctrlscb.flags = 0;
+				ctrlscb.ips_lrh.data[0].u32w0 =
+						flow->cca_ooo_pkts;
+				/* Send Control message to throttle flow. Will clear flow flag and
+				 * reset cca_ooo_pkts.
+				 */
+				ips_proto_send_ctrl_message(flow,
+					    OPCODE_BECN,
+					    &flow->ipsaddr->
+					    ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+			}
+		}
+	}
+
+	/* process ack if packet is not in sequence. */
+	ips_proto_process_ack(rcv_ev);
+
+	return 0;
+}
+
+/*
+ * Note, some code depends on the literal values specified in this enum.
+ */
+enum ips_msg_order {
+	IPS_MSG_ORDER_PAST  = 3,	/* Old message, recv & drop */
+	IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */
+	IPS_MSG_ORDER_EXPECTED = 1,	/* Expected message, always recv */
+	IPS_MSG_ORDER_FUTURE_RECV = 0,	/* Future message, buffer in OOO Q */
+	IPS_MSG_ORDER_FUTURE = -1,	/* Future message, leave on RHQ */
+};
+
+PSMI_ALWAYS_INLINE(
+enum ips_msg_order
+ips_proto_check_msg_order(ips_epaddr_t *ipsaddr,
+			 struct ips_flow *flow,
+			 uint16_t send_seqnum,
+			 uint16_t *recv_seqnum))
+
+{
+	int16_t diff = (int16_t) (*recv_seqnum - send_seqnum);
+
+	if (likely(diff == 0)) {
+		*recv_seqnum += 1;
+
+		ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK;
+		if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK)
+			return IPS_MSG_ORDER_EXPECTED_MATCH;
+
+		return IPS_MSG_ORDER_EXPECTED;
+	} else if (diff > 0) {
+		return IPS_MSG_ORDER_PAST;
+	}
+
+	ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK;
+	if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) {
+		/*
+		 * Second time to see the same ooo message, receive and put
+		 * into OOO queue.
+		 */
+		return IPS_MSG_ORDER_FUTURE_RECV;
+	}
+
+	/* The first time to see an OOO message, leave it there and try
+	 * next time. But we need to revert back the receiving flow PSN. */
+	uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+	flow->recv_seq_num.psn_num =
+		(flow->recv_seq_num.psn_num - 1) & psn_mask;
+	return IPS_MSG_ORDER_FUTURE;
+}
+
+PSMI_INLINE(
+int
+ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
+{
+	uint32_t index;
+
+	/* NOTE: Fault injection will currently not work with hardware
+	 * suppression. See note below for reason why as we currently
+	 * do not update the hardware tidflow table if FI is dropping
+	 * the packet.
+	 *
+	 * We need to look into the packet before dropping it and
+	 * if it's an expected packet AND we have hardware suppression
+	 * then we need to update the hardware tidflow table and the
+	 * associated tidrecvc state to fake having received a packet
+	 * until some point in the window defined by the loss rate.
+	 * This way the subsequent err chk will be NAKd and we can resync
+	 * the flow with the sender.
+	 *
+	 * Note: For real errors the hardware generates seq/gen errors
+	 * which are handled appropriately by the protocol.
+	 */
+
+	if_pf(PSMI_FAULTINJ_ENABLED()) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1,
+					  IPS_FAULTINJ_RECVLOST);
+		if (psmi_faultinj_is_fault(fi_recv))
+			return IPS_RECVHDRQ_CONTINUE;
+	}
+
+	/* see file ips_proto_header.h for details */
+	index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED;
+	if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED))
+		index = 0;
+
+	return ips_packet_service_routine[index]
+			((struct ips_recvhdrq_event *)rcv_ev);
+}
+
+/*
+ * Breaks header encapsulation but needed in mq sends so we can pay
+ * "near-equal" attention to putting sends on the wire and servicing the
+ * receive queue.
+ */
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_recv_progress_if_busy(ptl_t *ptl, psm2_error_t err))
+{
+	if (err == PSM2_EP_NO_RESOURCES) {
+		ptl->ctl->ep_poll(ptl, 0);
+		return PSM2_OK;
+	} else
+		return err;
+}
+
+/* Find next lowest power of a two for a 32 bit number*/
+PSMI_ALWAYS_INLINE(
+unsigned int
+ips_next_low_pow2(unsigned int v))
+{
+
+	const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 };
+	const unsigned int S[] = { 1, 2, 4, 8, 16 };
+	register unsigned int r = 1;
+	int i;
+
+	for (i = 4; i >= 0; i--) {
+		if (v & b[i]) {
+			v >>= S[i];
+			r <<= S[i];
+		}
+	}
+
+	return r;
+}
+
+PSMI_ALWAYS_INLINE(
+ips_path_rec_t *
+ips_select_path(struct ips_proto *proto, ips_path_type_t path_type,
+		ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp))
+{
+	uint32_t path_idx;
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		/* If dispersive routes are configured then select the routes in round
+		 * robin order. We may want to use congestion information to select the
+		 * least lightly loaded path.
+		 */
+		path_idx = pathgrp->pg_next_path[path_type];
+		if (++pathgrp->pg_next_path[path_type] >=
+		    pathgrp->pg_num_paths[path_type])
+			pathgrp->pg_next_path[path_type] = 0;
+	} else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+		path_idx =	/* Key on destination context */
+		    ipsaddr->context % pathgrp->pg_num_paths[path_type];
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+		path_idx =	/* Key off src context */
+		    proto->epinfo.ep_context % pathgrp->pg_num_paths[path_type];
+	else			/* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */
+		path_idx = 0;
+
+	return pathgrp->pg_path[path_idx][path_type];
+}
+
+#endif /* _IPS_PROTO_HELP_H */
diff --git a/ptl_ips/ips_proto_internal.h b/ptl_ips/ips_proto_internal.h
new file mode 100644
index 0000000..fb46d63
--- /dev/null
+++ b/ptl_ips/ips_proto_internal.h
@@ -0,0 +1,96 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_INTERNAL_H
+#define _IPS_PROTO_INTERNAL_H
+
+#include "ips_proto_header.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+/*
+ * Connect protocol.
+ *
+ * On receive, handled by upcalling into the connect interface.
+ * On send, handled by ips_proto by having connect compose the message.
+ */
+psm2_error_t ips_proto_process_connect(struct ips_proto *proto,
+				      uint8_t opcode,
+				      struct ips_message_header *p_hdr,
+				      void *payload, uint32_t paylen);
+int ips_proto_build_connect_message(struct ips_proto *proto,
+				    ips_epaddr_t *ptladdr,
+				    uint8_t opcode, void *payload);
+
+psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer,
+				   uint64_t current);
+
+psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment);
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto);
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto);
+
+int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev);
+
+#endif /* _IPS_PROTO_INTERNAL_H */
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
new file mode 100644
index 0000000..e36492f
--- /dev/null
+++ b/ptl_ips/ips_proto_mq.c
@@ -0,0 +1,1733 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+uint32_t non_dw_mul_sdma = 0;
+
+void
+ips_proto_mq_set_non_dw_mul_sdma(uint32_t mode)
+{
+	non_dw_mul_sdma = mode;
+}
+
+PSMI_NEVER_INLINE(ips_scb_t *
+		  ips_poll_scb(struct ips_proto *proto,
+			       int npkts, int len, uint32_t flags, int istiny))
+{
+	ips_scb_t *scb = NULL;
+	psmi_assert(npkts > 0);
+	psm2_error_t err;
+
+	proto->stats.scb_egr_unavail_cnt++;
+
+	PSMI_BLOCKUNTIL(proto->ep, err,
+			((scb =
+			  (istiny ?
+			   ips_scbctrl_alloc_tiny(&proto->scbc_egr) :
+			   ips_scbctrl_alloc(&proto->scbc_egr, npkts, len,
+					     flags))) != NULL));
+	psmi_assert(scb != NULL);
+	return scb;
+}
+
+PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto))
+{
+	ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr);
+	/* common case should branch right through */
+	if_pt(scb != NULL)
+	    return scb;
+	else
+	return ips_poll_scb(proto, 1, 0, 0, 1);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_scb_t *
+mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags))
+{
+	psmi_assert(npkts > 0);
+	ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags);
+	if_pt(scb != NULL) {
+		return scb;
+	}
+	else {
+		return ips_poll_scb(proto, npkts, len, flags,
+				    0 /* not tiny scb */);
+	}
+}
+
+static
+int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
+{
+	psm2_mq_req_t req = (psm2_mq_req_t) reqp;
+
+	/* This code path is executed when the send is on a device buffer
+	 * and the receive is completed using eager buffers. As there is no
+	 * completion notification sent to the sender, this is the only place
+	 * where send side chb's can be freed and put back into the mpool.
+	 */
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *chb;
+	if (req->cuda_hostbuf_used) {
+		while (!STAILQ_EMPTY(&req->sendreq_prefetch)) {
+			/* If any prefetched buffers weren't used, they
+			   must be reclaimed here. */
+			chb = STAILQ_FIRST(&req->sendreq_prefetch);
+			STAILQ_REMOVE_HEAD(&req->sendreq_prefetch,
+						   req_next);
+			psmi_mpool_put(chb);
+		}
+	}
+#endif
+
+	req->send_msgoff += nbytes;
+	/*
+	 * the reason to use >= is because
+	 * we may have DW pad in nbytes.
+	 */
+	if (req->send_msgoff >= req->send_msglen) {
+		req->state = MQ_STATE_COMPLETE;
+		ips_barrier();
+		if(!psmi_is_req_internal(req))
+			mq_qq_append(&req->mq->completed_q, req);
+	}
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int ips_proto_mq_rv_complete(void *reqp)
+{
+	psm2_mq_req_t req = (psm2_mq_req_t) reqp;
+	psmi_mq_handle_rts_complete(req);
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+void ips_proto_mq_rv_complete_exp(void *reqp)
+{
+	ips_proto_mq_rv_complete(reqp);
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars))
+{
+	unsigned char *dest = vdest;
+	const unsigned char *src = vsrc;
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+		PSMI_CUDA_CALL(cudaMemcpy,
+			       vdest, vsrc, nchars, cudaMemcpyDefault);
+		return;
+	}
+#endif
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+	return;
+}
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars))
+{
+	unsigned char *dest = vdest;
+	const unsigned char *src = vsrc;
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3:
+		*dest++ = *src++;
+	case 2:
+		*dest++ = *src++;
+	case 1:
+		*dest++ = *src++;
+	}
+	return;
+}
+#endif
+
+extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope
+ *
+ * Recoverable errors:
+ * PSM2_OK: If PIO, envelope is sent.
+ *	   If DMA, all queued up packets on flow were flushed.
+ *
+ * Recoverable errors converted to PSM2_OK just before return:
+ * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets.
+ * PSM2_EP_NO_RESOURCES:
+ *	   If PIO, no pio available or cable currently pulled.
+ *	   If DMA, can be that no scb's available to handle unaligned packets
+ *	           or writev returned a recoverable error (no mem for
+ *	           descriptors, dma interrupted or no space left in dma queue).
+ *
+ * Unrecoverable errors (PIO or DMA).
+ * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ *			  rxe/txe parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow,
+		     struct ips_scb *scb, int do_flush))
+{
+	psm2_error_t err = PSM2_OK;
+
+	ips_proto_flow_enqueue(flow, scb);
+
+	if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush)
+		err = flow->flush(flow, NULL);
+
+	if (do_flush)
+		err = ips_recv_progress_if_busy(proto->ptl, err);
+
+	/* As per the PSM error model (or lack thereof), PSM clients expect to see
+	 * only PSM2_OK as a recoverable error */
+	if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS)
+		err = PSM2_OK;
+	return err;
+}
+
+/*
+ * We don't use message striping for middle message protocol,
+ * Tests on sandy-bridge two HFIs show lower bandwidth if
+ * message striping is used.
+ */
+ustatic
+psm2_error_t
+ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
+		 struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf,
+		 uint32_t len)
+{
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	psm2_error_t err = PSM2_OK;
+	uintptr_t buf = (uintptr_t) ubuf;
+	uint32_t nbytes_left, pktlen, offset, chunk_size;
+	uint16_t msgseq, padding;
+	ips_scb_t *scb;
+	uint32_t is_non_dw_mul_allowed = IPS_NON_DW_MUL_NOT_ALLOWED;
+
+	psmi_assert(len > 0);
+	psmi_assert(req != NULL);
+
+	if (flow->transfer == PSM_TRANSFER_DMA) {
+		psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+		/* max chunk size is the rv window size */
+		chunk_size = ipsaddr->window_rv;
+		is_non_dw_mul_allowed = non_dw_mul_sdma;
+	} else {
+		psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+		chunk_size = flow->frag_size;
+	}
+	msgseq = ipsaddr->msgctl->mq_send_seqnum++;
+
+	nbytes_left = len;
+	offset = 0;
+	do {
+		if (is_non_dw_mul_allowed) {
+			// no need to care about padding if non-double word multiple message size is allowed.
+			padding = 0;
+		} else {
+			padding = nbytes_left & 0x3;
+		}
+
+		if (padding) {
+			psmi_assert(nbytes_left > flow->frag_size);
+			/* over reading should be OK on sender because
+			 * the padding area is within the whole buffer,
+			 * receiver will discard the extra bytes via
+			 * padcnt in packet header
+			 */
+			padding = 4 - padding;
+			pktlen = flow->frag_size - padding;
+		} else {
+			pktlen = min(chunk_size, nbytes_left);
+			psmi_assert(((pktlen & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == is_non_dw_mul_allowed));
+		}
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb != NULL);
+
+		ips_scb_opcode(scb) = OPCODE_EAGER;
+		scb->ips_lrh.khdr.kdeth0 = msgseq;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+		ips_scb_hdrdata(scb).u32w1 = len;
+		ips_scb_hdrdata(scb).u32w0 = offset;	/* initial offset */
+
+		_HFI_VDBG
+		    ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n",
+		     (void *)buf, pktlen, flow->frag_size, nbytes_left);
+		ips_scb_buffer(scb) = (void *)buf;
+
+		buf += pktlen;
+		offset += pktlen;
+		nbytes_left -= pktlen;
+
+		pktlen += padding;
+		psmi_assert(((pktlen & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == is_non_dw_mul_allowed));
+
+		scb->frag_size = flow->frag_size;
+		scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size;
+		if (scb->nfrag > 1) {
+			ips_scb_length(scb) = flow->frag_size;
+			scb->nfrag_remaining = scb->nfrag;
+			scb->chunk_size =
+				scb->chunk_size_remaining = pktlen;
+		} else
+			ips_scb_length(scb) = pktlen;
+
+		if (nbytes_left == 0) {	/* last segment/packet */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+
+			/* Set ACKREQ if single packet per scb. For multi
+			 * packets per scb, it is SDMA, driver will set
+			 * ACKREQ in last packet, we only need ACK for
+			 * last packet.
+			 */
+			if (scb->nfrag == 1)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+		} else {
+			req->send_msgoff += pktlen;
+		}
+
+		ips_proto_flow_enqueue(flow, scb);
+		if (flow->transfer == PSM_TRANSFER_PIO) {
+			/* we need to flush the pio pending queue as quick as possible */
+			err = flow->flush(flow, NULL);
+		}
+
+	} while (nbytes_left);
+
+	/* after all sdma setup, flush sdma queue,
+	 * we want one system call to handle as many scbs as possible.
+	 */
+	if (flow->transfer == PSM_TRANSFER_DMA) {
+		err = flow->flush(flow, NULL);
+	}
+
+	/* before return, try to make some progress. */
+	if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) {
+		err =
+		    ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES);
+	}
+
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
+		ips_epaddr_t *ipsaddr, const void *buf, uint32_t len)
+{
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	psm2_error_t err = PSM2_OK;
+	ips_scb_t *scb;
+
+	PSM2_LOG_MSG("entering");
+	req->buf = (void *)buf;
+	req->buf_len = len;
+	req->send_msglen = len;
+	req->recv_msgoff = 0;
+	req->rts_peer = (psm2_epaddr_t) ipsaddr;
+
+	scb = mq_alloc_pkts(proto, 1, 0, 0);
+	psmi_assert(scb);
+	ips_scb_opcode(scb) = OPCODE_LONG_RTS;
+	ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+	if (req->type & MQE_TYPE_WAITING)
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING;
+
+	scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+	ips_scb_copy_tag(scb->ips_lrh.tag, req->tag.tag);
+	ips_scb_hdrdata(scb).u32w1 = len;
+	ips_scb_hdrdata(scb).u32w0 = psmi_mpool_get_obj_index(req);
+
+	if (len <= flow->frag_size &&
+#ifdef PSM_CUDA
+	    !req->is_buf_gpu_mem &&
+#endif
+	    !(len & 0x3)) {
+		ips_scb_buffer(scb) = (void *)buf;
+		ips_scb_length(scb) = len;
+		req->send_msgoff = len;
+	} else {
+		ips_scb_length(scb) = 0;
+		req->send_msgoff = 0;
+	}
+
+#ifdef PSM_CUDA
+	/* Used to indicate to the receiver that the send
+	 * is issued on a device buffer. This helps the
+	 * receiver select TID instead of using eager buffers.
+	 */
+	if (req->is_buf_gpu_mem) {
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_GPU_BUF;
+		scb->mq_req = req;	/* request comes from GPU domain (device) ... */
+	}
+	req->cuda_hostbuf_used = 0;
+	if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) &&
+	   req->is_buf_gpu_mem &&
+	    (len > GPUDIRECT_THRESH_RV)) ||
+	    ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND)  &&
+	    req->is_buf_gpu_mem &&
+	    (len > gpudirect_send_threshold))) {
+		/* send from intermediate host buffer */
+		struct ips_cuda_hostbuf *chb;
+		uint32_t offset, window_len;
+		int prefetch_lookahead = 0;
+
+		STAILQ_INIT(&req->sendreq_prefetch);
+		offset = 0;
+		req->cuda_hostbuf_used = 1;
+		scb->mq_req = NULL;	/*  ... but it is transferred to host memory,
+					   so setting req = NULL lets us take a faster
+					   decision on scb's locality while sending
+					   (see IS_CUDA_BUF() macro) */
+
+		/* start prefetching */
+		req->prefetch_send_msgoff = 0;
+		while ((offset < len) &&
+		       (prefetch_lookahead < proto->cuda_prefetch_limit)) {
+			chb = NULL;
+			window_len =
+				ips_cuda_next_window(ipsaddr->window_rv,
+						     offset, len);
+
+			if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+				chb = (struct ips_cuda_hostbuf *)
+					psmi_mpool_get(
+					proto->cuda_hostbuf_pool_small_send);
+			if (chb == NULL)
+				chb = (struct ips_cuda_hostbuf *)
+					psmi_mpool_get(
+					proto->cuda_hostbuf_pool_send);
+
+			/* any buffers available? */
+			if (chb == NULL)
+				break;
+
+			req->prefetch_send_msgoff += window_len;
+
+			chb->offset = offset;
+			chb->size = window_len;
+			chb->req = req;
+			chb->gpu_buf = (void *) buf + offset;
+			chb->bytes_read = 0;
+
+			PSMI_CUDA_CALL(cudaMemcpyAsync,
+				       chb->host_buf, chb->gpu_buf,
+				       window_len,
+				       cudaMemcpyDeviceToHost,
+				       proto->cudastream_send);
+			PSMI_CUDA_CALL(cudaEventRecord,
+				       chb->copy_status,
+				       proto->cudastream_send);
+
+			STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb,
+					   req_next);
+			offset += window_len;
+			prefetch_lookahead++;
+		}
+	}
+#endif
+
+	PSM_LOG_EPM_COND(len > proto->mq->hfi_thresh_rv && proto->protoexp,OPCODE_LONG_RTS,PSM_LOG_EPM_TX,proto->ep->epid, req->rts_peer->epid,
+			    "ips_scb_hdrdata(scb).u32w0: %d",ips_scb_hdrdata(scb).u32w0);
+
+	if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE)))
+		goto fail;
+
+	/* Assume that we already put a few rndv requests in flight.  This helps
+	 * for bibw microbenchmarks and doesn't hurt the 'blocking' case since
+	 * we're going to poll anyway */
+	psmi_poll_internal(proto->ep, 1);
+
+fail:
+	_HFI_VDBG
+	    ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n",
+	     psmi_epaddr_get_name(proto->ep->epid),
+	     psmi_epaddr_get_name(req->rts_peer->epid), buf, len,
+	     req->tag.tag[0], req->tag.tag[1], req->tag.tag[2], req,
+	     psmi_mpool_get_obj_index(req), psm2_error_get_string(err));
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+psm2_error_t
+ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
+		   psm2_mq_tag_t *tag, const void *ubuf, uint32_t len,
+		   void *context, psm2_mq_req_t *req_o)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_proto *proto;
+	struct ips_flow *flow;
+	ips_epaddr_t *ipsaddr;
+	ips_scb_t *scb;
+	psm2_mq_req_t req;
+
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(req == NULL)
+	    return PSM2_NO_MEMORY;
+
+	ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+	ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+	proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+	req->send_msglen = len;
+	req->tag = *tag;
+	req->context = context;
+
+#ifdef PSM_CUDA
+	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+	 * when the buffer pointer received into PSM has been allocated
+	 * by the application. This guarantees the all memory operations
+	 * to this region of memory (used by multiple layers of the stack)
+	 * always synchronize
+	 */
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+		int trueflag = 1;
+		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+			      (CUdeviceptr)ubuf);
+		req->is_buf_gpu_mem = 1;
+		goto do_rendezvous;
+	} else
+		req->is_buf_gpu_mem = 0;
+#endif
+
+	if (flags & PSM2_MQ_FLAG_SENDSYNC) {
+		goto do_rendezvous;
+	} else if (len <= mq->hfi_thresh_tiny) {
+		flow = &ipsaddr->flows[proto->msgflowid];
+		scb = mq_alloc_tiny(proto);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_TINY;
+		scb->ips_lrh.khdr.kdeth0 =
+		    ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+		    ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+#ifdef PSM_CUDA
+		mq_copy_tiny_host_mem
+#else
+		mq_copy_tiny
+#endif
+			((uint32_t *) &ips_scb_hdrdata(scb),
+			     (uint32_t *) ubuf, len);
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		/* We can mark this op complete since all the data is now copied
+		 * into an SCB that remains live until it is remotely acked */
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+		_HFI_VDBG
+		    ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else if (len <= ipsaddr->flows[proto->msgflowid].frag_size) {
+		uint32_t paylen = len & ~0x3;
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb);
+
+		ips_scb_opcode(scb) = OPCODE_SHORT;
+		scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_hdrdata(scb).u32w1 = len;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+		ips_scb_buffer(scb) = (void *)ubuf;
+		ips_scb_length(scb) = paylen;
+		if (len > paylen) {
+			/* there are nonDW bytes, copy to header */
+#ifdef PSM_CUDA
+			mq_copy_tiny_host_mem
+#else
+			mq_copy_tiny
+#endif
+				((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
+				(uint32_t *)((uintptr_t)ubuf + paylen),
+				len - paylen);
+
+			/* for complete callback */
+			req->send_msgoff = len - paylen;
+		} else {
+			req->send_msgoff = 0;
+		}
+
+		/*
+		 * Need ack for send side completion because we
+		 * send from user buffer.
+		 */
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+
+		flow = &ipsaddr->flows[proto->msgflowid];
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		/*
+		 * It should be OK to check the buffer address in
+		 * 'scb' to be changed, when this scb is done, the
+		 * address is set to NULL when scb is put back to
+		 * scb pool. Even if the same scb is re-used, it
+		 * is not possible to set to this 'buf' address.
+		 */
+		if (ips_scb_buffer(scb) == (void *)ubuf) {
+			/* continue to send from user buffer */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+		} else {
+			/* mark the message done */
+			req->state = MQ_STATE_COMPLETE;
+			mq_qq_append(&mq->completed_q, req);
+		}
+		_HFI_VDBG
+		    ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else if (len <= mq->hfi_thresh_rv) {
+		if (len <= proto->iovec_thresh_eager) {
+			/* use PIO transfer */
+			psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+		} else {
+			/* use SDMA transfer */
+			psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+		}
+
+		req->send_msgoff = 0;
+		err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+
+		_HFI_VDBG
+		    ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else {		/* skip eager accounting below */
+do_rendezvous:
+		err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+		*req_o = req;
+		return err;
+	}
+
+	*req_o = req;
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+
+	return err;
+}
+
+psm2_error_t
+ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
+		  psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_proto *proto;
+	struct ips_flow *flow;
+	ips_epaddr_t *ipsaddr;
+	ips_scb_t *scb;
+
+	ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+	ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+	proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+#ifdef PSM_CUDA
+	int gpu_mem;
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+		gpu_mem = 1;
+		goto do_rendezvous;
+	} else
+		gpu_mem = 0;
+#endif
+
+	if (flags & PSM2_MQ_FLAG_SENDSYNC) {
+		goto do_rendezvous;
+	} else if (len <= mq->hfi_thresh_tiny) {
+		flow = &ipsaddr->flows[proto->msgflowid];
+		scb = mq_alloc_tiny(proto);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_TINY;
+		scb->ips_lrh.khdr.kdeth0 =
+		    ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+		    ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+#ifdef PSM_CUDA
+		mq_copy_tiny_host_mem
+#else
+		mq_copy_tiny
+#endif
+			((uint32_t *) &ips_scb_hdrdata(scb),
+			     (uint32_t *) ubuf, len);
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		_HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+	} else if (len <= ipsaddr->flows[proto->msgflowid].frag_size) {
+		uint32_t paylen = len & ~0x3;
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb);
+
+		ips_scb_opcode(scb) = OPCODE_SHORT;
+		scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_hdrdata(scb).u32w1 = len;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+		ips_scb_buffer(scb) = (void *)ubuf;
+		ips_scb_length(scb) = paylen;
+		if (len > paylen) {
+			/* there are nonDW bytes, copy to header */
+#ifdef PSM_CUDA
+			mq_copy_tiny_host_mem
+#else
+			mq_copy_tiny
+#endif
+				((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
+				(uint32_t *)((uintptr_t)ubuf + paylen),
+				len - paylen);
+		}
+
+		/*
+		 * Need ack for send side completion because we
+		 * send from user buffer.
+		 */
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+
+		flow = &ipsaddr->flows[proto->msgflowid];
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		/*
+		 * It should be OK to check the buffer address in
+		 * 'scb' to be changed, when this scb is done, the
+		 * address is set to NULL when scb is put back to
+		 * scb pool. Even if the same scb is re-used, it
+		 * is not possible to set to this 'ubuf' address.
+		 */
+		if (ips_scb_buffer(scb) == (void *)ubuf) {
+			if (flow->transfer != PSM_TRANSFER_PIO ||
+			    paylen > proto->scb_bufsize ||
+			    !ips_scbctrl_bufalloc(scb)) {
+				/* sdma transfer (can't change user buffer),
+				 * or, payload is larger than bounce buffer,
+				 * or, can't allocate bounce buffer,
+				 * send from user buffer till complete */
+				PSMI_BLOCKUNTIL(mq->ep, err,
+					ips_scb_buffer(scb) != (void*)ubuf);
+				if (err > PSM2_OK_NO_PROGRESS)
+					return err;
+				err = PSM2_OK;
+			} else {
+				/* copy to bounce buffer */
+#ifdef PSM_CUDA
+				ips_shortcpy_host_mem
+#else
+				ips_shortcpy
+#endif
+					(ips_scb_buffer(scb),
+					(void*)ubuf, paylen);
+			}
+		}
+		_HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+	} else if (len <= mq->hfi_thresh_rv) {
+		psm2_mq_req_t req;
+
+		if (len <= proto->iovec_thresh_eager_blocking) {
+			/* use PIO transfer */
+			psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+		} else {
+			/* use SDMA transfer */
+			psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+			flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+		}
+
+		/* Block until we can get a req */
+		PSMI_BLOCKUNTIL(mq->ep, err,
+				(req =
+				 psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+		if (err > PSM2_OK_NO_PROGRESS)
+			return err;
+
+		req->type |= MQE_TYPE_WAITING;
+		req->send_msglen = len;
+		req->tag = *tag;
+		req->send_msgoff = 0;
+		req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+
+		err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+
+		psmi_mq_wait_internal(&req);
+
+		_HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+	} else {
+		psm2_mq_req_t req;
+do_rendezvous:
+		/* Block until we can get a req */
+		PSMI_BLOCKUNTIL(mq->ep, err,
+				(req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+		if (err > PSM2_OK_NO_PROGRESS)
+			return err;
+
+		req->type |= MQE_TYPE_WAITING;
+		req->tag = *tag;
+		req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+
+#ifdef PSM_CUDA
+		/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+		 * when the buffer pointer received into PSM has been allocated
+		 * by the application. This guarantees the all memory operations
+		 * to this region of memory (used by multiple layers of the stack)
+		 * always synchronize
+		 */
+		if (gpu_mem) {
+			int trueflag = 1;
+			PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+				       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+				      (CUdeviceptr)ubuf);
+			req->is_buf_gpu_mem = 1;
+		} else
+			req->is_buf_gpu_mem = 0;
+#endif
+
+		err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+		psmi_mq_wait_internal(&req);
+		return err;	/* skip accounting, done separately at completion time */
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+
+	return err;
+}
+
+static
+psm2_error_t
+ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
+{
+	psm2_epaddr_t epaddr = req->rts_peer;
+	struct ips_proto *proto = epaddr->proto;
+
+	/* We have a match.
+	 * We may already set with first packet,
+	 * If we're doing eager-based r-v, just send back the sreq and length and
+	 * have the sender complete the send.
+	 */
+	PSM2_LOG_MSG("entering");
+#ifdef PSM_CUDA
+	/* Cases where we do not use TIDs:
+	 * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes
+	 * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv
+	 * 3) Recv on gpu buf and len is less than 3 bytes
+	 * 4) Expected protocol not initialized.
+	 */
+	if ((!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem &&
+	     req->recv_msglen <= GPUDIRECT_THRESH_RV)||
+	    (!req->is_sendbuf_gpu_mem &&
+	     req->recv_msglen <= proto->mq->hfi_thresh_rv))) ||
+	    (req->is_buf_gpu_mem && req->recv_msglen <= GPUDIRECT_THRESH_RV) ||
+	    proto->protoexp == NULL) {	/* no expected tid recieve */
+#else
+	if (req->recv_msglen <= proto->mq->hfi_thresh_rv ||/* less rv theshold */
+	    proto->protoexp == NULL) {  /* no expected tid recieve */
+#endif
+		/* there is no order requirement, try to push CTS request
+		 * directly, if fails, then queue it for later try. */
+		if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) {
+			struct ips_pend_sends *pends = &proto->pend_sends;
+			struct ips_pend_sreq *sreq =
+			    psmi_mpool_get(proto->pend_sends_pool);
+			psmi_assert(sreq != NULL);
+			if (sreq == NULL)
+			{
+				PSM2_LOG_MSG("leaving");
+				return PSM2_NO_MEMORY;
+			}
+			sreq->type = IPS_PENDSEND_EAGER_REQ;
+			sreq->req = req;
+
+			STAILQ_INSERT_TAIL(&pends->pendq, sreq, next);
+			psmi_timer_request(proto->timerq, &pends->timer,
+					   PSMI_TIMER_PRIO_1);
+		}
+	} else {
+		ips_protoexp_tid_get_from_token(proto->protoexp, req->buf,
+						req->recv_msglen, epaddr,
+						req->rts_reqidx_peer,
+						req->
+						type & MQE_TYPE_WAITING_PEER ?
+						IPS_PROTOEXP_TIDGET_PEERWAIT :
+						0, ips_proto_mq_rv_complete_exp,
+						req);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req)
+{
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+	struct ips_flow *flow;
+	ips_scb_t *scb;
+	ptl_arg_t *args;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[proto->msgflowid];
+	scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0);
+	if (scb == NULL)
+	{
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK_NO_PROGRESS;
+	}
+
+	args = (ptl_arg_t *) ips_scb_uwords(scb);
+
+	ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	args[0].u32w0 = psmi_mpool_get_obj_index(req);
+	args[1].u32w1 = req->recv_msglen;
+	args[1].u32w0 = req->rts_reqidx_peer;
+
+	PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_TX, proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d",
+		    req->rts_reqidx_peer);
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	/* have already received enough bytes */
+	if (req->recv_msgoff == req->recv_msglen) {
+		ips_proto_mq_rv_complete(req);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
+{
+	psm2_error_t err = PSM2_OK;
+	uintptr_t buf = (uintptr_t) req->buf + req->recv_msgoff;
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+	uint32_t nbytes_left = req->send_msglen - req->recv_msgoff;
+	uint32_t nbytes_sent = 0;
+	uint32_t nbytes_this, chunk_size;
+	uint16_t frag_size, unaligned_bytes;
+	struct ips_flow *flow;
+	ips_scb_t *scb;
+
+	psmi_assert(nbytes_left > 0);
+
+	PSM2_LOG_MSG("entering.");
+	if (
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem ||
+#endif
+		req->send_msglen > proto->iovec_thresh_eager) {
+		/* use SDMA transfer */
+		psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+		flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+		frag_size = flow->path->pr_mtu;
+		/* max chunk size is the rv window size */
+		chunk_size = ipsaddr->window_rv;
+	} else {
+		/* use PIO transfer */
+		psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+		flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+		chunk_size = frag_size = flow->frag_size;
+	}
+
+	do {
+		/*
+		 * don't try to call progression routine such as:
+		 * ips_recv_progress_if_busy() in this loop,
+		 * it will cause recursive call of this function.
+		 */
+
+		/*
+		 * When tid code path is enabled, we don’t allocate scbc_rv
+		 * objects. If the message is less than the hfi_thresh_rv,
+		 * we normally use eager protocol to do the transfer.
+		 * However, if it is sync send, we use the rendezvous
+		 * rts/cts/rts-data protocol.
+		 * In this case, because scbc_rv is null,
+		 * we use scbc_egr instead.
+		 */
+
+		scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv
+					: &proto->scbc_egr, 1, 0, 0);
+		if (scb == NULL) {
+			err = PSM2_OK_NO_PROGRESS;
+			break;
+		}
+
+		ips_scb_opcode(scb) = OPCODE_LONG_DATA;
+		scb->ips_lrh.khdr.kdeth0 = 0;
+		scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer;
+		scb->ips_lrh.data[1].u32w1 = req->send_msglen;
+
+		/* attached unaligned bytes into packet header */
+		unaligned_bytes = nbytes_left & 0x3;
+		if (unaligned_bytes) {
+			mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata,
+				(uint32_t *)buf, unaligned_bytes);
+
+			/* position to send */
+			buf += unaligned_bytes;
+			req->recv_msgoff += unaligned_bytes;
+			psmi_assert(req->recv_msgoff < 4);
+
+			/* for complete callback */
+			req->send_msgoff += unaligned_bytes;
+
+			nbytes_left -= unaligned_bytes;
+			nbytes_sent += unaligned_bytes;
+		}
+		scb->ips_lrh.data[1].u32w0 = req->recv_msgoff;
+		ips_scb_buffer(scb) = (void *)buf;
+
+		scb->frag_size = frag_size;
+		nbytes_this = min(chunk_size, nbytes_left);
+		if (nbytes_this > 0)
+			scb->nfrag = (nbytes_this + frag_size - 1) / frag_size;
+		else
+			scb->nfrag = 1;
+
+		if (scb->nfrag > 1) {
+			ips_scb_length(scb) = frag_size;
+			scb->nfrag_remaining = scb->nfrag;
+			scb->chunk_size =
+				scb->chunk_size_remaining = nbytes_this;
+		} else
+			ips_scb_length(scb) = nbytes_this;
+
+		buf += nbytes_this;
+		req->recv_msgoff += nbytes_this;
+		nbytes_sent += nbytes_this;
+		nbytes_left -= nbytes_this;
+		if (nbytes_left == 0) {
+			/* because of scb callback, use eager complete */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+
+			/* Set ACKREQ if single packet per scb. For multi
+			 * packets per scb, it is SDMA, driver will set
+			 * ACKREQ in last packet, we only need ACK for
+			 * last packet.
+			 */
+			if (scb->nfrag == 1)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+		} else {
+			req->send_msgoff += nbytes_this;
+		}
+
+		ips_proto_flow_enqueue(flow, scb);
+		if (flow->transfer == PSM_TRANSFER_PIO) {
+			/* we need to flush the pio pending queue as quick as possible */
+			flow->flush(flow, NULL);
+		}
+
+	} while (nbytes_left);
+
+	/* for sdma, if some bytes are queued, flush them */
+	if (flow->transfer == PSM_TRANSFER_DMA && nbytes_sent) {
+		flow->flush(flow, NULL);
+	}
+
+	PSM2_LOG_MSG("leaving.");
+
+	return err;
+}
+
+int
+ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_proto *proto = rcv_ev->proto;
+	psm2_mq_t mq = proto->ep->mq;
+	struct ips_flow *flow;
+	psm2_mq_req_t req;
+	uint32_t paylen;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	PSM2_LOG_MSG("entering");
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+	req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0);
+	psmi_assert(req != NULL);
+
+	/*
+	 * if there is payload, it is expected tid protocol
+	 * with tid session info as the payload.
+	 */
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	if (paylen > 0) {
+		ips_tid_session_list *payload =
+			ips_recvhdrq_event_payload(rcv_ev);
+		psmi_assert(paylen == 0 || payload);
+		PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+			    mq->ep->epid,"p_hdr->data[1].u32w0 %d",
+			    p_hdr->data[1].u32w0);
+		proto->epaddr_stats.tids_grant_recv++;
+
+		psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv);
+		psmi_assert(proto->protoexp != NULL);
+
+		/* ptl_req_ptr will be set to each tidsendc */
+		if (req->ptl_req_ptr == NULL) {
+			req->send_msglen = p_hdr->data[1].u32w1;
+		}
+		psmi_assert(req->send_msglen == p_hdr->data[1].u32w1);
+
+		if (ips_tid_send_handle_tidreq(proto->protoexp,
+					       rcv_ev->ipsaddr, req, p_hdr->data[0],
+					       p_hdr->mdata, payload, paylen) == 0) {
+			proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+		} else {
+			flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+			flow->recv_seq_num.psn_num -= 1;                            /* Decrement seq number to NAK proper CTS */
+			ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+			static unsigned int msg_cnt = 0;
+			if (msg_cnt++ == 0) {                                       /* Report the message only once */
+				_HFI_INFO("PSM2 memory shortage detected. Please consider modifying PSM2_MEMORY setting\n");
+			}
+			return PSM2_EP_NO_RESOURCES;
+		}
+	} else {
+		req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */
+		req->send_msglen = p_hdr->data[1].u32w1;
+
+		if (req->send_msgoff >= req->send_msglen) {
+			/* already sent enough bytes, may truncate so using >= */
+			ips_proto_mq_rv_complete(req);
+		} else if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) {
+			/* there is no order requirement, tried to push RTS data
+			 * directly and not done, so queue it for later try. */
+			struct ips_pend_sreq *sreq =
+				psmi_mpool_get(proto->pend_sends_pool);
+			psmi_assert(sreq != NULL);
+
+			sreq->type = IPS_PENDSEND_EAGER_DATA;
+			sreq->req = req;
+			STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next);
+			/* Make sure it's processed by timer */
+			psmi_timer_request(proto->timerq, &proto->pend_sends.timer,
+					   PSMI_TIMER_PRIO_1);
+		}
+	}
+
+	flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	PSM2_LOG_MSG("entering");
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	/* either no payload or whole message */
+	psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1);
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n",
+		  (long long)p_hdr->data[0].u64,
+		  p_hdr->data[1].u32w0, p_hdr->data[1].u32w1);
+
+	int rc = psmi_mq_handle_rts(mq,
+				    (psm2_epaddr_t) &ipsaddr->msgctl->
+				    master_epaddr,
+				    (psm2_mq_tag_t *) p_hdr->tag,
+				    p_hdr->data[1].u32w1, payload, paylen,
+				    msgorder, ips_proto_mq_rts_match_callback,
+				    &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	req->rts_peer = (psm2_epaddr_t) ipsaddr;
+	req->rts_reqidx_peer = p_hdr->data[1].u32w0;
+	if (req->send_msglen > mq->hfi_thresh_rv)
+	{
+		PSM_LOG_EPM(OPCODE_LONG_RTS,PSM_LOG_EPM_RX,req->rts_peer->epid,mq->ep->epid,
+			    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+	}
+	if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING)
+		req->type |= MQE_TYPE_WAITING_PEER;
+
+#ifdef PSM_CUDA
+	if (p_hdr->flags & IPS_SEND_FLAG_GPU_BUF)
+		req->is_sendbuf_gpu_mem = 1;
+	else
+		req->is_sendbuf_gpu_mem = 0;
+#endif
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (rc == MQ_RET_MATCH_OK)
+			ips_proto_mq_rts_match_callback(req, 1);
+
+		/* XXX if blocking, break out of progress loop */
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	PSM2_LOG_MSG("leaving");
+	return ret;
+}
+
+int
+ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = (void *)&p_hdr->hdr_data;
+	paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+		  HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK;
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		  OPCODE_TINY, p_hdr->hdr_data.u32w1);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag, paylen, 0,
+				payload, paylen, msgorder, OPCODE_TINY, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+int
+ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		  OPCODE_SHORT, p_hdr->hdr_data.u32w1);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag,
+				p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+				payload, paylen, msgorder, OPCODE_SHORT, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+int
+ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+	if (msgorder == IPS_MSG_ORDER_PAST ||
+			msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+		req = mq_eager_match(mq, msgctl,
+		    __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK);
+		/*
+		 * It is future message sequence or past message sequence,
+		 * and there is request matching in eager queue, we handle
+		 * the packet data and return. We can't go continue to
+		 * match envelope.
+		 * Past message sequence must always have a matching!!!
+		 * error is caught below.
+		 */
+		if (req) {
+			psmi_mq_handle_data(mq, req,
+				p_hdr->data[1].u32w0, payload, paylen);
+
+			if (msgorder == IPS_MSG_ORDER_FUTURE_RECV)
+				ret = IPS_RECVHDRQ_BREAK;
+
+			if ((__be32_to_cpu(p_hdr->bth[2]) &
+			    IPS_SEND_FLAG_ACKREQ) ||
+			    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+				ips_proto_send_ack((struct ips_recvhdrq *)
+					rcv_ev->recvq, flow);
+
+			ips_proto_process_ack(rcv_ev);
+
+			return ret;
+		}
+
+		psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV);
+		/*
+		 * For future message sequence, since there is no eager
+		 * queue matching yet, this must be the first packet for
+		 * the message sequence. And of course, expected message
+		 * sequence is always the first packet for the sequence.
+		 */
+	}
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+		p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		OPCODE_EAGER, p_hdr->hdr_data.u32w1);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag,
+				p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+				payload, paylen, msgorder, OPCODE_EAGER, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	/* for both outoforder matching and eager matching */
+	req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+	req->ptl_req_ptr = (void *)msgctl;
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+/*
+ * Progress the out of order queue to see if any message matches
+ * current receiving sequence number.
+ */
+void
+ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl)
+{
+	psm2_mq_req_t req;
+
+	do {
+		req =
+		    mq_ooo_match(&mq->outoforder_q, msgctl,
+				 msgctl->mq_recv_seqnum);
+		if (req == NULL)
+			return;
+
+		msgctl->outoforder_count--;
+		msgctl->mq_recv_seqnum++;
+
+		psmi_mq_handle_outoforder(mq, req);
+
+	} while (msgctl->outoforder_count > 0);
+
+	return;
+}
+
+int
+ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+	struct ips_flow *flow;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0);
+	psmi_assert(req != NULL);
+	psmi_assert(p_hdr->data[1].u32w1 == req->send_msglen);
+
+	/*
+	 * if a packet has very small offset, it must have unaligned data
+	 * attached in the packet header, and this must be the first packet
+	 * for that message.
+	 */
+	if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) {
+		psmi_assert(p_hdr->data[1].u32w0 == (req->send_msglen&0x3));
+		mq_copy_tiny((uint32_t *)req->buf,
+				(uint32_t *)&p_hdr->mdata,
+				p_hdr->data[1].u32w0);
+		req->send_msgoff += p_hdr->data[1].u32w0;
+	}
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+	psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen);
+
+	flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h
new file mode 100644
index 0000000..6e5e49a
--- /dev/null
+++ b/ptl_ips/ips_proto_params.h
@@ -0,0 +1,264 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_PARAMS_H
+#define _IPS_PROTO_PARAMS_H
+
+/*
+ * send method: dma, pio;
+ * recv method: tid, egr;
+ *
+ * send-recv mode combinations: 1=on, 0=off
+ * A: dma:1, pio=1, tid=1, egr=1;
+ * B: dma:0, pio=1, tid=1, egr=1;
+ * C: dma:1, pio=0, tid=1, egr=1;
+ * D: dma:1, pio=1, tid=0, egr=1;
+ * E: dma:0, pio=1, tid=0, egr=1;
+ * F: dma:1, pio=0, tid=0, egr=1;
+ *
+ * message packet type:
+ * T: tiny; S: short; E: eager;
+ * LR: long rts; LC: long cts; LD: long data;
+ * ED: expected data; EC: expected completion;
+ * C: ctrl msg;
+ *
+ * send,recv method for each packet type and each send-recv mode
+ * -------------------------------------------------------------------
+ * |    |  A       | B       | C       | D       | E       | F       |
+ * -------------------------------------------------------------------
+ * | T  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | S  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | E  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | E  |  dma,egr | pio,egr | dma,egr | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | LR |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LC |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LD |  x       | x       | x       | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | LD |  x       | x       | x       | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | ED |  dma,tid | pio,tid | dma,tid | x       | x       | x       |
+ * -------------------------------------------------------------------
+ * | EC |  pio,egr | pio,egr | dma,egr | x       | x       | x       |
+ * -------------------------------------------------------------------
+ * | C  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ */
+
+/* Constants */
+#define BYTE2DWORD_SHIFT 2
+#define LOWER_16_BITS 0xFFFF
+#define PSM_CACHE_LINE_BYTES 64
+#define PSM2_FLOW_CREDITS 64
+#define PSM_CRC_SIZE_IN_BYTES 8
+
+/*
+ * version of protocol header (known to chip also).
+ * This value for OPA is defined in spec.
+ */
+#define IPS_PROTO_VERSION 0x1
+
+/* Send retransmission */
+#define IPS_PROTO_SPIO_RETRY_US_DEFAULT	2	/* in uS */
+
+#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT	160	/* in millisecs */
+#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT	640	/* in millisecs */
+#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2
+#define PSM_TID_TIMEOUT_DEFAULT "160:640:2"	/* update from above params */
+
+/* time conversion macros */
+#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us))
+#define ms_2_cycles(ms)  nanosecs_to_cycles(1000000ULL*(ms))
+#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec))
+
+/* Per-flow flags */
+#define IPS_FLOW_FLAG_NAK_SEND	    0x01
+#define IPS_FLOW_FLAG_PENDING_ACK   0x02
+#define IPS_FLOW_FLAG_PENDING_NAK   0x04
+#define IPS_FLOW_FLAG_GEN_BECN      0x08
+#define IPS_FLOW_FLAG_CONGESTED     0x10
+#define IPS_FLOW_FLAG_SKIP_CTS      0x20
+
+/* tid session expected send flags  */
+#define EXP_SEND_FLAG_CLEAR_ALL 0x00
+#define EXP_SEND_FLAG_FREE_TIDS 0x01
+
+#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL	/* 64 bit all-one's  */
+
+/*
+ * scb flags for wire,
+ * Only the lower 6 bits are wire-protocol options
+ */
+#define IPS_SEND_FLAG_NONE              0x00
+#define IPS_SEND_FLAG_BLOCKING		0x01	/* blocking send */
+#define IPS_SEND_FLAG_PKTCKSUM          0x02	/* Has packet checksum */
+#define IPS_SEND_FLAG_AMISTINY		0x04	/* AM is tiny, exclusive */
+
+#ifdef PSM_CUDA
+/* This flag is used to indicate to the reciever when
+ * the send is issued on a device buffer. This helps in
+ * selecting TID path on the recieve side regardless of
+ * the receive buffers locality. It is used
+ * in a special case where the send is on a device
+ * buffer and the receive is on a host buffer.
+ */
+#define IPS_SEND_FLAG_GPU_BUF           0x08
+#endif
+
+#define IPS_SEND_FLAG_PROTO_OPTS        0x3f	/* only 6bits wire flags */
+
+/* scb flags */
+#define IPS_SEND_FLAG_PENDING		0x0100
+#define IPS_SEND_FLAG_PERSISTENT	0x0200
+
+/* 0x10000000, interrupt when done */
+#define IPS_SEND_FLAG_INTR		(1<<HFI_KHDR_INTR_SHIFT)
+/* 0x20000000, header suppression */
+#define IPS_SEND_FLAG_HDRSUPP		(1<<HFI_KHDR_SH_SHIFT)
+/* 0x80000000, request ack (normal) */
+#define IPS_SEND_FLAG_ACKREQ		(1<<HFI_BTH_ACK_SHIFT)
+
+/* proto flags */
+#define IPS_PROTO_FLAG_SDMA		0x01	/* all sdma, no pio */
+#define IPS_PROTO_FLAG_SPIO		0x02	/* all spio, no dma */
+#define IPS_PROTO_FLAG_RCVTHREAD	0x04	/* psm recv thread is on */
+#define IPS_PROTO_FLAG_LOOPBACK		0x08	/* psm loopback over hfi */
+#define IPS_PROTO_FLAG_CKSUM            0x10	/* psm checksum is on */
+
+/* Coalesced ACKs (On by default) */
+#define IPS_PROTO_FLAG_COALESCE_ACKS    0x20
+
+/* Use Path Record query (off by default) */
+#define IPS_PROTO_FLAG_QUERY_PATH_REC   0x40
+
+/* Path selection policies:
+ *
+ * (a) Adaptive - Dynamically determine the least loaded paths using various
+ * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs.
+ *
+ * (b) Static schemes  -
+ *     (i) static_src  - Use path keyed off source context
+ *    (ii) static_dest - Use path keyed off destination context
+ *   (iii) static_base - Use only the base lid path - default till Oct'09.
+ *
+ * The default is adaptive. If a zero lmc network is used then there exists
+ * just one path between endpoints the (b)(iii) case above.
+ *
+ */
+
+#define IPS_PROTO_FLAG_PPOLICY_ADAPTIVE 0x200
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_SRC 0x400
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_DST 0x800
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_BASE 0x1000
+
+/* All static policies */
+#define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
+
+/* IBTA CCA Protocol support */
+#define IPS_PROTO_FLAG_CCA 0x2000
+#define IPS_PROTO_FLAG_CCA_PRESCAN 0x4000	/* Enable RAPID CCA prescanning */
+
+#ifdef PSM_CUDA
+/* Use RNDV (TID) for all message sizes */
+#define IPS_PROTO_FLAG_ALWAYS_RNDV		0x10000
+/* Use GPUDirect RDMA for SDMA */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND	0x20000
+/* Use GPUDirect RDMA for TID */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV	0x40000
+#endif
+
+#define IPS_PROTOEXP_FLAG_ENABLED	     0x01	/* default */
+#define IPS_PROTOEXP_FLAG_HDR_SUPP           0x02	/* Header suppression enabled */
+#define IPS_PROTOEXP_FLAG_TID_DEBUG	     0x04	/* *not* default */
+#define IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE 0x08	/* Interleave RTS handling. */
+#define IPS_PROTOEXP_FLAG_CTS_SERIALIZED 0x10	/* CTS serialized */
+#define IPS_PROTOEXP_FLAGS_DEFAULT	     IPS_PROTOEXP_FLAG_ENABLED
+
+
+/* We have to get an MTU of at least 2K, or else this breaks some assumptions
+ * in the packets that handle tid descriptors
+ */
+#define IPS_PROTOEXP_MIN_MTU		2048
+
+/* Fault injection, becomes parameters to psmi_faultinj_getspec so
+ * a comma-delimited list of
+ *   "spec_name", num, denom
+ * Where num/denom means fault num out of every denom.
+ * The defines set 'denum' and assume that num is set to 1
+ *
+ * These values are all defaults, each is overridable via
+ * PSM2_FI_<spec_name> in the environment (and yes, spec_name is in lowercase
+ * *in the environment* just to minimize it appearing in the wild).  The format
+ * there is <num:denom:initial_seed> so the same thing except that one can set
+ * a specific seed to the random number generator.
+ */
+#if 1
+#define IPS_FAULTINJ_DMALOST	20	/* 1 every 20 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST	100	/* 1 every 100 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY	10	/* 1 every 10 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST	200	/* 1 every 200 pkts dropped at recv */
+#else
+#define IPS_FAULTINJ_DMALOST	500	/* 1 every 500 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST	3000	/* 1 every 3000 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY	100	/* 1 every 100 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST	500	/* 1 every 500 pkts dropped at recv */
+#endif
+
+#endif /* _IPS_PROTO_PARAMS_H */
diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c
new file mode 100644
index 0000000..c55a57c
--- /dev/null
+++ b/ptl_ips/ips_proto_recv.c
@@ -0,0 +1,1447 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* receive service routine for each packet opcode */
+ips_packet_service_fn_t
+ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED] = {
+ips_proto_process_unknown_opcode,	/* 0xC0 */
+ips_proto_mq_handle_tiny,		/* OPCODE_TINY */
+ips_proto_mq_handle_short,
+ips_proto_mq_handle_eager,
+ips_proto_mq_handle_rts,                /* RTS */
+ips_proto_mq_handle_cts,                /* CTS */
+ips_proto_mq_handle_data,               /* DATA */
+ips_protoexp_data,                      /* EXPTID */
+ips_protoexp_recv_tid_completion,       /* EXPTID_COMPLETION */
+ips_proto_process_ack,
+ips_proto_process_nak,
+ips_proto_process_becn,
+ips_proto_process_err_chk,
+ips_proto_process_err_chk_gen,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_am,
+ips_proto_am,
+ips_proto_am				/* OPCODE_AM_REPLY */
+};
+
+#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS	30
+static void ips_report_strays(struct ips_proto *proto);
+
+#define INC_TIME_SPEND(timer)
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto)
+{
+	uint32_t interval_secs;
+	union psmi_envvar_val env_stray;
+
+	psmi_getenv("PSM2_STRAY_WARNINTERVAL",
+		    "min secs between stray process warnings",
+		    PSMI_ENVVAR_LEVEL_HIDDEN,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS,
+		    &env_stray);
+	interval_secs = env_stray.e_uint;
+	if (interval_secs > 0)
+		proto->stray_warn_interval = sec_2_cycles(interval_secs);
+	else
+		proto->stray_warn_interval = 0;
+
+	return PSM2_OK;
+}
+
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto)
+{
+	ips_report_strays(proto);
+	return PSM2_OK;
+}
+
+#define cycles_to_sec_f(cycles)		    \
+	(((double)cycles_to_nanosecs(cycles)) / 1000000000.0)
+
+struct ips_stray_epid {
+	psm2_epid_t epid;
+	uint32_t err_check_bad_sent;
+	uint32_t ipv4_addr;
+	uint32_t pid;
+	uint32_t num_messages;
+	uint64_t t_warn_next;
+	uint64_t t_first;
+	uint64_t t_last;
+};
+
+static
+void ips_report_strays(struct ips_proto *proto)
+{
+	struct ips_stray_epid *sepid;
+	struct psmi_eptab_iterator itor;
+	psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK);
+
+#if _HFI_DEBUGGING
+	double t_first = 0;
+	double t_last = 0;
+	double t_runtime = 0;
+	if (_HFI_INFO_ON) {
+		t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init);
+	}
+#endif
+
+	while ((sepid = psmi_epid_itor_next(&itor))) {
+		char ipbuf[INET_ADDRSTRLEN], *ip = NULL;
+		char bufpid[32];
+		uint32_t lid = psm2_epid_nid(sepid->epid);
+#if _HFI_DEBUGGING
+		if (_HFI_INFO_ON) {
+			t_first =
+				cycles_to_sec_f(sepid->t_first - proto->t_init);
+			t_last =
+				cycles_to_sec_f(sepid->t_last - proto->t_init);
+		}
+#endif
+		if (sepid->ipv4_addr)
+			ip = (char *)
+			    inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf,
+				      sizeof(ipbuf));
+		if (!ip)
+			snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid);
+
+		if (sepid->pid)
+			snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid);
+		else
+			snprintf(bufpid, sizeof(bufpid), "PID unknown");
+
+		if (_HFI_INFO_ON) {
+			_HFI_INFO_ALWAYS
+				("Process %s on host %s=%s sent %d stray message(s) and "
+				"was told so %d time(s) (first stray message at %.1fs "
+				"(%d%%), last at %.1fs (%d%%) into application run)\n",
+				bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages,
+				sepid->err_check_bad_sent, t_first,
+				(int)(t_first * 100.0 / t_runtime), t_last,
+				(int)(t_last * 100.0 / t_runtime));
+		}
+
+		psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid);
+		psmi_free(sepid);
+	}
+	psmi_epid_itor_fini(&itor);
+	return;
+}
+
+/* New scbs now available.  If we have pending sends because we were out of
+ * scbs, put the pendq on the timerq so it can be processed. */
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+	struct ips_proto *proto = (struct ips_proto *)context;
+	struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq);
+	if (sreq != NULL)
+		psmi_timer_request(proto->timerq,
+				   &proto->pend_sends.timer, PSMI_TIMER_PRIO_1);
+	return;
+}
+
+psm2_error_t
+ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_pend_sends *pend_sends =
+	    (struct ips_pend_sends *)timer->context;
+	struct ips_pendsendq *phead = &pend_sends->pendq;
+	struct ips_proto *proto = (struct ips_proto *)pend_sends->proto;
+	struct ips_pend_sreq *sreq;
+
+	while (!STAILQ_EMPTY(phead)) {
+		sreq = STAILQ_FIRST(phead);
+		switch (sreq->type) {
+		case IPS_PENDSEND_EAGER_REQ:
+			err = ips_proto_mq_push_cts_req(proto, sreq->req);
+			break;
+		case IPS_PENDSEND_EAGER_DATA:
+			err = ips_proto_mq_push_rts_data(proto, sreq->req);
+			break;
+
+		default:
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown pendq state %d\n",
+					  sreq->type);
+		}
+
+		if (err == PSM2_OK) {
+			STAILQ_REMOVE_HEAD(phead, next);
+			psmi_mpool_put(sreq);
+		} else {	/* out of scbs. wait for the next scb_avail callback */
+			/* printf("!!!!! breaking out of pendq progress\n"); */
+			break;
+		}
+	}
+
+	return err;
+}
+
+PSMI_INLINE(
+int
+between(int first_seq, int last_seq, int seq))
+{
+	if (last_seq >= first_seq) {
+		if (seq < first_seq || seq > last_seq) {
+			return 0;
+		}
+	} else {
+		if (seq > last_seq && seq < first_seq) {
+			return 0;
+		}
+	}
+	return 1;
+}
+
+PSMI_INLINE(
+int
+pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow,
+		  psmi_seqnum_t ack_seq_num))
+{
+	uint32_t last_num;
+	struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+
+	if (STAILQ_EMPTY(unackedq))
+		return 0;
+
+	/* scb_pend will be moved back when an nak is received, but
+	 * the packet may actually be received and acked after the nak,
+	 * so we use the tail of unacked queue, which may include packets
+	 * not being sent out yet, this is over do, but it should be OK. */
+	last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num;
+
+	return between(flow->xmit_ack_num.psn_num,
+				last_num, ack_seq_num.psn_num);
+}
+
+PSMI_INLINE(
+struct ips_flow *
+get_tidflow(struct ips_proto *proto, ips_epaddr_t *ipsaddr,
+	    struct ips_message_header *p_hdr, psmi_seqnum_t ack_seq_num))
+{
+	struct ips_protoexp *protoexp = proto->protoexp;
+	ptl_arg_t desc_id = p_hdr->data[0];
+	struct ips_tid_send_desc *tidsendc;
+	ptl_arg_t desc_tidsendc;
+	struct ips_flow *flow;
+	uint32_t last_seq;
+	struct ips_scb_unackedq *unackedq;
+
+	tidsendc = (struct ips_tid_send_desc *)
+	    psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+					 desc_id._desc_idx);
+	if (tidsendc == NULL) {
+		_HFI_ERROR
+		    ("OPCODE_ACK: Index %d is out of range in tidflow ack\n",
+		     desc_id._desc_idx);
+		return NULL;
+	}
+
+	/* Ensure generation matches */
+	psmi_mpool_get_obj_index_gen_count(tidsendc,
+					   &desc_tidsendc._desc_idx,
+					   &desc_tidsendc._desc_genc);
+	if (desc_tidsendc.u64 != desc_id.u64)
+		return NULL;
+
+	/* Ensure ack is within window */
+	flow = &tidsendc->tidflow;
+	unackedq = &flow->scb_unacked;
+
+	/* No unacked scbs */
+	if (STAILQ_EMPTY(unackedq))
+		return NULL;
+
+	/* Generation for ack should match */
+	if (STAILQ_FIRST(unackedq)->seq_num.psn_gen != ack_seq_num.psn_gen)
+		return NULL;
+
+	/* scb_pend will be moved back when an nak is received, but
+	 * the packet may actually be received and acked after the nak,
+	 * so we use the tail of unacked queue, which may include packets
+	 * not being sent out yet, this is over do, but it should be OK. */
+	last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_seq;
+
+	if (between(flow->xmit_ack_num.psn_seq,
+				last_seq, ack_seq_num.psn_seq) == 0)
+		return NULL;
+
+	return flow;
+}
+
+/* NAK post process for tid flow */
+void ips_tidflow_nak_post_process(struct ips_proto *proto,
+				  struct ips_flow *flow)
+{
+	ips_scb_t *scb;
+	uint32_t first_seq, ack_seq;
+
+	scb = STAILQ_FIRST(&flow->scb_unacked);
+	first_seq = __be32_to_cpu(scb->ips_lrh.bth[2]) & HFI_BTH_SEQ_MASK;
+	ack_seq = (flow->xmit_ack_num.psn_seq - 1) & HFI_BTH_SEQ_MASK;
+
+	/* If the ack SEQ falls into a multi-packets scb,
+	 * don't re-send the packets already acked. */
+	if (scb->nfrag > 1 &&
+	between(first_seq, scb->seq_num.psn_seq, ack_seq)) {
+		uint32_t om, offset_in_tid, remaining_bytes_in_tid;
+		uint32_t npkt, pktlen, nbytes;
+		uint32_t idx, loop;
+
+		/* how many packets acked in this scb */
+		npkt = ((ack_seq - first_seq) & HFI_BTH_SEQ_MASK) + 1;
+
+		/* Get offset/om from current packet header */
+		offset_in_tid = __le32_to_cpu(scb->ips_lrh.khdr.kdeth0) &
+				HFI_KHDR_OFFSET_MASK;
+		om = (__le32_to_cpu(scb->ips_lrh.khdr.kdeth0) >>
+				HFI_KHDR_OM_SHIFT) & 0x1;
+		if (om)
+			offset_in_tid *= 64;
+		else
+			offset_in_tid *= 4;
+		/* bytes remaining in current tid */
+		remaining_bytes_in_tid =
+			(IPS_TIDINFO_GET_LENGTH(scb->tsess[0]) << 12) -
+			offset_in_tid;
+
+		/* packet length in current header */
+		pktlen = scb->payload_size;
+		psmi_assert(min(remaining_bytes_in_tid,
+			scb->frag_size) >= pktlen);
+		psmi_assert((((__be16_to_cpu(scb->ips_lrh.lrh[2]) &
+			HFI_LRH_PKTLEN_MASK) << BYTE2DWORD_SHIFT) -
+			sizeof(struct ips_message_header) -
+			HFI_CRC_SIZE_IN_BYTES) == pktlen);
+
+		/* Loop to find the position to start */
+		idx = 0;
+		nbytes = 0;
+		loop = npkt;
+		while (loop) {
+			remaining_bytes_in_tid -= pktlen;
+			offset_in_tid += pktlen;
+			nbytes += pktlen;
+			first_seq++;
+			loop--;
+
+			if (remaining_bytes_in_tid == 0) {
+				idx++;
+				remaining_bytes_in_tid =
+					IPS_TIDINFO_GET_LENGTH(scb->
+					tsess[idx]) << 12;
+				offset_in_tid = 0;
+			}
+
+			pktlen = min(remaining_bytes_in_tid, scb->frag_size);
+		}
+		psmi_assert((first_seq & HFI_BTH_SEQ_MASK) ==
+				((ack_seq + 1) & HFI_BTH_SEQ_MASK));
+
+		/* 0. update scb info */
+		psmi_assert(scb->nfrag_remaining > npkt);
+		scb->nfrag_remaining -= npkt;
+		psmi_assert(scb->chunk_size_remaining > nbytes);
+		scb->chunk_size_remaining -= nbytes;
+		ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes);
+
+		/* 1. if last packet in sequence, set ACK, clear SH */
+		if (scb->nfrag_remaining == 1) {
+			psmi_assert(scb->chunk_size_remaining <=
+				    scb->frag_size);
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+			scb->flags &= ~IPS_SEND_FLAG_HDRSUPP;
+
+			/* last packet is what remaining */
+			pktlen = scb->chunk_size_remaining;
+		}
+
+		/* 2. set new packet sequence number */
+		scb->ips_lrh.bth[2] = __cpu_to_be32(
+			((first_seq & HFI_BTH_SEQ_MASK) << HFI_BTH_SEQ_SHIFT) |
+			((scb->seq_num.psn_gen &
+			HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT) |
+			(scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+		/* 3. set new packet offset */
+		scb->ips_lrh.exp_offset += nbytes;
+
+		/* 4. if packet length is changed, set new length */
+		if (scb->payload_size != pktlen) {
+			scb->payload_size = pktlen;
+			scb->ips_lrh.lrh[2] = __cpu_to_be16((
+				(scb->payload_size +
+				sizeof(struct ips_message_header) +
+				HFI_CRC_SIZE_IN_BYTES) >>
+				BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK);
+		}
+
+		/* 5. set new tidctrl and tidinfo array */
+		scb->tsess = &scb->tsess[idx];
+		scb->tsess_length -= idx * sizeof(uint32_t);
+		scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(scb->tsess[0]);
+
+		/* 6. calculate new offset mode */
+		if (offset_in_tid < 131072) { /* 2^15 * 4 */
+			offset_in_tid /= 4;
+			om = 0;
+		} else {
+			offset_in_tid /= 64;
+			om = 1;
+		}
+
+		/* 7. set new tidinfo */
+		scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32(
+			(offset_in_tid & HFI_KHDR_OFFSET_MASK) |
+			(om << HFI_KHDR_OM_SHIFT) |
+			(IPS_TIDINFO_GET_TID(scb->tsess[0])
+					<< HFI_KHDR_TID_SHIFT) |
+			(scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) |
+			(scb->flags & IPS_SEND_FLAG_INTR) |
+			(scb->flags & IPS_SEND_FLAG_HDRSUPP) |
+			(IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT));
+	}
+
+	/* Update unacked scb's to use the new generation */
+	while (scb) {
+		/* update with new generation */
+		scb->ips_lrh.bth[2] = __cpu_to_be32(
+			(__be32_to_cpu(scb->ips_lrh.bth[2]) &
+			(~(HFI_BTH_GEN_MASK << HFI_BTH_GEN_SHIFT))) |
+			((flow->xmit_seq_num.psn_gen &
+			HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT));
+		scb->seq_num.psn_gen = flow->xmit_seq_num.psn_gen;
+		scb = SLIST_NEXT(scb, next);
+	}
+}
+
+/* NAK post process for dma flow */
+void ips_dmaflow_nak_post_process(struct ips_proto *proto,
+				  struct ips_flow *flow)
+{
+	ips_scb_t *scb;
+	uint32_t first_num, ack_num;
+	uint16_t padding = 0;
+
+	scb = STAILQ_FIRST(&flow->scb_unacked);
+	first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask;
+	ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask;
+
+
+	/* If the ack PSN falls into a multi-packets scb,
+	 * don't re-send the packets already acked. */
+	psmi_assert(scb->nfrag > 1);
+	if (between(first_num, scb->seq_num.psn_num, ack_num)) {
+		uint32_t npkt, pktlen, nbytes;
+
+		/* how many packets acked in this scb */
+		npkt = ((ack_num - first_num) & proto->psn_mask) + 1;
+
+		/* how many bytes already acked in this scb, for eager receive
+		 * packets, all payload size is frag_size except the last packet
+		 * which is not acked yet */
+		pktlen = scb->frag_size;
+		nbytes = (((ack_num - first_num) &
+			proto->psn_mask) + 1) * pktlen;
+
+		/* 0. update scb info */
+		psmi_assert(scb->nfrag_remaining > npkt);
+		scb->nfrag_remaining -= npkt;
+		psmi_assert(scb->chunk_size_remaining > nbytes);
+		scb->chunk_size_remaining -= nbytes;
+		ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes);
+
+		/* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */
+		if (scb->chunk_size_remaining <= scb->frag_size) {
+			psmi_assert(scb->nfrag_remaining == 1);
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+			/* last packet is what remaining */
+			/* check if padding is required*/
+			padding = scb->chunk_size_remaining & 0x3;
+			if_pf(padding) {
+				/* how much to pad with also equals how many bytes we need
+				 * to rewind the source buffer offset by to keep it dw aligned */
+				padding = 4 - padding;
+				ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding);
+				scb->chunk_size_remaining += padding;
+			}
+			pktlen = scb->chunk_size_remaining;
+		}
+
+		/* 2. set new packet sequence number */
+		scb->ips_lrh.bth[2] = __cpu_to_be32(
+			((ack_num + 1) & proto->psn_mask) |
+			(scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+		/* 3. set new packet offset adjusted with padding */
+		ips_scb_hdrdata(scb).u32w0 += nbytes - padding;
+
+		/* 4. if packet length is changed, set new length */
+		if (scb->payload_size != pktlen) {
+			scb->payload_size = pktlen;
+			scb->ips_lrh.lrh[2] = __cpu_to_be16((
+				(scb->payload_size +
+				sizeof(struct ips_message_header) +
+				HFI_CRC_SIZE_IN_BYTES) >>
+				BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK);
+		}
+	}
+}
+
+/* process an incoming ack message.  Separate function to allow */
+/* for better optimization by compiler */
+int
+ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_flow *flow = NULL;
+	struct ips_scb_unackedq *unackedq;
+	struct ips_scb_pendlist *scb_pend;
+	psmi_seqnum_t ack_seq_num, last_seq_num;
+	ips_epaddr_flow_t flowid;
+	ips_scb_t *scb;
+	uint32_t tidctrl;
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+	if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+		ack_seq_num.psn_num =
+		    (ack_seq_num.psn_num - 1) & proto->psn_mask;
+		psmi_assert(flowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[flowid];
+		if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+			goto ret;
+	} else {
+		ack_seq_num.psn_seq -= 1;
+		flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num);
+		if (!flow)	/* Invalid ack for flow */
+			goto ret;
+	}
+	flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+
+	unackedq = &flow->scb_unacked;
+	scb_pend = &flow->scb_pend;
+
+	if (STAILQ_EMPTY(unackedq))
+		goto ret;
+
+	last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+	INC_TIME_SPEND(TIME_SPEND_USER2);
+
+	/* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+	 * we can used general psn_num to compare the PSN. */
+	while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+		       last_seq_num.psn_num, ack_seq_num.psn_num)
+	    ) {
+
+		/* take it out of the xmit queue and ..  */
+		if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+			SLIST_REMOVE_HEAD(scb_pend, next);
+		}
+
+		STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+		flow->scb_num_unacked--;
+		psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+		flow->credits += scb->nfrag;
+
+		if (flow->transfer == PSM_TRANSFER_DMA &&
+				scb->dma_complete == 0)
+			ips_proto_dma_wait_until(proto, scb);
+
+		if (scb->callback)
+			(*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+					  scb->chunk_size : scb->payload_size);
+
+		if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+			ips_scbctrl_free(scb);
+
+		/* set all index pointer to NULL if all frames have been
+		 * acked */
+		if (STAILQ_EMPTY(unackedq)) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+
+			SLIST_FIRST(scb_pend) = NULL;
+			psmi_assert(flow->scb_num_pending == 0);
+			/* Reset congestion window - all packets ACK'd */
+			flow->credits = flow->cwin = proto->flow_credits;
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+			flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+			goto ret;
+		} else if (flow->timer_ack == scb->timer_ack) {
+			/*
+			 * Exchange timers with last scb on unackedq.
+			 * timer in scb is used by flow, cancelling current
+			 * timer and then requesting a new timer takes more
+			 * time, instead, we exchange the timer between current
+			 * freeing scb and the last scb on unacked queue.
+			 */
+			psmi_timer *timer;
+			ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+			timer = scb->timer_ack;
+			scb->timer_ack = last->timer_ack;
+			last->timer_ack = timer;
+			timer = scb->timer_send;
+			scb->timer_send = last->timer_send;
+			last->timer_send = timer;
+
+			scb->timer_ack->context = scb;
+			scb->timer_send->context = scb;
+			last->timer_ack->context = last;
+			last->timer_send->context = last;
+		}
+	}
+
+	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
+
+	/* CCA: If flow is congested adjust rate */
+	if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+		if ((flow->path->pr_ccti +
+		     proto->cace[flow->path->pr_sl].ccti_increase) <=
+		    proto->ccti_limit) {
+			ips_cca_adjust_rate(flow->path,
+					    proto->cace[flow->path->pr_sl].
+					    ccti_increase);
+			/* Clear congestion event */
+			rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+		}
+	}
+	else {
+		/* Increase congestion window if flow is not congested */
+		if_pf(flow->cwin < proto->flow_credits) {
+			flow->credits +=
+			    min(flow->cwin << 1,
+				proto->flow_credits) - flow->cwin;
+			flow->cwin = min(flow->cwin << 1, proto->flow_credits);
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+		}
+	}
+
+	/* Reclaimed some credits - attempt to flush flow */
+	if (!SLIST_EMPTY(scb_pend))
+		flow->flush(flow, NULL);
+
+	/*
+	 * If the next packet has not even been put on the wire, cancel the
+	 * retransmission timer since we're still presumably waiting on free
+	 * pio bufs
+	 */
+	if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE)
+		psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+ret:
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* process an incoming nack message.  Separate function to allow */
+/* for better optimization by compiler */
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_flow *flow = NULL;
+	struct ips_scb_unackedq *unackedq;
+	struct ips_scb_pendlist *scb_pend;
+	psmi_seqnum_t ack_seq_num, last_seq_num;
+	psm_protocol_type_t protocol;
+	ips_epaddr_flow_t flowid;
+	ips_scb_t *scb;
+	uint32_t tidctrl;
+
+	INC_TIME_SPEND(TIME_SPEND_USER3);
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+	if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+		protocol = PSM_PROTOCOL_GO_BACK_N;
+		psmi_assert(flowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[flowid];
+		if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+			goto ret;
+		ack_seq_num.psn_num =
+		    (ack_seq_num.psn_num - 1) & proto->psn_mask;
+		flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+	} else {
+		protocol = PSM_PROTOCOL_TIDFLOW;
+		flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num);
+		if (!flow)
+			goto ret;	/* Invalid ack for flow */
+		ack_seq_num.psn_seq--;
+
+		psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen);
+		psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen);
+		/* Update xmit_ack_num with both new generation and new
+		 * acked sequence; update xmit_seq_num with the new flow
+		 * generation, don't change the sequence number. */
+		flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0;
+		flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen;
+		psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen);
+	}
+
+	unackedq = &flow->scb_unacked;
+	scb_pend = &flow->scb_pend;
+
+	if (STAILQ_EMPTY(unackedq))
+		goto ret;
+
+	last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+	proto->epaddr_stats.nak_recv++;
+
+	_HFI_VDBG("got a nack %d on flow %d, "
+		  "first is %d, last is %d\n", ack_seq_num.psn_num,
+		  flow->flowid,
+		  STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num.
+		  psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq,
+								     ips_scb,
+								     nextq)->
+		  seq_num.psn_num);
+
+	/* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+	 * we can used general psn_num to compare the PSN. */
+	while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+		       last_seq_num.psn_num, ack_seq_num.psn_num)
+	    ) {
+		/* take it out of the xmit queue and ..  */
+		if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+			SLIST_REMOVE_HEAD(scb_pend, next);
+		}
+
+		STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+		flow->scb_num_unacked--;
+		psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+
+		if (flow->transfer == PSM_TRANSFER_DMA &&
+				scb->dma_complete == 0)
+			ips_proto_dma_wait_until(proto, scb);
+
+		if (scb->callback)
+			(*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+					  scb->chunk_size : scb->payload_size);
+
+		if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+			ips_scbctrl_free(scb);
+
+		/* set all index pointer to NULL if all frames has been acked */
+		if (STAILQ_EMPTY(unackedq)) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+
+			SLIST_FIRST(scb_pend) = NULL;
+			psmi_assert(flow->scb_num_pending == 0);
+			/* Reset congestion window if all packets acknowledged */
+			flow->credits = flow->cwin = proto->flow_credits;
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+			flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+			goto ret;
+		} else if (flow->timer_ack == scb->timer_ack) {
+			/*
+			 * Exchange timers with last scb on unackedq.
+			 * timer in scb is used by flow, cancelling current
+			 * timer and then requesting a new timer takes more
+			 * time, instead, we exchange the timer between current
+			 * freeing scb and the last scb on unacked queue.
+			 */
+			psmi_timer *timer;
+			ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+			timer = scb->timer_ack;
+			scb->timer_ack = last->timer_ack;
+			last->timer_ack = timer;
+			timer = scb->timer_send;
+			scb->timer_send = last->timer_send;
+			last->timer_send = timer;
+
+			scb->timer_ack->context = scb;
+			scb->timer_send->context = scb;
+			last->timer_ack->context = last;
+			last->timer_send->context = last;
+		}
+	}
+
+	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
+
+	if (protocol == PSM_PROTOCOL_TIDFLOW)
+		ips_tidflow_nak_post_process(proto, flow);
+	else if (scb->nfrag > 1)
+		ips_dmaflow_nak_post_process(proto, flow);
+
+	/* Always cancel ACK timer as we are going to restart the flow */
+	psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+	/* What's now pending is all that was unacked */
+	SLIST_FIRST(scb_pend) = scb;
+#ifdef PSM_DEBUG
+	flow->scb_num_pending = flow->scb_num_unacked;
+#endif
+	while (scb && !(scb->flags & IPS_SEND_FLAG_PENDING)) {
+		/* Wait for the previous dma completion */
+		if (flow->transfer == PSM_TRANSFER_DMA &&
+				scb->dma_complete == 0)
+			ips_proto_dma_wait_until(proto, scb);
+
+		scb->flags |= IPS_SEND_FLAG_PENDING;
+		scb = SLIST_NEXT(scb, next);
+	}
+
+	/* If NAK with congestion bit set - delay re-transmitting and THEN adjust
+	 * CCA rate.
+	 */
+	if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+		uint64_t offset;
+
+		/* Clear congestion event and mark flow as congested */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+		flow->flags |= IPS_FLOW_FLAG_CONGESTED;
+
+		/* For congested flow use slow start i.e. reduce congestion window.
+		 * For TIDFLOW we cannot reduce congestion window as peer expects
+		 * header packets at regular intervals (protoexp->hdr_pkt_interval).
+		 */
+		if (flow->protocol != PSM_PROTOCOL_TIDFLOW)
+			flow->credits = flow->cwin = 1;
+		else
+			flow->credits = flow->cwin;
+
+		flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+
+		/* During congestion cancel send timer and delay retransmission by
+		 * random interval
+		 */
+		psmi_timer_cancel(proto->timerq, flow->timer_send);
+		if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE)
+			offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1);
+		else
+			offset = 0;
+		struct drand48_data drand48_data;
+		srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data);
+		double rnum;
+		drand48_r(&drand48_data, &rnum);
+		psmi_timer_request(proto->timerq, flow->timer_send,
+				   (get_cycles() +
+				    (uint64_t) (offset *
+						(rnum + 1.0))));
+	}
+	else {
+		int num_resent = 0;
+
+		/* Reclaim all credits upto congestion window only */
+		flow->credits = flow->cwin;
+		flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+
+		/* Flush pending scb's */
+		flow->flush(flow, &num_resent);
+
+		proto->epaddr_stats.send_rexmit += num_resent;
+	}
+
+ret:
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	psmi_seqnum_t seq_num;
+	int16_t seq_off;
+
+	INC_TIME_SPEND(TIME_SPEND_USER4);
+	PSM2_LOG_MSG("entering");
+	psmi_assert(flowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[flowid];
+	recvq->proto->epaddr_stats.err_chk_recv++;
+	/* Ignore FECN bit since this is the control path */
+	rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+
+	seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+	seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num);
+
+	if_pf(seq_off <= 0) {
+		_HFI_VDBG("naking for seq=%d, off=%d on flowid  %d\n",
+			  seq_num.psn_num, seq_off, flowid);
+
+		if (seq_off < -flow->ack_interval)
+			flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+		ips_proto_send_nak(recvq, flow);
+		flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+
+		ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+					    &ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_protoexp *protoexp = recvq->proto->protoexp;
+	struct ips_tid_recv_desc *tidrecvc;
+	ips_scb_t ctrlscb;
+	psmi_seqnum_t err_seqnum, recvseq;
+	ptl_arg_t desc_id = p_hdr->data[0];
+	ptl_arg_t send_desc_id = p_hdr->data[1];
+	int16_t seq_off;
+	uint8_t ack_type;
+
+	INC_TIME_SPEND(TIME_SPEND_USER4);
+	PSM2_LOG_MSG("entering");
+	recvq->proto->epaddr_stats.err_chk_recv++;
+
+	/* Ignore FECN bit since this is the control path */
+	rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+
+	/* Get the flowgenseq for err chk gen */
+	err_seqnum.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+	/* Get receive descriptor */
+	psmi_assert(desc_id._desc_idx < HFI_TF_NFLOWS);
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+		/* Receive descriptor mismatch in time and space.
+		 * Stale err chk gen, drop packet
+		 */
+		_HFI_DBG
+		    ("ERR_CHK_GEN: gen mismatch Pkt: 0x%x, Current: 0x%x\n",
+		     desc_id._desc_genc, tidrecvc->rdescid._desc_genc);
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+	psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
+
+	/*
+	 * We change tidrecvc->tidflow_genseq here only when a new generation
+	 * is allocated and programmed into hardware. Otherwise we use local
+	 * variable recvseq to create the reply.
+	 */
+	recvseq = tidrecvc->tidflow_genseq;
+
+	/* Get the latest seq from hardware tidflow table. But
+	 * only do this when context sharing is not used, because
+	 * context sharing might drop packet even though hardware
+	 * has received it successfully.
+	 */
+	if (!tidrecvc->context->tf_ctrl)
+		recvseq.psn_seq = hfi_tidflow_get_seqnum(
+			hfi_tidflow_get(tidrecvc->context->ctrl,
+			tidrecvc->rdescid._desc_idx));
+
+	if (err_seqnum.psn_gen != recvseq.psn_gen) {
+		ack_type = OPCODE_NAK;
+		/* NAK without allocating a new generation */
+
+		/* My current generation and last received seq */
+		ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val;
+	 } else {
+		/* Either lost packets or lost ack, we need to deal
+		 * with wrap around of the seq value from 2047 to 0
+		 * because seq is only 11 bits */
+		seq_off = (int16_t)(err_seqnum.psn_seq - recvseq.psn_seq);
+		if (seq_off < 0)
+			seq_off += 2048; /* seq is 11 bits */
+
+		if (seq_off < 1024) {
+			ack_type = OPCODE_NAK;
+			/* NAK with allocating a new generation */
+
+			/* set latest seq */
+			tidrecvc->tidflow_genseq.psn_seq = recvseq.psn_seq;
+			/* allocate and set a new generation */
+			ips_protoexp_flow_newgen(tidrecvc);
+			/* get the new generation */
+			recvseq.psn_gen = tidrecvc->tidflow_genseq.psn_gen;
+
+			/* My new generation and last received seq */
+			ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val;
+		} else
+			/* ACK with last received seq,
+			 * no need to set ips_lrh.data[1].u32w0 */
+			ack_type = OPCODE_ACK;
+	}
+
+	ctrlscb.flags = 0;
+	ctrlscb.ips_lrh.data[0].u64 = send_desc_id.u64;
+	/* Keep peer generation but use my last received sequence */
+	err_seqnum.psn_seq = recvseq.psn_seq;
+	ctrlscb.ips_lrh.ack_seq_num = err_seqnum.psn_val;
+
+	/* May want to generate a BECN if a lot of swapped generations */
+	if_pf((tidrecvc->tidflow_nswap_gen > 4) &&
+	      (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+		_HFI_CCADBG
+		    ("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n",
+		     tidrecvc->tidflow_nswap_gen);
+		/* Mark flow to generate BECN in control packet */
+		tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+		/* Update stats for congestion encountered */
+		recvq->proto->epaddr_stats.congestion_pkts++;
+	}
+
+	ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+				    ack_type, &tidrecvc->ctrl_msg_queued,
+				    &ctrlscb, ctrlscb.cksum, 0);
+
+	/* Update stats for expected window */
+	tidrecvc->stats.nErrChkReceived++;
+	if (ack_type == OPCODE_NAK)
+		tidrecvc->stats.nReXmit++;	/* Update stats for retransmit (Sent a NAK) */
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	int flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+
+	psmi_assert(flowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[flowid];
+	if ((flow->path->pr_ccti +
+	proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) {
+		ips_cca_adjust_rate(flow->path,
+			    proto->cace[flow->path->pr_sl].ccti_increase);
+		/* Clear congestion event */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+	}
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
+{
+	_HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code);
+
+	if (hfi_debug & __HFI_DBG) {
+		ips_proto_show_header(proto, "received bad opcode");
+		ips_proto_dump_frame(proto, sizeof(struct ips_message_header),
+				     "Opcode error protocol header dump");
+	}
+}
+
+int
+ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *protocol_header = rcv_ev->p_hdr;
+	struct ips_proto *proto = rcv_ev->proto;
+
+	proto->stats.unknown_packets++;
+	ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header);
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev)
+{
+	psm2_error_t err = PSM2_OK;
+	char *payload = ips_recvhdrq_event_payload(rcv_ev);
+	uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+	psmi_assert(payload);
+	err = ips_proto_process_connect(rcv_ev->proto,
+					_get_proto_hfi_opcode(rcv_ev->p_hdr),
+					rcv_ev->p_hdr,
+					payload,
+					paylen);
+	if (err != PSM2_OK)
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Process connect/disconnect error: %d, opcode %d\n",
+			err, _get_proto_hfi_opcode(rcv_ev->p_hdr));
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* Return 1 if packet is ok. */
+/* Return 0 if packet should be skipped */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	uint8_t ptype = rcv_ev->ptype;
+	struct ips_proto *proto = rcv_ev->proto;
+	psm2_ep_t ep_err;
+	char *pkt_type;
+	int opcode = (int)_get_proto_hfi_opcode(p_hdr);
+
+	/*
+	 * If the protocol is disabled or not yet enabled, no processing happens
+	 * We set it t_init to 0 when disabling the protocol
+	 */
+	if (proto->t_init == 0)
+		return IPS_RECVHDRQ_CONTINUE;
+
+	/* Connect messages don't have to be from a known epaddr */
+	switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+		ips_proto_connect_disconnect(
+				(struct ips_recvhdrq_event *)rcv_ev);
+		return IPS_RECVHDRQ_CONTINUE;
+	default:
+		break;
+	}
+
+	/* Packet from "unknown" peer. Log the packet and payload if at appropriate
+	 * verbose level.
+	 */
+	{
+		char *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+		    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+
+		ips_proto_dump_err_stats(proto);
+
+		if (hfi_debug & __HFI_PKTDBG) {
+			ips_proto_dump_frame(rcv_ev->p_hdr,
+					     HFI_MESSAGE_HDR_SIZE, "header");
+			if (paylen)
+				ips_proto_dump_frame(payload, paylen, "data");
+		}
+	}
+
+	/* Other messages are definitely crosstalk. */
+	/* out-of-context expected messages are always fatal */
+	if (ptype == RCVHQ_RCV_TYPE_EXPECTED) {
+		ep_err = PSMI_EP_NORETURN;
+		pkt_type = "expected";
+	} else if (ptype == RCVHQ_RCV_TYPE_EAGER) {
+		ep_err = PSMI_EP_LOGEVENT;
+		pkt_type = "eager";
+	} else {
+		ep_err = PSMI_EP_NORETURN;
+		pkt_type = "unknown";
+	}
+
+	proto->stats.stray_packets++;
+
+	/* If we have debug mode, print the complete packet every time */
+	if (hfi_debug & __HFI_PKTDBG)
+		ips_proto_show_header(p_hdr, "invalid connidx");
+
+	/* At this point we are out of luck. */
+	psmi_handle_error(ep_err, PSM2_EPID_NETWORK_ERROR,
+			  "Received %s message(s) ptype=0x%x opcode=0x%x"
+			  " from an unknown process", pkt_type, ptype, opcode);
+
+	return 0;		/* Always skip this packet unless the above call was a noreturn
+				 * call */
+}
+
+/* get the error string as a number and a string */
+static void rhf_errnum_string(char *msg, size_t msglen, long err)
+{
+	int len;
+	char *errmsg;
+
+	len = snprintf(msg, msglen, "RHFerror %lx: ", err);
+	if (len > 0 && len < msglen) {
+		errmsg = msg + len;
+		msglen -= len;
+	} else
+		errmsg = msg;
+	*errmsg = 0;
+	ips_proto_get_rhf_errstring(err, errmsg, msglen);
+}
+
+/*
+ * Error handling
+ */
+int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	int pkt_verbose_err = hfi_debug & __HFI_PKTDBG;
+	int tiderr = rcv_ev->error_flags & HFI_RHF_TIDERR;
+	int tf_seqerr = rcv_ev->error_flags & HFI_RHF_TFSEQERR;
+	int tf_generr = rcv_ev->error_flags & HFI_RHF_TFGENERR;
+	int data_err = rcv_ev->error_flags &
+	    (HFI_RHF_ICRCERR | HFI_RHF_ECCERR | HFI_RHF_LENERR |
+	     HFI_RHF_DCERR | HFI_RHF_DCUNCERR | HFI_RHF_KHDRLENERR);
+	char pktmsg[128];
+
+	*pktmsg = 0;
+	/*
+	 * Tid errors on eager pkts mean we get a headerq overflow, perfectly
+	 * safe.  Tid errors on expected or other packets means trouble.
+	 */
+	if (tiderr && rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) {
+		struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+
+		/* Payload dropped - Determine flow for this header and see if
+		 * we need to generate a NAK.
+		 *
+		 * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO
+		 * CONGESTION AS THE EAGER BUFFER IS FULL.
+		 *
+		 * Possible eager packet type:
+		 *
+		 * Ctrl Message - ignore
+		 * MQ message - Can get flow and see if we need to NAK.
+		 * AM message - Can get flow and see if we need to NAK.
+		 */
+
+		proto->stats.hdr_overflow++;
+		if (data_err)
+			return 0;
+
+		switch (_get_proto_hfi_opcode(p_hdr)) {
+		case OPCODE_TINY:
+		case OPCODE_SHORT:
+		case OPCODE_EAGER:
+		case OPCODE_LONG_RTS:
+		case OPCODE_LONG_CTS:
+		case OPCODE_LONG_DATA:
+		case OPCODE_AM_REQUEST:
+		case OPCODE_AM_REQUEST_NOREPLY:
+		case OPCODE_AM_REPLY:
+			{
+				ips_epaddr_flow_t flowid =
+				    ips_proto_flowid(p_hdr);
+				struct ips_epstate_entry *epstaddr;
+				struct ips_flow *flow;
+				psmi_seqnum_t sequence_num;
+				int16_t diff;
+
+				/* Obtain ipsaddr for packet */
+				epstaddr =
+				    ips_epstate_lookup(rcv_ev->recvq->epstate,
+						       rcv_ev->p_hdr->connidx);
+				if_pf(epstaddr == NULL
+				      || epstaddr->ipsaddr == NULL)
+				    return 0;	/* Unknown packet - drop */
+
+				rcv_ev->ipsaddr = epstaddr->ipsaddr;
+
+				psmi_assert(flowid < EP_FLOW_LAST);
+				flow = &rcv_ev->ipsaddr->flows[flowid];
+				sequence_num.psn_val =
+				    __be32_to_cpu(p_hdr->bth[2]);
+				diff =
+				    (int16_t) (sequence_num.psn_num -
+					       flow->recv_seq_num.psn_num);
+
+				if (diff >= 0
+				    && !(flow->
+					 flags & IPS_FLOW_FLAG_NAK_SEND)) {
+					/* Mark flow as congested and attempt to generate NAK */
+					flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+					proto->epaddr_stats.congestion_pkts++;
+
+					flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+					flow->cca_ooo_pkts = 0;
+					ips_proto_send_nak((struct ips_recvhdrq
+							    *)rcv_ev->recvq,
+							   flow);
+				}
+
+				/* Safe to process ACKs from header */
+				ips_proto_process_ack(rcv_ev);
+			}
+			break;
+		case OPCODE_EXPTID:
+			/* If RSM is matching packets that are TID&FECN&SH,
+			 * it is possible to have a EXPTID packet encounter
+			 * the eager full condition and have the payload
+			 * dropped (but the header delivered).
+			 * Treat this condition as a data error (corruption,etc)
+			 * and send a NAK.
+			 */
+			ips_protoexp_handle_data_err(rcv_ev);
+			break;
+		default:
+			break;
+		}
+	} else if (tf_generr) /* handle generr, ignore tiderr if any */
+		ips_protoexp_handle_tf_generr(rcv_ev);
+	else if (tf_seqerr)
+		ips_protoexp_handle_tf_seqerr(rcv_ev);
+	else if (tiderr) {	/* tid error, but not on an eager pkt */
+		psm2_ep_t ep_err = PSMI_EP_LOGEVENT;
+		uint16_t tid, offset;
+		uint64_t t_now = get_cycles();
+
+		proto->tiderr_cnt++;
+
+		/* Whether and how we will be logging this event */
+		if (proto->tiderr_max > 0
+		    && proto->tiderr_cnt >= proto->tiderr_max)
+			ep_err = PSMI_EP_NORETURN;
+		else if (proto->tiderr_warn_interval != UINT64_MAX &&
+			 proto->tiderr_tnext <= t_now)
+			proto->tiderr_tnext =
+			    get_cycles() + proto->tiderr_warn_interval;
+		else
+			ep_err = NULL;
+
+		if (ep_err != NULL) {
+			rhf_errnum_string(pktmsg, sizeof(pktmsg),
+					  rcv_ev->error_flags);
+
+			tid = (__le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) >>
+			       HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK;
+			offset = __le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) &
+			    HFI_KHDR_OFFSET_MASK;
+
+			psmi_handle_error(ep_err, PSM2_EP_DEVICE_FAILURE,
+					  "%s with tid=%d,offset=%d,count=%d: %s %s",
+					  "TID Error",
+					  tid, offset, proto->tiderr_cnt,
+					  pktmsg, ep_err == PSMI_EP_NORETURN ?
+					  "(Terminating...)" : "");
+		}
+
+		ips_protoexp_handle_tiderr(rcv_ev);
+	} else if (data_err) {
+#if _HFI_DEBUGGING
+		if (_HFI_DBG_ON) {
+			uint8_t op_code
+				= _get_proto_hfi_opcode(rcv_ev->p_hdr);
+
+			if (!pkt_verbose_err) {
+				rhf_errnum_string(pktmsg, sizeof(pktmsg),
+						  rcv_ev->error_flags);
+				_HFI_DBG_ALWAYS
+					("Error %s pkt type opcode 0x%x at hd=0x%x %s\n",
+					(rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER)
+					? "eager" : (rcv_ev-> ptype ==
+							RCVHQ_RCV_TYPE_EXPECTED)
+					? "expected" : (rcv_ev->ptype ==
+							RCVHQ_RCV_TYPE_NON_KD) ? "non-kd" :
+					"<error>", op_code,
+							rcv_ev->recvq->state->hdrq_head, pktmsg);
+			}
+		}
+#endif
+
+		if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED)
+			ips_protoexp_handle_data_err(rcv_ev);
+	} else {		/* not a tid or data error -- some other error */
+#if _HFI_DEBUGGING
+		if (_HFI_DBG_ON) {
+			uint8_t op_code =
+				__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF;
+
+			if (!pkt_verbose_err)
+				rhf_errnum_string(pktmsg, sizeof(pktmsg),
+					rcv_ev->error_flags);
+
+			/* else RHFerr decode printed below */
+			_HFI_DBG_ALWAYS
+				("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n",
+				rcv_ev->ptype, op_code,
+				rcv_ev->recvq->state->hdrq_head, pktmsg);
+		}
+#endif
+	}
+	if (pkt_verbose_err) {
+		if (!*pktmsg)
+			rhf_errnum_string(pktmsg, sizeof(pktmsg),
+					  rcv_ev->error_flags);
+		ips_proto_show_header(rcv_ev->p_hdr, pktmsg);
+	}
+
+	return 0;
+}
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
new file mode 100644
index 0000000..4b2617f
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.c
@@ -0,0 +1,869 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+
+/*
+ * Receive header queue initialization.
+ */
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvq_params *hdrq_params,
+		  const struct ips_recvq_params *egrq_params,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  uint32_t runtime_flags,
+		  uint32_t subcontext,
+		  struct ips_recvhdrq *recvq,
+		  struct ips_recvhdrq_state *recvq_state)
+{
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+	psm2_error_t err = PSM2_OK;
+
+	memset(recvq, 0, sizeof(*recvq));
+	recvq->proto = (struct ips_proto *)proto;
+	recvq->state = recvq_state;
+	recvq->context = context;
+	recvq->subcontext = subcontext;
+	/* This runtime flags may be different from the context's runtime flags since
+	 * a receive queue may be initialised to represent a "software" receive
+	 * queue (shared contexts) or a hardware receive queue */
+	recvq->runtime_flags = runtime_flags;
+	recvq->hdrq = *hdrq_params;	/* deep copy */
+	pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED);
+	recvq->hdrq_rhf_off =
+	    (ctxt_info->rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT;
+
+	if (recvq->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+		recvq->hdrq_rhf_notail = 0;
+		recvq->state->hdrq_rhf_seq = 0;	/* _seq is ignored */
+	} else {
+		recvq->hdrq_rhf_notail = 1;
+		recvq->state->hdrq_rhf_seq = 1;
+	}
+	recvq->hdrq_elemlast = ((recvq->hdrq.elemcnt - 1) * recvq->hdrq.elemsz);
+
+	recvq->egrq = *egrq_params;	/* deep copy */
+	recvq->egrq_buftable =
+	    ips_recvq_egrbuf_table_alloc(context->ep, recvq->egrq.base_addr,
+					 recvq->egrq.elemcnt,
+					 recvq->egrq.elemsz);
+	if (recvq->egrq_buftable == NULL) {
+		err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+					"Couldn't allocate memory for eager buffer index table");
+		goto fail;
+	}
+
+	recvq->epstate = epstate;
+	recvq->recvq_callbacks = *callbacks;	/* deep copy */
+	SLIST_INIT(&recvq->pending_acks);
+
+	recvq->state->hdrq_head = 0;
+	recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE;
+	recvq->state->num_hdrq_done = 0;
+	recvq->state->num_egrq_done = 0;
+	recvq->state->hdr_countdown = 0;
+	recvq->state->hdrq_cachedlastscan = 0;
+
+	{
+		union psmi_envvar_val env_hdr_update;
+		psmi_getenv("PSM2_HEAD_UPDATE",
+			    "header queue update interval (0 to update after all entries are processed). Default is 64",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val) 64, &env_hdr_update);
+
+		/* Cap max header update interval to size of header/eager queue */
+		recvq->state->head_update_interval =
+			min(env_hdr_update.e_uint, recvq->hdrq.elemcnt - 1);
+		recvq->state->egrq_update_interval = 1;
+	}
+
+fail:
+	return err;
+}
+
+psm2_error_t ips_recvhdrq_fini(struct ips_recvhdrq *recvq)
+{
+	ips_recvq_egrbuf_table_free(recvq->egrq_buftable);
+	return PSM2_OK;
+}
+
+/* flush the eager buffers, by setting the eager index head to eager index tail
+   if eager buffer queue is full.
+
+   Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR
+   was set in RHF errors), and no good eager packets were received, so
+   that eager head wasn't advanced.
+*/
+#if 0
+static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq)
+{
+	const uint32_t tail = ips_recvq_tail_get(&recvq->egrq);
+	const uint32_t head = ips_recvq_head_get(&recvq->egrq);
+	uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+	if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) {
+		_HFI_DBG("eager array full after overflow, flushing "
+			 "(head %llx, tail %llx)\n",
+			 (long long)head, (long long)tail);
+		recvq->proto->stats.egr_overflow++;
+	}
+	return;
+}
+#endif
+
+/*
+ * Helpers for ips_recvhdrq_progress.
+ */
+
+static __inline__ int
+_get_proto_subcontext(const struct ips_message_header *p_hdr)
+{
+	return ((__be32_to_cpu(p_hdr->bth[1]) >>
+		 HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK);
+}
+
+/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_fecn_set(const struct ips_message_header *p_hdr)
+{
+	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1;
+}
+
+/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_becn_set(const struct ips_message_header *p_hdr)
+{
+	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1;
+}
+
+static __inline__ struct ips_message_header *_get_proto_hdr_from_rhf(const
+								     uint32_t *
+								     rcv_hdr,
+								     const
+								     __le32 *
+								     rhf)
+{
+	return (struct ips_message_header *)(rcv_hdr +
+					     hfi_hdrget_hdrq_offset(rhf));
+}
+
+static __inline__ struct ips_message_header *_get_proto_hdr(const uint32_t *
+							    rcv_hdr)
+{
+	return (struct ips_message_header *)&rcv_hdr[2];
+}
+
+static __inline__ uint32_t
+_get_rhf_seq(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+	return hfi_hdrget_seq((const __le32 *)rcv_hdr + recvq->hdrq_rhf_off);
+}
+
+static __inline__ uint32_t
+_get_rhf_len_in_bytes(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+	return hfi_hdrget_length_in_bytes((const __le32 *)rcv_hdr +
+					  recvq->hdrq_rhf_off);
+}
+
+static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
+{
+	char *payload = ips_recvhdrq_event_payload(rcv_ev);
+	uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+	    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+
+#ifdef PSM_DEBUG
+	ips_proto_show_header((struct ips_message_header *)
+			      rcv_ev->rcv_hdr, "received invalid pkt");
+#endif
+	if (hfi_debug & __HFI_PKTDBG) {
+		ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE,
+				     "header");
+		if (paylen)
+			ips_proto_dump_frame(payload, paylen, "data");
+	}
+
+}
+
+static __inline__ void
+_update_error_stats(struct ips_proto *proto, uint32_t err)
+{
+	if (err & HFI_RHF_ICRCERR)
+		proto->error_stats.num_icrc_err++;
+	if (err & HFI_RHF_ECCERR)
+		proto->error_stats.num_ecc_err++;
+	if (err & HFI_RHF_LENERR)
+		proto->error_stats.num_len_err++;
+	if (err & HFI_RHF_TIDERR)
+		proto->error_stats.num_tid_err++;
+	if (err & HFI_RHF_DCERR)
+		proto->error_stats.num_dc_err++;
+	if (err & HFI_RHF_DCUNCERR)
+		proto->error_stats.num_dcunc_err++;
+	if (err & HFI_RHF_KHDRLENERR)
+		proto->error_stats.num_khdrlen_err++;
+}
+
+#ifdef PSM_DEBUG
+static int _check_headers(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+	struct ips_proto *proto = rcv_ev->proto;
+	uint32_t *lrh = (uint32_t *) rcv_ev->p_hdr;
+	const uint32_t *rcv_hdr = rcv_ev->rcv_hdr;
+	uint32_t dest_context;
+	const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]);
+	const uint16_t base_dlid =
+	    __be16_to_cpu(recvq->proto->epinfo.ep_base_lid);
+
+	/* Check that the receive header queue entry has a sane sequence number */
+	if (_get_rhf_seq(recvq, rcv_hdr) > LAST_RHF_SEQNO) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n",
+				  _get_rhf_seq(recvq, rcv_hdr),
+				  recvq->state->hdrq_rhf_seq, lrh[0], lrh[1]);
+		return -1;
+	}
+
+	/* Verify that the packet was destined for our context */
+	dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr);
+	if_pf(dest_context != recvq->proto->epinfo.ep_context) {
+
+		struct ips_recvhdrq_state *state = recvq->state;
+
+		/* Packet not targeted at us. Drop packet and continue */
+		ips_proto_dump_err_stats(proto);
+		_dump_invalid_pkt(rcv_ev);
+
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n",
+				  dest_context, recvq->proto->epinfo.ep_context,
+				  state->hdrq_head);
+
+		return -1;
+	}
+
+	/* Verify that rhf packet length matches the length in LRH */
+	if_pf(_get_rhf_len_in_bytes(recvq, rcv_hdr) !=
+	      (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << BYTE2DWORD_SHIFT)) {
+		_HFI_EPDBG
+		    ("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n",
+		     _get_rhf_len_in_bytes(recvq, rcv_hdr) >> 2,
+		     __be16_to_cpu(rcv_ev->p_hdr->lrh[2]));
+
+		ips_proto_dump_err_stats(proto);
+		_dump_invalid_pkt(rcv_ev);
+		return -1;
+	}
+
+	/* Verify that the DLID matches our local LID. */
+	if_pf(!((base_dlid <= pkt_dlid) &&
+		(pkt_dlid <=
+		 (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) {
+		_HFI_EPDBG
+		    ("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n",
+		     rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid);
+		ips_proto_dump_err_stats(proto);
+		_dump_invalid_pkt(rcv_ev);
+		return -1;
+	}
+
+	return 0;
+}
+#endif
+
+static __inline__ int do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev)
+{
+	char *payload = ips_recvhdrq_event_payload(rcv_ev);
+	uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+	    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+	uint32_t *ckptr;
+	uint32_t recv_cksum, cksum, dest_subcontext;
+
+	/* With checksum every packet has a payload */
+	psmi_assert_always(payload);
+
+	ckptr = (uint32_t *) (payload + paylen);
+	recv_cksum = ckptr[0];
+
+	/* Calculate checksum hdr + payload (includes any padding words) */
+	cksum = 0xffffffff;
+	cksum = ips_crc_calculate(HFI_MESSAGE_HDR_SIZE,
+				  (uint8_t *) rcv_ev->p_hdr, cksum);
+	if (paylen)
+		cksum = ips_crc_calculate(paylen, (uint8_t *) payload, cksum);
+
+	if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) {
+		struct ips_epstate_entry *epstaddr;
+		uint32_t lcontext;
+		uint32_t hd, tl;
+
+		epstaddr =
+		    ips_epstate_lookup(rcv_ev->recvq->epstate,
+				       rcv_ev->p_hdr->connidx);
+		epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL;
+
+		lcontext = epstaddr ? rcv_ev->proto->epinfo.ep_context : -1;
+
+		hd = rcv_ev->recvq->context->ctrl->__hfi_rcvhdrhead[0];
+		tl = rcv_ev->recvq->context->ctrl->__hfi_rcvhdrhead[-2];
+
+		dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr);
+
+		_HFI_ERROR
+		    ("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%x,%x, rhfseq 0x%x\n",
+		     (dest_subcontext !=
+		      rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext,
+		     epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->pathgrp->
+					      pg_base_lid) : -1, cksum,
+		     ckptr[0], ckptr[1], _get_proto_hfi_opcode(rcv_ev->p_hdr),
+		     rcv_ev->error_flags, hd, tl, rcv_ev->rhf[0],
+		     rcv_ev->rhf[1],
+		     _get_rhf_seq((struct ips_recvhdrq *)rcv_ev->recvq,
+				  rcv_ev->rcv_hdr));
+
+		/* Dump packet */
+		_dump_invalid_pkt(rcv_ev);
+		return 0;	/* Packet checksum error */
+	}
+
+	return 1;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+process_pending_acks(struct ips_recvhdrq *recvq))
+{
+	ips_scb_t ctrlscb;
+
+	/* If any pending acks, dispatch them now */
+	while (!SLIST_EMPTY(&recvq->pending_acks)) {
+		struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks);
+
+		SLIST_REMOVE_HEAD(&recvq->pending_acks, next);
+		SLIST_NEXT(flow, next) = NULL;
+
+		ctrlscb.flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+			psmi_assert_always((flow->
+					    flags & IPS_FLOW_FLAG_PENDING_NAK)
+					   == 0);
+
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;
+			ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+						    &flow->ipsaddr->
+						    ctrl_msg_queued,
+						    &ctrlscb, ctrlscb.cksum, 0);
+		} else {
+			psmi_assert_always(flow->
+					   flags & IPS_FLOW_FLAG_PENDING_NAK);
+
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;
+			ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+						    &flow->ipsaddr->
+						    ctrl_msg_queued,
+						    &ctrlscb, ctrlscb.cksum, 0);
+		}
+	}
+}
+
+/*
+ * Core receive progress function
+ *
+ * recvhdrq_progress is the core function that services the receive header
+ * queue and optionally, the eager queue.  At the lowest level, it identifies
+ * packets marked with errors by the chip and also detects and corrects when
+ * eager overflow conditions occur.  At the highest level, it queries the
+ * 'epstate' interface to classify packets from "known" and "unknown"
+ * endpoints.  In order to support shared contexts, it can also handle packets
+ * destined for other contexts (or "subcontexts").
+ */
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
+{
+	struct ips_recvhdrq_state *state = recvq->state;
+	const __le32 *rhf;
+	PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto =
+		    recvq->proto,
+		.recvq = recvq
+	};
+	struct ips_epstate_entry *epstaddr;
+
+	uint32_t num_hdrq_done = 0;
+	const int num_hdrq_todo = recvq->hdrq.elemcnt;
+	const uint32_t hdrq_elemsz = recvq->hdrq.elemsz;
+	uint32_t dest_subcontext;
+
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	int done = 0;
+	int do_hdr_update = 0;
+
+	/* Chip features */
+	const int has_rtail = recvq->runtime_flags & HFI1_CAP_DMA_RTAIL;
+
+	/* Returns whether the currently set 'rcv_hdr'/head is a readable entry */
+#define next_hdrq_is_ready()						     \
+	(has_rtail ? \
+	 state->hdrq_head != ips_recvq_tail_get(&recvq->hdrq) : \
+	 recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, rcv_hdr))
+
+	const uint32_t *rcv_hdr =
+	    (const uint32_t *)recvq->hdrq.base_addr + state->hdrq_head;
+	uint32_t tmp_hdrq_head;
+
+	PSM2_LOG_MSG("entering");
+	done = !next_hdrq_is_ready();
+
+	while (!done) {
+
+		rhf = (const __le32 *)rcv_hdr + recvq->hdrq_rhf_off;
+		rcv_ev.error_flags = hfi_hdrget_err_flags(rhf);
+		rcv_ev.ptype = hfi_hdrget_rcv_type(rhf);
+		rcv_ev.rhf = rhf;
+		rcv_ev.rcv_hdr = rcv_hdr;
+		rcv_ev.p_hdr =
+		    recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(rcv_hdr, rhf)
+		    : _get_proto_hdr(rcv_hdr);
+		rcv_ev.has_cksum =
+		    ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+		     (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM));
+
+		_HFI_VDBG
+		    ("new packet: rcv_hdr %p, rhf_off %d, rhf %p (%x,%x), p_hdr %p\n",
+		     rcv_hdr, recvq->hdrq_rhf_off, rhf, rhf[0], rhf[1],
+		     rcv_ev.p_hdr);
+
+		/* If the hdrq_head is before cachedlastscan, that means that we have
+		 * already prescanned this for BECNs and FECNs, so we should not check
+		 * again
+		 */
+		if_pt((recvq->proto->flags & IPS_PROTO_FLAG_CCA) &&
+				(state->hdrq_head >= state->hdrq_cachedlastscan)) {
+			/* IBTA CCA handling:
+			 * If FECN bit set handle IBTA CCA protocol. For the
+			 * flow that suffered congestion we flag it to generate
+			 * a control packet with the BECN bit set - This is
+			 * currently an unsolicited ACK.
+			 *
+			 * For all MQ packets the FECN processing/BECN
+			 * generation is done in the is_expected_or_nak
+			 * function as each eager packet is inspected there.
+			 *
+			 * For TIDFLOW/Expected data transfers the FECN
+			 * bit/BECN generation is done in protoexp_data. Since
+			 * header suppression can result in even FECN packets
+			 * being suppressed the expected protocol generated
+			 * additional BECN packets if a "large" number of
+			 * generations are swapped without progress being made
+			 * for receive. "Large" is set empirically to 4.
+			 *
+			 * FECN packets are ignored for all control messages
+			 * (except ACKs and NAKs) since they indicate
+			 * congestion on the control path which is not rate
+			 * controlled. The CCA specification allows FECN on
+			 * ACKs to be disregarded as well.
+			 */
+			rcv_ev.is_congested =
+			    _is_cca_fecn_set(rcv_ev.
+					     p_hdr) & IPS_RECV_EVENT_FECN;
+			rcv_ev.is_congested |=
+			    (_is_cca_becn_set(rcv_ev.p_hdr) <<
+			     (IPS_RECV_EVENT_BECN - 1));
+		} else
+			rcv_ev.is_congested = 0;
+
+#ifdef PSM_DEBUG
+		if_pf(_check_headers(&rcv_ev))
+			goto skip_packet;
+#endif
+		dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr);
+
+		/* If the destination is not our subcontext, process
+		 * message as subcontext message (shared contexts) */
+		if (dest_subcontext != recvq->subcontext) {
+			rcv_ev.ipsaddr = NULL;
+
+			ret = recvq->recvq_callbacks.callback_subcontext
+						(&rcv_ev, dest_subcontext);
+			if (ret == IPS_RECVHDRQ_REVISIT)
+			{
+				PSM2_LOG_MSG("leaving");
+				return PSM2_OK_NO_PROGRESS;
+			}
+
+			goto skip_packet;
+		}
+
+		if_pf(rcv_ev.error_flags) {
+
+			_update_error_stats(recvq->proto, rcv_ev.error_flags);
+
+			recvq->recvq_callbacks.callback_error(&rcv_ev);
+
+			if ((rcv_ev.ptype != RCVHQ_RCV_TYPE_EAGER) ||
+			    (!(rcv_ev.error_flags & HFI_RHF_TIDERR)))
+				goto skip_packet;
+
+			/* no pending eager update, header
+			 * is not currently under tracing. */
+			if (state->hdr_countdown == 0 &&
+			    state->rcv_egr_index_head == NO_EAGER_UPDATE) {
+				uint32_t egr_cnt = recvq->egrq.elemcnt;
+				const uint32_t etail =
+					ips_recvq_tail_get(&recvq->egrq);
+				const uint32_t ehead =
+					ips_recvq_head_get(&recvq->egrq);
+
+				if (ehead == ((etail + 1) % egr_cnt)) {
+					/* eager is full,
+					 * trace existing header entries */
+					uint32_t hdr_size =
+						recvq->hdrq_elemlast +
+						hdrq_elemsz;
+					const uint32_t htail =
+						ips_recvq_tail_get
+						(&recvq->hdrq);
+					const uint32_t hhead =
+						state->hdrq_head;
+
+					state->hdr_countdown =
+						(htail > hhead) ?
+						(htail - hhead) :
+						(htail + hdr_size - hhead);
+				}
+			}
+
+			/* Eager packet and tiderr.
+			 * Don't consider updating egr head, unless we're in
+			 * the congested state.  If we're congested, we should
+			 * try to keep the eager buffers free. */
+
+			if (!rcv_ev.is_congested)
+				goto skip_packet_no_egr_update;
+			else
+				goto skip_packet;
+		}
+
+		/* If checksum is enabled, verify that it is valid */
+		if_pf(rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev))
+			goto skip_packet;
+
+		_HFI_VDBG("opcode %x, payload %p paylen %d; "
+			  "egrhead %lx egrtail %lx; "
+			  "useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n",
+			  _get_proto_hfi_opcode(rcv_ev.p_hdr),
+			  ips_recvhdrq_event_payload(&rcv_ev),
+			  ips_recvhdrq_event_paylen(&rcv_ev),
+			  ips_recvq_head_get(&recvq->egrq),
+			  ips_recvq_tail_get(&recvq->egrq),
+			  hfi_hdrget_use_egrbfr(rhf),
+			  hfi_hdrget_egrbfr_index(rhf),
+			  hfi_hdrget_egrbfr_offset(rhf),
+			  state->rcv_egr_index_head);
+
+		/* Classify packet from a known or unknown endpoint */
+		epstaddr = ips_epstate_lookup(recvq->epstate,
+					       rcv_ev.p_hdr->connidx);
+		if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) {
+			rcv_ev.ipsaddr = NULL;
+			recvq->recvq_callbacks.
+			    callback_packet_unknown(&rcv_ev);
+		} else {
+			rcv_ev.ipsaddr = epstaddr->ipsaddr;
+			ret = ips_proto_process_packet(&rcv_ev);
+			if (ret == IPS_RECVHDRQ_REVISIT)
+			{
+				PSM2_LOG_MSG("leaving");
+				return PSM2_OK_NO_PROGRESS;
+			}
+		}
+
+skip_packet:
+		/*
+		 * if eager buffer is used, record the index.
+		 */
+		if (hfi_hdrget_use_egrbfr(rhf)) {
+			/* set only when a new entry is used */
+			if (hfi_hdrget_egrbfr_offset(rhf) == 0){
+				state->rcv_egr_index_head =
+				    hfi_hdrget_egrbfr_index(rhf);
+				state->num_egrq_done++;
+			}
+			/* a header entry is using an eager entry, stop tracing. */
+			state->hdr_countdown = 0;
+		}
+
+skip_packet_no_egr_update:
+		/* Note that state->hdrq_head is sampled speculatively by the code
+		 * in ips_ptl_shared_poll() when context sharing, so it is not safe
+		 * for this shared variable to temporarily exceed the last element. */
+		tmp_hdrq_head = state->hdrq_head + hdrq_elemsz;
+		_HFI_VDBG
+		    ("dma_rtail %d head %d, elemsz %d elemlast %d tmp %d\n",
+		     has_rtail, state->hdrq_head, hdrq_elemsz,
+		     recvq->hdrq_elemlast, tmp_hdrq_head);
+
+		if_pt(tmp_hdrq_head <= recvq->hdrq_elemlast)
+		    state->hdrq_head = tmp_hdrq_head;
+		else
+		state->hdrq_head = 0;
+
+		if_pf(has_rtail == 0
+		      && ++recvq->state->hdrq_rhf_seq > LAST_RHF_SEQNO)
+		    recvq->state->hdrq_rhf_seq = 1;
+
+		state->num_hdrq_done++;
+		num_hdrq_done++;
+		rcv_hdr =
+		    (const uint32_t *)recvq->hdrq.base_addr + state->hdrq_head;
+		done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK)
+			|| (num_hdrq_done == num_hdrq_todo));
+
+		do_hdr_update = (state->head_update_interval ?
+				 (state->num_hdrq_done ==
+				  state->head_update_interval) : done);
+		if (do_hdr_update) {
+			ips_recvq_head_update(&recvq->hdrq, state->hdrq_head);
+			/* Reset header queue entries processed */
+			state->num_hdrq_done = 0;
+		}
+		if (state->num_egrq_done >= state->egrq_update_interval) {
+			/* Lazy update of egrq */
+			if (state->rcv_egr_index_head != NO_EAGER_UPDATE) {
+				ips_recvq_head_update(&recvq->egrq,
+						      state->
+						      rcv_egr_index_head);
+				state->rcv_egr_index_head = NO_EAGER_UPDATE;
+				state->num_egrq_done = 0;
+			}
+		}
+		if (state->hdr_countdown > 0) {
+			/* a header entry is consumed. */
+			state->hdr_countdown -= hdrq_elemsz;
+			if (state->hdr_countdown == 0) {
+				/* header entry count reaches zero. */
+				const uint32_t tail =
+				    ips_recvq_tail_get(&recvq->egrq);
+				const uint32_t head =
+				    ips_recvq_head_get(&recvq->egrq);
+				uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+				/* Checks eager-full again. This is a real false-egr-full */
+				if (head == ((tail + 1) % egr_cnt)) {
+					ips_recvq_head_update(&recvq->egrq,
+							      tail);
+					_HFI_DBG
+					    ("eager array full after overflow, flushing "
+					     "(head %llx, tail %llx)\n",
+					     (long long)head, (long long)tail);
+					recvq->proto->stats.egr_overflow++;
+				} else
+					_HFI_ERROR
+					    ("PSM BUG: EgrOverflow: eager queue is not full\n");
+			}
+		}
+	}
+	/* while (hdrq_entries_to_read) */
+
+	/* Process any pending acks before exiting */
+	process_pending_acks(recvq);
+
+	PSM2_LOG_MSG("leaving");
+	return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS;
+}
+
+/*	This function is designed to implement RAPID CCA. It iterates
+	through the recvq, checking each element for set FECN or BECN bits.
+	In the case of finding one, the proper response is executed, and the bits
+	are cleared.
+*/
+psm2_error_t ips_recvhdrq_scan_cca (struct ips_recvhdrq *recvq)
+{
+
+/* Looks at hdr and determines if it is the last item in the queue */
+
+#define is_last_hdr(hdr)						\
+	(has_rtail ? 							\
+	(hdr != ips_recvq_tail_get(&recvq->hdrq)) :			\
+	(recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, curr_hdr)))
+
+	struct ips_recvhdrq_state *state = recvq->state;
+	const __le32 *rhf;
+	PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto,
+							    .recvq = recvq
+	};
+
+	uint32_t num_hdrq_done = state->hdrq_cachedlastscan / recvq->hdrq.elemsz;
+	const int num_hdrq_todo = recvq->hdrq.elemcnt;
+	const uint32_t hdrq_elemsz = recvq->hdrq.elemsz;
+
+	int done;
+
+	/* Chip features */
+	const int has_rtail = recvq->runtime_flags & HFI1_CAP_DMA_RTAIL;
+
+	uint32_t *rcv_hdr =
+	    (uint32_t *)recvq->hdrq.base_addr + state->hdrq_cachedlastscan;
+	uint32_t *curr_hdr = rcv_hdr;
+	uint32_t scan_head = state->hdrq_head + state->hdrq_cachedlastscan;
+
+	/* Skip the first element, since we're going to process it soon anyway */
+	if ( state->hdrq_cachedlastscan == 0 )
+	{
+		curr_hdr = curr_hdr + hdrq_elemsz;
+		scan_head += hdrq_elemsz;
+		num_hdrq_done++;
+	}
+
+	PSM2_LOG_MSG("entering");
+	done = !is_last_hdr(scan_head);
+
+	while (!done) {
+		rhf = (const __le32 *)curr_hdr + recvq->hdrq_rhf_off;
+		rcv_ev.error_flags = hfi_hdrget_err_flags(rhf);
+		rcv_ev.ptype = hfi_hdrget_rcv_type(rhf);
+		rcv_ev.rhf = rhf;
+		rcv_ev.rcv_hdr = curr_hdr;
+		rcv_ev.p_hdr =
+		    recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(curr_hdr, rhf)
+		    : _get_proto_hdr(curr_hdr);
+		rcv_ev.has_cksum =
+		    ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+		     (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM));
+
+		_HFI_VDBG
+		    ("scanning packet for CCA: curr_hdr %p, rhf_off %d, rhf %p (%x,%x), p_hdr %p\n",
+		     curr_hdr, recvq->hdrq_rhf_off, rhf, rhf[0], rhf[1],
+		     rcv_ev.p_hdr);
+
+		if_pt ( _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN ) {
+			struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate,
+										rcv_ev.p_hdr->connidx);
+
+			if (epstaddr != NULL && epstaddr->ipsaddr != NULL)
+			{
+				rcv_ev.ipsaddr = epstaddr->ipsaddr;
+
+				/* Send BECN back */
+				ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr;
+				struct ips_message_header *p_hdr = rcv_ev.p_hdr;
+				ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+				struct ips_flow *flow;
+				ips_scb_t ctrlscb;
+
+				psmi_assert(flowid < EP_FLOW_LAST);
+				flow = &ipsaddr->flows[flowid];
+				ctrlscb.flags = 0;
+				ctrlscb.ips_lrh.data[0].u32w0 =
+					flow->cca_ooo_pkts;
+
+				rcv_ev.proto->epaddr_stats.congestion_pkts++;
+				/* Clear FECN event */
+				rcv_ev.is_congested &= ~IPS_RECV_EVENT_FECN;
+
+				ips_proto_send_ctrl_message(flow,
+							    OPCODE_BECN,
+							    &flow->ipsaddr->
+							    ctrl_msg_queued,
+							    &ctrlscb, ctrlscb.cksum, 0);
+			}
+		}
+		else if_pt (0 != (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1))) {
+			struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate,
+										rcv_ev.p_hdr->connidx);
+
+			if (epstaddr != NULL && epstaddr->ipsaddr != NULL)
+			{
+				rcv_ev.ipsaddr = epstaddr->ipsaddr;
+
+				/* Adjust flow */
+				struct ips_proto *proto = rcv_ev.proto;
+				struct ips_message_header *p_hdr = rcv_ev.p_hdr;
+				ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr;
+				struct ips_flow *flow;
+				ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+
+				psmi_assert(flowid < EP_FLOW_LAST);
+				flow = &ipsaddr->flows[flowid];
+				if ((flow->path->pr_ccti +
+				     proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) {
+					ips_cca_adjust_rate(flow->path,
+							    proto->cace[flow->path->pr_sl].ccti_increase);
+					/* Clear congestion event */
+					rcv_ev.is_congested &= ~IPS_RECV_EVENT_BECN;
+				}
+			}
+		}
+
+		curr_hdr = curr_hdr + hdrq_elemsz;
+
+		num_hdrq_done++;
+		scan_head += hdrq_elemsz;
+		state->hdrq_cachedlastscan += hdrq_elemsz;
+
+		done = (num_hdrq_done == num_hdrq_todo && !is_last_hdr(scan_head) );
+
+	}
+	/* while (hdrq_entries_to_read) */
+
+
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h
new file mode 100644
index 0000000..15761aa
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.h
@@ -0,0 +1,240 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_proto_header.h"
+#include "ips_proto_params.h"
+#include "ips_recvq.h"
+
+#ifndef _IPS_RECVHDRQ_H
+#define _IPS_RECVHDRQ_H
+
+struct ips_recvhdrq;
+struct ips_recvhdrq_state;
+struct ips_epstate;
+
+/* process current packet, continue on next packet */
+#define IPS_RECVHDRQ_CONTINUE   0
+/* process current packet, break and return to caller */
+#define IPS_RECVHDRQ_BREAK      1
+/* keep current packet, revisit the same packet next time */
+#define IPS_RECVHDRQ_REVISIT	2
+
+#define IPS_RECVHDRQ_ELEMSZ_MAX 32	/* 128 bytes */
+#define LAST_RHF_SEQNO 13
+
+/* CCA related receive events */
+#define IPS_RECV_EVENT_FECN 0x1
+#define IPS_RECV_EVENT_BECN 0x2
+
+struct ips_recvhdrq_event {
+	struct ips_proto *proto;
+	const struct ips_recvhdrq *recvq;	/* where message received */
+	const uint32_t *rcv_hdr;	/* rcv_hdr ptr */
+	const __le32 *rhf;	/* receive header flags */
+	struct ips_message_header *p_hdr;	/* protocol header in rcv_hdr */
+	struct ips_epaddr *ipsaddr;	/* peer ipsaddr, if available */
+	uint32_t error_flags;	/* error flags */
+	uint8_t has_cksum;	/* payload has cksum */
+	uint8_t is_congested;	/* Packet faced congestion */
+	uint16_t ptype;		/* packet type */
+};
+
+struct ips_recvhdrq_callbacks {
+	int (*callback_packet_unknown) (const struct ips_recvhdrq_event *);
+	int (*callback_subcontext) (const struct ips_recvhdrq_event *,
+				    uint32_t subcontext);
+	int (*callback_error) (struct ips_recvhdrq_event *);
+};
+
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvq_params *hdrq_params,
+		  const struct ips_recvq_params *egrq_params,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  uint32_t flags,
+		  uint32_t subcontext,
+		  struct ips_recvhdrq *recvq,
+		  struct ips_recvhdrq_state *recvq_state);
+
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq);
+
+psm2_error_t ips_recvhdrq_fini(struct ips_recvhdrq *recvq);
+
+/*
+ * This function is designed to implement RAPID CCA. It iterates
+ * through the recvq, checking each element for set FECN or BECN bits.
+ * In the case of finding one, the proper response is executed, and the bits
+ * are cleared.
+ */
+psm2_error_t ips_recvhdrq_scan_cca(struct ips_recvhdrq *recvq);
+
+/*
+ * Structure containing state for recvhdrq reading. This is logically
+ * part of ips_recvhdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the context. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+#define NO_EAGER_UPDATE ~0U
+struct ips_recvhdrq_state {
+	uint32_t hdrq_head;	/* software copy of head */
+	uint32_t rcv_egr_index_head;	/* software copy of eager index head */
+	uint32_t hdrq_rhf_seq;	/* last seq */
+	uint32_t head_update_interval;	/* Header update interval */
+	uint32_t num_hdrq_done;	/* Num header queue done */
+	uint32_t egrq_update_interval; /* Eager buffer update interval */
+	uint32_t num_egrq_done; /* num eager buffer done */
+	uint32_t hdr_countdown;	/* for false-egr-full tracing */
+	uint32_t hdrq_cachedlastscan;	/* last element to be prescanned */
+};
+
+/*
+ * Structure to read from recvhdrq
+ */
+struct ips_recvhdrq {
+	struct ips_proto *proto;
+	const psmi_context_t *context;	/* error handling, epid id, etc. */
+	struct ips_recvhdrq_state *state;
+	uint32_t context_flags;	/* derived from base_info.spi_runtime_flags */
+	uint32_t subcontext;	/* messages that don't match subcontext call
+				 * recv_callback_subcontext */
+
+	/* Header queue handling */
+	pthread_spinlock_t hdrq_lock;	/* Lock for thread-safe polling */
+	uint32_t hdrq_rhf_off;	/* rhf offset */
+	int hdrq_rhf_notail;	/* rhf notail enabled */
+	uint32_t hdrq_elemlast;	/* last element precomputed */
+	struct ips_recvq_params hdrq;
+
+	/* Eager queue handling */
+	void **egrq_buftable;	/* table of eager idx-to-ptr */
+	struct ips_recvq_params egrq;
+
+	/* Lookup endpoints epid -> ptladdr (rank)) */
+	const struct ips_epstate *epstate;
+
+	/* Callbacks to handle recvq events */
+	struct ips_recvhdrq_callbacks recvq_callbacks;
+
+	/* List of flows with pending acks for receive queue */
+	 SLIST_HEAD(pending_flows, ips_flow) pending_acks;
+
+	uint32_t runtime_flags;
+	volatile __u64 *spi_status;
+};
+
+PSMI_INLINE(int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq))
+{
+	if (recvq->hdrq_rhf_notail)	/* use rhf-based reads */
+		return recvq->state->hdrq_rhf_seq !=
+		    hfi_hdrget_seq(recvq->hdrq.base_addr +
+				   recvq->state->hdrq_head +
+				   recvq->hdrq_rhf_off);
+	else
+		return ips_recvq_tail_get(&recvq->hdrq) ==
+		    recvq->state->hdrq_head;
+}
+
+PSMI_INLINE(
+void *
+ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev))
+{
+	/* XXX return NULL if no eager buffer allocated */
+	if (hfi_hdrget_use_egrbfr(rcv_ev->rhf))
+		return ips_recvq_egr_index_2_ptr(rcv_ev->recvq->egrq_buftable,
+						 hfi_hdrget_egrbfr_index
+						 (rcv_ev->rhf),
+						 hfi_hdrget_egrbfr_offset
+						 (rcv_ev->rhf) * 64);
+	else
+		return NULL;
+}
+
+PSMI_INLINE(
+uint32_t
+ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev))
+{
+	uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0;
+
+	return hfi_hdrget_length_in_bytes(rcv_ev->rhf) -
+	    (sizeof(struct ips_message_header) +
+	     HFI_CRC_SIZE_IN_BYTES + cksum_len);
+	/* PSM does not use bth0].PadCnt, it figures out real datalen other way */
+}
+
+PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_trylock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_lock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_unlock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+#endif /* _IPS_RECVHDRQ_H */
diff --git a/ptl_ips/ips_recvq.c b/ptl_ips/ips_recvq.c
new file mode 100644
index 0000000..55b702c
--- /dev/null
+++ b/ptl_ips/ips_recvq.c
@@ -0,0 +1,91 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_recvq.h"
+
+/* We return a table of pointer indexes.
+ *
+ * From the point of view of the returned pointer, index -1 always points to
+ * the address to call psmi_free on (since we force page-alignment).
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr,
+				    uint32_t bufnum, uint32_t bufsize)
+{
+	unsigned i;
+	void *ptr_alloc;
+	uintptr_t *buft;
+	uintptr_t base = (uintptr_t) baseptr;
+
+	ptr_alloc = psmi_malloc(ep, UNDEFINED,
+				PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum +
+								     1));
+	if (ptr_alloc == NULL)
+		return NULL;
+	/* First pointer is to the actual allocated address, so we can free it but
+	 * buft[1] is first on the page boundary
+	 */
+	buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc + 1, PSMI_PAGESIZE);
+	buft[-1] = (uintptr_t) ptr_alloc;
+	for (i = 0; i < bufnum; i++)
+		buft[i] = (uintptr_t) ((char *)base + i * bufsize);
+	return (void **)buft;
+}
+
+void ips_recvq_egrbuf_table_free(void **buftable)
+{
+	uintptr_t *buft = (uintptr_t *) buftable;
+	void *ptr_alloc = (void *)buft[-1];
+	psmi_free(ptr_alloc);
+}
diff --git a/ptl_ips/ips_recvq.h b/ptl_ips/ips_recvq.h
new file mode 100644
index 0000000..3236da6
--- /dev/null
+++ b/ptl_ips/ips_recvq.h
@@ -0,0 +1,124 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_RECVQ_H
+#define _IPS_RECVQ_H
+
+#include "psm_user.h"
+
+struct ips_recvq_params {
+	volatile __le64 *tail_register;	/* location of tail */
+	volatile __le64 *head_register;	/* location of head */
+	uint32_t *base_addr;	/* base address of q */
+	uint32_t elemsz;	/* size of q elements (in words) */
+	uint32_t elemcnt;	/* num of q elements (in words) */
+};
+
+/*
+ * Tables to map eager indexes into their buffer addresses
+ *
+ * If function returns NULL, no memory has been allocated and the error handler
+ * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY.
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep,
+				    void *base, uint32_t bufnum,
+				    uint32_t bufsize);
+void ips_recvq_egrbuf_table_free(void **buftable);
+
+/*
+ * Accessor inlines for reading and writing to hdrq/egrq registers
+ */
+PSMI_ALWAYS_INLINE(
+void *
+ips_recvq_egr_index_2_ptr(void **egrq_buftable, int index, int offset))
+{
+	return (void *)((char *)egrq_buftable[index] + offset);
+}
+
+PSMI_INLINE(
+void
+ips_recvq_head_update(const struct ips_recvq_params *recvq, uint64_t newhead))
+{
+	*recvq->head_register = __cpu_to_le64(newhead);
+	return;
+}
+
+PSMI_INLINE(
+uint64_t
+ips_recvq_head_get(const struct ips_recvq_params *recvq))
+{
+	uint64_t res = __le64_to_cpu(*recvq->head_register);
+	ips_rmb();
+	return res;
+}
+
+PSMI_INLINE(
+void
+ips_recvq_tail_update(const struct ips_recvq_params *recvq, uint64_t newtail))
+{
+	*recvq->tail_register = __cpu_to_le64(newtail);
+	return;
+}
+
+PSMI_INLINE(
+uint64_t
+ips_recvq_tail_get(const struct ips_recvq_params *recvq))
+{
+	uint64_t res = __le64_to_cpu(*recvq->tail_register);
+	ips_rmb();
+	return res;
+}
+
+#endif /* _IPS_RECVQ_H */
diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c
new file mode 100644
index 0000000..0dbae1e
--- /dev/null
+++ b/ptl_ips/ips_scb.c
@@ -0,0 +1,364 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_scb.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t
+ips_scbctrl_init(const psmi_context_t *context,
+		 uint32_t numscb, uint32_t numbufs,
+		 uint32_t imm_size, uint32_t bufsize,
+		 ips_scbctrl_avail_callback_fn_t scb_avail_callback,
+		 void *scb_avail_context, struct ips_scbctrl *scbc)
+{
+	int i;
+	struct ips_scb *scb;
+	size_t scb_size;
+	size_t alloc_sz;
+	uintptr_t base, imm_base;
+	psm2_ep_t ep = context->ep;
+	/* scbc->context = context; */
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert_always(numscb > 0);
+	scbc->sbuf_num = scbc->sbuf_num_cur = numbufs;
+	SLIST_INIT(&scbc->sbuf_free);
+	scbc->sbuf_buf_size = bufsize;
+	scbc->sbuf_buf_base = NULL;
+	scbc->sbuf_buf_alloc = NULL;
+	scbc->sbuf_buf_last = NULL;
+
+	/* send buffers are not mandatory but when allocating them, make sure they
+	 * are on a page boundary */
+	if (numbufs > 0) {
+		struct ips_scbbuf *sbuf;
+		int redzone = PSM_VALGRIND_REDZONE_SZ;
+
+		/* If the allocation requested is a page and we have redzones we have
+		 * to allocate 2 pages so we end up using a redzone of 2048 bytes.
+		 *
+		 * if the allocation is not 4096, we relax that requirement and keep
+		 * the redzones PSM_VALGRIND_REDZONE_SZ
+		 */
+		if (redzone > 0 && bufsize % PSMI_PAGESIZE == 0)
+			redzone = PSMI_PAGESIZE / 2;
+		bufsize += 2 * redzone;
+		bufsize = PSMI_ALIGNUP(bufsize, 64);
+
+		alloc_sz = numbufs * bufsize + redzone + PSMI_PAGESIZE;
+		scbc->sbuf_buf_alloc =
+		    psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+		if (scbc->sbuf_buf_alloc == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		base = (uintptr_t) scbc->sbuf_buf_alloc;
+		base = PSMI_ALIGNUP(base + redzone, PSMI_PAGESIZE);
+		scbc->sbuf_buf_base = (void *)base;
+		scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1));
+		_HFI_VDBG
+		    ("sendbufs=%d, (redzone=%d|size=%d|redzone=%d),base=[%p..%p)\n",
+		     numbufs, redzone, bufsize - 2 * redzone, redzone,
+		     (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last);
+
+		for (i = 0; i < numbufs; i++) {
+			sbuf = (struct ips_scbbuf *)(base + bufsize * i);
+			SLIST_NEXT(sbuf, next) = NULL;
+			SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next);
+		}
+
+		VALGRIND_CREATE_MEMPOOL(scbc->sbuf_buf_alloc, 0,
+					/* Should be undefined but we stuff a next
+					 * pointer in the buffer */
+					PSM_VALGRIND_MEM_DEFINED);
+	}
+
+	imm_base = 0;
+	scbc->scb_imm_size = imm_size;
+	if (scbc->scb_imm_size) {
+		scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64);
+		alloc_sz = numscb * scbc->scb_imm_size + 64;
+		scbc->scb_imm_buf =
+		    psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+		if (scbc->scb_imm_buf == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64);
+	} else
+		scbc->scb_imm_buf = NULL;
+
+	scbc->scb_num = scbc->scb_num_cur = numscb;
+	SLIST_INIT(&scbc->scb_free);
+	scb_size = sizeof(struct ips_scb) + 2 * PSM_VALGRIND_REDZONE_SZ;
+	scb_size = PSMI_ALIGNUP(scb_size, 64);
+	alloc_sz = numscb * scb_size + PSM_VALGRIND_REDZONE_SZ + 64;
+	scbc->scb_base = (void *)
+	    psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+	if (scbc->scb_base == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	base = (uintptr_t) scbc->scb_base;
+	base = PSMI_ALIGNUP(base + PSM_VALGRIND_REDZONE_SZ, 64);
+
+	/*
+	 * Allocate ack/send timer for each scb object.
+	 */
+	scbc->timers = (struct psmi_timer *)
+		psmi_calloc(ep, UNDEFINED, 2*numscb,
+		sizeof(struct psmi_timer));
+	if (scbc->timers == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	for (i = 0; i < numscb; i++) {
+		scb = (struct ips_scb *)(base + i * scb_size);
+		scb->scbc = scbc;
+		if (scbc->scb_imm_buf)
+			scb->imm_payload =
+			    (void *)(imm_base + (i * scbc->scb_imm_size));
+		else
+			scb->imm_payload = NULL;
+
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+		/*
+		 * Initialize timers.
+		 * Associate the timers to each scb, the association is
+		 * not fixed because later PSM may exchange the timers
+		 * between scb, the reason for exchanging is that the
+		 * timer is currently using by flow, but the scb is to
+		 * be freed. see ack/nak processing in file ips_prot_recv.c
+		 */
+		scb->timer_ack = &scbc->timers[2*i];
+		psmi_timer_entry_init(scb->timer_ack,
+				ips_proto_timer_ack_callback, scb);
+
+		scb->timer_send = &scbc->timers[2*i+1];
+		psmi_timer_entry_init(scb->timer_send,
+				ips_proto_timer_send_callback, scb);
+	}
+	scbc->scb_avail_callback = scb_avail_callback;
+	scbc->scb_avail_context = scb_avail_context;
+
+	/* It would be nice to mark the scb as undefined but we pre-initialize the
+	 * "next" pointer and valgrind would see this as a violation.
+	 */
+	VALGRIND_CREATE_MEMPOOL(scbc, PSM_VALGRIND_REDZONE_SZ,
+				PSM_VALGRIND_MEM_DEFINED);
+
+fail:
+	return err;
+}
+
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc)
+{
+	if (scbc->scb_base != NULL) {
+		psmi_free(scbc->scb_base);
+		VALGRIND_DESTROY_MEMPOOL(scbc);
+	}
+	if (scbc->sbuf_buf_alloc) {
+		VALGRIND_DESTROY_MEMPOOL(scbc->sbuf_buf_alloc);
+		psmi_free(scbc->sbuf_buf_alloc);
+	}
+	return PSM2_OK;
+}
+
+int ips_scbctrl_bufalloc(ips_scb_t *scb)
+{
+	struct ips_scbctrl *scbc = scb->scbc;
+
+	psmi_assert(scbc->sbuf_num > 0);
+	psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+			     (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)));
+	psmi_assert(scb->payload_size <= scbc->sbuf_buf_size);
+
+	if (scb->payload_size <= scbc->scb_imm_size) {
+		/* Attach immediate buffer */
+		ips_scb_buffer(scb) = scb->imm_payload;
+		return 1;
+	}
+
+	if (SLIST_EMPTY(&scbc->sbuf_free))
+		return 0;
+	else {
+		psmi_assert(scbc->sbuf_num_cur);
+		ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free);
+		scbc->sbuf_num_cur--;
+
+		/* If under memory pressure request ACK for packet to reclaim
+		 * credits.
+		 */
+		if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1))
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+		VALGRIND_MEMPOOL_ALLOC(scbc->sbuf_buf_alloc, ips_scb_buffer(scb),
+				       scb->payload_size);
+		SLIST_REMOVE_HEAD(&scbc->sbuf_free, next);
+		return 1;
+	}
+}
+
+int ips_scbctrl_avail(struct ips_scbctrl *scbc)
+{
+	return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0);
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len,
+			     uint32_t flags)
+{
+	ips_scb_t *scb, *scb_head = NULL;
+
+	psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1);
+	psmi_assert(scbc->sbuf_buf_size >= len);
+
+	while (scbnum--) {
+		if (SLIST_EMPTY(&scbc->scb_free))
+			break;
+		scb = SLIST_FIRST(&scbc->scb_free);
+		scb->flags = 0;	/* Need to set this here as bufalloc may request
+				 * an ACK under memory pressure
+				 */
+		VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+
+		if (flags & IPS_SCB_FLAG_ADD_BUFFER) {
+			scb->payload_size = len;
+			if (!ips_scbctrl_bufalloc(scb))
+				break;
+		} else {
+			ips_scb_buffer(scb) = NULL;
+			scb->payload_size = 0;
+		}
+
+		scb->tidsendc = NULL;
+		scb->callback = NULL;
+		scb->tidctrl = 0;
+		scb->nfrag = 1;
+		scb->frag_size = 0;
+#ifdef PSM_CUDA
+		scb->mq_req = NULL;
+#endif
+
+		scbc->scb_num_cur--;
+		if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+			scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+		SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+		SLIST_NEXT(scb, next) = scb_head;
+		scb_head = scb;
+	}
+	return scb_head;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc);
+
+void ips_scbctrl_free(ips_scb_t *scb)
+{
+	struct ips_scbctrl *scbc = scb->scbc;
+	if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+	    (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) {
+		scbc->sbuf_num_cur++;
+		SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next);
+		VALGRIND_MEMPOOL_FREE(scbc->sbuf_buf_alloc, ips_scb_buffer(scb));
+	}
+
+	ips_scb_buffer(scb) = NULL;
+	scb->tidsendc = NULL;
+	scb->payload_size = 0;
+	scbc->scb_num_cur++;
+	if (SLIST_EMPTY(&scbc->scb_free)) {
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+		if (scbc->scb_avail_callback != NULL)
+			scbc->scb_avail_callback(scbc, scbc->scb_avail_context);
+	} else
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+	VALGRIND_MEMPOOL_FREE(scbc, scb);
+	return;
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc)
+{
+	ips_scb_t *scb;
+	if (SLIST_EMPTY(&scbc->scb_free))
+		return NULL;
+	scb = SLIST_FIRST(&scbc->scb_free);
+
+	VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+	SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+	SLIST_NEXT(scb, next) = NULL;
+
+	ips_scb_buffer(scb) = NULL;
+	scb->payload_size = 0;
+	scb->flags = 0;
+	scb->tidsendc = NULL;
+	scb->callback = NULL;
+	scb->tidctrl = 0;
+	scb->nfrag = 1;
+	scb->frag_size = 0;
+#ifdef PSM_CUDA
+	scb->mq_req = NULL;
+#endif
+
+	scbc->scb_num_cur--;
+	if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+		scb->flags |= IPS_SEND_FLAG_ACKREQ;
+	return scb;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny);
diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h
new file mode 100644
index 0000000..62a509b
--- /dev/null
+++ b/ptl_ips/ips_scb.h
@@ -0,0 +1,226 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_SCB_H
+#define _IPS_SCB_H
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ips_proto_header.h"
+
+/* ips_alloc_scb flags */
+#define IPS_SCB_FLAG_NONE	0x0
+#define IPS_SCB_FLAG_ADD_BUFFER 0x1
+
+/* macros to update scb */
+#define ips_scb_hdrdata(scb)   scb->ips_lrh.hdr_data
+#define ips_scb_uwords(scb)    scb->ips_lrh.data
+#define ips_scb_opcode(scb)    scb->opcode
+#define ips_scb_buffer(scb)    scb->payload
+#define ips_scb_length(scb)    scb->payload_size
+#define ips_scb_flags(scb)     scb->flags
+#define ips_scb_dma_cntr(scb)  scb->dma_cntr
+#define ips_scb_epaddr(scb)    scb->epaddr
+#define ips_scb_cb(scb)        scb->callback
+#define ips_scb_cb_param(scb)  scb->cb_param
+
+#define ips_scb_copy_tag(dst, src)			\
+				(dst)[0] = (src)[0];	\
+				(dst)[1] = (src)[1];	\
+				(dst)[2] = (src)[2];
+
+struct ips_scbbuf;
+struct ips_scb;
+struct ips_scbctrl;
+struct ips_tid_send_desc;
+
+typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *,
+						 void *context);
+
+STAILQ_HEAD(ips_scb_stailq, ips_scb);
+SLIST_HEAD(ips_scb_slist, ips_scb);
+
+struct ips_scbctrl {
+	/* const psmi_context_t *context; */
+
+	/* Send control blocks for each send */
+	uint32_t scb_num;
+	uint32_t scb_num_cur;
+	 SLIST_HEAD(scb_free, ips_scb) scb_free;
+	void *scb_base;
+	ips_scbctrl_avail_callback_fn_t scb_avail_callback;
+	void *scb_avail_context;
+
+	/* Immediate data for send buffers */
+	uint32_t scb_imm_size;
+	void *scb_imm_buf;
+	psmi_timer *timers;	/* ack/send timers */
+
+	/*
+	 * Send buffers (or bounce buffers) to keep user data if we need to
+	 * retransmit.
+	 */
+	uint32_t sbuf_num;
+	uint32_t sbuf_num_cur;
+	 SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free;
+	void *sbuf_buf_alloc;
+	uint32_t sbuf_buf_size;
+	void *sbuf_buf_base;
+	void *sbuf_buf_last;
+};
+
+struct ips_scbbuf {
+	SLIST_ENTRY(ips_scbbuf) next;
+};
+
+typedef struct ips_scb ips_scb_t;
+
+struct ips_scb {
+	union {
+		SLIST_ENTRY(ips_scb) next;
+		STAILQ_ENTRY(ips_scb) nextq;
+	};
+	union {
+		void *payload;
+		struct ips_scbbuf *sbuf;
+	};
+	uint64_t ack_timeout;	/* in cycles  */
+	uint64_t abs_timeout;	/* in cycles  */
+
+	psmi_timer *timer_send;	/* for sending packets */
+	psmi_timer *timer_ack;	/* for acking packets */
+
+	/* Used when composing packet */
+	psmi_seqnum_t seq_num;
+	uint32_t cksum[2];
+	uint32_t flags;
+	uint32_t payload_size;	/* remaining first packet size */
+	uint32_t chunk_size;	/* total buffer size if nfrag > 1 */
+	/* initially chunk_size_remaining = chunk_size. */
+	uint32_t chunk_size_remaining; /* buffer size to re-transmit */
+	uint16_t nfrag;		/* total packets in sequence */
+	/* initially nfrag_remaining = nfrag */
+	uint16_t nfrag_remaining; /* number packets to re-transmit */
+	uint16_t dma_complete;
+	uint16_t tidctrl;
+	uint16_t frag_size;	/* max packet size in sequence */
+	uint16_t opcode;
+
+	struct ips_flow *flow;
+	struct ips_tid_send_desc *tidsendc;
+	uint32_t *tsess;
+	uint16_t tsess_length;
+
+	struct ips_scbctrl *scbc;
+	void *imm_payload;
+
+	union {
+		int (*callback) (void *, uint32_t);
+		psm2_am_completion_fn_t completion_am;
+	};
+	void *cb_param;
+#ifdef PSM_CUDA
+	psm2_mq_req_t mq_req;		/* back pointer to original request */
+#endif
+
+	/* sdma header place holder, PSM2 code should access
+	 * the sdma_req_info only using the psmi_get_sdma_req_info()
+	 * accessor function. */
+	/*
+	 * The size of struct sdma_req_info is variable. (10 bytes for
+	 * GPU-direct and 8 bytes for non GPU-Direct)
+	 * When GPU-Direct feature is used, all 10 bytes of the space is used.
+	 * Otherwise, we only use upto 8 bytes. The usage is controlled by
+	 * psmi_get_sdma_req_info() in ips_proto.h
+	 */
+	union {
+		struct sdma_req_info _DO_NOT_USE_;
+		struct sdma_req_info_v6_3 _PLEASE_DO_NOT_USE_;
+	};
+	struct {
+		struct hfi_pbc pbc;
+		struct ips_message_header ips_lrh;
+	} PSMI_CACHEALIGN;
+};
+
+#ifdef PSM_CUDA
+#define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL)
+/* In case we need to be more precise about scb's locality
+ * we can expand the macro in place, e.g.
+ * #define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL && \
+ * 					 scb->mq_req->is_buf_gpu_mem && \
+ * 					!scb->mq_req->cuda_hostbuf_used)
+ */
+#endif
+
+void ips_scbctrl_free(ips_scb_t *scb);
+int ips_scbctrl_bufalloc(ips_scb_t *scb);
+int ips_scbctrl_avail(struct ips_scbctrl *scbc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc,
+			     int scbnum, int len, uint32_t flags);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny);
+
+psm2_error_t ips_scbctrl_init(const psmi_context_t *context,
+			     uint32_t numscb, uint32_t numbufs,
+			     uint32_t imm_size, uint32_t bufsize,
+			     ips_scbctrl_avail_callback_fn_t,
+			     void *avail_context, struct ips_scbctrl *);
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *);
+
+psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd);
+
+#endif /* _IPS_SCB_H */
diff --git a/ptl_ips/ips_spio.c b/ptl_ips/ips_spio.c
new file mode 100644
index 0000000..944ebf5
--- /dev/null
+++ b/ptl_ips/ips_spio.c
@@ -0,0 +1,951 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/* included header files  */
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sched.h>
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_spio.h"
+#include "ipserror.h"		/* ips error codes */
+#include "ips_proto_params.h"
+
+/* Report PIO stalls every 20 seconds at the least */
+#define SPIO_STALL_WARNING_INTERVAL	  (nanosecs_to_cycles(20e9))
+#define SPIO_MAX_CONSECUTIVE_SEND_FAIL	  (1<<20)	/* 1M */
+/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */
+#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4)	/* 16 */
+
+static void spio_report_stall(struct ips_spio *ctrl,
+			      uint64_t t_cyc_now, uint64_t send_failures);
+
+static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures);
+
+static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl);
+static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl);
+static psm2_error_t spio_credit_return_update(struct ips_spio *ctrl);
+static psm2_error_t spio_credit_return_update_shared(struct ips_spio *ctrl);
+
+psm2_error_t
+ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
+	      struct ips_spio *ctrl)
+{
+	const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+	cpuid_t id;
+	int i;
+
+	ctrl->ptl = ptl;
+	ctrl->context = context;
+	/* Copy runtime flags */
+	ctrl->runtime_flags = ptl->runtime_flags;
+	ctrl->unit_id = context->ep->unit_id;
+	ctrl->portnum = context->ep->portnum;
+
+	pthread_spin_init(&ctrl->spio_lock, PTHREAD_PROCESS_PRIVATE);
+	ctrl->spio_credits_addr =
+	    (__le64 *) (ptrdiff_t) base_info->sc_credits_addr;
+	ctrl->spio_bufbase_sop =
+	    (uint64_t *) (ptrdiff_t) base_info->pio_bufbase_sop;
+	ctrl->spio_bufbase =
+	    (uint64_t *) (ptrdiff_t) base_info->pio_bufbase;
+	ctrl->spio_event = (uint64_t *) (ptrdiff_t) base_info->events_bufbase;
+
+	ctrl->spio_consecutive_failures = 0;
+	ctrl->spio_num_stall = 0ULL;
+	ctrl->spio_num_stall_total = 0ULL;
+	ctrl->spio_next_stall_warning = 0ULL;
+	ctrl->spio_last_stall_cyc = 0ULL;
+	ctrl->spio_init_cyc = get_cycles();
+
+	ctrl->spio_total_blocks = ctxt_info->credits;
+	ctrl->spio_block_index = 0;
+
+	ctrl->spio_ctrl = (struct ips_spio_ctrl *)context->spio_ctrl;
+	if (!ctrl->spio_ctrl) {
+		ctrl->spio_ctrl = (volatile struct ips_spio_ctrl *)
+		    psmi_calloc(context->ep, UNDEFINED, 1,
+				sizeof(struct ips_spio_ctrl));
+		if (ctrl->spio_ctrl == NULL) {
+			return PSM2_NO_MEMORY;
+		}
+
+		ctrl->spio_reset_hfi = spio_reset_hfi;
+		ctrl->spio_credit_return_update =
+				spio_credit_return_update;
+	} else {
+		ctrl->spio_reset_hfi = spio_reset_hfi_shared;
+		ctrl->spio_credit_return_update =
+				spio_credit_return_update_shared;
+	}
+
+	/*
+	 * Only the master process can initialize.
+	 */
+	if (ctxt_info->subctxt == 0) {
+		pthread_spin_init(&ctrl->spio_ctrl->spio_ctrl_lock,
+					PTHREAD_PROCESS_SHARED);
+
+		ctrl->spio_ctrl->spio_write_in_progress = 0;
+		ctrl->spio_ctrl->spio_reset_count = 0;
+		ctrl->spio_ctrl->spio_frozen_count = 0;
+
+		ctrl->spio_ctrl->spio_available_blocks =
+				ctrl->spio_total_blocks;
+		ctrl->spio_ctrl->spio_block_index = 0;
+		ctrl->spio_ctrl->spio_fill_counter = 0;
+
+		psmi_assert(SPIO_CREDITS_Counter
+			    (ctrl->spio_ctrl->spio_credits.value) == 0);
+		psmi_assert(SPIO_CREDITS_Status
+			    (ctrl->spio_ctrl->spio_credits.value) == 0);
+
+		ctrl->spio_ctrl->spio_credits.credit_return =
+				*ctrl->spio_credits_addr;
+	}
+
+	/*
+	 * Setup the PIO block copying routine.
+	 */
+	ctrl->spio_blockcpy_selected = NULL;
+	ctrl->spio_blockcpy_routines[0] = hfi_pio_blockcpy_64;
+
+	ctrl->spio_blockcpy_routines[1] = NULL;
+#ifdef __SSE2__
+	ctrl->spio_blockcpy_routines[1] = hfi_pio_blockcpy_128;
+#endif
+	ctrl->spio_blockcpy_routines[2] = NULL;
+#ifdef __AVX2__
+	ctrl->spio_blockcpy_routines[2] = hfi_pio_blockcpy_256;
+#endif
+	ctrl->spio_blockcpy_routines[3] = NULL;
+#ifdef __AVX512F__
+	ctrl->spio_blockcpy_routines[3] = hfi_pio_blockcpy_512;
+#endif
+
+	get_cpuid(0x7, 0, &id);
+	if (id.ebx & (1<<AVX512F_BIT)) {
+		/* avx512f supported */
+		for (i = 3; i>= 0; i--) {
+			if (ctrl->spio_blockcpy_routines[i]) {
+				ctrl->spio_blockcpy_selected =
+					ctrl->spio_blockcpy_routines[i];
+				break;
+			}
+		}
+	} else if (id.ebx & (1<<AVX2_BIT)) {
+		/* 32B copying supported */
+		for (i = 2; i >=0; i--) {
+			if (ctrl->spio_blockcpy_routines[i]) {
+			    ctrl->spio_blockcpy_selected =
+				ctrl->spio_blockcpy_routines[i];
+			    break;
+			}
+		}
+	} else {
+		get_cpuid(0x1, 0, &id);
+		if (id.edx & (1<<SSE2_BIT)) {
+			/* 16B copying supported */
+			for (i = 1; i >=0; i--) {
+				if (ctrl->spio_blockcpy_routines[i]) {
+				    ctrl->spio_blockcpy_selected =
+					ctrl->spio_blockcpy_routines[i];
+				    break;
+				}
+			}
+		} else {
+			/* use 8B copying */
+			ctrl->spio_blockcpy_selected =
+					ctrl->spio_blockcpy_routines[0];
+		}
+	}
+	psmi_assert(ctrl->spio_blockcpy_selected != NULL);
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED) {
+		PSMI_CUDA_CALL(cudaHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+		       10240 /* Max MTU */, cudaHostAllocPortable);
+	}
+#endif
+
+	_HFI_PRDBG("ips_spio_init() done\n");
+
+	return PSM2_OK;
+}
+
+psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
+{
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		PSMI_CUDA_CALL(cudaFreeHost, (void *) ctrl->cuda_pio_buffer);
+#endif
+	spio_report_stall(ctrl, get_cycles(), 0ULL);
+	if (!ctrl->context->spio_ctrl)
+		psmi_free((void *)ctrl->spio_ctrl);
+	return PSM2_OK;
+}
+
+static
+void
+spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now,
+		  uint64_t send_failures)
+{
+	size_t off = 0;
+	char buf[1024];
+
+	if (ctrl->spio_num_stall == 0)
+		return;
+
+	if (send_failures > 0) {
+		char bufctr[128];
+		uint64_t tx_stat, rx_stat;
+		int ret;
+
+		off = snprintf(buf, sizeof(buf) - 1,
+			       "PIO Send context %d with total blocks %d , available blocks %d, "
+			       "fill counter %d, free counter %d ",
+			       (int)psm2_epid_context(ctrl->context->epid),
+			       ctrl->spio_total_blocks,
+			       ctrl->spio_ctrl->spio_available_blocks,
+			       ctrl->spio_ctrl->spio_fill_counter,
+			       SPIO_CREDITS_Counter(ctrl->spio_ctrl->
+						    spio_credits.value));
+		buf[off] = '\0';
+
+		/* In case hfifs isn't running */
+		ret = hfi_get_single_portctr(ctrl->unit_id, ctrl->portnum,
+					     "TxPkt", &tx_stat);
+		if (ret != -1) {
+			ret = hfi_get_single_portctr(ctrl->unit_id,
+						     ctrl->portnum, "RxPkt",
+						     &rx_stat);
+			if (ret != -1) {
+				snprintf(bufctr, sizeof(bufctr) - 1,
+					 "(TxPktCnt=%llu,RxPktCnt=%llu)",
+					 (unsigned long long)tx_stat,
+					 (unsigned long long)rx_stat);
+				bufctr[sizeof(bufctr) - 1] = '\0';
+			} else
+				bufctr[0] = '\0';
+		} else
+			bufctr[0] = '\0';
+
+		_HFI_DBG
+		    ("PIO Send Stall after at least %.2fM failed send attempts "
+		     "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n",
+		     send_failures / 1e6,
+		     PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+		     PSMI_CYCLES_TO_SECSF(t_cyc_now -
+					  ctrl->spio_last_stall_cyc),
+		     (unsigned long long)ctrl->spio_num_stall,
+		     bufctr[0] != '\0' ? bufctr : "", buf);
+	} else {
+		_HFI_DBG
+		    ("PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs",
+		     (unsigned long long)ctrl->spio_num_stall,
+		     PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+		     PSMI_CYCLES_TO_SECSF(t_cyc_now -
+					  ctrl->spio_last_stall_cyc));
+	}
+
+	return;
+}
+
+static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures)
+{
+	uint64_t t_cyc_now = get_cycles();
+
+	/* We handle the pio-stall every time but only report something every 20
+	 * seconds.  We print a summary at the end while closing the device */
+	ctrl->spio_num_stall++;
+	ctrl->spio_num_stall_total++;
+
+	if (ctrl->spio_next_stall_warning <= t_cyc_now) {
+		/* If context status is ok (i.e. no cables pulled or anything) */
+		if (psmi_context_check_status(ctrl->context) == PSM2_OK)
+			spio_report_stall(ctrl, t_cyc_now, send_failures);
+		ctrl->spio_next_stall_warning =
+		    get_cycles() + SPIO_STALL_WARNING_INTERVAL;
+	}
+
+	/* re-initialize our shadow from the real registers; by this time,
+	 * we know the hardware has to have done the update.
+	 * Also, kernel check may have changed things.
+	 */
+	ctrl->spio_credit_return_update(ctrl);
+
+	ctrl->spio_last_stall_cyc = t_cyc_now;
+
+	return;
+}
+
+/*
+ * A send context halt is detected in several ways:
+ * 1. during pio for normal credit return update;
+ * 2. during events process when no event;
+ * when a hfi is frozen, we recover hfi by calling this routine.
+ */
+static void spio_reset_context(struct ips_spio *ctrl)
+{
+	/* if there are too many reset, teardown process */
+	ctrl->spio_ctrl->spio_reset_count++;
+	if (ctrl->spio_ctrl->spio_reset_count > IPS_CTXT_RESET_MAX)
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Too many send context reset, teardown...\n");
+
+	/*
+	 * Because there are many epaddrs and many flows using the
+	 * same PIO queue, it is hard to search all the unacked
+	 * queue and find the correct retry point. Instead we just
+	 * let the upper level flow control to NAK the packets and
+	 * do the retry from the right point.
+	 */
+
+	/* Call into driver to reset send context, driver will
+	 * block this routine until the send context is actually
+	 * reset.
+	 */
+	ips_wmb();
+	if (hfi_reset_context(ctrl->context->ctrl))
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Send context reset failed: %d.\n", errno);
+
+	/* Reset spio shared control struct. */
+	ctrl->spio_ctrl->spio_available_blocks =
+			ctrl->spio_total_blocks;
+	ctrl->spio_ctrl->spio_block_index = 0;
+	ctrl->spio_ctrl->spio_fill_counter = 0;
+	/* Get updated credit return again after reset. */
+	ctrl->spio_ctrl->spio_credits.credit_return =
+			*ctrl->spio_credits_addr;
+
+	psmi_assert(SPIO_CREDITS_Counter
+			(ctrl->spio_ctrl->spio_credits.value) == 0);
+	psmi_assert(SPIO_CREDITS_Status
+			(ctrl->spio_ctrl->spio_credits.value) == 0);
+}
+
+/*
+ * hfi frozen is detected when checking events from driver,
+ * psm calls to check events in the main receive loop
+ * when there is no normal traffic.
+ */
+static void spio_reset_hfi_internal(struct ips_spio *ctrl)
+{
+	struct ips_recvhdrq *recvq = &ctrl->ptl->recvq;
+	struct ips_proto *proto = (struct ips_proto *)&ctrl->ptl->proto;
+
+	/* Reset receive queue state, this must be done first
+	 * because after send context reset, hardware start to
+	 * receive new packets.
+	 */
+	recvq->state->hdrq_head = 0;
+	recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE;
+	recvq->state->num_hdrq_done = 0;
+	recvq->state->hdr_countdown = 0;
+	if (!(recvq->runtime_flags & HFI1_CAP_DMA_RTAIL))
+		recvq->state->hdrq_rhf_seq = 1;
+
+	/* Reset send context */
+	spio_reset_context(ctrl);
+
+	/* Reset sdma completion queue, this should be done last
+	 * because when send context is reset, driver will complete
+	 * all the sdma requests with error code -2. This error
+	 * code is ignored by PSM, but other error codes are
+	 * caught inside the routine.
+	 */
+	while (proto->sdma_done_index != proto->sdma_fill_index)
+		ips_proto_dma_completion_update(proto);
+}
+
+static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl)
+{
+	/* Drain receive header queue before reset hfi, we use
+	 * the main progression loop to do this so we return from
+	 * here.
+	 */
+	if (!ips_recvhdrq_isempty(&ctrl->ptl->recvq))
+		return PSM2_OK_NO_PROGRESS;
+
+	/* do the real reset work:
+	 * 1. reset receive header queue;
+	 * 2. reset send context;
+	 * 3. dain sdma completion queue;
+	 */
+	spio_reset_hfi_internal(ctrl);
+
+	return PSM2_OK;
+}
+
+/*
+ * There is a shared count and per process count, all initialized to
+ * zero. If a process' local count is equal to shared count, it is
+ * the first process and does the hfi reset, this process also move
+ * both counts up by one. If a process' local count is not equal to
+ * the shared count, it means other process has done the hfi reset,
+ * it just saves the shared count to local count and return. All the
+ * operation are locked by spio_ctrl_lock.
+ */
+static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl)
+{
+	volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl;
+
+	/* Drain receive header queue before reset hfi, we use
+	 * the main progression loop to do this so we return from
+	 * here. We don't reset software receive header queue.
+	 */
+	if (!ips_recvhdrq_isempty(&ctrl->ptl->recvq))
+		return PSM2_OK_NO_PROGRESS;
+
+	pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+
+	/*
+	 * In context sharing mode, if there is a subcontext
+	 * process in PIO writing, we need to wait till the PIO
+	 * writing is done. So we spin wait here. If other
+	 * process comes here and does the hfi reset, it should
+	 * be perfectly fine.
+	 */
+	while (ctrl->spio_ctrl->spio_write_in_progress) {
+		pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+		usleep(1000);
+		pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+	}
+
+	if (ctrl->spio_frozen_count == ctrl->spio_ctrl->spio_frozen_count) {
+		ctrl->spio_frozen_count++;
+		ctrl->spio_ctrl->spio_frozen_count++;
+
+		spio_reset_hfi_internal(ctrl);
+	} else
+		ctrl->spio_frozen_count = ctrl->spio_ctrl->spio_frozen_count;
+
+	pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+
+	return PSM2_OK;
+}
+
+/*
+ * return value:
+ * PSM2_OK: new credits updated;
+ * PSM2_OK_NO_PROGRESS: no new credits;
+ */
+static psm2_error_t
+spio_credit_return_update(struct ips_spio *ctrl)
+{
+	uint64_t credit_return;
+
+	credit_return = *ctrl->spio_credits_addr;
+	/* Update available blocks based on fill counter and free counter */
+	if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return)
+		return PSM2_OK_NO_PROGRESS;
+
+	ctrl->spio_ctrl->spio_credits.credit_return = credit_return;
+
+	/* If Status is set, then send context is halted */
+	if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) {
+		spio_reset_context(ctrl);
+	} else {
+		/*
+		 * OPA1 has 1M PIO buffer, but each context can have max 64K,
+		 * which is 1K 64B blocks, so the distance between fill counter
+		 * and credit return counter is no more than 1024; Both fill
+		 * counter and credit return counter are 11 bits value,
+		 * representing range [0, 2047].
+		 */
+		psmi_assert((ctrl->spio_ctrl->spio_available_blocks +
+			((ctrl->spio_ctrl->spio_fill_counter -
+			SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+					    value)) & 0x7FF)) <=
+			ctrl->spio_total_blocks);
+		ctrl->spio_ctrl->spio_available_blocks =
+			ctrl->spio_total_blocks -
+			((ctrl->spio_ctrl->spio_fill_counter -
+			SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+					   value)) & 0x7FF);
+
+		/* a successful credit update, clear reset count */
+		ctrl->spio_ctrl->spio_reset_count = 0;
+	}
+
+	return PSM2_OK;
+}
+
+/*
+ * return value:
+ * PSM2_OK: new credits updated;
+ * PSM2_OK_NO_PROGRESS: no new credits;
+ */
+static psm2_error_t
+spio_credit_return_update_shared(struct ips_spio *ctrl)
+{
+	uint64_t credit_return;
+
+	pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock);
+
+	credit_return = *ctrl->spio_credits_addr;
+	/* Update available blocks based on fill counter and free counter */
+	if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) {
+		pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+		return PSM2_OK_NO_PROGRESS;
+	}
+
+	ctrl->spio_ctrl->spio_credits.credit_return = credit_return;
+
+	/* If Status is set, then send context is halted */
+	if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) {
+		/*
+		 * In context sharing mode, if there is a subcontext
+		 * process in PIO writing, we need to wait till the PIO
+		 * writing is done. So we spin wait here. Other processes
+		 * won't come here because for them, there is NO new
+		 * credit return change (the first 'if' check in this
+		 * routine).
+		 */
+		while (ctrl->spio_ctrl->spio_write_in_progress) {
+			pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+			usleep(1000);
+			pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock);
+		}
+
+		spio_reset_context(ctrl);
+	} else {
+		/*
+		 * OPA1 has 1M PIO buffer, but each context can have max 64K,
+		 * which is 1K 64B blocks, so the distance between fill counter
+		 * and credit return counter is no more than 1024; Both fill
+		 * counter and credit return counter are 11 bits value,
+		 * representing range [0, 2047].
+		 */
+		psmi_assert((ctrl->spio_ctrl->spio_available_blocks +
+			((ctrl->spio_ctrl->spio_fill_counter -
+			SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+					    value)) & 0x7FF)) <=
+			ctrl->spio_total_blocks);
+		ctrl->spio_ctrl->spio_available_blocks =
+			ctrl->spio_total_blocks -
+			((ctrl->spio_ctrl->spio_fill_counter -
+			SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+					   value)) & 0x7FF);
+
+		/* a successful credit update, clear reset count */
+		ctrl->spio_ctrl->spio_reset_count = 0;
+	}
+
+	pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+
+	return PSM2_OK;
+}
+
+/*
+ * Check and process events
+ * return value:
+ *  PSM2_OK: normal events processing;
+ *  PSM2_OK_NO_PROGRESS: no event is processed;
+ */
+psm2_error_t
+ips_spio_process_events(const struct ptl *ptl)
+{
+	struct ips_spio *ctrl = ptl->proto.spioc;
+	__u64 event_mask;
+
+	/*
+	 * If there is no event, try do credit return update
+	 * to catch send context halt.
+	 */
+	if_pf(*ctrl->spio_event == 0)
+		return ctrl->spio_credit_return_update(ctrl);
+
+	/*
+	 * Process mmu invalidation event, this will invalidate
+	 * all caching items removed by mmu notifier.
+	 */
+	if ((*ctrl->spio_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+		/*
+		 * driver will clear the event bit before return,
+		 * PSM does not need to ack the event.
+		 */
+		return ips_tidcache_invalidation(&ptl->proto.protoexp->tidc);
+	}
+
+	/* Get event mask for PSM to process */
+	event_mask = (uint64_t) *ctrl->spio_event;
+
+	/* Check if HFI is frozen */
+	if (event_mask & HFI1_EVENT_FROZEN) {
+		/* if no progress, return and retry */
+		if (ctrl->spio_reset_hfi(ctrl) != PSM2_OK)
+			return PSM2_OK_NO_PROGRESS;
+	}
+
+	/* First ack the driver the receipt of the events */
+	_HFI_VDBG("Acking event(s) 0x%" PRIx64 " to qib driver.\n",
+		  (uint64_t) event_mask);
+	hfi_event_ack(ctrl->context->ctrl, event_mask);
+
+	if (event_mask & HFI1_EVENT_LINKDOWN) {
+		/* A link down event can clear the LMC and SL2VL
+		 * change as those events are implicitly handled
+		 * in the link up/down event handler.
+		 */
+		event_mask &=
+			    ~(HFI1_EVENT_LMC_CHANGE |
+				HFI1_EVENT_SL2VL_CHANGE);
+		ips_ibta_link_updown_event(&ctrl->ptl->proto);
+		_HFI_VDBG("Link down detected.\n");
+	}
+
+	if (event_mask & HFI1_EVENT_LID_CHANGE) {
+		/* Display a warning that LID change has occurred during
+		 * the run. This is not supported in the current
+		 * implementation and in general is bad for the SM to
+		 * re-assign LIDs during a run.
+		 */
+		_HFI_INFO
+		    ("Warning! LID change detected during run. "
+			"Old LID: %d, New Lid: %d\n",
+		     (int)PSMI_EPID_GET_LID(ctrl->context->epid),
+		     (int)hfi_get_port_lid(ctrl->unit_id,
+					   ctrl->portnum));
+	}
+
+	if (event_mask & HFI1_EVENT_LMC_CHANGE)
+			_HFI_INFO("Fabric LMC changed.\n");
+
+	if (event_mask & HFI1_EVENT_SL2VL_CHANGE) {
+		_HFI_INFO("SL2VL mapping changed for port.\n");
+		ips_ibta_init_sl2sc2vl_table(&ctrl->ptl->proto);
+	}
+
+	return PSM2_OK;
+}
+
+static void
+spio_handle_resync(struct ips_spio *ctrl, uint64_t consecutive_send_failed)
+{
+	/* hfi_force_pio_avail_update(ctrl->context->ctrl); */
+
+	if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1)))
+		spio_handle_stall(ctrl, consecutive_send_failed);
+}
+
+/*
+ * This function attempts to write a packet to a PIO.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Packet triggered through PIO.
+ * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm2_error_t
+ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
+			struct hfi_pbc *pbc, uint32_t *payload,
+			uint32_t length, uint32_t isCtrlMsg,
+			uint32_t cksum_valid, uint32_t cksum
+#ifdef PSM_CUDA
+			, uint32_t is_cuda_payload
+#endif
+			)
+{
+	struct ips_spio *ctrl = proto->spioc;
+	volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl;
+	volatile uint64_t *pioaddr;
+	uint32_t paylen, nblks;
+	psm2_error_t err = PSM2_OK;
+	int do_lock = (ctrl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+
+	if (do_lock)
+		pthread_spin_lock(&ctrl->spio_lock);
+
+	if_pf(PSMI_FAULTINJ_ENABLED()) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1,
+					  IPS_FAULTINJ_PIOLOST);
+		PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1,
+					  IPS_FAULTINJ_PIOBUSY);
+		if (psmi_faultinj_is_fault(fi_lost)) {
+			if (do_lock)
+				pthread_spin_unlock(&ctrl->spio_lock);
+			return PSM2_OK;
+		} else if (psmi_faultinj_is_fault(fi_busy))
+			goto fi_busy;
+		/* else fall through normal processing path, i.e. no faults */
+	}
+
+	psmi_assert((length & 0x3) == 0);
+	paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0);
+	nblks = 1 + ((paylen + 63) >> 6);
+
+	if (spio_ctrl->spio_available_blocks < nblks) {
+		ctrl->spio_credit_return_update(ctrl);
+
+		if_pf(spio_ctrl->spio_available_blocks < nblks) {
+			/* Check unit status */
+fi_busy:
+			if ((err =
+			     psmi_context_check_status(ctrl->context)) ==
+			    PSM2_OK) {
+				if (0 ==
+				    (++ctrl->
+				     spio_consecutive_failures &
+				     (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL - 1)))
+					spio_handle_resync(ctrl,
+							   ctrl->
+							   spio_consecutive_failures);
+				err = PSM2_EP_NO_RESOURCES;
+			}
+			/* If cable is pulled, we don't count it as a consecutive failure,
+			 * we just make it as though no send pio was available */
+			else if (err == PSM2_OK_NO_PROGRESS)
+				err = PSM2_EP_NO_RESOURCES;
+			/* else something bad happened in check_status */
+			if (do_lock)
+				pthread_spin_unlock(&ctrl->spio_lock);
+			return err;
+		}
+	}
+
+	/*
+	 * if context->spio_ctrl is set, it is pointing to shared context ureg
+	 * page, and we are using context sharing.
+	 */
+	if (ctrl->context->spio_ctrl) {
+		pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+		if (spio_ctrl->spio_available_blocks < nblks) {
+			pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+
+			if (do_lock)
+				pthread_spin_unlock(&ctrl->spio_lock);
+			return PSM2_EP_NO_RESOURCES;
+		}
+	}
+
+	_HFI_VDBG("credits: total %d, avail %d index %d, fill %d "
+		  "free %d: %d %d %d %d %d; addr %llx\n",
+		  ctrl->spio_total_blocks,
+		  spio_ctrl->spio_available_blocks,
+		  spio_ctrl->spio_block_index,
+		  spio_ctrl->spio_fill_counter,
+		  SPIO_CREDITS_Counter(spio_ctrl->spio_credits.value),
+		  SPIO_CREDITS_Status(spio_ctrl->spio_credits.value),
+		  SPIO_CREDITS_DueToPbc(spio_ctrl->spio_credits.value),
+		  SPIO_CREDITS_DueToTheshold(spio_ctrl->spio_credits.value),
+		  SPIO_CREDITS_DueToErr(spio_ctrl->spio_credits.value),
+		  SPIO_CREDITS_DueToForce(spio_ctrl->spio_credits.value),
+		  *ctrl->spio_credits_addr);
+
+	/*
+	 * Save the assigned locally, update the shared for other processes.
+	 */
+	ctrl->spio_block_index = spio_ctrl->spio_block_index;
+	spio_ctrl->spio_available_blocks -= nblks;
+	/* fill counter should be 11 bits value, same as credit return counter */
+	spio_ctrl->spio_fill_counter =
+	    (spio_ctrl->spio_fill_counter + nblks) & 0x7FF;
+	spio_ctrl->spio_block_index += nblks;
+	if (spio_ctrl->spio_block_index >= ctrl->spio_total_blocks)
+		spio_ctrl->spio_block_index -= ctrl->spio_total_blocks;
+
+	/*
+	 * Unlock in context sharing mode, but increase refcount to
+	 * indicate I am in progress to write to PIO blocks.
+	 */
+	if (ctrl->context->spio_ctrl) {
+		spio_ctrl->spio_write_in_progress++;
+		pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+	}
+
+	ctrl->spio_num_stall = 0;	/* now able to send, so clear if set */
+	ctrl->spio_consecutive_failures = 0;
+	if (do_lock)
+		pthread_spin_unlock(&ctrl->spio_lock);
+
+	_HFI_VDBG("PIO write: nblks %d length %d, paylen %d\n", nblks, length,
+		  paylen);
+
+	/* Setup PBC for this packet */
+	ips_proto_pbc_update(proto, flow, isCtrlMsg,
+			     pbc, sizeof(struct ips_message_header), paylen);
+
+	/* Write to PIO: SOP block */
+	pioaddr = ctrl->spio_bufbase_sop + ctrl->spio_block_index * 8;
+	if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+		ctrl->spio_block_index = 0;
+
+	ctrl->spio_blockcpy_selected(pioaddr, (uint64_t *) pbc, 1);
+	_HFI_VDBG("pio qw write sop %p: 8\n", pioaddr);
+
+	/* Write to PIO: other blocks of payload */
+#ifdef PSM_CUDA
+	if (is_cuda_payload) {
+		/* Since the implementation of cudaMemcpy is unknown,
+		   and the HFI specifies several conditions for how PIO
+		   writes must occur, for safety reasons we should not assume
+		   that cudaMemcpy will follow the HFI's requirements.
+		   The cudaMemcpy should instead write into a buffer in
+		   host memory, and then PSM can copy to the HFI as usual. */
+		PSMI_CUDA_CALL(cudaMemcpy, ctrl->cuda_pio_buffer,
+			       payload, paylen, cudaMemcpyDeviceToHost);
+		payload = (uint32_t *) ctrl->cuda_pio_buffer;
+	}
+#endif
+	if (length >= 64) {
+		uint32_t blks2send = length >> 6;
+		uint32_t blks2end =
+			ctrl->spio_total_blocks - ctrl->spio_block_index;
+
+		pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+		if (blks2end >= blks2send) {
+			ctrl->spio_blockcpy_selected(pioaddr,
+				(uint64_t *)payload, blks2send);
+			_HFI_VDBG("pio blk write %p: %d\n",
+					pioaddr, blks2send);
+			ctrl->spio_block_index += blks2send;
+			if (ctrl->spio_block_index == ctrl->spio_total_blocks)
+				ctrl->spio_block_index = 0;
+			payload += blks2send*16;
+		} else {
+			ctrl->spio_blockcpy_selected(pioaddr,
+				(uint64_t *)payload, blks2end);
+			_HFI_VDBG("pio blk write %p: %d\n",
+					pioaddr, blks2end);
+			payload += blks2end*16;
+
+			pioaddr = ctrl->spio_bufbase;
+			ctrl->spio_blockcpy_selected(pioaddr,
+				(uint64_t *)payload, (blks2send-blks2end));
+			_HFI_VDBG("pio blk write %p: %d\n",
+					pioaddr, (blks2send-blks2end));
+			ctrl->spio_block_index = blks2send - blks2end;
+			payload += (blks2send-blks2end)*16;
+		}
+
+		length -= blks2send*64;
+	}
+
+	/*
+	 * The following code makes sure to write to pioaddr in
+	 * qword granularity, this is required by hardware.
+	 */
+	paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0);
+	if (paylen > 0) {
+		uint32_t blkbuf[32];
+		uint32_t qws = length >> 3;
+		uint32_t dws = 0;
+
+		pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+		if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+			ctrl->spio_block_index = 0;
+
+		/* Write the remaining qwords of payload */
+		if (qws) {
+			hfi_qwordcpy_safe(pioaddr, (uint64_t *) payload, qws);
+			_HFI_VDBG("pio qw write %p: %d\n", pioaddr, qws);
+			payload += qws << 1;
+			length -= qws << 3;
+
+			pioaddr += qws;
+			paylen -= qws << 3;
+		}
+
+		/* if we have last one dword payload */
+		if (length > 0) {
+			blkbuf[dws++] = payload[0];
+		}
+		/* if we have checksum to attach */
+		if (paylen > length) {
+			blkbuf[dws++] = cksum;
+			blkbuf[dws++] = cksum;
+		}
+
+		/* Write the rest of qwords of current block */
+		hfi_qwordcpy_safe(pioaddr, (uint64_t *) blkbuf, 8 - qws);
+		_HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8 - qws);
+
+		if (paylen > ((8 - qws) << 3)) {
+			/* We need another block */
+			pioaddr =
+			    ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+			if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+				ctrl->spio_block_index = 0;
+
+			/* Write the last block */
+			hfi_qwordcpy_safe(pioaddr,
+					  (uint64_t *) &blkbuf[(8 - qws) << 1],
+					  8);
+			_HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8);
+		}
+	}
+
+	/*
+	 * In context sharing, we need to track who is in progress of
+	 * writing to PIO block, this is for halted send context reset.
+	 * I am done with PIO blocks writing, decrease the refcount.
+	 */
+	if (ctrl->context->spio_ctrl) {
+		pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+		spio_ctrl->spio_write_in_progress--;
+		pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+	}
+
+	return PSM2_OK;
+}				/* ips_spio_transfer_frame() */
diff --git a/ptl_ips/ips_spio.h b/ptl_ips/ips_spio.h
new file mode 100644
index 0000000..2d61cce
--- /dev/null
+++ b/ptl_ips/ips_spio.h
@@ -0,0 +1,189 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef IPS_SPIO_H
+#define IPS_SPIO_H
+
+#include "psm_user.h"
+
+#define IPS_CTXT_RESET_MAX	1000	/* max send context reset */
+struct ips_spio;
+struct ptl;
+
+/* 64B move instruction support */
+#define AVX512F_BIT		16	/* level 07h, ebx */
+/* 32B move instruction support */
+#define AVX2_BIT		 5	/* level 07h, ebx */
+/* 16B move instruction support */
+#define SSE2_BIT		26	/* level 01h, edx */
+
+typedef
+void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#endif
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#endif
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#endif
+void hfi_pio_blockcpy_64(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+
+
+psm2_error_t ips_spio_init(const psmi_context_t *context,
+				struct ptl *ptl, struct ips_spio *ctrl);
+psm2_error_t ips_spio_fini(struct ips_spio *ctrl);
+
+psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto,
+				struct ips_flow *flow, struct hfi_pbc *pbc,
+				uint32_t *payload, uint32_t length,
+				uint32_t isCtrlMsg, uint32_t cksum_valid,
+				uint32_t cksum
+#ifdef PSM_CUDA
+				, uint32_t is_cuda_payload
+#endif
+);
+
+psm2_error_t ips_spio_process_events(const struct ptl *ptl);
+
+#define SPIO_CREDITS_Counter(value)       (((value) >> 0) & 0x7FF)
+#define SPIO_CREDITS_Status(value)        (((value) >> 11) & 0x1)
+#define SPIO_CREDITS_DueToPbc(value)      (((value) >> 12) & 0x1)
+#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1)
+#define SPIO_CREDITS_DueToErr(value)      (((value) >> 14) & 0x1)
+#define SPIO_CREDITS_DueToForce(value)    (((value) >> 15) & 0x1)
+struct ips_spio_credits {
+/* don't use bit operation for performance reason,
+ * using above macro instead.
+	uint16_t	Counter:11;
+	uint16_t	Status:1;
+	uint16_t	CreditReturnDueToPbc:1;
+	uint16_t	CreditReturnDueToThreshold:1;
+	uint16_t	CreditReturnDueToErr:1;
+	uint16_t	CreditReturnDueToForce:1;
+*/
+	union {
+		struct {
+			uint16_t value;
+			uint16_t pad0;
+			uint32_t pad1;
+		};
+		uint64_t credit_return;
+	};
+};
+
+struct ips_spio_ctrl {
+	/* credit return lock for context sharing */
+	pthread_spinlock_t spio_ctrl_lock;
+
+	/* PIO write in progress for context sharing */
+	volatile uint16_t spio_write_in_progress;
+	/* send context reset count */
+	volatile uint16_t spio_reset_count;
+	/* HFI frozen count, shared copy */
+	volatile uint16_t spio_frozen_count;
+
+	volatile uint16_t spio_available_blocks;
+	volatile uint16_t spio_block_index;
+	volatile uint16_t spio_fill_counter;
+	volatile struct ips_spio_credits spio_credits;
+} __attribute__ ((aligned(64)));
+
+struct ips_spio {
+	const psmi_context_t *context;
+	struct ptl *ptl;
+	uint32_t runtime_flags;
+	uint16_t unit_id;
+	uint16_t portnum;
+
+	pthread_spinlock_t spio_lock;	/* thread lock */
+	volatile __le64 *spio_credits_addr __attribute__ ((aligned(64)));
+	volatile uint64_t *spio_bufbase_sop;
+	volatile uint64_t *spio_bufbase;
+	volatile uint64_t *spio_event;
+	volatile struct ips_spio_ctrl *spio_ctrl;
+
+	uint16_t spio_frozen_count;	/* local copy */
+	uint16_t spio_total_blocks;
+	uint16_t spio_block_index;
+
+	uint32_t spio_consecutive_failures;
+	uint64_t spio_num_stall;
+	uint64_t spio_num_stall_total;
+	uint64_t spio_next_stall_warning;
+	uint64_t spio_last_stall_cyc;
+	uint64_t spio_init_cyc;
+
+	psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl);
+	psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl);
+
+	/* 8B copying, 16B copying, 32B copying, and 64B copying */
+	ips_spio_blockcpy_fn_t spio_blockcpy_routines[4];
+	ips_spio_blockcpy_fn_t spio_blockcpy_selected;
+
+#ifdef PSM_CUDA
+	/* Use an intermediate buffer when writing PIO data from the
+	   GPU to ensure that we follow the HFI's write ordering rules. */
+	unsigned char *cuda_pio_buffer;
+#endif
+};
+
+#endif /* IPS_SPIO_H */
diff --git a/ptl_ips/ips_stats.h b/ptl_ips/ips_stats.h
new file mode 100644
index 0000000..046e0c3
--- /dev/null
+++ b/ptl_ips/ips_stats.h
@@ -0,0 +1,83 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_STATS_H
+#define _IPS_STATS_H
+
+struct psm2_epaddr;		/* for non-PSM clients */
+
+/* Old stats */
+typedef struct {
+	uint64_t err_chk_send;
+	uint64_t err_chk_recv;
+	uint64_t send_failed;
+	uint64_t recv_dropped;
+	union {
+		uint64_t recv_copied;	/* obsolete */
+		uint64_t nak_sent;
+	};
+	uint64_t nak_recv;
+	uint64_t total_send_eager;
+	uint64_t total_send_exp;
+	uint64_t acks_sent;
+	uint64_t retransmits;
+	uint64_t recv_matched;
+	uint64_t recv_unmatched;
+	uint64_t scb_alloc_yields;
+} ips_sess_stat;
+
+int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats);
+
+#endif /* _IPS_STATS_H */
diff --git a/ptl_ips/ips_subcontext.c b/ptl_ips/ips_subcontext.c
new file mode 100644
index 0000000..7e3d04b
--- /dev/null
+++ b/ptl_ips/ips_subcontext.c
@@ -0,0 +1,97 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_subcontext.h"
+#include "ips_spio.h"
+#include "ips_tid.h"
+#include "ips_tidflow.h"
+#include "ptl_ips.h"
+
+psm2_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, uint32_t subcontext_cnt,
+			psmi_context_t *context,
+			struct ips_subcontext_ureg **uregp)
+{
+	const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+	uintptr_t all_subcontext_uregbase =
+	    (uintptr_t) base_info->subctxt_uregbase;
+	int i;
+
+	psmi_assert_always(all_subcontext_uregbase != 0);
+	for (i = 0; i < HFI1_MAX_SHARED_CTXTS; i++) {
+		struct ips_subcontext_ureg *subcontext_ureg =
+		    (struct ips_subcontext_ureg *)all_subcontext_uregbase;
+		*uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL;
+		all_subcontext_uregbase += sizeof(struct ips_subcontext_ureg);
+	}
+
+	ptl->recvshc->hwcontext_ctrl =
+	    (struct ips_hwcontext_ctrl *)all_subcontext_uregbase;
+	all_subcontext_uregbase += sizeof(struct ips_hwcontext_ctrl);
+
+	context->spio_ctrl = (void *)all_subcontext_uregbase;
+	all_subcontext_uregbase += sizeof(struct ips_spio_ctrl);
+
+	context->tid_ctrl = (void *)all_subcontext_uregbase;
+	all_subcontext_uregbase += sizeof(struct ips_tid_ctrl);
+
+	context->tf_ctrl = (void *)all_subcontext_uregbase;
+	all_subcontext_uregbase += sizeof(struct ips_tf_ctrl);
+
+	psmi_assert((all_subcontext_uregbase -
+		     (uintptr_t) base_info->subctxt_uregbase) <= PSMI_PAGESIZE);
+
+	return PSM2_OK;
+}
diff --git a/ptl_ips/ips_subcontext.h b/ptl_ips/ips_subcontext.h
new file mode 100644
index 0000000..a35e080
--- /dev/null
+++ b/ptl_ips/ips_subcontext.h
@@ -0,0 +1,81 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef __IPS_SUBCONTEXT_H
+#define __IPS_SUBCONTEXT_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+
+/* This data structure is allocated in ureg page of each subcontext process */
+
+struct ips_subcontext_ureg {
+	/* head/eager head/tail register storage, one per cacheline */
+	uint64_t subcontext_uregbase[ur_maxreg * 8];
+	struct ips_writehdrq_state writeq_state;	/* used in all ureg pages */
+} __attribute__ ((aligned(64)));
+
+struct ips_hwcontext_ctrl {
+	pthread_spinlock_t context_lock;	/* lock shared by all subctxts */
+	struct ips_recvhdrq_state recvq_state;	/* state shared by all subctxts */
+} __attribute__ ((aligned(64)));
+
+psm2_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, uint32_t subcontext_cnt,
+			psmi_context_t *context,
+			struct ips_subcontext_ureg **uregp);
+
+#endif
diff --git a/ptl_ips/ips_tid.c b/ptl_ips/ips_tid.c
new file mode 100644
index 0000000..63f213b
--- /dev/null
+++ b/ptl_ips/ips_tid.c
@@ -0,0 +1,278 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_tid.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t
+ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp,
+	     ips_tid_avail_cb_fn_t cb, void *cb_context)
+{
+	const struct hfi1_user_info_dep *user_info = &context->user_info;
+	const struct hfi1_base_info *base_info     = &context->ctrl->base_info;
+	const struct hfi1_ctxt_info *ctxt_info     = &context->ctrl->ctxt_info;
+	struct ips_tid *tidc = &protoexp->tidc;
+
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL,
+				NULL, &tidc->tid_num_total),
+	};
+
+	tidc->context = context;
+	tidc->protoexp = protoexp;
+	tidc->tid_num_total = 0;
+	tidc->tid_num_inuse = 0;
+	tidc->tid_avail_cb = cb;
+	tidc->tid_avail_context = cb_context;
+	tidc->tid_array = NULL;
+	tidc->invalidation_event = (uint64_t *)
+		(ptrdiff_t) base_info->events_bufbase;
+
+	/*
+	 * PSM uses tid registration caching only if driver has enabled it.
+	 */
+	if (!(tidc->context->runtime_flags & HFI1_CAP_TID_UNMAP)) {
+		int i;
+		cl_qmap_t *p_map;
+		cl_map_item_t *root,*nil_item;
+
+		tidc->tid_array = (uint32_t *)
+			psmi_calloc(context->ep, UNDEFINED,
+				context->ctrl->__hfi_tidexpcnt,
+				sizeof(uint32_t));
+		if (tidc->tid_array == NULL)
+			return PSM2_NO_MEMORY;
+
+		/*
+		 * first is root node, last is terminator node.
+		 */
+		p_map = &tidc->tid_cachemap;
+		root = (cl_map_item_t *)
+			psmi_calloc(context->ep, UNDEFINED,
+				    context->ctrl->__hfi_tidexpcnt + 2,
+				    sizeof(cl_map_item_t));
+
+		if (root == NULL)
+			return PSM2_NO_MEMORY;
+
+		nil_item = &root
+			[context->ctrl->__hfi_tidexpcnt + 1];
+
+		ips_tidcache_map_init(p_map,root,nil_item);
+
+		NTID = 0;
+		NIDLE = 0;
+		IPREV(IHEAD) = INEXT(IHEAD) = IHEAD;
+		for (i = 1; i <= context->ctrl->__hfi_tidexpcnt; i++) {
+			INVALIDATE(i) = 1;
+		}
+
+		/*
+		 * if not shared context, all tids are used by the same
+		 * process. Otherwise, subcontext process can only cache
+		 * its own portion. Driver makes the same tid number
+		 * assignment to subcontext processes.
+		 */
+		tidc->tid_cachesize = context->ctrl->__hfi_tidexpcnt;
+		if (user_info->subctxt_cnt > 0) {
+			uint16_t remainder = tidc->tid_cachesize %
+					user_info->subctxt_cnt;
+			tidc->tid_cachesize /= user_info->subctxt_cnt;
+			if (ctxt_info->subctxt < remainder)
+				tidc->tid_cachesize++;
+		}
+	}
+
+	/*
+	 * Setup shared control structure.
+	 */
+	tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl;
+	if (!tidc->tid_ctrl) {
+		tidc->tid_ctrl = (struct ips_tid_ctrl *)
+		    psmi_calloc(context->ep, UNDEFINED, 1,
+				sizeof(struct ips_tid_ctrl));
+		if (tidc->tid_ctrl == NULL) {
+			return PSM2_NO_MEMORY;
+		}
+	}
+
+	/*
+	 * Only the master process can initialize.
+	 */
+	if (ctxt_info->subctxt == 0) {
+		pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock,
+					PTHREAD_PROCESS_SHARED);
+
+		tidc->tid_ctrl->tid_num_max =
+			    context->ctrl->__hfi_tidexpcnt;
+		tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max;
+	}
+
+	return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+					PSMI_STATSTYPE_TIDS,
+					entries,
+					PSMI_STATS_HOWMANY(entries), tidc);
+}
+
+psm2_error_t ips_tid_fini(struct ips_tid *tidc)
+{
+	if (tidc->tid_array)
+		ips_tidcache_cleanup(tidc);
+
+	if (!tidc->context->tid_ctrl)
+		psmi_free(tidc->tid_ctrl);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_tid_acquire(struct ips_tid *tidc,
+		const void *buf, uint32_t *length,
+		uint32_t *tid_array, uint32_t *tidcnt
+#ifdef PSM_CUDA
+		, uint8_t is_cuda_ptr
+#endif
+		)
+{
+	struct ips_tid_ctrl *ctrl = tidc->tid_ctrl;
+	psm2_error_t err = PSM2_OK;
+	uint16_t flags = 0;
+	int rc;
+
+	psmi_assert(((uintptr_t) buf & 0xFFF) == 0);
+	psmi_assert(((*length) & 0xFFF) == 0);
+
+	if (tidc->context->tid_ctrl)
+		pthread_spin_lock(&ctrl->tid_ctrl_lock);
+
+	if (!ctrl->tid_num_avail) {
+		err = PSM2_EP_NO_RESOURCES;
+		goto fail;
+	}
+
+	/* Clip length if it exceeds worst case tid allocation,
+	   where each entry in the tid array can accommodate only
+	   1 page. */
+	if (*length > 4096*tidc->tid_ctrl->tid_num_max)
+	{
+		*length = 4096*tidc->tid_ctrl->tid_num_max;
+	}
+
+#ifdef PSM_CUDA
+	if (is_cuda_ptr)
+		flags = HFI1_BUF_GPU_MEM;
+#endif
+
+	rc = hfi_update_tid(tidc->context->ctrl,
+		    (uint64_t) (uintptr_t) buf, length,
+		    (uint64_t) (uintptr_t) tid_array, tidcnt, flags);
+	if (rc < 0) {
+		/* Unable to pin pages? retry later */
+		err = PSM2_EP_DEVICE_FAILURE;
+		goto fail;
+	}
+
+	psmi_assert_always((*tidcnt) > 0);
+	psmi_assert(ctrl->tid_num_avail >= (*tidcnt));
+	ctrl->tid_num_avail -= (*tidcnt);
+	tidc->tid_num_total += (*tidcnt);
+	tidc->tid_num_inuse += (*tidcnt);
+
+fail:
+	if (tidc->context->tid_ctrl)
+		pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+	return err;
+}
+
+psm2_error_t
+ips_tid_release(struct ips_tid *tidc,
+		uint32_t *tid_array, uint32_t tidcnt)
+{
+	struct ips_tid_ctrl *ctrl = tidc->tid_ctrl;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(tidcnt > 0);
+	if (tidc->context->tid_ctrl)
+		pthread_spin_lock(&ctrl->tid_ctrl_lock);
+
+	if (hfi_free_tid(tidc->context->ctrl,
+		    (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) {
+		if (tidc->context->tid_ctrl)
+			pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+		/* If failed to unpin pages, it's fatal error */
+		err = psmi_handle_error(tidc->context->ep,
+			PSM2_EP_DEVICE_FAILURE,
+			"Failed to tid free %d tids",
+			tidcnt);
+		goto fail;
+	}
+
+	ctrl->tid_num_avail += tidcnt;
+	if (tidc->context->tid_ctrl)
+		pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+	tidc->tid_num_inuse -= tidcnt;
+	/* If an available callback is registered invoke it */
+	if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max)
+	    && tidc->tid_avail_cb)
+		tidc->tid_avail_cb(tidc, tidc->tid_avail_context);
+
+fail:
+	return err;
+}
diff --git a/ptl_ips/ips_tid.h b/ptl_ips/ips_tid.h
new file mode 100644
index 0000000..4fcdaf1
--- /dev/null
+++ b/ptl_ips/ips_tid.h
@@ -0,0 +1,169 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* included header files  */
+
+#ifndef _IPS_TID_H
+#define _IPS_TID_H
+
+#include "psm_user.h"
+#include "ips_tidcache.h"
+
+struct ips_tid;
+
+typedef void (*ips_tid_avail_cb_fn_t) (struct ips_tid *, void *context);
+
+/* Max tids a context can support */
+#define IPS_TID_MAX_TIDS	2048
+/* Max tid-session buffer size */
+#define PSM_TIDLIST_BUFSIZE	4096
+/* Max tid-session window size */
+#define PSM_TID_WINSIZE		(4*1024*1024)
+/* Max number of packets for a single TID flow, fitting tid-session window.
+   In PSM2 packet integrity is realized by PSN (Packet Sequence Number),
+   which is kept as 11 bits field (for 9B KDETH),
+   giving max value 2048 (0 - 2047) */
+#define PSM_TID_MAX_PKTS	2048
+/* Total number of combined pages from the Tid-pair to be merged */
+#define PSM_MAX_NUM_PAGES_IN_TIDPAIR	512
+
+struct ips_tid_ctrl {
+	pthread_spinlock_t tid_ctrl_lock;
+	uint32_t tid_num_max;
+	uint32_t tid_num_avail;
+} __attribute__ ((aligned(64)));
+
+struct ips_tid {
+	const psmi_context_t *context;
+	struct ips_protoexp *protoexp;
+
+	void *tid_avail_context;
+	struct ips_tid_ctrl *tid_ctrl;
+	volatile uint64_t *invalidation_event;
+
+	ips_tid_avail_cb_fn_t tid_avail_cb;
+	uint64_t tid_num_total;
+	uint32_t tid_num_inuse;
+	uint32_t tid_cachesize;	/* items can be cached */
+	cl_qmap_t tid_cachemap; /* RB tree implementation */
+	/*
+	 * tids storage.
+	 * This is used in tid registration caching case for
+	 * tid invalidation, acquire, replace and release,
+	 * entries should be the assigned tid number.
+	 */
+	uint32_t *tid_array;
+};
+
+psm2_error_t ips_tid_init(const psmi_context_t *context,
+		struct ips_protoexp *protoexp,
+		ips_tid_avail_cb_fn_t cb, void *cb_context);
+psm2_error_t ips_tid_fini(struct ips_tid *tidc);
+
+/* Acquiring tids.
+ * Buffer base has to be aligned on page boundary
+ * Buffer length has to be multiple pages
+ */
+psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc,
+		const void *buf,  /* input buffer, aligned to page boundary */
+		uint32_t *length, /* buffer length, aligned to page size */
+		uint32_t *tid_array, /* output tidarray, */
+		uint32_t *tidcnt,    /* output of tid count */
+		uint32_t *pageoff  /* output of offset in first tid */
+#ifdef PSM_CUDA
+		, uint8_t is_cuda_ptr
+#endif
+		);
+
+psm2_error_t ips_tidcache_release(struct ips_tid *tidc,
+		uint32_t *tid_array, /* input tidarray, */
+		uint32_t tidcnt);    /* input of tid count */
+
+psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc);
+psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc);
+
+psm2_error_t ips_tid_acquire(struct ips_tid *tidc,
+		const void *buf,  /* input buffer, aligned to page boundary */
+		uint32_t *length, /* buffer length, aligned to page size */
+		uint32_t *tid_array, /* output tidarray, */
+		uint32_t *tidcnt
+#ifdef PSM_CUDA
+		, uint8_t is_cuda_ptr
+#endif
+		);   /* output of tid count */
+
+psm2_error_t ips_tid_release(struct ips_tid *tidc,
+		uint32_t *tid_array, /* input tidarray, */
+		uint32_t tidcnt);    /* input of tid count */
+
+PSMI_INLINE(int ips_tid_num_available(struct ips_tid *tidc))
+{
+	if (tidc->tid_ctrl->tid_num_avail == 0) {
+		if (tidc->tid_ctrl->tid_num_max == tidc->tid_num_inuse)
+			return -1;
+		else
+			return 0;
+	}
+
+	return tidc->tid_ctrl->tid_num_avail;
+}
+
+/* Note that the caller is responsible for making sure that NIDLE is non-zero
+   before calling ips_tidcache_evict.  If NIDLE is 0 at the time of call,
+   ips_tidcache_evict is unstable.
+ */
+uint64_t ips_tidcache_evict(struct ips_tid *tidc, uint64_t length);
+
+#endif /* _IPS_TID_H */
diff --git a/ptl_ips/ips_tidcache.c b/ptl_ips/ips_tidcache.c
new file mode 100644
index 0000000..ecc0bba
--- /dev/null
+++ b/ptl_ips/ips_tidcache.c
@@ -0,0 +1,653 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR)  ((PAYLOAD_PTR)->start)
+#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length<<12))
+#define RBTREE_ASSERT                     psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->ntid)
+
+#include "rbtree.c"
+
+void ips_tidcache_map_init(cl_qmap_t		*p_map,
+			   cl_map_item_t* const	root,
+			   cl_map_item_t* const	nil_item)
+{
+	ips_cl_qmap_init(p_map,root,nil_item);
+}
+
+/*
+ *
+ * Force to remove a tid, check invalidation event afterwards.
+ */
+static psm2_error_t
+ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	uint32_t idx;
+	psm2_error_t err;
+
+	/*
+	 * call driver to free the tids.
+	 */
+	if (hfi_free_tid(tidc->context->ctrl,
+		    (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) {
+		/* If failed to unpin pages, it's fatal error */
+		err = psmi_handle_error(tidc->context->ep,
+			PSM2_EP_DEVICE_FAILURE,
+			"Failed to tid free %d tids", 1);
+		return err;
+	}
+
+	while (tidcnt) {
+		tidcnt--;
+		idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) +
+			IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]);
+
+		/*
+		 * sanity check.
+		 */
+		psmi_assert(idx != 0);
+		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+		psmi_assert(INVALIDATE(idx) == 0);
+		psmi_assert(REFCNT(idx) == 0);
+
+		/*
+		 * mark the tid invalidated.
+		 */
+		INVALIDATE(idx) = 1;
+
+		/*
+		 * remove the tid from RB tree.
+		 */
+		IDLE_REMOVE(idx);
+		ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+	}
+
+	/*
+	 * Because the freed tid is not from invalidation list,
+	 * it is possible that kernel just invalidated the tid,
+	 * then we need to check and process the invalidation
+	 * before we can re-use this tid. The reverse order
+	 * will wrongly invalidate this tid again.
+	 */
+	if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+		err = ips_tidcache_invalidation(tidc);
+		if (err)
+			return err;
+	}
+
+	return PSM2_OK;
+}
+
+/*
+ * Register a new buffer with driver, and cache the tidinfo.
+ */
+static psm2_error_t
+ips_tidcache_register(struct ips_tid *tidc,
+		unsigned long start, uint32_t length, uint32_t *firstidx
+#ifdef PSM_CUDA
+		, uint8_t is_cuda_ptr
+#endif
+		)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	uint32_t tidoff, tidlen;
+	uint32_t idx, tidcnt;
+	uint16_t flags = 0;
+	psm2_error_t err;
+
+	/*
+	 * make sure we have at least one free tid to
+	 * register the new buffer.
+	 */
+	if (NTID == tidc->tid_cachesize) {
+		/* all tids are in active use, error? */
+		if (NIDLE == 0)
+			return PSM2_OK_NO_PROGRESS;
+
+		/*
+		 * free the first tid in idle queue.
+		 */
+		idx = IPREV(IHEAD);
+		tidc->tid_array[0] = p_map->root[idx].payload.tidinfo;
+		err = ips_tidcache_remove(tidc, 1);
+		if (err)
+			return err;
+	}
+	psmi_assert(NTID < tidc->tid_cachesize);
+
+	/* Clip length if it exceeds worst case tid allocation,
+	   where each entry in the tid array can accommodate only
+	   1 page. */
+	if (length > 4096*tidc->tid_ctrl->tid_num_max)
+	{
+		length = 4096*tidc->tid_ctrl->tid_num_max;
+	}
+	/*
+	 * register the new buffer.
+	 */
+
+retry:
+	tidcnt = 0;
+
+#ifdef PSM_CUDA
+	if (is_cuda_ptr)
+		flags = HFI1_BUF_GPU_MEM;
+#endif
+
+	if (hfi_update_tid(tidc->context->ctrl,
+			(uint64_t) start, &length,
+			(uint64_t) tidc->tid_array, &tidcnt, flags) < 0) {
+		/* if driver reaches lockable memory limit */
+		if ((errno == ENOMEM
+#ifdef PSM_CUDA
+			/* This additional check is in place for just the cuda
+			 * version. It is a temporary workaround for a known
+			 * issue where nvidia driver returns EINVAL instead of
+			 * ENOMEM when there is no BAR1 space left to pin pages.
+			 * PSM frees tidcache enteries when the driver sends
+			 * EINVAL there by unpinning pages and freeing some
+			 * BAR1 space.*/
+		     || (PSMI_IS_CUDA_ENABLED && errno == EINVAL)
+#endif
+			) && NIDLE) {
+			uint64_t lengthEvicted = ips_tidcache_evict(tidc,length);
+
+			if (lengthEvicted >= length)
+				goto retry;
+		}
+
+		/* Unable to pin pages? retry later */
+		return PSM2_EP_DEVICE_FAILURE;
+	}
+	psmi_assert_always(tidcnt > 0);
+	psmi_assert((tidcnt+NTID) <= tidc->tid_cachesize);
+
+	/*
+	 * backward processing because we want to return
+	 * the first RB index in the array.
+	 */
+	idx = 0;
+	tidoff = length;
+	while (tidcnt) {
+		/*
+		 * Driver only returns tidctrl=1 or tidctrl=2.
+		 */
+		tidcnt--;
+		idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) +
+			IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]);
+		tidlen = IPS_TIDINFO_GET_LENGTH(tidc->tid_array[tidcnt]);
+
+		/*
+		 * sanity check.
+		 */
+		psmi_assert(idx != 0);
+		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+		psmi_assert(INVALIDATE(idx) != 0);
+		psmi_assert(REFCNT(idx) == 0);
+
+		/*
+		 * clear the tid invalidated.
+		 */
+		INVALIDATE(idx) = 0;
+
+		/*
+		 * put the tid into a RB node.
+		 */
+		tidoff -= tidlen << 12;
+		START(idx) = start + tidoff;
+		LENGTH(idx) = tidlen;
+		p_map->root[idx].payload.tidinfo = tidc->tid_array[tidcnt];
+
+		/*
+		 * put the node into RB tree and idle queue head.
+		 */
+		IDLE_INSERT(idx);
+		ips_cl_qmap_insert_item(p_map, &p_map->root[idx]);
+	}
+	psmi_assert(idx != 0);
+	psmi_assert(tidoff == 0);
+	*firstidx = idx;
+
+	return PSM2_OK;
+}
+
+/*
+ * Get mmu notifier invalidation info and update PSM's caching.
+ */
+psm2_error_t
+ips_tidcache_invalidation(struct ips_tid *tidc)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	uint32_t i, j, idx, tidcnt;
+	psm2_error_t err;
+
+	/*
+	 * get a list of invalidated tids from driver,
+	 * driver will clear the event bit before return.
+	 */
+	tidcnt = 0;
+	if (hfi_get_invalidation(tidc->context->ctrl,
+		   (uint64_t) (uintptr_t) tidc->tid_array, &tidcnt) < 0) {
+		/* If failed to get invalidation info, it's fatal error */
+		err = psmi_handle_error(tidc->context->ep,
+			PSM2_EP_DEVICE_FAILURE,
+			"Failed to get invalidation info");
+		return err;
+	}
+	psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max);
+
+	j = 0;
+	for (i = 0; i < tidcnt; i++) {
+		/*
+		 * Driver only returns tidctrl=1 or tidctrl=2.
+		 */
+		idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) +
+			IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]);
+		psmi_assert(idx != 0);
+		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+
+		/*
+		 * sanity check.
+		 */
+		psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]);
+		psmi_assert(LENGTH(idx) ==
+				IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i]));
+
+		/*
+		 * if the tid is already invalidated, ignore it,
+		 * but do sanity check.
+		 */
+		if (INVALIDATE(idx) != 0) {
+			psmi_assert(REFCNT(idx) == 0);
+			continue;
+		}
+
+		/*
+		 * mark the tid invalidated.
+		 */
+		INVALIDATE(idx) = 1;
+
+		/*
+		 * if the tid is idle, remove the tid from RB tree
+		 * and idle queue, put on free list.
+		 */
+		if (REFCNT(idx) == 0) {
+			IDLE_REMOVE(idx);
+			ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+
+			if (i != j)
+				tidc->tid_array[j] = tidc->tid_array[i];
+			j++;
+		}
+	}
+
+	if (j > 0) {
+		/*
+		 * call driver to free the tids.
+		 */
+		if (hfi_free_tid(tidc->context->ctrl,
+			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+			/* If failed to unpin pages, it's fatal error */
+			err = psmi_handle_error(tidc->context->ep,
+				PSM2_EP_DEVICE_FAILURE,
+				"Failed to tid free %d tids", j);
+			return err;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_tidcache_acquire(struct ips_tid *tidc,
+		const void *buf, uint32_t *length,
+		uint32_t *tid_array, uint32_t *tidcnt,
+		uint32_t *tidoff
+#ifdef PSM_CUDA
+		, uint8_t is_cuda_ptr
+#endif
+		)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	cl_map_item_t *p_item;
+	unsigned long start = (unsigned long)buf;
+	unsigned long end = start + (*length);
+	uint32_t idx, nbytes;
+	psm2_error_t err;
+
+	/*
+	 * Before every tid caching search, we need to update the
+	 * tid caching if there is invalidation event, otherwise,
+	 * the cached address may be invalidated and we might have
+	 * wrong matching.
+	 */
+	if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+		err = ips_tidcache_invalidation(tidc);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Now we can do matching from the caching, because obsolete
+	 * address in caching has been removed or identified.
+	 */
+retry:
+	p_item = ips_cl_qmap_search(p_map, start, end);
+	idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) +
+		IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo);
+
+	/*
+	 * There is tid matching.
+	 */
+	if (idx) {
+		/*
+		 * if there is a caching match, but the tid has been
+		 * invalidated, we can't match this tid, and we also
+		 * can't register this address, we need to wait this
+		 * tid to be freed.
+		 */
+		if (INVALIDATE(idx) != 0)
+			return PSM2_OK_NO_PROGRESS;
+
+		/*
+		 * if the page offset within the tid is not less than
+		 * 128K, the address offset within the page is not 64B
+		 * multiple, PSM can't handle this tid with any offset
+		 * mode. We need to free this tid and re-register with
+		 * the asked page address.
+		 */
+		if (((start - START(idx)) >= 131072) && ((*tidoff) & 63)) {
+			/*
+			 * If the tid is currently used, retry later.
+			 */
+			if (REFCNT(idx) != 0)
+				return PSM2_OK_NO_PROGRESS;
+
+			/*
+			 * free this tid.
+			 */
+			tidc->tid_array[0] = p_map->root[idx].payload.tidinfo;
+			err = ips_tidcache_remove(tidc, 1);
+			if (err)
+				return err;
+
+			/* try to match a node again */
+			goto retry;
+		}
+	}
+
+	/*
+	 * If there is no match node, or 'start' falls out of node range,
+	 * whole or partial buffer from 'start' is not registered yet.
+	 */
+	if (!idx || START(idx) > start) {
+		if (!idx)
+			nbytes = end - start;
+		else
+			nbytes = START(idx) - start;
+
+		/*
+		 * Because we don't have any match tid yet, if
+		 * there is an error, we return from here, PSM
+		 * will try later.
+		 */
+		err = ips_tidcache_register(tidc, start, nbytes, &idx
+#ifdef PSM_CUDA
+					, is_cuda_ptr
+#endif
+				);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * sanity check.
+	 */
+	psmi_assert(START(idx) <= start);
+	psmi_assert(INVALIDATE(idx) == 0);
+
+	*tidoff += start - START(idx);
+	*tidcnt = 1;
+
+	tid_array[0] = p_map->root[idx].payload.tidinfo;
+	REFCNT(idx)++;
+	if (REFCNT(idx) == 1)
+		IDLE_REMOVE(idx);
+	start = END(idx);
+
+	while (start < end) {
+		p_item = ips_cl_qmap_successor(p_map, &p_map->root[idx]);
+		idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) +
+			IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo);
+		if (!idx || START(idx) != start) {
+			if (!idx)
+				nbytes = end - start;
+			else
+				nbytes = (START(idx) > end) ?
+					(end - start) :
+					(START(idx) - start);
+
+			/*
+			 * Because we already have at least one match tid,
+			 * if it is error to register new pages, we break
+			 * here and return the tids we already have.
+			 */
+			err = ips_tidcache_register(tidc, start, nbytes, &idx
+#ifdef PSM_CUDA
+					, is_cuda_ptr
+#endif
+				);
+			if (err)
+				break;
+		} else if (INVALIDATE(idx) != 0) {
+			/*
+			 * the tid has been invalidated, it is still in
+			 * caching because it is still being used, but
+			 * any new usage is not allowed, we ignore it and
+			 * return the tids we already have.
+			 */
+			psmi_assert(REFCNT(idx) != 0);
+			break;
+		}
+
+		/*
+		 * sanity check.
+		 */
+		psmi_assert(START(idx) == start);
+		psmi_assert(INVALIDATE(idx) == 0);
+
+		tid_array[(*tidcnt)++] = p_map->root[idx].payload.tidinfo;
+		REFCNT(idx)++;
+		if (REFCNT(idx) == 1)
+			IDLE_REMOVE(idx);
+		start = END(idx);
+	}
+
+	if (start < end)
+		*length = start - (unsigned long)buf;
+	/* otherwise, all pages are registered */
+	psmi_assert((*tidcnt) > 0);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_tidcache_release(struct ips_tid *tidc,
+		uint32_t *tid_array, uint32_t tidcnt)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	uint32_t i, j, idx;
+	psm2_error_t err;
+
+	psmi_assert(tidcnt > 0);
+
+	j = 0;
+	for (i = 0; i < tidcnt; i++) {
+		/*
+		 * Driver only returns tidctrl=1 or tidctrl=2.
+		 */
+		idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) +
+			IPS_TIDINFO_GET_TIDCTRL(tid_array[i]);
+		psmi_assert(idx != 0);
+		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+		psmi_assert(REFCNT(idx) != 0);
+
+		REFCNT(idx)--;
+		if (REFCNT(idx) == 0) {
+			if (INVALIDATE(idx) != 0) {
+				ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+
+				tidc->tid_array[j] = tid_array[i];
+				j++;
+			} else {
+				IDLE_INSERT(idx);
+			}
+		}
+	}
+
+	if (j > 0) {
+		/*
+		 * call driver to free the tids.
+		 */
+		if (hfi_free_tid(tidc->context->ctrl,
+			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+			/* If failed to unpin pages, it's fatal error */
+			err = psmi_handle_error(tidc->context->ep,
+				PSM2_EP_DEVICE_FAILURE,
+				"Failed to tid free %d tids", j);
+			return err;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+/*
+ *
+ * Call driver to free all cached tids.
+ */
+psm2_error_t
+ips_tidcache_cleanup(struct ips_tid *tidc)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	psm2_error_t err;
+	int i, j;
+
+	j = 0;
+	for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) {
+		psmi_assert(REFCNT(i) == 0);
+		if (INVALIDATE(i) == 0) {
+			tidc->tid_array[j++] = p_map->root[i].payload.tidinfo;
+		}
+	}
+
+	if (j > 0) {
+		/*
+		 * call driver to free the tids.
+		 */
+		if (hfi_free_tid(tidc->context->ctrl,
+			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+			/* If failed to unpin pages, it's fatal error */
+			err = psmi_handle_error(tidc->context->ep,
+				PSM2_EP_DEVICE_FAILURE,
+				"Failed to tid free %d tids", j);
+			return err;
+		}
+	}
+
+	psmi_free(tidc->tid_array);
+	psmi_free(tidc->tid_cachemap.root);
+
+	return PSM2_OK;
+}
+
+
+/* Note that the caller is responsible for making sure that NIDLE is non-zero
+   before calling ips_tidcache_evict.  If NIDLE is 0 at the time of call,
+   ips_tidcache_evict is unstable.
+ */
+uint64_t
+ips_tidcache_evict(struct ips_tid *tidc,uint64_t length)
+{
+	cl_qmap_t *p_map = &tidc->tid_cachemap;
+	uint32_t idx = IHEAD, tidcnt = 0, tidlen = 0;
+	/*
+	 * try to free the required
+	 * pages from idle queue tids
+	 */
+
+	do {
+		idx = IPREV(idx);
+		psmi_assert(idx != 0);
+		tidc->tid_array[tidcnt] =
+			p_map->root[idx].payload.tidinfo;
+		tidcnt++;
+
+		tidlen += IPS_TIDINFO_GET_LENGTH
+			(p_map->root[idx].payload.tidinfo)<<12;
+	} while (tidcnt < NIDLE && tidlen < length);
+
+	/*
+	 * free the selected tids on successfully finding some:.
+	 */
+	if (tidcnt > 0 && ips_tidcache_remove(tidc, tidcnt))
+		return 0;
+
+	return tidlen;
+}
diff --git a/ptl_ips/ips_tidcache.h b/ptl_ips/ips_tidcache.h
new file mode 100644
index 0000000..20d45bf
--- /dev/null
+++ b/ptl_ips/ips_tidcache.h
@@ -0,0 +1,158 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _IPS_TIDCACHE_H
+#define _IPS_TIDCACHE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/*
+ * Design notes.
+ *
+ * PSM needs to call into driver to program receiving buffer pages to
+ * HFI gen1 hardware, each tid can be programmed with physically contiguous
+ * power-of-two pages from 1 pages to 512 pages. This procedure takes
+ * time.
+ *
+ * Lots of applications tend to re-use the same receiving buffer, caching
+ * such programmed tids in user space process will save time and improve
+ * application performance.
+ *
+ * This PSM tid registration caching design requires cooperation between
+ * PSM and driver. Here is what happen between PSM and driver.
+ *
+ * 1. PSM call into driver with a chunk of buffer with virtual address
+ *    and length.
+ * 2. driver pins the buffer pages, program hardware with the physical
+ *    pages, get a list of tids.
+ * 3. driver caches the tids with the corresponding virtual address in
+ *    user space for each tid, and return the list of tids back to PSM.
+ * 4. PSM also caches the list of tids with the corresponding virtual
+ *    address for each tid, and use the list of tids for transmission.
+ * 5. when process frees a buffer, kernel VM will catch the event and
+ *    calls the callback in driver to notify that the virtual address
+ *    range is gone in the process.
+ * 6. driver will search its cache system and find the tids with the
+ *    removed virtual address, put these tid in an invalidation queue
+ *    and notify PSM the event.
+ * 7. PSM will pick the event and remove the tids from its own cache
+ *    as well.
+ * 8. PSM must check such invalidation event every time before searching
+ *    its caching system to match tids for a 'new' buffer chunk.
+ * 9, when the caching system is full, and a new buffer chunk is asked
+ *    to register, PSM picks a victim to remove.
+ */
+
+typedef struct
+{
+	unsigned long		start;		/* start virtual address */
+	uint32_t		tidinfo;	/* tid encoding */
+	uint16_t		length;		/* length in pages */
+	uint16_t		invalidate;	/* invalidate flag */
+	uint16_t		refcount;	/* usage reference count */
+	uint16_t		i_prev;		/* idle queue previous */
+	uint16_t		i_next;		/* idle queue next */
+} rbtree_tidcache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		ntid;		/* tids are cached */
+	uint32_t		nidle;		/* tids are idle */
+} rbtree_tidcache_map_pl_t;
+
+#define RBTREE_MI_PL  rbtree_tidcache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t
+
+#include "rbtree.h"
+
+/*
+ * Macro definition for easy programming.
+ */
+
+#define NTID			p_map->payload.ntid
+#define REFCNT(x)		p_map->root[x].payload.refcount
+#define INVALIDATE(x)		p_map->root[x].payload.invalidate
+
+#define LENGTH(x)		p_map->root[x].payload.length
+#define START(x)		p_map->root[x].payload.start
+#define END(x)			(START(x) + (LENGTH(x)<<12))
+
+/*
+ * Macro for idle tid queue management.
+ */
+#define NIDLE			p_map->payload.nidle
+#define IHEAD			0
+#define INEXT(x)		p_map->root[x].payload.i_next
+#define IPREV(x)		p_map->root[x].payload.i_prev
+
+#define IDLE_REMOVE(x)		do {					\
+					INEXT(IPREV(x)) = INEXT(x);	\
+					IPREV(INEXT(x)) = IPREV(x);	\
+					NIDLE--;			\
+				} while (0)
+
+#define	IDLE_INSERT(x)		do {					\
+					INEXT(x) = INEXT(IHEAD);	\
+					IPREV(x) = IHEAD;		\
+					IPREV(INEXT(IHEAD)) = x;	\
+					INEXT(IHEAD) = x;		\
+					NIDLE++;			\
+				} while (0)
+
+extern void ips_tidcache_map_init(cl_qmap_t		*p_map,
+				  cl_map_item_t* const	root,
+				  cl_map_item_t* const	nil_item);
+
+#endif
diff --git a/ptl_ips/ips_tidflow.c b/ptl_ips/ips_tidflow.c
new file mode 100644
index 0000000..06b9c58
--- /dev/null
+++ b/ptl_ips/ips_tidflow.c
@@ -0,0 +1,267 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+			const psmi_context_t *context,
+			struct ips_tf *tfc,
+			ips_tf_avail_cb_fn_t cb)
+{
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+	int tf_idx;
+
+#if TF_ADD
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("tidflow update count",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				NULL, &tfc->tf_num_total),
+	};
+#endif
+
+	tfc->context = context;
+	tfc->tf_num_total = 0;
+	tfc->tf_num_inuse = 0;
+	tfc->tf_avail_cb = cb;
+	tfc->tf_avail_context = (void *)protoexp;
+	if ((context->runtime_flags & HFI1_CAP_EXTENDED_PSN)) {
+		tfc->tf_gen_mask = 0xFFFFF;
+	} else {
+		tfc->tf_gen_mask = 0x1FFF;
+	}
+
+	/* Allocate and Initialize tidrecvc array. */
+	tfc->tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_calloc(context->ep, UNDEFINED, 1,
+			sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS);
+	if (tfc->tidrecvc == NULL)
+		return PSM2_NO_MEMORY;
+
+	for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+		tfc->tidrecvc[tf_idx].context = context;
+		tfc->tidrecvc[tf_idx].protoexp = protoexp;
+		tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx;
+		tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx;
+		tfc->tidrecvc[tf_idx].tidflow.flowid = EP_FLOW_TIDFLOW;
+		tfc->tidrecvc[tf_idx].tidflow.frag_size = protoexp->proto->epinfo.ep_mtu;
+	}
+
+	/* Shared control structure, it will be in shared memory
+	 * for context sharing, otherwise calloc() it */
+	tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl;
+	if (!tfc->tf_ctrl) {
+		tfc->tf_ctrl = (struct ips_tf_ctrl *)
+		    psmi_calloc(context->ep, UNDEFINED, 1,
+				sizeof(struct ips_tf_ctrl));
+		if (tfc->tf_ctrl == NULL) {
+			return PSM2_NO_MEMORY;
+		}
+	}
+
+	/*
+	 * Only the master process can initialize.
+	 */
+	if (ctxt_info->subctxt == 0) {
+		pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock,
+					PTHREAD_PROCESS_SHARED);
+		tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS;
+		tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS;
+
+		for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+			/* Update flow state */
+			tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED;
+			tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx;
+			tfc->tf_ctrl->tf[tf_idx].next_gen = 0;
+			tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
+
+			hfi_tidflow_reset(context->ctrl, tf_idx,
+					  tfc->tf_gen_mask, 0x7FF);
+		}
+		tfc->tf_ctrl->tf_head = 0;
+	}
+
+#if TF_ADD
+	/* TF_ADD: Add a new stats type for tid flows in psm_stats.h */
+	return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+					PSMI_STATSTYPE_TIDS,
+					entries,
+					PSMI_STATS_HOWMANY(entries), tidc);
+#else
+	return PSM2_OK;
+#endif
+}
+
+psm2_error_t ips_tf_fini(struct ips_tf *tfc)
+{
+	if (!tfc->context->tf_ctrl)
+		psmi_free(tfc->tf_ctrl);
+	psmi_free(tfc->tidrecvc);
+	return PSM2_OK;
+}
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+		struct ips_tid_recv_desc **tidrecvc)
+{
+	struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+	struct ips_tf_entry *entry;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+	if (!ctrl->tf_num_avail) {
+		psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS);
+		*tidrecvc = NULL;
+
+		if (tfc->context->tf_ctrl)
+			pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	entry = &ctrl->tf[ctrl->tf_head];
+	ctrl->tf_head = entry->next_free;
+	ctrl->tf_num_avail--;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+	tfc->tf_num_total++;
+	tfc->tf_num_inuse++;
+
+	psmi_assert(entry->state == TF_STATE_DEALLOCATED);
+	entry->state = TF_STATE_ALLOCATED;
+
+	*tidrecvc = &(tfc->tidrecvc[entry->tf_idx]);
+	/* initial tidflow generation */
+	(*tidrecvc)->tidflow_active_gen = entry->next_gen;
+
+	psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx);
+	psmi_assert_always(entry->next_gen < tfc->tf_gen_mask);
+
+	entry->next_gen++;
+	if (entry->next_gen == tfc->tf_gen_mask)
+		entry->next_gen = 0;
+
+	return PSM2_OK;
+}
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx)
+{
+	struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+	struct ips_tf_entry *entry;
+
+	psmi_assert(tf_idx < HFI_TF_NFLOWS);
+	psmi_assert(tf_idx >= 0);
+
+	entry = &ctrl->tf[tf_idx];
+	psmi_assert(entry->state == TF_STATE_ALLOCATED);
+	entry->state = TF_STATE_DEALLOCATED;
+
+	/*
+	 * The wire protocol only uses 16bits tidrecvc generation
+	 * count in exptid packet, this should be bigger enough,
+	 * u16w3 is the lower 16bits of _desc_genc
+	 */
+	tfc->tidrecvc[tf_idx].rdescid.u16w3++;
+
+	/* Mark invalid generation for flow (stale packets will be dropped) */
+	hfi_tidflow_reset(tfc->context->ctrl, tf_idx, tfc->tf_gen_mask, 0x7FF);
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+	entry->next_free = ctrl->tf_head;
+	ctrl->tf_head = tf_idx;
+	ctrl->tf_num_avail++;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+	tfc->tf_num_inuse--;
+	/* If an available callback is registered invoke it */
+	if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb)
+		tfc->tf_avail_cb(tfc, tfc->tf_avail_context);
+
+	return PSM2_OK;
+}
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+			       uint32_t tf_idx, uint32_t *tfgen)
+{
+	struct ips_tf_entry *entry;
+	int ret = PSM2_OK;
+
+	psmi_assert(tf_idx < HFI_TF_NFLOWS);
+	psmi_assert(tf_idx >= 0);
+
+	entry = &tfc->tf_ctrl->tf[tf_idx];
+	psmi_assert(entry->state == TF_STATE_ALLOCATED);
+
+	*tfgen = entry->next_gen;
+
+	entry->next_gen++;
+	if (entry->next_gen == tfc->tf_gen_mask)
+		entry->next_gen = 0;
+
+	psmi_assert_always(*tfgen < tfc->tf_gen_mask);
+
+	return ret;
+}
diff --git a/ptl_ips/ips_tidflow.h b/ptl_ips/ips_tidflow.h
new file mode 100644
index 0000000..5578dc5
--- /dev/null
+++ b/ptl_ips/ips_tidflow.h
@@ -0,0 +1,133 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_TIDFLOW_H
+#define _IPS_TIDFLOW_H
+
+#include "psm_user.h"
+
+struct ips_tf;
+struct ips_protoexp;
+
+typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context);
+typedef enum {
+	TF_STATE_INVALID = 0,
+	TF_STATE_ALLOCATED = 1,
+	TF_STATE_DEALLOCATED = 2
+} tf_state_t;
+
+struct ips_tf_entry {
+	tf_state_t state;
+	uint32_t tf_idx;
+	uint32_t next_gen;
+	uint32_t next_free;
+};
+
+struct ips_tf_ctrl {
+	pthread_spinlock_t tf_ctrl_lock;
+	uint32_t tf_num_max;
+	uint32_t tf_num_avail;
+	uint32_t tf_head;
+	struct ips_tf_entry tf[HFI_TF_NFLOWS];
+} __attribute__ ((aligned(64)));
+
+struct ips_tf {
+	const psmi_context_t *context;
+	ips_tf_avail_cb_fn_t tf_avail_cb;
+	void *tf_avail_context;
+	struct ips_tf_ctrl *tf_ctrl;
+
+	uint64_t tf_num_total;
+	uint32_t tf_num_inuse;
+	uint32_t tf_gen_mask;
+
+#ifdef PSM_CUDA
+	void *host_to_gpu_bounce_buf_pool;
+#endif
+
+	/* Pointer to array of size HFI_TF_NFLOWS */
+	struct ips_tid_recv_desc *tidrecvc;
+};
+
+PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf))
+{
+	if (tf->tf_ctrl->tf_num_avail == 0) {
+		if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse)
+			return -1;
+		else
+			return 0;
+	}
+
+	return tf->tf_ctrl->tf_num_avail;
+}
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+			const psmi_context_t *context,
+			struct ips_tf *tfc,
+			ips_tf_avail_cb_fn_t cb);
+psm2_error_t ips_tf_fini(struct ips_tf *tfc);
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+			struct ips_tid_recv_desc **tidrecvc);
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx);
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+			uint32_t tf_idx, uint32_t *tfgen);
+
+#endif
diff --git a/ptl_ips/ips_writehdrq.c b/ptl_ips/ips_writehdrq.c
new file mode 100644
index 0000000..1bb8697
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.c
@@ -0,0 +1,110 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_writehdrq.h"
+
+psm2_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+		   const struct ips_recvq_params *hdrq_params,
+		   const struct ips_recvq_params *egrq_params,
+		   struct ips_writehdrq *writeq,
+		   struct ips_writehdrq_state *state, uint32_t runtime_flags)
+{
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+
+	memset(writeq, 0, sizeof(*writeq));
+	writeq->context = context;
+	writeq->state = state;
+	writeq->hdrq = *hdrq_params;	/* deep copy */
+	writeq->hdrq_elemlast =
+	    ((writeq->hdrq.elemcnt - 1) * writeq->hdrq.elemsz);
+	writeq->egrq = *egrq_params;	/* deep copy */
+	writeq->egrq_buftable =
+	    ips_recvq_egrbuf_table_alloc(context->ep, writeq->egrq.base_addr,
+					 writeq->egrq.elemcnt,
+					 writeq->egrq.elemsz);
+	writeq->runtime_flags = runtime_flags;
+	writeq->hdrq_rhf_off =
+	    (ctxt_info->rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT;
+
+	if (writeq->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+		writeq->hdrq_hdr_copysz =
+		    writeq->hdrq.elemsz * sizeof(uint32_t);
+		writeq->state->hdrq_rhf_seq = 0;	/* _seq is ignored */
+	} else {
+		writeq->state->hdrq_rhf_seq = 1;
+		/*
+		 * We don't allow readers to see the RHF until the writer can
+		 * atomically write an updated RHF.
+		 */
+		writeq->hdrq_hdr_copysz =
+		    (writeq->hdrq.elemsz - 2) * sizeof(uint32_t);
+		/*
+		 * Ensure 8-byte alignment of the RHF by looking at RHF of the second
+		 * header, which is required for atomic RHF updates.
+		 */
+		psmi_assert_always(!((uintptr_t) (writeq->hdrq.base_addr +
+						  writeq->hdrq.elemsz +
+						  writeq->hdrq_rhf_off) & 0x7));
+	}
+	writeq->state->enabled = 1;
+	return PSM2_OK;
+}
+
+psm2_error_t ips_writehdrq_fini(struct ips_writehdrq *writeq)
+{
+	ips_recvq_egrbuf_table_free(writeq->egrq_buftable);
+	return PSM2_OK;
+}
diff --git a/ptl_ips/ips_writehdrq.h b/ptl_ips/ips_writehdrq.h
new file mode 100644
index 0000000..ff95000
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.h
@@ -0,0 +1,269 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_WRITEHDRQ_H
+#define _IPS_WRITEHDRQ_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_recvq.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Structure containing state for writehdrq writing. This is logically
+ * part of ips_writehdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the port. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+struct ips_writehdrq_state {
+	uint32_t hdrq_rhf_seq;	/* last seq */
+	uint32_t egrq_offset;	/* in bytes unit, not 64B */
+	uint32_t enabled;	/* enables writing */
+};
+
+struct ips_writehdrq {
+	const psmi_context_t *context;
+	struct ips_writehdrq_state *state;
+	struct ips_recvq_params hdrq;
+	uint32_t hdrq_elemlast;
+	uint32_t hdrq_rhf_off;	/* rhf offset */
+	uint32_t hdrq_hdr_copysz;
+	struct ips_recvq_params egrq;
+	void **egrq_buftable;	/* table of eager idx-to-ptr */
+	uint32_t runtime_flags;
+};
+
+psm2_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+		   const struct ips_recvq_params *hdrq_params,
+		   const struct ips_recvq_params *egrq_params,
+		   struct ips_writehdrq *writeq,
+		   struct ips_writehdrq_state *state, uint32_t runtime_flags);
+
+psm2_error_t ips_writehdrq_fini(struct ips_writehdrq *writeq);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_writehdrq_write_rhf_atomic(uint32_t *rhf_dest, uint32_t *rhf_src))
+{
+	/*
+	 * In 64-bit mode, we check in init that the rhf will always be 8-byte
+	 * aligned
+	 */
+	*((uint64_t *) rhf_dest) = *((uint64_t *) rhf_src);
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+ips_write_eager_packet(struct ips_writehdrq *writeq, uint32_t *write_hdr,
+		       uint32_t *write_rhf,
+		       const struct ips_recvhdrq_event *rcv_ev))
+{
+	uint32_t write_egr_tail = ips_recvq_tail_get(&writeq->egrq);
+	uint32_t next_write_egr_tail = write_egr_tail;
+	/* checksum is trimmed from paylen, we need to add back */
+	uint32_t rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+	    (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+	psmi_assert(rcv_paylen > 0);
+
+	/* Loop as long as the write eager queue is NOT full */
+	while (1) {
+		next_write_egr_tail++;
+		if (next_write_egr_tail >= writeq->egrq.elemcnt)
+			next_write_egr_tail = 0;
+		if (next_write_egr_tail == ips_recvq_head_get(&writeq->egrq)) {
+			break;
+		}
+
+		/* Move to next eager entry if leftover is not enough */
+		if ((writeq->state->egrq_offset + rcv_paylen) >
+		    writeq->egrq.elemsz) {
+			writeq->state->egrq_offset = 0;
+			write_egr_tail = next_write_egr_tail;
+
+			/* Update the eager buffer tail pointer */
+			ips_recvq_tail_update(&writeq->egrq, write_egr_tail);
+		} else {
+			/* There is enough space in this entry! */
+			/* Use pre-calculated address from look-up table */
+			char *write_payload =
+			    ips_recvq_egr_index_2_ptr(writeq->egrq_buftable,
+						      write_egr_tail,
+						      writeq->state->
+						      egrq_offset);
+			const char *rcv_payload =
+			    ips_recvhdrq_event_payload(rcv_ev);
+
+			psmi_assert(write_payload != NULL);
+			psmi_assert(rcv_payload != NULL);
+			psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen);
+
+			/* Copy the header to the subcontext's header queue */
+			psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr,
+				       writeq->hdrq_hdr_copysz);
+
+			/* Fix up the header with the subcontext's eager index/offset */
+			hfi_hdrset_egrbfr_index((uint32_t *) write_rhf,
+						write_egr_tail);
+			hfi_hdrset_egrbfr_offset((uint32_t *) write_rhf,
+						 (writeq->state->
+						  egrq_offset >> 6));
+
+			/* Update offset to next 64B boundary */
+			writeq->state->egrq_offset =
+			    (writeq->state->egrq_offset + rcv_paylen +
+			     63) & (~63);
+			return IPS_RECVHDRQ_CONTINUE;
+		}
+	}
+
+	/* At this point, the eager queue is full -- drop the packet. */
+	/* Copy the header to the subcontext's header queue */
+	psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr, writeq->hdrq_hdr_copysz);
+
+	/* Mark header with ETIDERR (eager overflow) */
+	hfi_hdrset_err_flags(write_rhf, HFI_RHF_TIDERR);
+
+	/* Clear UseEgrBfr bit because payload is dropped */
+	hfi_hdrset_use_egrbfr(write_rhf, 0);
+	return IPS_RECVHDRQ_BREAK;
+}
+
+PSMI_INLINE(
+int
+ips_writehdrq_append(struct ips_writehdrq *writeq,
+		     const struct ips_recvhdrq_event *rcv_ev))
+{
+	uint32_t write_hdr_head;
+	uint32_t write_hdr_tail;
+	uint32_t *write_hdr;
+	uint32_t *write_rhf;
+	uint32_t next_write_hdr_tail;
+	union {
+		uint32_t u32[2];
+		uint64_t u64;
+	} rhf;
+	int result = IPS_RECVHDRQ_CONTINUE;
+
+	/* Drop packet if write header queue is disabled */
+	if (!writeq->state->enabled) {
+		return IPS_RECVHDRQ_BREAK;
+	}
+
+	write_hdr_head = ips_recvq_head_get(&writeq->hdrq);
+	write_hdr_tail = ips_recvq_tail_get(&writeq->hdrq);
+	write_hdr = writeq->hdrq.base_addr + write_hdr_tail;
+	write_rhf = write_hdr + writeq->hdrq_rhf_off;
+
+	/* Drop packet if write header queue is full */
+	next_write_hdr_tail = write_hdr_tail + writeq->hdrq.elemsz;
+	if (next_write_hdr_tail > writeq->hdrq_elemlast) {
+		next_write_hdr_tail = 0;
+	}
+	if (next_write_hdr_tail == write_hdr_head) {
+		return IPS_RECVHDRQ_BREAK;
+	}
+
+	/*
+	 * If not DMA_RTAIL, don't let consumer see RHF until it's ready.
+	 * We copy the source rhf and operate on it until we are ready
+	 * to atomically update it for the reader.
+	 */
+	if (!(writeq->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+		write_rhf = &rhf.u32[0];
+		rhf.u64 = *((uint64_t *) rcv_ev->rhf);
+	}
+
+	if (hfi_hdrget_use_egrbfr(rcv_ev->rhf)) {
+		result = ips_write_eager_packet(writeq,
+						write_hdr, write_rhf, rcv_ev);
+	} else {
+		/* Copy the header to the subcontext's header queue */
+		psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr,
+			       writeq->hdrq_hdr_copysz);
+	}
+
+	/* Ensure previous writes are visible before writing rhf seq or tail */
+	ips_wmb();
+
+	if (!(writeq->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+		/* We accumulated a few changes to the RHF and now want to make it
+		 * atomically visible for the reader.
+		 */
+		uint32_t rhf_seq = writeq->state->hdrq_rhf_seq;
+		hfi_hdrset_seq((uint32_t *) write_rhf, rhf_seq);
+		if (rhf_seq >= LAST_RHF_SEQNO)
+			writeq->state->hdrq_rhf_seq = 1;
+		else
+			writeq->state->hdrq_rhf_seq = rhf_seq + 1;
+
+		/* Now write the new rhf */
+		ips_writehdrq_write_rhf_atomic(write_hdr + writeq->hdrq_rhf_off,
+					       write_rhf);
+	}
+
+	/* The tail must be updated regardless of HFI1_CAP_DMA_RTAIL
+	 * since this tail is also used to keep track of where
+	 * ips_writehdrq_append will write to next. For subcontexts there is
+	 * no separate shadow copy of the tail. */
+	ips_recvq_tail_update(&writeq->hdrq, next_write_hdr_tail);
+
+	return result;
+}
+
+#endif /* _IPS_WRITEHDRQ_H */
diff --git a/ptl_ips/ipserror.c b/ptl_ips/ipserror.c
new file mode 100644
index 0000000..608b73e
--- /dev/null
+++ b/ptl_ips/ipserror.c
@@ -0,0 +1,200 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* IPS - Interconnect Protocol Stack */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include "ipserror.h"
+
+char *ips_err_str(int ips_error)
+{
+	static char err_str[128];
+
+	switch (ips_error) {
+	case IPS_RC_OK:
+		return "OK!";
+
+	case IPS_RC_ERROR:
+		return "general error";
+
+	case IPS_RC_PENDING:
+		return "request pending";
+
+	case IPS_RC_EXIST:
+		return "entry exist";
+
+	case IPS_RC_MAX_ENTRIES_EXCEEDED:
+		return "max entries has been exceeded";
+
+	case IPS_RC_NOT_ENOUGH_BUFFERS:
+		return "not enough buffers to complete request";
+
+	case IPS_RC_NO_FREE_MEM:
+		return "no free memory";
+
+	case IPS_RC_NAME_LOOKUP_FAILED:
+		return "name lookup failed";
+
+	case IPS_RC_PARAM_ERROR:
+		return "invalid parameter";
+
+	case IPS_RC_UNKNOWN_DEVICE:
+		return "unknown device";
+
+	case IPS_RC_DEVICE_INIT_FAILED:
+		return "device init failed";
+
+	case IPS_RC_DATA_TRUNCATED:
+		return "data truncated";
+
+	case IPS_RC_INVALID_RANK:
+		return "invalid rank";
+
+	case IPS_RC_INVALID_OPCODE:
+		return "invalid op code";
+
+	case IPS_RC_PEER_NOT_READY:
+		return "peer is not ready";
+
+	case IPS_RC_PEER_CLOSED:
+		return "peer is closed";
+
+	case IPS_RC_DEST_EQUAL_LOCAL_RANK:
+		return "src and dest rank is equal";
+
+	case IPS_RC_DEVICE_ERROR:
+		return
+		    "OPA hardware not found, hardware problem, or disabled";
+
+	case IPS_RC_NETWORK_DOWN:
+		return "The link is down";
+
+	case IPS_RC_NOT_ENOUGH_FREE_TIDS:
+		return "Not enough free TIDS to complete request";
+
+	case IPS_RC_NO_RESOURCE_AVAILABLE:
+		return "Internal resources exhausted";
+
+	case IPS_RC_HW_UPDATE_FAILED:
+		return "Failed TID update for rendevous, allocation problem";
+
+	case IPS_RC_PARTITION_ERROR:
+		return "One or more nodes is on a different partition";
+
+	case IPS_RC_RUN_ERROR:
+		return "One or more nodes is still running the previous job";
+
+	case IPS_RC_ALREADY_OPEN:
+		return "Open/init has already been called";
+
+	case IPS_RC_WAS_CLOSED:
+		return "Close has already been called";
+
+	case IPS_RC_DEST_EQUAL_LOCAL_LID:
+		return "src and dest LID is equal";
+
+	case IPS_RC_BUFFER_ALIGMENT_ERROR:
+		return "Buffer start address is not 32 bit aligned";
+
+	case IPS_RC_LENGTH_ALIGMENT_ERROR:
+		return "Buffer length is not a whole # of 32 bit words";
+
+	case IPS_RC_INVALID_DATA_LENGTH:
+		return "invalid data length";
+
+	case IPS_RC_BUSY:
+		return "Device is busy";
+
+	case IPS_RC_INIT_TIMEOUT_EXPIRED:
+		return "Could not connect to other nodes";
+
+	case IPS_RC_NO_PORTS_AVAILABLE:
+		return "All OPA ports are in use.";
+
+		/* Performance Counters codes */
+	case IPS_RCPERF_INIT_FAILED:
+		return "Initialization of performance counters failed";
+
+	case IPS_RCPERF_EVENT_SETUP_FAILED:
+		return "Setting performance counter events failed";
+
+	case IPS_RCPERF_REG_DEFAULT_SET:
+		return "Default event set for one of the counters";
+
+	case IPS_RCPERF_UNSUPPORTED_CPU:
+		return "This CPU type is not supported";
+
+	case IPS_RCPERF_REG_GET_FAILED:
+		return "Failed to get register value for event";
+
+	case IPS_RCPERF_SET_EVENT_STR_FAILED:
+		return "Failed to find event description";
+
+	case IPS_RCPERF_INVALID_REGISTER:
+		return "Register index out of range of available counters";
+
+	case IPS_RC_SYSERR:	/* we hope errno hasn't changed since this was set... */
+		snprintf(err_str, sizeof(err_str), "System error: %s",
+			 strerror(errno));
+		return err_str;
+
+	default:
+		snprintf(err_str, sizeof(err_str),
+			 "Error code %i: <no interpretation>", ips_error);
+		return err_str;
+	}
+}
diff --git a/ptl_ips/ipserror.h b/ptl_ips/ipserror.h
new file mode 100644
index 0000000..685f617
--- /dev/null
+++ b/ptl_ips/ipserror.h
@@ -0,0 +1,122 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/*
+ * interface to OPA Interconnect Protocol Stack
+ *
+ * This file contains the function prototypes of the interconnect protocol
+ * stack. It should be included in all the clients of the stack, such as MPI.
+ */
+
+#ifndef ipserror_h
+#define ipserror_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return codes */
+#define IPS_RC_OK 0
+#define IPS_RC_ERROR (-1)
+#define IPS_RC_PENDING (-2)
+#define IPS_RC_EXIST (-3)
+#define IPS_RC_MAX_ENTRIES_EXCEEDED (-4)
+#define IPS_RC_NOT_ENOUGH_BUFFERS   (-100)
+#define IPS_RC_NO_FREE_MEM  (-101)
+#define IPS_RC_NAME_LOOKUP_FAILED   (-102)
+#define IPS_RC_PARAM_ERROR  (-103)
+#define IPS_RC_UNKNOWN_DEVICE   (-104)
+#define IPS_RC_DEVICE_INIT_FAILED   (-105)
+#define IPS_RC_DATA_TRUNCATED   (-106)
+#define IPS_RC_INVALID_RANK (-107)
+#define IPS_RC_INVALID_OPCODE   (-108)
+#define IPS_RC_PEER_NOT_READY   (-109)
+#define IPS_RC_PEER_CLOSED  (-110)
+#define IPS_RC_DEST_EQUAL_LOCAL_RANK    (-111)
+#define IPS_RC_DEVICE_ERROR  (-112)
+#define IPS_RC_NETWORK_DOWN  (-113)
+#define IPS_RC_NOT_ENOUGH_FREE_TIDS   (-114)
+#define IPS_RC_NO_RESOURCE_AVAILABLE (-115)
+#define IPS_RC_HW_UPDATE_FAILED (-116)
+#define IPS_RC_PARTITION_ERROR   (-117)
+#define IPS_RC_RUN_ERROR (-118)
+#define IPS_RC_ALREADY_OPEN (-119)
+#define IPS_RC_WAS_CLOSED (-120)
+#define IPS_RC_DEST_EQUAL_LOCAL_LID    (-121)
+#define IPS_RC_BUFFER_ALIGMENT_ERROR  (-122)
+#define IPS_RC_LENGTH_ALIGMENT_ERROR  (-123)
+#define IPS_RC_INVALID_DATA_LENGTH   (-124)
+#define IPS_RC_BUSY (-125)
+#define IPS_RC_INIT_TIMEOUT_EXPIRED (-126)
+#define IPS_RC_NO_PORTS_AVAILABLE (-127)
+#define IPS_RC_TRANSFER_INCOMPLETE (-128)
+#define IPS_RC_SYSERR (-129)	/* errno has meaning, if no further errors since this error */
+#define IPS_RC_STARTUP_ERR (-130)
+
+/* Performance Counters Error Codes */
+#define IPS_RCPERF_INIT_FAILED          (-200)
+#define IPS_RCPERF_EVENT_SETUP_FAILED   (-201)
+#define IPS_RCPERF_REG_DEFAULT_SET      (-202)
+#define IPS_RCPERF_UNSUPPORTED_CPU      (-203)
+#define IPS_RCPERF_REG_GET_FAILED       (-204)
+#define IPS_RCPERF_SET_EVENT_STR_FAILED (-205)
+#define IPS_RCPERF_INVALID_REGISTER     (-206)
+
+char *ips_err_str(int);
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c
new file mode 100644
index 0000000..01a0c3f
--- /dev/null
+++ b/ptl_ips/ptl.c
@@ -0,0 +1,950 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file implements the PSM PTL for ips */
+#include "psm_user.h"
+#include "ptl_ips.h"
+#include "ipserror.h"
+
+int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS	250
+
+#define HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED 6
+#define HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED 2
+
+static
+int
+ips_subcontext_ignore(const struct ips_recvhdrq_event *rcv_ev,
+		      uint32_t subcontext)
+{
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int
+ips_subcontext_process(const struct ips_recvhdrq_event *rcv_ev,
+		       uint32_t subcontext)
+{
+	struct ptl_shared *recvshc = rcv_ev->proto->ptl->recvshc;
+	if_pt(subcontext != recvshc->subcontext &&
+	      subcontext < recvshc->subcontext_cnt) {
+		return ips_writehdrq_append(&recvshc->writeq[subcontext],
+					    rcv_ev);
+	}
+	else {
+		_HFI_VDBG
+		    ("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n",
+		     (int)subcontext, (int)recvshc->subcontext_cnt,
+		     (int)recvshc->subcontext, (unsigned)rcv_ev->error_flags);
+		return IPS_RECVHDRQ_BREAK;
+	}
+}
+
+static
+void
+recvhdrq_hw_params(const psmi_context_t *context,
+		   struct ips_recvq_params *hdrq,
+		   struct ips_recvq_params *egrq,
+		   int is_shared_context, int subcontext)
+{
+	const struct hfi1_ctxt_info *cinfo = &context->ctrl->ctxt_info;
+	const struct hfi1_base_info *binfo = &context->ctrl->base_info;
+
+	hdrq->elemcnt = cinfo->rcvhdrq_cnt;
+	/* dwords */
+	hdrq->elemsz = cinfo->rcvhdrq_entsize >> BYTE2DWORD_SHIFT;
+
+	egrq->elemcnt = cinfo->egrtids;
+	/* bytes */
+	egrq->elemsz = cinfo->rcvegr_size;
+
+	if (!is_shared_context) {
+		volatile uint64_t *uregbase =	/* HW registers */
+		    (volatile uint64_t *)(uintptr_t) binfo->user_regbase;
+
+		hdrq->base_addr =
+		    (uint32_t *) (uintptr_t) binfo->rcvhdr_bufbase;
+		hdrq->head_register =
+		    (volatile __le64 *)&uregbase[ur_rcvhdrhead];
+		hdrq->tail_register =
+		    (volatile __le64 *)(uintptr_t) binfo->rcvhdrtail_base;
+
+		egrq->base_addr = (void *)(uintptr_t) binfo->rcvegr_bufbase;
+		egrq->head_register =
+		    (volatile __le64 *)&uregbase[ur_rcvegrindexhead];
+		egrq->tail_register =
+		    (volatile __le64 *)&uregbase[ur_rcvegrindextail];
+	} else {
+		/* Subcontexts mimic the HW registers but use different addresses
+		 * to avoid cache contention. */
+		volatile uint64_t *subcontext_uregbase;
+		uint32_t *rcv_hdr, *rcv_egr;
+		unsigned hdrsize, egrsize;
+		unsigned pagesize = getpagesize();
+		unsigned i = pagesize - 1;
+
+		hdrsize =
+		    (cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize + i) & ~i;
+		egrsize =
+		    (cinfo->egrtids * cinfo->rcvegr_size + i) & ~i;
+
+		subcontext_uregbase = (uint64_t *)
+		    (((uintptr_t) binfo->subctxt_uregbase) +
+		     (sizeof(struct ips_subcontext_ureg) * subcontext));
+		rcv_hdr = (uint32_t *)
+		    (((uintptr_t) binfo->subctxt_rcvhdrbuf +
+		      (hdrsize * subcontext)));
+		rcv_egr = (uint32_t *)
+		    (((uintptr_t) binfo->subctxt_rcvegrbuf +
+		      (egrsize * subcontext)));
+
+		hdrq->base_addr = rcv_hdr;
+		hdrq->head_register =
+		    (volatile __le64 *)&subcontext_uregbase[ur_rcvhdrhead * 8];
+		hdrq->tail_register =
+		    (volatile __le64 *)&subcontext_uregbase[ur_rcvhdrtail * 8];
+
+		egrq->base_addr = rcv_egr;
+		egrq->head_register =
+		    (volatile __le64 *)&subcontext_uregbase[ur_rcvegrindexhead *
+							    8];
+		egrq->tail_register =
+		    (volatile __le64 *)&subcontext_uregbase[ur_rcvegrindextail *
+							    8];
+	}
+}
+
+static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context);
+static psm2_error_t shrecvq_fini(ptl_t *ptl);
+
+static size_t ips_ptl_sizeof(void)
+{
+	return sizeof(ptl_t);
+}
+
+static
+int ips_ptl_epaddr_stats_num(void)
+{
+	return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+}
+
+static
+int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags)
+{
+	int num_stats =
+	    sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+	int i;
+
+	/* All stats are uint64_t */
+	for (i = 0; i < num_stats; i++)
+		flags[i] = MPSPAWN_STATS_REDUCTION_ALL |
+		    MPSPAWN_STATS_SKIP_IF_ZERO;
+
+	desc[0] = "errchecks sent";
+	desc[1] = "errchecks recv";
+	desc[2] = "naks sent";
+	desc[3] = "naks recv";
+	desc[4] = "connect reqs sent";
+	desc[5] = "disconnect reqs sent";
+	desc[6] = "tid grants sent";
+	desc[7] = "tid grants recv";
+	desc[8] = "send rexmit";
+	desc[9] = "congestion packets";
+
+	return num_stats;
+}
+
+int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o)
+{
+	int i, num_stats =
+	    sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+	uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats;
+
+	for (i = 0; i < num_stats; i++)
+		stats_o[i] = stats_i[i];
+
+	return num_stats;
+}
+
+static
+psm2_error_t
+psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current)
+{
+	struct ptl *ptl = (struct ptl *)t->context;
+	const uint64_t current_count = get_cycles();
+	psm2_error_t err;
+
+	err = psmi_context_check_status(ptl->context);
+	if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS)
+		err = ips_spio_process_events(ptl);
+
+	psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+				  current_count + ptl->status_cyc_timeout);
+
+	return err;
+}
+
+/*
+ * Check if non-double word multiple message size for SDMA is allowed to be
+ * pass to the driver. Starting from 6.2 driver version, PSM is able to pass
+ * to the driver message which size is not a multiple of double word for SDMA.
+ */
+ustatic
+void ips_ptl_non_dw_mul_sdma_init(void)
+{
+	uint16_t major_version = hfi_get_user_major_version();
+	uint16_t minor_version = hfi_get_user_minor_version();
+
+	if ((major_version > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) ||
+		((major_version == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) &&
+		 (minor_version >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED)))
+	{
+		ips_proto_mq_set_non_dw_mul_sdma(IPS_NON_DW_MUL_ALLOWED);
+	}
+	else
+	{
+		ips_proto_mq_set_non_dw_mul_sdma(IPS_NON_DW_MUL_NOT_ALLOWED);
+	}
+}
+
+static
+psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+	psm2_error_t err = PSM2_OK;
+	uint32_t num_of_send_bufs = ep->hfi_num_sendbufs;
+	uint32_t num_of_send_desc = ep->hfi_num_descriptors;
+	uint32_t imm_size = ep->hfi_imm_size;
+	const psmi_context_t *context = &ep->context;
+	const struct hfi1_user_info_dep *user_info = &context->user_info;
+	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+	const int enable_shcontexts = (user_info->subctxt_cnt > 0);
+	const uint64_t current_count = get_cycles();
+
+	/* Preconditions */
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+	psmi_assert_always(ep->hfi_num_sendbufs > 0);
+
+	memset(ptl, 0, sizeof(struct ptl));
+
+	ptl->ep = ep;		/* back pointer */
+	ptl->epid = ep->epid;	/* cache epid */
+	ptl->epaddr = ep->epaddr;	/* cache a copy */
+	ptl->ctl = ctl;
+	ptl->context = context;
+	ptl->runtime_flags = context->runtime_flags;
+
+	memset(ctl, 0, sizeof(*ctl));
+	/* Fill in the control structure */
+	ctl->ep = ep;
+	ctl->ptl = ptl;
+	ctl->ep_poll = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll;
+	ctl->ep_connect = ips_ptl_connect;
+	ctl->ep_disconnect = ips_ptl_disconnect;
+	ctl->mq_send = ips_proto_mq_send;
+	ctl->mq_isend = ips_proto_mq_isend;
+
+	ctl->am_get_parameters = ips_am_get_parameters;
+
+	ctl->am_short_request = ips_am_short_request;
+	ctl->am_short_reply = ips_am_short_reply;
+
+	ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num;
+	ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init;
+	ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get;
+
+	/*
+	 * Runtime flags in 'ptl' are different from runtime flags in 'context'.
+	 * In 'context', runtime flags reflect what the driver is capable of.
+	 * In 'ptl', runtime flags reflect the features we can or want to use in
+	 *           the driver's supported runtime flags.
+	 */
+
+	/*
+	 * This timer is to be used to check the context's status at every
+	 * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS.  This is useful to detect when
+	 * the link transitions from the DOWN state to the UP state.  We can thus
+	 * stop aggregating link failure messages once we detect that the link is
+	 * up.
+	 */
+	psmi_timer_entry_init(&ptl->status_timer,
+			      psmi_context_check_status_callback, ptl);
+
+	/* cache the context's status timeout in cycles */
+	ptl->status_cyc_timeout =
+	    ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS);
+
+	/*
+	 * Retransmissions and pending operations are kept in a timer structure
+	 * (queue).  The timerq is shared to various internal IPS interfaces so
+	 * that they too may schedule events on the timer queue.  The timerq is
+	 * drained in the progress function.
+	 */
+	if ((err = psmi_timer_init(&ptl->timerq)))
+		goto fail;
+
+	/* start the context's status timer */
+	psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+				  current_count + ptl->status_cyc_timeout);
+
+	/*
+	 * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings
+	 * are added/removed by the connect portion of the ips protocol and lookup
+	 * is made by the receive queue processing component.
+	 */
+	if ((err = ips_epstate_init(&ptl->epstate, context)))
+		goto fail;
+
+	ips_ptl_non_dw_mul_sdma_init();
+	/*
+	 * Context sharing, setup subcontext ureg page.
+	 */
+	if (enable_shcontexts) {
+		struct ptl_shared *recvshc;
+
+		recvshc = (struct ptl_shared *)
+		    psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ptl_shared));
+		if (recvshc == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+
+		ptl->recvshc = recvshc;
+		recvshc->ptl = ptl;
+
+		/* Initialize recvshc fields */
+		recvshc->context = ctxt_info->ctxt;
+		recvshc->subcontext = ctxt_info->subctxt;
+		recvshc->subcontext_cnt = user_info->subctxt_cnt;
+		psmi_assert_always(recvshc->subcontext_cnt <=
+				   HFI1_MAX_SHARED_CTXTS);
+		psmi_assert_always(recvshc->subcontext <
+				   recvshc->subcontext_cnt);
+
+		/*
+		 * Using ep->context to avoid const modifier since this function
+		 * will modify the content in ep->context.
+		 */
+		if ((err = ips_subcontext_ureg_get(ptl, recvshc->subcontext_cnt,
+						   &ep->context,
+						   recvshc->subcontext_ureg)))
+			goto fail;
+
+		memset(recvshc->subcontext_ureg[recvshc->subcontext], 0,
+		       sizeof(struct ips_subcontext_ureg));
+		recvshc->context_lock = &recvshc->hwcontext_ctrl->context_lock;
+		if (recvshc->subcontext == 0) {
+			if (pthread_spin_init(recvshc->context_lock,
+					      PTHREAD_PROCESS_SHARED) != 0) {
+				err =
+				    psmi_handle_error(ptl->ep,
+						      PSM2_EP_DEVICE_FAILURE,
+						      "Couldn't initialize process-shared spin lock");
+				goto fail;
+			}
+		}
+	}
+
+	/*
+	 * Hardware send pio used by eager and control messages.
+	 */
+	if ((err = ips_spio_init(context, ptl, &ptl->spioc)))
+		goto fail;
+
+	/*
+	 * Actual ips protocol handling.
+	 */
+	if ((err =
+	     ips_proto_init(context, ptl, num_of_send_bufs, num_of_send_desc,
+			    imm_size, &ptl->timerq, &ptl->epstate, &ptl->spioc,
+			    &ptl->proto)))
+		goto fail;
+
+	/*
+	 * Hardware receive hdr/egr queue, services incoming packets and issues
+	 * callbacks for protocol handling in proto_recv.  It uses the epstate
+	 * interface to determine if a packet is known or unknown.
+	 */
+	if (!enable_shcontexts) {
+		struct ips_recvhdrq_callbacks recvq_callbacks;
+		struct ips_recvq_params hdrq, egrq;
+		recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+		recvq_callbacks.callback_packet_unknown =
+		    ips_proto_process_unknown;
+		recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+		recvq_callbacks.callback_error = ips_proto_process_packet_error;
+		if ((err =
+		     ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+				       &hdrq, &egrq, &recvq_callbacks,
+				       ptl->runtime_flags, 0, &ptl->recvq,
+				       &ptl->recvq_state)))
+			goto fail;
+	}
+
+	/*
+	 * Software receive hdr/egr queue, used in shared contexts.
+	 */
+	else if ((err = shrecvq_init(ptl, context)))
+		goto fail;
+
+	/*
+	 * Receive thread, always initialized but not necessary creates a
+	 * pthread.
+	 */
+	if ((err = ips_ptl_rcvthread_init(ptl, &ptl->recvq)))
+		goto fail;
+fail:
+	return err;
+}
+
+static psm2_error_t ips_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_in)
+{
+	const struct hfi1_user_info_dep *user_info = &ptl->context->user_info;
+	const int enable_shcontexts = (user_info->subctxt_cnt > 0);
+	psm2_error_t err = PSM2_OK;
+
+	if ((err = ips_proto_fini(&ptl->proto, force, timeout_in)))
+		goto fail;
+
+	/* We have to cancel the thread after terminating the protocol because
+	 * connect/disconnect packets use interrupts and the kernel doesn't
+	 * like to have no pollers waiting */
+	if ((err = ips_ptl_rcvthread_fini(ptl)))
+		goto fail;
+
+	if ((err = ips_epstate_fini(&ptl->epstate)))
+		goto fail;
+
+	if ((err = ips_spio_fini(&ptl->spioc)))
+		goto fail;
+
+	if ((err = psmi_timer_fini(&ptl->timerq)))
+		goto fail;
+
+	if (!enable_shcontexts && (err = ips_recvhdrq_fini(&ptl->recvq)))
+		goto fail;
+
+	if (enable_shcontexts && (err = shrecvq_fini(ptl)))
+		goto fail;
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_optctl(const void *core_obj, int optname,
+	       void *optval, uint64_t *optlen, int get)
+{
+	psm2_error_t err = PSM2_OK;
+
+	switch (optname) {
+	case PSM2_IB_OPT_EP_SL:
+		{
+			/* Core object is psm2_epaddr */
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+			ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+
+			/* If endpoint does not use IB ignore for set, complain for get */
+			if (epaddr->ptlctl->ep_connect != ips_ptl_connect) {
+				if (get)
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+							      PSM2_PARAM_ERR,
+							      "Invalid EP transport");
+				goto exit_fn;
+			}
+
+			/* Sanity check option length */
+			if (*optlen < sizeof(uint8_t)) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Option value length error");
+				*optlen = sizeof(unsigned);
+				goto exit_fn;
+			}
+
+			if (get) {
+				/* Get returns the SL for the PIO flow */
+				*((uint8_t *) optval) =
+				    (uint8_t) ipsaddr->
+				    flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl;
+			} else {
+				uint16_t new_sl;
+
+				/* Sanity check if SL is within range */
+				new_sl = (uint16_t) *(uint8_t *) optval;
+				if (new_sl > PSMI_SL_MAX) {
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid SL value %u. %d<= SL <=%d.",
+						      new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+					goto exit_fn;
+				}
+
+				/* Set new SL for all flows */
+				ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path->
+				    pr_sl = new_sl;
+				ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].path->
+				    pr_sl = new_sl;
+			}
+		}
+		break;
+	case PSM2_IB_OPT_DF_SL:
+		{
+			/* Set default SL to be used by an endpoint for all communication */
+			/* Core object is psm2_epaddr */
+			psm2_ep_t ep = (psm2_ep_t) core_obj;
+
+			/* Make sure ep is specified */
+			if (!ep) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid PSM Endpoint");
+				goto exit_fn;
+			}
+
+			/* Sanity check option length */
+			if (*optlen < sizeof(uint8_t)) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Option value length error");
+				*optlen = sizeof(uint8_t);
+				goto exit_fn;
+			}
+
+			if (get) {
+				*((uint8_t *) optval) =
+				    ep->ptl_ips.ptl->proto.epinfo.ep_sl;
+			} else {
+				uint16_t new_sl;
+
+				/* Sanity check if SL is within range */
+				new_sl = (uint16_t) *(uint8_t *) optval;
+				if (new_sl > PSMI_SL_MAX) {
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid SL value %u. %d<= SL <=%d.",
+						      new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+					goto exit_fn;
+				}
+
+				ep->ptl_ips.ptl->proto.epinfo.ep_sl =
+				    (uint8_t) new_sl;
+			}
+		}
+		break;
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown PSM2_IB option %u.", optname);
+	}
+
+exit_fn:
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_setopt(const void *component_obj, int optname,
+	       const void *optval, uint64_t optlen)
+{
+	return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen,
+			      0);
+}
+
+static
+psm2_error_t
+ips_ptl_getopt(const void *component_obj, int optname,
+	       void *optval, uint64_t *optlen)
+{
+	return ips_ptl_optctl(component_obj, optname, optval, optlen, 1);
+}
+
+psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored)
+{
+	const uint64_t current_count = get_cycles();
+	const int do_lock = PSMI_LOCK_DISABLED &&
+	    (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	psm2_error_t err2;
+
+	if (!ips_recvhdrq_isempty(&ptl->recvq)) {
+		if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq))
+			return err;
+		if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+			ips_recvhdrq_scan_cca(&ptl->recvq);
+		}
+
+		err = ips_recvhdrq_progress(&ptl->recvq);
+		if (do_lock)
+			ips_recvhdrq_unlock(&ptl->recvq);
+		if_pf(err > PSM2_OK_NO_PROGRESS)
+		    return err;
+		err2 =
+		    psmi_timer_process_if_expired(&(ptl->timerq),
+						  current_count);
+		if (err2 != PSM2_OK_NO_PROGRESS)
+			return err2;
+		else
+			return err;
+	}
+
+	/*
+	 * Process timer expirations after servicing receive queues (some packets
+	 * may have been acked, some requests-to-send may have been queued).
+	 *
+	 * It's safe to look at the timer without holding the lock because it's not
+	 * incorrect to be wrong some of the time.
+	 */
+	if (psmi_timer_is_expired(&(ptl->timerq), current_count)) {
+		if (do_lock)
+			ips_recvhdrq_lock(&ptl->recvq);
+		err = psmi_timer_process_expired(&(ptl->timerq), current_count);
+		if (do_lock)
+			ips_recvhdrq_unlock(&ptl->recvq);
+	}
+
+	return err;
+}
+
+PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc))
+{
+	return pthread_spin_trylock(recvshc->context_lock);
+}
+
+PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc))
+{
+	pthread_spin_lock(recvshc->context_lock);
+}
+
+PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc))
+{
+	pthread_spin_unlock(recvshc->context_lock);
+}
+
+psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored)
+{
+	const uint64_t current_count = get_cycles();
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	psm2_error_t err2;
+	struct ptl_shared *recvshc = ptl->recvshc;
+	psmi_assert(recvshc != NULL);
+
+	/* The following header queue checks are speculative (but safe)
+	 * until this process has acquired the lock. The idea is to
+	 * minimize lock contention due to processes spinning on the
+	 * shared context. */
+	if (ips_recvhdrq_isempty(&recvshc->recvq)) {
+		if (!ips_recvhdrq_isempty(&ptl->recvq) &&
+		    ips_try_lock_shared_context(recvshc) == 0) {
+			/* check that subcontext is empty while under lock to avoid
+			 * re-ordering of incoming packets (since packets from
+			 * hardware context will be processed immediately). */
+			if_pt(ips_recvhdrq_isempty(&recvshc->recvq)) {
+				if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+					ips_recvhdrq_scan_cca(&ptl->recvq);
+				}
+
+				err = ips_recvhdrq_progress(&ptl->recvq);
+			}
+			ips_unlock_shared_context(recvshc);
+		}
+	}
+
+	if_pf(err > PSM2_OK_NO_PROGRESS)
+	    return err;
+
+	if (!ips_recvhdrq_isempty(&recvshc->recvq)) {
+		if (recvshc->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+			ips_recvhdrq_scan_cca(&recvshc->recvq);
+		}
+
+		err2 = ips_recvhdrq_progress(&recvshc->recvq);
+		if (err2 != PSM2_OK_NO_PROGRESS) {
+			err = err2;
+		}
+	}
+
+	if_pf(err > PSM2_OK_NO_PROGRESS)
+	    return err;
+
+	/*
+	 * Process timer expirations after servicing receive queues (some packets
+	 * may have been acked, some requests-to-send may have been queued).
+	 */
+	err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count);
+	if (err2 != PSM2_OK_NO_PROGRESS)
+		err = err2;
+
+	return err;
+}
+
+int ips_ptl_recvq_isempty(const ptl_t *ptl)
+{
+	struct ptl_shared *recvshc = ptl->recvshc;
+
+	if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq))
+		return 0;
+	return ips_recvhdrq_isempty(&ptl->recvq);
+}
+
+/*
+ * Legacy ips_get_stat -- do nothing.
+ */
+int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats)
+{
+	memset(stats, 0, sizeof(ips_sess_stat));
+	return 0;
+}
+
+static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context)
+{
+	struct ptl_shared *recvshc = ptl->recvshc;
+	struct ips_recvhdrq_callbacks recvq_callbacks;
+	struct ips_recvq_params hdrq, egrq;
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	/* Initialize (shared) hardware context recvq (ptl->recvq) */
+	/* NOTE: uses recvq in ptl structure for shared h/w context */
+	recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+	recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown;
+	recvq_callbacks.callback_subcontext = ips_subcontext_process;
+	recvq_callbacks.callback_error = ips_proto_process_packet_error;
+	if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+				     &hdrq, &egrq, &recvq_callbacks,
+				     ptl->runtime_flags, recvshc->subcontext,
+				     &ptl->recvq,
+				     &recvshc->hwcontext_ctrl->recvq_state))) {
+		goto fail;
+	}
+
+	/* Initialize software subcontext (recvshc->recvq). Subcontexts do */
+	/* not require the rcvhdr copy feature. */
+	recvhdrq_hw_params(context, &hdrq, &egrq, 1, recvshc->subcontext);
+	recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+	if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+				     &hdrq, &egrq, &recvq_callbacks,
+				     ptl->runtime_flags, recvshc->subcontext,
+				     &recvshc->recvq, &recvshc->recvq_state))) {
+		goto fail;
+	}
+
+	/* Initialize each recvshc->writeq for shared contexts */
+	for (i = 0; i < recvshc->subcontext_cnt; i++) {
+		recvhdrq_hw_params(context, &hdrq, &egrq, 1, i);
+		if ((err = ips_writehdrq_init(context, &hdrq, &egrq,
+					      &recvshc->writeq[i],
+					      &recvshc->subcontext_ureg[i]->
+					      writeq_state,
+					      ptl->runtime_flags))) {
+			goto fail;
+		}
+	}
+
+	if (err == PSM2_OK)
+		_HFI_DBG
+		    ("Context sharing in use: lid %d, context %d, sub-context %d\n",
+		     (int)psm2_epid_nid(ptl->epid), recvshc->context,
+		     recvshc->subcontext);
+fail:
+	return err;
+}
+
+static psm2_error_t shrecvq_fini(ptl_t *ptl)
+{
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	/* disable my write header queue before deallocation */
+	i = ptl->recvshc->subcontext;
+	ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0;
+
+	if ((err = ips_recvhdrq_fini(&ptl->recvq)))
+		goto fail;
+
+	if ((err = ips_recvhdrq_fini(&ptl->recvshc->recvq)))
+		goto fail;
+
+	for (i = 0; i < ptl->recvshc->subcontext_cnt; i++) {
+		if ((err = ips_writehdrq_fini(&ptl->recvshc->writeq[i]))) {
+			goto fail;
+		}
+	}
+
+	psmi_free(ptl->recvshc);
+
+fail:
+	return err;
+}
+
+psm2_error_t
+ips_ptl_connect(ptl_t *ptl, int numep, const psm2_epid_t *array_of_epid,
+		const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+	psm2_error_t err;
+	psm2_ep_t ep;
+	psm2_epid_t *epid_array = NULL;
+	psm2_error_t *error_array = NULL;
+	psm2_epaddr_t *epaddr_array = NULL;
+	ips_epaddr_t *ipsaddr_master, *ipsaddr;
+	int *mask_array = NULL;
+	int i;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+	err = ips_proto_connect(&ptl->proto, numep, array_of_epid,
+				array_of_epid_mask, array_of_errors,
+				array_of_epaddr, timeout_in);
+	if (err)
+		return err;
+
+	psmi_assert_always(ptl->ep->mctxt_master == ptl->ep);
+	if (ptl->ep->mctxt_next == ptl->ep)
+		return err;
+
+	/* make the additional mutil-context connections. */
+	epid_array = (psm2_epid_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep);
+	mask_array = (int *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep);
+	error_array = (psm2_error_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep);
+	epaddr_array = (psm2_epaddr_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep);
+	if (!epid_array || !mask_array || !error_array || !epaddr_array) {
+		goto fail;
+	}
+
+	ep = ptl->ep->mctxt_next;
+	while (ep != ep->mctxt_master) {
+
+		/* Setup the mask array and epid array. */
+		for (i = 0; i < numep; i++) {
+			if (array_of_epid_mask[i]
+			    && array_of_errors[i] == PSM2_OK) {
+				ipsaddr_master =
+				    (ips_epaddr_t *) array_of_epaddr[i];
+				ipsaddr = ipsaddr_master->next;
+				mask_array[i] = 0;
+				while (ipsaddr != ipsaddr_master) {
+					if (((psm2_epaddr_t) ipsaddr)->proto->
+					    ep == ep) {
+						mask_array[i] = 1;
+						epid_array[i] =
+						    ((psm2_epaddr_t) ipsaddr)->
+						    epid;
+						break;
+					}
+					ipsaddr = ipsaddr->next;
+				}
+			} else {
+				mask_array[i] = 0;
+			}
+		}
+
+		/* Make the real protocol connections. */
+		err =
+		    ips_proto_connect(&ep->ptl_ips.ptl->proto, numep,
+				      epid_array, mask_array, error_array,
+				      epaddr_array, timeout_in);
+		if (err)
+			goto fail;
+
+		ep = ep->mctxt_next;
+	}
+
+fail:
+	if (epid_array)
+		psmi_free(epid_array);
+	if (mask_array)
+		psmi_free(mask_array);
+	if (error_array)
+		psmi_free(error_array);
+	if (epaddr_array)
+		psmi_free(epaddr_array);
+
+	return err;
+}
+
+psm2_error_t
+ips_ptl_disconnect(ptl_t *ptl, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	psm2_error_t err;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+	err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr,
+				   array_of_epaddr_mask, array_of_errors,
+				   timeout_in);
+	return err;
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_ips = {
+	ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt,
+	    ips_ptl_getopt
+};
diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h
new file mode 100644
index 0000000..d2a903a
--- /dev/null
+++ b/ptl_ips/ptl_fwd.h
@@ -0,0 +1,65 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_IPS_H
+#define _PTL_FWD_IPS_H
+#include "ptl.h"
+
+typedef struct ips_epaddr ips_epaddr_t;
+typedef struct ips_msgctl ips_msgctl_t;
+
+/* Symbol in ips ptl */
+struct ptl_ctl_init psmi_ptl_ips;
+#endif /* _PTL_FWD_IPS_H */
diff --git a/ptl_ips/ptl_ips.h b/ptl_ips/ptl_ips.h
new file mode 100644
index 0000000..56adaf7
--- /dev/null
+++ b/ptl_ips/ptl_ips.h
@@ -0,0 +1,194 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PTL_H
+#define _IPS_PTL_H
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+#include "ips_proto_params.h"
+#include "ips_proto.h"
+#include "ips_spio.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+#include "ips_epstate.h"
+#include "ips_stats.h"
+#include "ips_subcontext.h"
+
+struct ptl_shared;
+
+/*
+ * PTL at the ips level (for OPA)
+ *
+ * This PTL structure glues all the ips components together.
+ *
+ * * ips timer, shared by various components, allows each component to
+ *   schedule time-based expiration callbacks on the timerq.
+ * * HW receive queue
+ * * send control block to handle eager messages
+ * * instantiation of the ips protocol
+ * * endpoint state, to map endpoint indexes into structures
+ *
+ *   Receive-side
+ *
+ *          ----[   proto    ]
+ *         /       ^      ^
+ *        |        |      |
+ *        |     packet  packet
+ *        |	known   unknown
+ *   add_endpt      \ /
+ *        |          |
+ *        `----> [epstate]
+ *                   ^
+ *                   |
+ *               lookup_endpt
+ *                   |
+ *                [recvq]
+ *                   |
+ *                 poll
+ *
+ */
+/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */
+/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */
+struct ptl {
+	psm2_ep_t ep;		/* back ptr */
+	psm2_epid_t epid;	/* cached from ep */
+	psm2_epaddr_t epaddr;	/* cached from ep */
+	ips_epaddr_t *ipsaddr;	/* cached from epaddr */
+	ptl_ctl_t *ctl;		/* cached from init */
+	const psmi_context_t *context;	/* cached from init */
+
+	struct ips_spio spioc;	/* PIO send control */
+	struct ips_proto proto;	/* protocol instance: timerq, epstate, spio */
+
+	/* Receive header queue and receive queue processing */
+	uint32_t runtime_flags;
+	struct psmi_timer_ctrl timerq;
+	struct ips_epstate epstate;	/* map incoming packets */
+	struct ips_recvhdrq_state recvq_state;
+	struct ips_recvhdrq recvq;	/* HW recvq: epstate, proto */
+
+	/* timer to check the context's status */
+	struct psmi_timer status_timer;
+
+	/* context's status check timeout in cycles -- cached */
+	uint64_t status_cyc_timeout;
+
+	/* Shared contexts context */
+	struct ptl_shared *recvshc;
+
+	/* Rcv thread context */
+	struct ptl_rcvthread *rcvthread;
+}
+#ifndef PACK_STRUCT_STL
+#define PACK_STRUCT_STL /* nothing */
+#endif
+ __attribute__ ((PACK_STRUCT_STL aligned(16)));
+
+/*
+ * Sample implementation of shared contexts context.
+ *
+ * In shared mode, the hardware queue is serviced by more than one process.
+ * Each process also mirrors the hardware queue in software (represented by an
+ * ips_recvhdrq).  For packets we service in the hardware queue that are not
+ * destined for us, we write them in other processes's receive queues
+ * (represented by an ips_writehdrq).
+ *
+ */
+struct ptl_shared {
+	ptl_t *ptl;		/* backptr to main ptl */
+	uint32_t context;
+	uint32_t subcontext;
+	uint32_t subcontext_cnt;
+
+	pthread_spinlock_t *context_lock;
+	struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS];
+	struct ips_hwcontext_ctrl *hwcontext_ctrl;
+	struct ips_recvhdrq recvq;	/* subcontext receive queue */
+	struct ips_recvhdrq_state recvq_state;	/* subcontext receive queue state */
+	struct ips_writehdrq writeq[HFI1_MAX_SHARED_CTXTS];	/* peer subcontexts */
+};
+
+/*
+ * Connect/disconnect are wrappers around psm proto's connect/disconnect,
+ * mostly to abstract away PSM-specific stuff from ips internal structures
+ */
+psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep,
+			    const psm2_epid_t *array_of_epid,
+			    const int *array_of_epid_mask,
+			    psm2_error_t *array_of_errors,
+			    psm2_epaddr_t *array_of_epaddr,
+			    uint64_t timeout_in);
+
+psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep,
+			       psm2_epaddr_t array_of_epaddr[],
+			       const int array_of_epaddr_mask[],
+			       psm2_error_t array_of_errors[],
+			       uint64_t timeout_in);
+
+/*
+ * Generic Poll function for ips-level ptl
+ */
+psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Support for receive thread
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq);
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl);
+
+#endif /* _IPS_PTL_H */
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
new file mode 100644
index 0000000..527e113
--- /dev/null
+++ b/ptl_ips/ptl_rcvthread.c
@@ -0,0 +1,506 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/poll.h>
+
+#include "ptl_ips.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+#include "psm_mq_internal.h"
+#include "psm_user.h"
+
+/* All in milliseconds */
+#define RCVTHREAD_TO_MIN_FREQ	    10	/* min of 10 polls per sec */
+#define RCVTHREAD_TO_MAX_FREQ	    100	/* max of 100 polls per sec */
+#define RCVTHREAD_TO_SHIFT	    1
+
+struct ptl_rcvthread;
+
+static void *ips_ptl_pollintr(void *recvthreadc);
+static psm2_error_t rcvthread_initstats(ptl_t *ptl);
+static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc);
+
+struct ptl_rcvthread {
+	const psmi_context_t *context;
+	const ptl_t *ptl;
+	struct ips_recvhdrq *recvq;
+
+	pthread_t hdrq_threadid;
+	uint64_t t_start_cyc;
+	int pipefd[2];
+
+	/* stats and some for scheduling */
+	uint64_t pollcnt;
+	uint64_t pollcnt_to;
+	uint64_t pollcyc;
+	uint64_t pollok;
+
+	/* For scheduling interrupt thread */
+	int timeout_period_min;
+	int timeout_period_max;
+	int timeout_shift;
+	uint64_t pollok_last;
+	uint64_t pollcnt_last;
+	uint32_t last_timeout;
+};
+
+#ifdef PSM_CUDA
+	/* This is a global cuda context (extern declaration in psm_user.h)
+         * stored to provide hints during a cuda failure
+         * due to a null cuda context.
+         */
+	CUcontext ctxt;
+#endif
+
+/*
+ * The receive thread knows about the ptl interface, so it can muck with it
+ * directly.
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ptl_rcvthread *rcvc;
+
+	ptl->rcvthread =
+	    psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread));
+	if (ptl->rcvthread == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	rcvc = ptl->rcvthread;
+
+	rcvc->recvq = recvq;
+	rcvc->ptl = ptl;
+	rcvc->context = ptl->context;
+	rcvc->t_start_cyc = get_cycles();
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		PSMI_CUDA_DRIVER_API_CALL(cuCtxGetCurrent, &ctxt);
+#endif
+
+	if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+
+		if ((err = rcvthread_initsched(rcvc)))
+			goto fail;
+
+		/* Create a pipe so we can synchronously terminate the thread */
+		if (pipe(rcvc->pipefd) != 0) {
+			err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+						"Cannot create a pipe for receive thread: %s\n",
+						strerror(errno));
+			goto fail;
+		}
+
+		if (pthread_create(&rcvc->hdrq_threadid, NULL,
+				   ips_ptl_pollintr, ptl->rcvthread)) {
+			close(rcvc->pipefd[0]);
+			close(rcvc->pipefd[1]);
+			err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+						"Cannot start receive thread: %s\n",
+						strerror(errno));
+			goto fail;
+		}
+
+	}
+
+	if ((err = rcvthread_initstats(ptl)))
+		goto fail;
+
+fail:
+	return err;
+}
+
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+	uint64_t t_now;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+	if (ptl->rcvthread == NULL)
+		return err;
+
+	if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+		t_now = get_cycles();
+
+		/* Disable interrupts then kill the receive thread */
+		if (psmi_context_interrupt_isenabled
+		    ((psmi_context_t *) ptl->context))
+			if ((err =
+			     psmi_context_interrupt_set((psmi_context_t *) ptl->
+							context, 0)))
+				goto fail;
+
+		/* Close the pipe so we can have the thread synchronously exit.
+		   On Linux just closing the pipe does not wake up the receive
+		   thread.
+		 */
+		if (write(rcvc->pipefd[1], (const void *)&t_now,
+			  sizeof(uint64_t)) == -1 ||
+		    close(rcvc->pipefd[1]) == -1) {
+			_HFI_VDBG
+			    ("unable to close pipe to receive thread cleanly\n");
+		}
+		pthread_join(rcvc->hdrq_threadid, NULL);
+
+		if (_HFI_PRDBG_ON) {
+			_HFI_PRDBG_ALWAYS
+				("rcvthread poll success %lld/%lld times, "
+				 "thread cancelled in %.3f us\n",
+				(long long)rcvc->pollok, (long long)rcvc->pollcnt,
+				(double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3);
+		}
+	}
+
+	psmi_free(ptl->rcvthread);
+
+fail:
+	return err;
+}
+
+psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
+{
+	union psmi_envvar_val env_to;
+	char buf[192];
+	char *rcv_freq = buf;
+	int no_timeout = 0;
+	int tvals[3] = { RCVTHREAD_TO_MIN_FREQ,
+		RCVTHREAD_TO_MAX_FREQ,
+		RCVTHREAD_TO_SHIFT
+	};
+	snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ,
+		 RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT);
+	buf[sizeof(buf) - 1] = '\0';
+
+	if (!psmi_getenv("PSM2_RCVTHREAD_FREQ",
+			 "Thread timeouts (per sec) <min_freq[:max_freq[:shift_freq]]>",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)rcv_freq, &env_to)) {
+		/* not using default values */
+		int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals);
+		int invalid = 0;
+
+		if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) ||
+		    (nparsed > 1 && tvals[1] == 0)) {
+			no_timeout = 1;
+		} else {
+			if (nparsed > 0 && tvals[0] > 1000)
+				invalid = 1;
+			if (nparsed > 1
+			    && (tvals[1] > 1000 || tvals[1] < tvals[0]))
+				invalid = 1;
+			if (nparsed > 2 && tvals[2] > 10)
+				invalid = 1;
+		}
+
+		if (invalid) {
+			_HFI_INFO
+			    ("Overriding invalid request for RcvThread frequency"
+			     " settings of %s to be <%d:%d:%d>\n", env_to.e_str,
+			     RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ,
+			     RCVTHREAD_TO_SHIFT);
+			tvals[0] = RCVTHREAD_TO_MIN_FREQ;
+			tvals[1] = RCVTHREAD_TO_MAX_FREQ;
+			tvals[2] = RCVTHREAD_TO_SHIFT;
+		}
+	}
+
+	if (no_timeout) {
+		rcvc->last_timeout = -1;
+		_HFI_PRDBG("PSM2_RCVTHREAD_FREQ set to only interrupt "
+			   "(no timeouts)\n");
+	} else {
+		/* Convert freq to period in microseconds (for poll()) */
+		rcvc->timeout_period_max = 1000 / tvals[0];
+		rcvc->timeout_period_min = 1000 / tvals[1];
+		rcvc->timeout_shift = tvals[2];
+		/* Start in the middle of min and max */
+		rcvc->last_timeout = (rcvc->timeout_period_min +
+				      rcvc->timeout_period_max) / 2;
+		_HFI_PRDBG("PSM2_RCVTHREAD_FREQ converted to period "
+			   "min=%dms,max=%dms,shift=%d\n",
+			   rcvc->timeout_period_min, rcvc->timeout_period_max,
+			   rcvc->timeout_shift);
+	}
+	return PSM2_OK;
+}
+
+static
+int rcvthread_next_timeout(struct ptl_rcvthread *rcvc)
+{
+	uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last;
+
+	if (pollok_diff > 0) {
+		if (rcvc->last_timeout > rcvc->timeout_period_min)
+			/* By default, be less aggressive, but there's a more aggressive
+			 * alternative if need be */
+#if 1
+			rcvc->last_timeout >>= rcvc->timeout_shift;
+#else
+			rcvc->last_timeout = rcvc->timeout_period_min;
+#endif
+	} else {		/* we had less progress */
+		if (rcvc->last_timeout < rcvc->timeout_period_max)
+			rcvc->last_timeout <<= rcvc->timeout_shift;
+	}
+
+	rcvc->pollok_last = rcvc->pollok;
+	rcvc->pollcnt_last = rcvc->pollcnt;
+	return (int)rcvc->last_timeout;
+}
+
+extern int ips_in_rcvthread;
+
+/*
+ * Receiver thread support.
+ *
+ * By default, polling in the driver asks the chip to generate an interrupt on
+ * every packet.  When the driver supports POLLURG we can switch the poll mode
+ * to one that requests interrupts only for packets that contain an urgent bit
+ * (and optionally enable interrupts for hdrq overflow events).  When poll
+ * returns an event, we *try* to make progress on the receive queue but simply
+ * go back to sleep if we notice that the main thread is already making
+ * progress.
+ */
+static
+void *ips_ptl_pollintr(void *rcvthreadc)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc;
+	struct ips_recvhdrq *recvq = rcvc->recvq;
+	psmi_context_t *context = (psmi_context_t *) rcvc->context;
+	int fd_dev = context->fd;
+	int fd_pipe = rcvc->pipefd[0];
+	psm2_ep_t ep;
+	struct pollfd pfd[2];
+	int ret;
+	int next_timeout = rcvc->last_timeout;
+	uint64_t t_cyc;
+	psm2_error_t err;
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && ctxt != NULL)
+		PSMI_CUDA_DRIVER_API_CALL(cuCtxSetCurrent, ctxt);
+#endif
+
+	PSM2_LOG_MSG("entering");
+	/* No reason to have many of these, keep this as a backup in case the
+	 * recvhdrq init function is misused */
+	psmi_assert_always((recvq->runtime_flags & PSMI_RUNTIME_RCVTHREAD));
+
+	/* Switch driver to a mode where it can interrupt on urgent packets */
+	if (psmi_context_interrupt_set((psmi_context_t *)
+				       rcvc->context, 1) == PSM2_EP_NO_RESOURCES) {
+		_HFI_PRDBG
+		    ("hfi_poll_type feature not present in driver, turning "
+		     "off internal progress thread\n");
+		return NULL;
+	}
+
+	_HFI_PRDBG("Enabled communication thread on URG packets\n");
+
+	while (1) {
+		pfd[0].fd = fd_dev;
+		pfd[0].events = POLLIN;
+		pfd[0].revents = 0;
+		pfd[1].fd = fd_pipe;
+		pfd[1].events = POLLIN;
+		pfd[1].revents = 0;
+
+		ret = poll(pfd, 2, next_timeout);
+		t_cyc = get_cycles();
+		if_pf(ret < 0) {
+			if (errno == EINTR)
+				_HFI_DBG("got signal, keep polling\n");
+			else
+				psmi_handle_error(PSMI_EP_NORETURN,
+						  PSM2_INTERNAL_ERR,
+						  "Receive thread poll() error: %s",
+						  strerror(errno));
+		} else if (pfd[1].revents) {
+			/* Any type of event on this fd means exit, should be POLLHUP */
+			_HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents);
+			close(fd_pipe);
+			break;
+		} else {
+			rcvc->pollcnt++;
+			if (!PSMI_LOCK_TRY(psmi_creation_lock)) {
+
+				if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) {
+					if (PSMI_LOCK_DISABLED) {
+						/* We do this check without acquiring the lock, no sense to
+						* adding the overhead and it doesn't matter if we're
+						* wrong. */
+						if (ips_recvhdrq_isempty(recvq))
+							continue;
+						if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+							ips_recvhdrq_scan_cca(recvq);
+						}
+						if (!ips_recvhdrq_trylock(recvq))
+							continue;
+						err = ips_recvhdrq_progress(recvq);
+						if (err == PSM2_OK)
+							rcvc->pollok++;
+						else
+							rcvc->pollcyc += get_cycles() - t_cyc;
+						ips_recvhdrq_unlock(recvq);
+					} else {
+
+						ep = psmi_opened_endpoint;
+
+						if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) {
+							if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN ) {
+									ips_recvhdrq_scan_cca(recvq);
+							}
+							PSMI_UNLOCK(ep->mq->progress_lock);
+						}
+
+						/* Go through all master endpoints. */
+						do{
+							if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) {
+								/* If we time out, we service shm and hfi.  If not, we
+								* assume to have received an hfi interrupt and service
+								* only hfi.
+								*/
+								err = psmi_poll_internal(ep,
+											 ret ==
+											 0 ? PSMI_TRUE :
+											 PSMI_FALSE);
+
+								if (err == PSM2_OK)
+									rcvc->pollok++;
+								else
+									rcvc->pollcyc += get_cycles() - t_cyc;
+								PSMI_UNLOCK(ep->mq->progress_lock);
+							}
+
+							/* get next endpoint from multi endpoint list */
+							ep = ep->user_ep_next;
+						} while(NULL != ep);
+					}
+				}
+
+				PSMI_UNLOCK(psmi_creation_lock);
+			}
+
+			if (ret == 0) { /* change timeout only on timed out poll */
+				rcvc->pollcnt_to++;
+				next_timeout = rcvthread_next_timeout(rcvc);
+			}
+
+		}
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return NULL;
+}
+
+static uint64_t rcvthread_stats_pollok(void *context)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+	double ratio = 0.0;
+	uint64_t ratio_u;
+	if (rcvc->pollcnt > 0)
+		ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt;
+	memcpy(&ratio_u, &ratio, sizeof(uint64_t));
+	return ratio_u;
+}
+
+static uint64_t rcvthread_stats_pollcyc(void *context)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+	/* log in milliseconds */
+	return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6);
+}
+
+static psm2_error_t rcvthread_initstats(ptl_t *ptl)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("intrthread schedule count",
+				MPSPAWN_STATS_REDUCTION_ALL |
+				MPSPAWN_STATS_SKIP_IF_ZERO,
+				NULL, &rcvc->pollcnt),
+		PSMI_STATS_DECL("intrthread schedule success (%)",
+				MPSPAWN_STATS_REDUCTION_ALL |
+				MPSPAWN_STATS_TYPE_DOUBLE,
+				rcvthread_stats_pollok, NULL),
+		PSMI_STATS_DECL("intrthread timeout count",
+				MPSPAWN_STATS_REDUCTION_ALL |
+				MPSPAWN_STATS_SKIP_IF_ZERO,
+				NULL, &rcvc->pollcnt_to),
+		PSMI_STATS_DECL("intrthread wasted time (ms)",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				rcvthread_stats_pollcyc, NULL)
+	};
+
+	/* If we don't want a thread, make sure we still initialize the counters
+	 * but set them to NaN instead */
+	if (!(ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD)) {
+		int i;
+		static uint64_t ctr_nan = MPSPAWN_NAN;
+		for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) {
+			entries[i].getfn = NULL;
+			entries[i].u.val = &ctr_nan;
+		}
+	}
+
+	return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+					PSMI_STATSTYPE_RCVTHREAD,
+					entries,
+					PSMI_STATS_HOWMANY(entries), rcvc);
+}
diff --git a/ptl_self/Makefile b/ptl_self/Makefile
new file mode 100644
index 0000000..daeac5b
--- /dev/null
+++ b/ptl_self/Makefile
@@ -0,0 +1,90 @@
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2015 Intel Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir = $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := ptl.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	@if [ -d $(OUTDIR) ]; then \
+		cd $(OUTDIR); \
+		rm -f *.o *.d *.gcda *.gcno; \
+		cd -; \
+	fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+	@echo "Nothing to do for install."
diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c
new file mode 100644
index 0000000..da613d9
--- /dev/null
+++ b/ptl_self/ptl.c
@@ -0,0 +1,394 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/*
+ * This file implements the PSM PTL for self (loopback)
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+struct ptl {
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ctl;
+} __attribute__((aligned(16)));
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted)
+{
+	psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr;
+
+	if (recv_req->recv_msglen > 0) {
+		PSM_VALGRIND_DEFINE_MQ_RECV(recv_req->buf, recv_req->buf_len,
+					    recv_req->recv_msglen);
+		VALGRIND_MAKE_MEM_DEFINED(send_req->buf, send_req->buf_len);
+		VALGRIND_MAKE_MEM_DEFINED(send_req->buf, recv_req->recv_msglen);
+
+		psmi_mq_mtucpy(recv_req->buf, send_req->buf,
+			       recv_req->recv_msglen);
+	}
+
+	psmi_mq_handle_rts_complete(recv_req);
+
+	/* If the send is already marked complete, that's because it was internally
+	 * buffered. */
+	if (send_req->state == MQ_STATE_COMPLETE) {
+		psmi_mq_stats_rts_account(send_req);
+		if (send_req->buf != NULL && send_req->send_msglen > 0)
+			psmi_mq_sysbuf_free(send_req->mq, send_req->buf);
+		/* req was left "live" even though the sender was told that the
+		 * send was done */
+		psmi_mq_req_free(send_req);
+	} else
+		psmi_mq_handle_rts_complete(send_req);
+
+	_HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n",
+		  recv_req->buf, send_req, recv_req);
+	return PSM2_OK;
+}
+
+static
+psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq)
+{
+	uint8_t *ubuf;
+	psm2_mq_req_t req = *ireq;
+
+	PSMI_LOCK_ASSERT(req->mq->progress_lock);
+
+	/* We're waiting on a send request, and the matching receive has not been
+	 * posted yet.  This is a deadlock condition in MPI but we accommodate it
+	 * here in the "self ptl" by using system-allocated memory.
+	 */
+	req->testwait_callback = NULL;	/* no more calls here */
+
+	ubuf = req->buf;
+	if (ubuf != NULL && req->send_msglen > 0) {
+		req->buf = psmi_mq_sysbuf_alloc(req->mq, req->send_msglen);
+		if (req->buf == NULL)
+			return PSM2_NO_MEMORY;
+		psmi_mq_mtucpy(req->buf, ubuf, req->send_msglen);
+	}
+
+	/* Mark it complete but don't free the req, it's freed when the receiver
+	 * does the match */
+	req->state = MQ_STATE_COMPLETE;
+	*ireq = PSM2_MQ_REQINVALID;
+	return PSM2_OK;
+}
+
+/* Self is different.  We do everything as rendezvous. */
+static
+psm2_error_t
+self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	      psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context,
+	      psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t send_req;
+	psm2_mq_req_t recv_req;
+	int rc;
+
+	send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(send_req == NULL)
+	    return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+	 * when the buffer pointer received into PSM has been allocated
+	 * by the application. This guarantees the all memory operations
+	 * to this region of memory (used by multiple layers of the stack)
+	 * always synchronize
+	 */
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+		int trueflag = 1;
+		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+			      (CUdeviceptr)ubuf);
+		send_req->is_buf_gpu_mem = 1;
+	} else
+		send_req->is_buf_gpu_mem = 0;
+#endif
+
+	rc = psmi_mq_handle_rts(mq, epaddr, tag,
+				len, NULL, 0, 1,
+				ptl_handle_rtsmatch, &recv_req);
+	send_req->tag = *tag;
+	send_req->buf = (void *)ubuf;
+	send_req->send_msglen = len;
+	send_req->context = context;
+	recv_req->ptl_req_ptr = (void *)send_req;
+	recv_req->rts_sbuf = (uintptr_t) ubuf;
+	recv_req->rts_peer = epaddr;
+	if (rc == MQ_RET_MATCH_OK)
+		ptl_handle_rtsmatch(recv_req, 1);
+	else
+		send_req->testwait_callback = self_mq_send_testwait;
+
+	_HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n",
+		  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+		  rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req);
+	*req_o = send_req;
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	     psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	psm2_error_t err;
+	psm2_mq_req_t req;
+	err = self_mq_isend(mq, epaddr, flags, tag, ubuf, len, NULL, &req);
+	psmi_mq_wait_internal(&req);
+	return err;
+}
+
+/* Fill in AM capabilities parameters */
+static psm2_error_t
+self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	/* Self is just a loop-back and has no restrictions. */
+	parameters->max_handlers = INT_MAX;
+	parameters->max_nargs = INT_MAX;
+	parameters->max_request_short = INT_MAX;
+	parameters->max_reply_short = INT_MAX;
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_request(psm2_epaddr_t epaddr,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      void *src, size_t len, int flags,
+		      psm2_am_completion_fn_t completion_fn,
+		      void *completion_ctxt)
+{
+	psm2_am_handler_fn_t hfn;
+	psm2_ep_t ep = epaddr->ptlctl->ptl->ep;
+	struct psmi_am_token tok;
+
+	tok.epaddr_incoming = epaddr;
+
+	hfn = psm_am_get_handler_function(ep, handler);
+	hfn(&tok, args, nargs, src, len);
+
+	if (completion_fn) {
+		completion_fn(completion_ctxt);
+	}
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_reply(psm2_am_token_t token,
+		    psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		    void *src, size_t len, int flags,
+		    psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	psm2_am_handler_fn_t hfn;
+	struct psmi_am_token *tok = token;
+	psm2_ep_t ep = tok->epaddr_incoming->ptlctl->ptl->ep;
+
+	hfn = psm_am_get_handler_function(ep, handler);
+	hfn(token, args, nargs, src, len);
+
+	if (completion_fn) {
+		completion_fn(completion_ctxt);
+	}
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_connect(ptl_t *ptl,
+	     int numep,
+	     const psm2_epid_t array_of_epid[],
+	     const int array_of_epid_mask[],
+	     psm2_error_t array_of_errors[],
+	     psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns)
+{
+	psmi_assert_always(ptl->epaddr != NULL);
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+
+		if (array_of_epid[i] == ptl->epid) {
+			array_of_epaddr[i] = ptl->epaddr;
+			array_of_epaddr[i]->ptlctl = ptl->ctl;
+			array_of_epaddr[i]->epid = ptl->epid;
+			if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid),
+						   psmi_gethostname(), 0)) {
+				err = PSM2_NO_MEMORY;
+				goto fail;
+			}
+			psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr);
+			array_of_errors[i] = PSM2_OK;
+		} else {
+			array_of_epaddr[i] = NULL;
+			array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+		}
+	}
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+self_disconnect(ptl_t *ptl, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	int i;
+	for (i = 0; i < numep; i++) {
+		if (array_of_epaddr_mask[i] == 0)
+			continue;
+
+		if (array_of_epaddr[i] == ptl->epaddr) {
+			psmi_epid_remove(ptl->ep, ptl->epid);
+			array_of_errors[i] = PSM2_OK;
+		}
+	}
+	return PSM2_OK;
+}
+
+static
+size_t self_ptl_sizeof(void)
+{
+	return sizeof(ptl_t);
+}
+
+ustatic
+psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+
+	ptl->ep = ep;
+	ptl->epid = ep->epid;
+	ptl->epaddr = ep->epaddr;
+	ptl->ctl = ctl;
+
+	memset(ctl, 0, sizeof(*ctl));
+	/* Fill in the control structure */
+	ctl->ptl = ptl;
+	ctl->ep = ep;
+	ctl->ep_poll = NULL;
+	ctl->ep_connect = self_connect;
+	ctl->ep_disconnect = self_disconnect;
+
+	ctl->mq_send = self_mq_send;
+	ctl->mq_isend = self_mq_isend;
+
+	ctl->am_get_parameters = self_am_get_parameters;
+	ctl->am_short_request = self_am_short_request;
+	ctl->am_short_reply = self_am_short_reply;
+
+	/* No stats in self */
+	ctl->epaddr_stats_num = NULL;
+	ctl->epaddr_stats_init = NULL;
+	ctl->epaddr_stats_get = NULL;
+
+	return PSM2_OK;
+}
+
+static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+	return PSM2_OK;		/* nothing to do */
+}
+
+static
+psm2_error_t
+self_ptl_setopt(const void *component_obj, int optname,
+		const void *optval, uint64_t optlen)
+{
+	/* No options for SELF PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown SELF ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+self_ptl_getopt(const void *component_obj, int optname,
+		void *optval, uint64_t *optlen)
+{
+	/* No options for SELF PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown SELF ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_self = {
+	self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt,
+	self_ptl_getopt
+};
diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h
new file mode 100644
index 0000000..77ee7f9
--- /dev/null
+++ b/ptl_self/ptl_fwd.h
@@ -0,0 +1,62 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_SELF_H
+#define _PTL_FWD_SELF_H
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_self;
+
+#endif
diff --git a/rpm_release_extension b/rpm_release_extension
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++ b/rpm_release_extension
@@ -0,0 +1 @@
+8

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/libpsm2.git