[Pkg-ofed-commits] [libpsm2] 01/04: New upstream version 10.3-8
Brian Smith
bsmith-guest at moszumanska.debian.org
Tue Nov 21 22:12:10 UTC 2017
This is an automated email from the git hooks/post-receive script.
bsmith-guest pushed a commit to branch master
in repository libpsm2.
commit 490810c2edfbe67631d8c51588716b4a50ddeee8
Author: Brian T. Smith <bsmith at systemfabricworks.com>
Date: Tue Nov 21 13:10:57 2017 -0600
New upstream version 10.3-8
---
40-psm.rules | 52 +
COMMIT | 1 +
COPYING | 376 +++++
Makefile | 511 ++++++
README | 300 ++++
buildflags.mak | 210 +++
compat/40-psm-compat.rules | 52 +
compat/Makefile | 90 ++
compat/buildflags.mak | 103 ++
compat/libpsm2-compat.cmds | 70 +
compat/libpsm2-compat.conf | 52 +
compat/psm-compat.c | 335 ++++
compat/psm2_compat_linker_script.map | 66 +
include/common_defines.h | 176 ++
include/hfi1_deprecated.h | 181 +++
include/linux-i386/bit_ops.h | 98 ++
include/linux-i386/sysdep.h | 171 ++
include/opa_byteorder.h | 264 +++
include/opa_common.h | 62 +
include/opa_debug.h | 108 ++
include/opa_intf.h | 90 ++
include/opa_queue.h | 512 ++++++
include/opa_revision.h | 64 +
include/opa_service.h | 268 +++
include/opa_udebug.h | 194 +++
include/opa_user.h | 973 +++++++++++
include/psm2_mock_testing.h | 176 ++
include/rbtree.c | 692 ++++++++
include/rbtree.h | 90 ++
libpsm2.spec.in | 177 ++
libuuid/Makefile | 92 ++
libuuid/compare.c | 53 +
libuuid/pack.c | 69 +
libuuid/parse.c | 78 +
libuuid/psm_uuid.c | 114 ++
libuuid/psm_uuid.h | 78 +
libuuid/unpack.c | 63 +
libuuid/unparse.c | 75 +
makesdeb.sh | 105 ++
makesrpm.sh | 145 ++
mpspawn/mpspawn_stats.h | 132 ++
opa/Makefile | 113 ++
opa/opa_debug.c | 364 +++++
opa/opa_dwordcpy-generic.c | 298 ++++
opa/opa_dwordcpy-i386.S | 84 +
opa/opa_dwordcpy-x86_64-fast.S | 77 +
opa/opa_dwordcpy-x86_64.c | 298 ++++
opa/opa_i2cflash.c | 87 +
opa/opa_proto.c | 578 +++++++
opa/opa_service.c | 909 +++++++++++
opa/opa_sysfs.c | 854 ++++++++++
opa/opa_syslog.c | 113 ++
opa/opa_time.c | 284 ++++
opa/opa_utils.c | 425 +++++
opa/opa_write_pio-i386.c | 305 ++++
opa/opa_write_pio-x86_64.c | 296 ++++
psm.c | 732 +++++++++
psm2.h | 1517 +++++++++++++++++
psm2_am.h | 411 +++++
psm2_linker_script.map | 93 ++
psm2_linker_script_map.in | 95 ++
psm2_mq.h | 1403 ++++++++++++++++
psm_am.c | 269 ++++
psm_am_internal.h | 93 ++
psm_context.c | 817 ++++++++++
psm_context.h | 102 ++
psm_diags.c | 362 +++++
psm_ep.c | 1527 ++++++++++++++++++
psm_ep.h | 245 +++
psm_ep_connect.c | 620 +++++++
psm_error.c | 348 ++++
psm_error.h | 78 +
psm_help.h | 190 +++
psm_lock.h | 142 ++
psm_log.h | 224 +++
psm_memcpy.c | 67 +
psm_mock.c | 90 ++
psm_mpool.c | 588 +++++++
psm_mpool.h | 107 ++
psm_mq.c | 1433 ++++++++++++++++
psm_mq_internal.h | 639 ++++++++
psm_mq_recv.c | 593 +++++++
psm_mq_utils.c | 273 ++++
psm_perf.c | 246 +++
psm_perf.h | 142 ++
psm_stats.c | 664 ++++++++
psm_stats.h | 120 ++
psm_sysbuf.c | 234 +++
psm_sysbuf.h | 81 +
psm_timer.c | 198 +++
psm_timer.h | 164 ++
psm_user.h | 500 ++++++
psm_utils.c | 2553 +++++++++++++++++++++++++++++
psm_utils.h | 379 +++++
psmi_wrappers.c | 94 ++
psmi_wrappers.h | 98 ++
ptl.h | 211 +++
ptl_am/Makefile | 91 ++
ptl_am/am_cuda_memhandle_cache.c | 316 ++++
ptl_am/am_cuda_memhandle_cache.h | 124 ++
ptl_am/am_reqrep.c | 118 ++
ptl_am/am_reqrep_shmem.c | 2590 +++++++++++++++++++++++++++++
ptl_am/cmarw.h | 73 +
ptl_am/cmarwu.c | 207 +++
ptl_am/psm_am_internal.h | 466 ++++++
ptl_am/ptl.c | 364 +++++
ptl_am/ptl_fwd.h | 64 +
ptl_ips/Makefile | 96 ++
ptl_ips/ips_crc32.c | 91 ++
ptl_ips/ips_epstate.c | 154 ++
ptl_ips/ips_epstate.h | 100 ++
ptl_ips/ips_expected_proto.h | 397 +++++
ptl_ips/ips_opp_path_rec.c | 602 +++++++
ptl_ips/ips_path_rec.c | 791 +++++++++
ptl_ips/ips_path_rec.h | 185 +++
ptl_ips/ips_proto.c | 2348 +++++++++++++++++++++++++++
ptl_ips/ips_proto.h | 687 ++++++++
ptl_ips/ips_proto_am.c | 595 +++++++
ptl_ips/ips_proto_am.h | 93 ++
ptl_ips/ips_proto_connect.c | 1551 ++++++++++++++++++
ptl_ips/ips_proto_dump.c | 255 +++
ptl_ips/ips_proto_expected.c | 2957 ++++++++++++++++++++++++++++++++++
ptl_ips/ips_proto_header.h | 181 +++
ptl_ips/ips_proto_help.h | 705 ++++++++
ptl_ips/ips_proto_internal.h | 96 ++
ptl_ips/ips_proto_mq.c | 1733 ++++++++++++++++++++
ptl_ips/ips_proto_params.h | 264 +++
ptl_ips/ips_proto_recv.c | 1447 +++++++++++++++++
ptl_ips/ips_recvhdrq.c | 869 ++++++++++
ptl_ips/ips_recvhdrq.h | 240 +++
ptl_ips/ips_recvq.c | 91 ++
ptl_ips/ips_recvq.h | 124 ++
ptl_ips/ips_scb.c | 364 +++++
ptl_ips/ips_scb.h | 226 +++
ptl_ips/ips_spio.c | 951 +++++++++++
ptl_ips/ips_spio.h | 189 +++
ptl_ips/ips_stats.h | 83 +
ptl_ips/ips_subcontext.c | 97 ++
ptl_ips/ips_subcontext.h | 81 +
ptl_ips/ips_tid.c | 278 ++++
ptl_ips/ips_tid.h | 169 ++
ptl_ips/ips_tidcache.c | 653 ++++++++
ptl_ips/ips_tidcache.h | 158 ++
ptl_ips/ips_tidflow.c | 267 +++
ptl_ips/ips_tidflow.h | 133 ++
ptl_ips/ips_writehdrq.c | 110 ++
ptl_ips/ips_writehdrq.h | 269 ++++
ptl_ips/ipserror.c | 200 +++
ptl_ips/ipserror.h | 122 ++
ptl_ips/ptl.c | 950 +++++++++++
ptl_ips/ptl_fwd.h | 65 +
ptl_ips/ptl_ips.h | 194 +++
ptl_ips/ptl_rcvthread.c | 506 ++++++
ptl_self/Makefile | 90 ++
ptl_self/ptl.c | 394 +++++
ptl_self/ptl_fwd.h | 62 +
rpm_release_extension | 1 +
157 files changed, 59022 insertions(+)
diff --git a/40-psm.rules b/40-psm.rules
new file mode 100644
index 0000000..ba8d494
--- /dev/null
+++ b/40-psm.rules
@@ -0,0 +1,52 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+KERNEL=="hfi1", MODE="0666"
+KERNEL=="hfi1_[0-9]", MODE="0666"
diff --git a/COMMIT b/COMMIT
new file mode 100644
index 0000000..b6b4b33
--- /dev/null
+++ b/COMMIT
@@ -0,0 +1 @@
+6ca1de91a1ee2604096449942bbed93e0ad9311e
\ No newline at end of file
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..ea3d558
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,376 @@
+This software is available to you under a choice of one of two
+licenses. You may choose to be licensed under the terms of the
+BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright(c) 2016 Intel Corporation. All rights reserved.
+
+==================================================================
+ BSD Simplified License
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+==================================================================
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f0a539d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,511 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2017 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+
+OPTIONS =
+HISTORY = .outdirs
+HISTORIC_TARGETS = $(patsubst %, %_clean, $(shell cat $(HISTORY) 2> /dev/null))
+
+RPM_NAME := libpsm2
+
+SUBDIRS:= ptl_self ptl_ips ptl_am libuuid opa
+top_srcdir := $(shell readlink -m .)
+
+# Default locations
+OUTDIR := $(top_srcdir)/build_release
+MOCK_OUTDIR := $(top_srcdir)/build_mock
+DEBUG_OUTDIR := $(top_srcdir)/build_debug
+
+# We need a temporary test variable, as the OUTDIR macro
+# can be overriden by the shell and thus not run.
+TESTOUTDIR= $(shell readlink -m $(OUTDIR))
+ifeq ($(top_srcdir), $(TESTOUTDIR))
+$(error OUTDIR cannot be the same as your source folder ${top_srcdir}))
+endif
+
+ifeq (/,$(TESTOUTDIR))
+$(error OUTDIR cannot be the / folder ))
+endif
+
+# Forces any value to be full path.
+# We don't need to override MOCK_OUTDIR or DEBUG_OUTDIR
+# as they are recursive make invocations and use OUTDIR
+ifneq ($(MAKECMDGOALS), mock)
+ifneq ($(MAKECMDGOALS), debug)
+override OUTDIR := $(shell readlink -m $(OUTDIR))
+endif
+endif
+
+LINKER_SCRIPT_FILE := ${OUTDIR}/psm2_linker_script.map
+
+PSM2_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
+PSM2_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
+PSM2_LIB_MAJOR := $(shell printf "%d" ${PSM2_VERNO_MAJOR})
+PSM2_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM2_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h`)
+SOURCES_CHKSUM_FILES = Makefile buildflags.mak $(LINKER_SCRIPT_FILE) \
+ `find . -regex '\(.*\.h\|.*\.c\)' -not -path "./test/*" -not -path "./tools/*" -not -path "_revision.c" | sort`
+SOURCES_CHKSUM_VALUE = $(shell cat ${SOURCES_CHKSUM_FILES} | sha1sum | cut -d' ' -f 1)
+
+OPA_LIB_MAJOR := 4
+OPA_LIB_MINOR := 0
+
+export PSM2_VERNO_MAJOR
+export PSM2_LIB_MAJOR
+export PSM2_VERNO_MINOR
+export PSM2_LIB_MINOR
+export OPA_LIB_MAJOR
+export OPA_LIB_MINOR
+export CCARCH ?= gcc
+export FCARCH ?= gfortran
+
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+ifneq (x86_64,$(arch))
+ ifneq (i386,$(arch))
+ $(error Unsupported architecture $(arch))
+ endif
+endif
+
+ifndef LIBDIR
+ ifeq (${arch},x86_64)
+ INSTALL_LIB_TARG=/usr/lib64
+ else
+ INSTALL_LIB_TARG=/usr/lib
+ endif
+else
+ INSTALL_LIB_TARG=${LIBDIR}
+endif
+export DESTDIR
+export INSTALL_LIB_TARG
+
+TARGLIB := libpsm2
+COMPATMAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' \
+ $(top_srcdir)/psm2.h)
+COMPATLIB := libpsm_infinipath
+
+MAJOR := $(PSM2_LIB_MAJOR)
+MINOR := $(PSM2_LIB_MINOR)
+
+nthreads := $(shell echo $$(( `nproc` * 2 )) )
+
+# The following line sets the DISTRO variable to:
+# 'rhel' if the host is running RHEL.
+# 'suse' if the host is running SUSE.
+# 'fedora' if the host is running Fedora.
+# 'ubuntu' if the host is running Ubuntu.
+#
+# The DISTRO variable is used subsequently for variable
+# behaviors of the 3 distros.
+
+DISTRO := $(shell . /etc/os-release; echo $$ID)
+
+# By default the following two variables have the following values:
+LIBPSM2_COMPAT_CONF_DIR := /etc
+LIBPSM2_COMPAT_SYM_CONF_DIR := /etc
+# We can't set SPEC_FILE_RELEASE_DIST to an empty value, a space will result.
+# It then messes up sed operations for PSM_CUDA=1.
+# So leaving the commented out line here as documentation to NOT set it.
+# SPEC_FILE_RELEASE_DIST :=
+UDEV_40_PSM_RULES := %{_udevrulesdir}/40-psm.rules
+
+ifeq (fedora,$(DISTRO))
+ # On Fedora, we change these two variables to these values:
+ LIBPSM2_COMPAT_CONF_DIR := /usr/lib
+ LIBPSM2_COMPAT_SYM_CONF_DIR := %{_prefix}/lib
+ SPEC_FILE_RELEASE_DIST := %{?dist}
+ UDEV_40_PSM_RULES :=#
+else ifeq (rhel,${DISTRO})
+ # Insert code specific to RHEL here.
+else ifeq (sles,${DISTRO})
+ # Insert code specific to SLES here.
+endif
+
+ifdef PSM_CUDA
+#Value needs to be something without spaces or dashes '-'
+SPEC_FILE_RELEASE_DIST += cuda
+endif
+
+export LIBPSM2_COMPAT_CONF_DIR
+
+# The desired version number comes from the most recent tag starting with "v"
+ifeq (true, $(shell git rev-parse --is-inside-work-tree))
+ISGIT := 1 # Cache the result for later
+# Note, we don't define ISGIT if we are not in a git folder
+VERSION := $(shell git describe --tags --abbrev=0 --match='psm-v*' | sed -e 's/^psm-v//' -e 's/-/_/')
+else
+VERSION := version
+endif
+
+# If we have a file called 'rpm_release_extension' (as on github),
+# we take the release extension number from this file
+RELEASE_EXT := $(shell if [ -e rpm_release_extension ] ; then cat rpm_release_extension; fi)
+CURRENTSHA := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --pretty=format:'%h' -n 1; fi)
+RPMEXTHASH := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --pretty=format:'%h' -n 1 rpm_release_extension; fi)
+
+# On github, the last commit for each release should be the one to bump up
+# the release extension number in 'rpm_release_extension'. Further commits
+# are counted here and appended to the final rpm name to distinguish commits
+# present only on github
+NCOMMITS := $(shell if [ $(ISGIT) -a -f rpm_release_extension ] ; then git log --children $(RPMEXTHASH)..$(CURRENTSHA) . --pretty=oneline | wc -l; fi)
+
+# This logic should kick-in only on github
+ifdef RELEASE_EXT
+ifneq ($(CURRENTSHA), $(RPMEXTHASH))
+RELEASE := $(RELEASE_EXT)_$(NCOMMITS)
+endif
+endif
+
+# The desired release number comes the git describe following the version which
+# is the number of commits since the version tag was planted suffixed by the g<commitid>
+ifndef RELEASE
+RELTAG := "psm-v$(VERSION)"
+RELEASE := $(shell if [ -f rpm_release_extension ]; then cat rpm_release_extension;\
+ elif [ $ISGIT ] ; then git rev-list $(RELTAG)..HEAD -- . | wc -l; \
+ else echo "release" ; fi)
+endif
+
+DIST_SHA := ${shell if [ $(ISGIT) ] ; then git log -n1 --pretty=format:%H .; \
+ else echo DIST_SHA ; fi}
+
+# Concatenated version and release
+ifndef VERSION_RELEASE_OVERRIDE
+VERSION_RELEASE := $(VERSION).$(RELEASE)
+else
+VERSION_RELEASE := ${VERSION_RELEASE_OVERRIDE}
+endif
+
+LDLIBS := -lrt -lpthread -ldl -lnuma ${EXTRA_LIBS}
+
+PKG_CONFIG ?= pkg-config
+
+UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null)
+ifndef UDEVDIR
+ UDEVDIR = /lib/udev
+endif
+
+export UDEVDIR
+
+# The DIST variable is a name kernel corresponding to:
+# 1. The name of the directory containing the source code distribution
+# (see dist: target below).
+# 2. The basename of the filename of the tar file created in the dist:
+# target.
+DIST := ${RPM_NAME}-${VERSION_RELEASE}
+
+# If user has empty RPM NAME BASEEXT (defined or not), then attempt to
+# see if we are running on SLES 12.3 or newer.
+# If we are, then change the base package name, but not the supporting
+# packages to libpsm2-2. Do note this requires support both in the Makefile
+# specfile target rule as well as changes in the libpsm2.spec.in
+# file as well.
+ifeq ($(RPM_NAME_BASEEXT),)
+# Detect current version of the OS
+OS := $(shell grep -m1 NAME /etc/os-release | cut -f 2 -d\")
+OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 -d.)
+OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.)
+
+override RPM_NAME_BASEEXT := $(shell \
+ if [ "$(OS)" = "SLES" ]; then \
+ if [ "$(OSVERSION)" \> "11" ]; then \
+ if [ "$(OSSUBVERSION)" \> "2" ]; then \
+ echo "-2"; \
+ fi \
+ fi \
+ fi)
+endif
+
+all: outdir symlinks
+ @if [ ! -e $(HISTORY) ] || [ -z "`grep -E '^$(OUTDIR)$$' $(HISTORY)`" ]; then \
+ echo $(OUTDIR) >> $(HISTORY); \
+ fi
+ @for subdir in $(SUBDIRS); do \
+ mkdir -p $(OUTDIR)/$$subdir; \
+ $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir $(OPTIONS); \
+ done
+ $(MAKE) -j $(nthreads) OUTDIR=$(OUTDIR) $(OPTIONS) $(OUTDIR)/${TARGLIB}.so
+ @mkdir -p $(OUTDIR)/compat
+ $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat $(OPTIONS)
+
+%_clean:
+ make OUTDIR=$* clean
+
+clean: linker_script_file_clean cleanlinks
+ rm -rf ${OUTDIR}
+ @if [ -e $(HISTORY) ]; then \
+ grep -v -E "^$(OUTDIR)$$" $(HISTORY) > $(HISTORY)_tmp; \
+ mv $(HISTORY)_tmp $(HISTORY); \
+ if [ "`wc -c $(HISTORY) | cut -d ' ' -f 1`" -eq 0 ]; then \
+ rm -f $(HISTORY); \
+ fi; \
+ fi
+
+mock: OUTDIR := $(MOCK_OUTDIR)
+mock: OPTIONS = PSM2_MOCK_TESTING=1
+mock:
+ $(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
+
+debug: OUTDIR := $(DEBUG_OUTDIR)
+debug: OPTIONS = PSM_DEBUG=1
+debug:
+ $(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
+
+test_clean:
+ if [ -d ./test ]; then \
+ $(MAKE) -C test clean; \
+ fi
+
+specfile_clean:
+ rm -f ${OUTDIR}/${RPM_NAME}.spec
+
+distclean: specfile_clean cleanlinks $(HISTORIC_TARGETS) test_clean
+ rm -rf ${OUTDIR}/${DIST}
+ rm -f ${OUTDIR}/${DIST}.tar.gz
+ rm -fr temp.*
+
+outdir:
+ mkdir -p ${OUTDIR}
+
+symlinks:
+ @test -L $(top_srcdir)/include/linux-x86_64 || \
+ ln -sf linux-i386 $(top_srcdir)/include/linux-x86_64
+
+cleanlinks:
+ rm -rf $(top_srcdir)/include/linux-x86_64
+
+install: all
+ for subdir in $(SUBDIRS) ; do \
+ mkdir -p $(OUTDIR)/$$subdir ; \
+ $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir install ; \
+ done
+ $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so OUTDIR=$(OUTDIR)
+ $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat install
+ install -D $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} \
+ ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR}
+ (cd ${DESTDIR}${INSTALL_LIB_TARG} ; \
+ ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \
+ ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so)
+ install -m 0644 -D psm2.h ${DESTDIR}/usr/include/psm2.h
+ install -m 0644 -D psm2_mq.h ${DESTDIR}/usr/include/psm2_mq.h
+ install -m 0644 -D psm2_am.h ${DESTDIR}/usr/include/psm2_am.h
+ifneq (fedora,${DISTRO})
+ install -m 0644 -D 40-psm.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm.rules
+endif
+ # The following files and dirs were part of the noship rpm:
+ mkdir -p ${DESTDIR}/usr/include/hfi1diag
+ mkdir -p ${DESTDIR}/usr/include/hfi1diag/linux-x86_64
+ mkdir -p ${DESTDIR}/usr/include/hfi1diag/ptl_ips
+ install -m 0644 -D ptl_ips/ipserror.h ${DESTDIR}/usr/include/hfi1diag/ptl_ips/ipserror.h
+ install -m 0644 -D include/linux-x86_64/bit_ops.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/bit_ops.h
+ install -m 0644 -D include/linux-x86_64/sysdep.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/sysdep.h
+ install -m 0644 -D include/opa_udebug.h ${DESTDIR}/usr/include/hfi1diag/opa_udebug.h
+ install -m 0644 -D include/opa_debug.h ${DESTDIR}/usr/include/hfi1diag/opa_debug.h
+ install -m 0644 -D include/opa_intf.h ${DESTDIR}/usr/include/hfi1diag/opa_intf.h
+ install -m 0644 -D include/opa_user.h ${DESTDIR}/usr/include/hfi1diag/opa_user.h
+ install -m 0644 -D include/opa_service.h ${DESTDIR}/usr/include/hfi1diag/opa_service.h
+ install -m 0644 -D include/opa_common.h ${DESTDIR}/usr/include/hfi1diag/opa_common.h
+ install -m 0644 -D include/opa_byteorder.h ${DESTDIR}/usr/include/hfi1diag/opa_byteorder.h
+ install -m 0644 -D include/psm2_mock_testing.h ${DESTDIR}/usr/include/hfi1diag/psm2_mock_testing.h
+ install -m 0644 -D include/hfi1_deprecated.h ${DESTDIR}/usr/include/hfi1diag/hfi1_deprecated.h
+ install -m 0644 -D include/opa_revision.h ${DESTDIR}/usr/include/hfi1diag/opa_revision.h
+ install -m 0644 -D psmi_wrappers.h ${DESTDIR}/usr/include/hfi1diag/psmi_wrappers.h
+
+specfile: outdir specfile_clean
+ sed -e 's/@VERSION@/'${VERSION_RELEASE}'/g' libpsm2.spec.in | \
+ sed -e 's/@TARGLIB@/'${TARGLIB}'/g' \
+ -e 's/@RPM_NAME@/'${RPM_NAME}'/g' \
+ -e 's/@RPM_NAME_BASEEXT@/'${RPM_NAME_BASEEXT}'/g' \
+ -e 's/@COMPATLIB@/'${COMPATLIB}'/g' \
+ -e 's/@COMPATMAJOR@/'${COMPATMAJOR}'/g' \
+ -e 's;@UDEVDIR@;'${UDEVDIR}';g' \
+ -e 's/@MAJOR@/'${MAJOR}'/g' \
+ -e 's/@MINOR@/'${MINOR}'/g' \
+ -e 's:@LIBPSM2_COMPAT_CONF_DIR@:'${LIBPSM2_COMPAT_CONF_DIR}':g' \
+ -e 's:@LIBPSM2_COMPAT_SYM_CONF_DIR@:'${LIBPSM2_COMPAT_SYM_CONF_DIR}':g' \
+ -e 's;@SPEC_FILE_RELEASE_DIST@;'${SPEC_FILE_RELEASE_DIST}';g' \
+ -e 's/@DIST_SHA@/'${DIST_SHA}'/g' > \
+ ${OUTDIR}/${RPM_NAME}.spec
+ if [ -f /etc/redhat-release ] && [ `grep -o "[0-9.]*" /etc/redhat-release | cut -d"." -f1` -lt 7 ]; then \
+ sed -i 's;@40_PSM_RULES@;'${UDEVDIR}'/rules.d/40-psm.rules;g' ${OUTDIR}/${RPM_NAME}.spec; \
+ else \
+ sed -i 's;@40_PSM_RULES@;'${UDEV_40_PSM_RULES}';g' ${OUTDIR}/${RPM_NAME}.spec; \
+ fi
+
+# We can't totally prevent two make dist calls in a row from packaging
+# the previous make dist, unless we switch to using a dedicated ./src folder
+# That will come in the next major revision of the Makefile for now we can
+# prevent the easy and default cases
+dist: distclean
+ mkdir -p ${OUTDIR}/${DIST}
+ for x in $$(/usr/bin/find . \
+ -name ".git" -prune -o \
+ -name "cscope*" -prune -o \
+ -name "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" -prune -o \
+ -name "*.orig" -prune -o \
+ -name "*~" -prune -o \
+ -name "#*" -prune -o \
+ -name ".gitignore" -prune -o \
+ -name "doc" -prune -o \
+ -name "libcm" -prune -o \
+ -name "psm.supp" -prune -o \
+ -name "test" -prune -o \
+ -name "tools" -prune -o \
+ -name "artifacts" -prune -o \
+ -print); do \
+ dir=$$(dirname $$x); \
+ mkdir -p ${OUTDIR}/${DIST}/$$dir; \
+ [ ! -d $$x ] && cp $$x ${OUTDIR}/${DIST}/$$dir; \
+ done
+ if [ $(ISGIT) ] ; then git log -n1 --pretty=format:%H . > ${OUTDIR}/${DIST}/COMMIT ; fi
+ echo ${RELEASE} > ${OUTDIR}/${DIST}/rpm_release_extension
+ cd ${OUTDIR}; tar czvf ${DIST}.tar.gz ${DIST}
+ @echo "${DIST}.tar.gz is located in ${OUTDIR}/${DIST}.tar.gz"
+
+ofeddist:
+ $(MAKE) -j $(nthreads) dist
+
+# rebuild the cscope database, skipping sccs files, done once for
+# top level
+cscope:
+ find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \
+ -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i -
+
+sources-checksum:
+ @echo ${SOURCES_CHKSUM_VALUE}
+
+${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o \
+ ptl_am/am_reqrep.o \
+ ptl_am/ptl.o \
+ ptl_am/cmarwu.o \
+ ptl_am/am_cuda_memhandle_cache.o \
+ psm_context.o \
+ psm_ep.o \
+ psm_ep_connect.o \
+ psm_error.o \
+ psm_utils.o \
+ psm_sysbuf.o \
+ psm_timer.o \
+ psm_am.o \
+ psm_mq.o \
+ psm_mq_utils.o \
+ psm_mq_recv.o \
+ psm_mpool.o \
+ psm_stats.o \
+ psm_memcpy.o \
+ psm_mock.o \
+ psm.o \
+ psm_perf.o \
+ libuuid/psm_uuid.o \
+ libuuid/parse.o \
+ libuuid/pack.o \
+ libuuid/unpack.o \
+ libuuid/unparse.o \
+ ptl_ips/ptl.o \
+ ptl_ips/ptl_rcvthread.o \
+ ptl_ips/ipserror.o \
+ ptl_ips/ips_scb.o \
+ ptl_ips/ips_epstate.o \
+ ptl_ips/ips_recvq.o \
+ ptl_ips/ips_recvhdrq.o \
+ ptl_ips/ips_spio.o \
+ ptl_ips/ips_proto.o \
+ ptl_ips/ips_proto_recv.o \
+ ptl_ips/ips_proto_connect.o \
+ ptl_ips/ips_proto_expected.o \
+ ptl_ips/ips_tid.o \
+ ptl_ips/ips_tidcache.o \
+ ptl_ips/ips_tidflow.o \
+ ptl_ips/ips_crc32.o \
+ ptl_ips/ips_proto_dump.o \
+ ptl_ips/ips_proto_mq.o \
+ ptl_ips/ips_proto_am.o \
+ ptl_ips/ips_subcontext.o \
+ ptl_ips/ips_path_rec.o \
+ ptl_ips/ips_opp_path_rec.o \
+ ptl_ips/ips_writehdrq.o \
+ ptl_self/ptl.o \
+ opa/*.o \
+ psm_diags.o \
+ psmi_wrappers.o
+
+${TARGLIB}-objs := $(patsubst %.o, ${OUTDIR}/%.o, ${${TARGLIB}-objs})
+
+DEPS:= $(${TARGLIB}-objs:.o=.d)
+-include $(DEPS)
+
+${OUTDIR}/${TARGLIB}.so: ${OUTDIR}/${TARGLIB}.so.${MAJOR}
+ ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+${OUTDIR}/${TARGLIB}.so.${MAJOR}: ${OUTDIR}/${TARGLIB}.so.${MAJOR}.${MINOR}
+ ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+# when we build the shared library, generate a revision and date
+# string in it, for easier id'ing when people may have copied the
+# file around. Generate it such that the ident command can find it
+# and strings -a | grep OPA does a reasonable job as well.
+$(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} $(LINKER_SCRIPT_FILE)
+ echo "char psmi_hfi_IFS_version[]=\"`printenv RELEASE_TAG`\";" > ${OUTDIR}/_revision.c
+ date -u -d@$${SOURCE_DATE_EPOCH:-$$(date +%s)} +'char psmi_hfi_build_timestamp[] ="%F %T%:z";' >> ${OUTDIR}/_revision.c
+ echo "char psmi_hfi_sources_checksum[] =\"${SOURCES_CHKSUM_VALUE}\";" >> ${OUTDIR}/_revision.c
+ echo "char psmi_hfi_git_checksum[] =\"`git rev-parse HEAD`\";" >> ${OUTDIR}/_revision.c
+ $(CC) -c $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o
+ $(CC) $(LINKER_SCRIPT) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \
+ ${${TARGLIB}-objs} $(OUTDIR)/_revision.o -Lopa $(LDLIBS)
+
+${OUTDIR}/%.o: ${top_srcdir}/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+
+$(LINKER_SCRIPT_FILE): psm2_linker_script_map.in
+ sed "s/_psm2_additional_globals_;/$(PSM2_ADDITIONAL_GLOBALS)/" \
+ psm2_linker_script_map.in > ${OUTDIR}/psm2_linker_script.map
+
+linker_script_file_clean:
+ rm -f $(LINKER_SCRIPT_FILE)
diff --git a/README b/README
new file mode 100644
index 0000000..e74c865
--- /dev/null
+++ b/README
@@ -0,0 +1,300 @@
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2017 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ Copyright (c) 2003-2017 Intel Corporation. All rights reserved.
+
+================================================================================
+
+ABSTRACT
+--------
+
+Discusses how to build, install and test the PSM2 library source code.
+
+Contains the following sections:
+
+- INTRODUCTION
+- DEPENDENCIES
+- BUILDING
+ * BUILDING USING MAKEFILE
+ * BUILDING USING RPMBUILD (CREATING SOURCE AND BINARY RPM'S)
+- INSTALLING
+ * INSTALLING USING MAKEFILE
+ * INSTALLING USING EITHER YUM OR DNF
+- RELATED SOFTWARE TO PSM2
+- SUPPORTING DOCUMENTATION
+
+INTRODUCTION
+============
+
+This README file discusses how to build, install and test the PSM2 library
+source code.
+
+The PSM2 library supports a number of fabric media and stacks, and all of
+them run on version 7.X of Red Hat Enterprise Linux (abbreviated: RHEL), and
+SuSE SLES.
+
+Only the x86_64 architecture is supported.
+
+Building PSM2 is possible on RHEL 7.2+ as it ships with hfi1 kernel driver.
+For older RHEL 7.x versions and SuSE SLES, OPA is not natively supported
+in the kernel and therefore, building PSM2 is not possible unless
+you have the correct kernel-devel package or use latest versions of IFS.
+
+There are two mechanisms for building and installing the PSM2 library:
+
+ 1. Use provided Makefiles to build and install or
+ 2. Generate the *.rpm files which you can then install using either
+ yum or dnf command
+
+DEPENDENCIES
+============
+
+The following packages are required to build the PSM2 library source code:
+(all packages are for the x86_64 architecture)
+
+compat-rdma-devel
+gcc-4.8.2
+glibc-devel
+glibc-headers
+kernel-headers
+
+Additional packages for GPU Direct support include:
+NVIDIA CUDA toolkit 8.0 or greater. Older versions are not supported.
+
+In addition to depending on these packages, root privileges are required to
+install the runtime libraries and development header files into standard
+system location.
+
+BUILDING
+========
+
+The instructions below use $BASENAME, $PRODUCT and $RELEASE to refer to
+the base name of the tarball, RPM that will be generated and the product
+and release identifiers of the RPM.
+
+The base name of the RPM changes depending on which version/branch
+of code you derive the tar file from.
+
+Up until v10.2 of PSM2, the base name for the RPM is hfi1-psm.
+From v10.2 onwards, the base name will be libpsm2. The internal
+library remains unchanged and is still libpsm2.so.2.
+
+BUILDING USING MAKEFILES
+------------------------
+
+1. Untar the tarball:
+ $ tar zxvf $BASENAME-$PRODUCT-$RELEASE.tar.gz
+2. Change directory into the untarred location:
+ $ cd $BASENAME-$PRODUCT-$RELEASE
+3. Run make on the command line. This will build the PSM2 library.
+ $ make
+ 3.1. Optionally to build PSM2 library with GPU Direct support, Run make
+ PSM_CUDA=1 instead of make on the command line.
+ $ make PSM_CUDA=1
+
+BUILDING USING RPMBUILD
+-----------------------
+
+1. Run this command from your $PWD to generate rpm, srpm files
+ $ ./makesrpm.sh a
+
+ This command results in the following collection of rpm's and source
+ code rpm's under your $PWD/temp.X/ directory.
+ ("X" is the pid of the bash script that created the srpm and rpm files)
+ (Result shown here for RHEL systems.)
+
+ RPMS/x86_64/libpsm2-compat-10.3.7-1x86_64.rpm
+ RPMS/x86_64/libpsm2-devel-10.3.7-1x86_64.rpm
+ RPMS/x86_64/libpsm2-10.3.7-1x86_64.rpm
+ RPMS/x86_64/libpsm2-debuginfo-10.3.7-1x86_64.rpm
+ SRPMS/libpsm2-10.3.7-1.src.rpm
+
+ 1.1. Optionally for GPU Direct support run this command from your $PWD to
+ generate rpm, srpm files
+ $ ./makesrpm.sh a -cuda
+
+ This command results in the following collection of rpm's and source code
+ rpm's under your $PWD/temp.X/ directory. ("X" is the pid of the bash
+ script that created the srpm and rpm files):
+ RPMS/x86_64/libpsm2-10.3.7-1cuda.x86_64.rpm
+ RPMS/x86_64/libpsm2-compat-10.3.7-1cuda.x86_64.rpm
+ RPMS/x86_64/libpsm2-devel-10.3.7-1cuda.x86_64.rpm
+ SRPMS/x86_64/libpsm2-10.3.7-1cuda.src.rpm
+
+ On systems with SLES 12.3 or newer, the package name for the base libpsm2
+ RPM will be:
+ libpsm2-2-10.3.7-1.x86_64.rpm
+
+ Other supporting RPM package names will be as listed above.
+
+INSTALLING
+==========
+
+INSTALLING USING MAKEFILE
+-------------------------
+
+Install the libraries and header files on the system (as root):
+ $ make install
+
+The libraries will be installed in /usr/lib64, and the header files will
+be installed in /usr/include.
+
+This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on
+the "make install" command line. "DESTDIR" will add a leading path component
+to the overall install path and "LIBDIR" will change the path where libraries
+will be installed. For example, "make DESTDIR=/tmp/psm-install install" will
+install all files (libraries and headers) into "/tmp/psm-install/usr/...",
+"make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the
+libraries in "/tmp/psm-install/libraries" and the headers in
+"/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will
+install the libraries in "/tmp/libs" and the headers in "/usr/include".
+
+
+INSTALLING USING EITHER YUM OR DNF
+----------------------------------
+
+You can install the rpm's and source rpm's previously built using rpmbuild using
+either the yum or dnf command as the root user. See the appropriate man page for
+details of installing rpm's.
+
+Note: It is also possible to use rpm command to install rpm's, but it is recommended
+that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags.
+yum or dnf should be better able to resolve dependency issues.
+
+RELATED SOFTWARE TO PSM2
+========================
+
+MPI Libraries supported
+-----------------------
+A large number of open source (OpenMPI, MVAPICH2) and Vendor MPI
+implementations support PSM2 for optimized communication on HCAs. Vendor MPI
+implementations (HP-MPI, Intel MPI 4.0 with PMI, Platform/Scali MPI)
+require that the PSM2 runtime libraries be installed and available on
+each node. Usually a configuration file or a command line switch to mpirun
+needs to be specified to utilize the PSM2 transport.
+
+OpenMPI support
+---------------
+It is recommended to use the v1.10.4 or newer version of OpenMPI.
+Prior versions of OpenMPI have an issue with support PSM2 network transports
+mixed with standard Verbs transport (BTL openib). This prevents an OpenMPI
+installation with network modules available for PSM2 and Verbs to work
+correctly on nodes with no HFI hardware. This has been fixed in the
+latest development branch allowing a single OpenMPI installation to target
+HFI hardware via PSM2 or Verbs as well as alternate transports seamlessly.
+
+If NVIDIA CUDA support is desired, you can use the OpenMPI build
+(v1.10.4-cuda-hfi) provided by Intel in the IFS installer v10.4.X or newer.
+The changes have also been accepted into v3.0.x branch of upstream OpenMPI
+repository. Therefore subsequent v3.0.x versions of OpenMPI should carry the
+required OpenMPI support for PSM2 GPUDirect feature.
+
+PSM2 header and runtime files need to be installed on a node where the OpenMPI
+build is performed. All compute nodes additionally should have the PSM2 runtime
+libraries available on them. OpenMPI provides a standard configure, make and
+make install mechanism which will detect and build the relevant PSM2 network
+modules for OpenMPI once the header and runtime files are detected.
+
+MVAPICH2 support
+----------------
+MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware.
+OPA IFS supports MVAPICH2 v2.1 (or later). PSM2 header and runtime files
+need to be installed on a node where MVAPICH2 builds are performed. All
+compute nodes should also have the PSM2 runtime libraries available on them.
+
+For building and installing MVAPICH2 with OPA support, refer to MVAPICH2
+user guides here:
+http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.2rc1-userguide.html
+
+(Note: Support for PSM2 is currently on v2.2rc1 of OSU MVAPICH2 code base.
+The above link might change when a stable v2.2 is released.)
+
+OFED Support
+------------
+Intel OPA is not yet included within OFED. But the hfi1 driver is available
+publicly at kernel.org. Please do pull the driver from either kernel.org or
+the github page for opa-hfi1 driver (https://github.com/01org/opa-hfi1)
+
+SUPPORTING DOCUMENTATION
+------------------------
+PSM2 Programmer's Guide is published along with documentation for "Intel® Omni-Path
+Host Fabric Interface PCIe Adapter 100 Series"
+(http://www.intel.com/content/www/us/en/support/network-and-i-o/fabric-products/000016242.html)
+
+Refer to this document for description on APIs and environment variables that
+are available for use. For sample code on writing applications leveraging the
+PSM2 APIs, refer to Section 5.
+
+Link to latest (as of Sep 2017) PSM2 Programmer's Guide:
+https://www.intel.com/content/dam/support/us/en/documents/network-and-i-o/fabric-products/Intel_PSM2_PG_H76473_v7_0.pdf
+
+PSM Compatibility Support
+------------
+
+libpsm2-compat suppports applications that use the PSM API instead of
+the PSM2 API, through a compatibility library. This library is an interface
+between PSM applications and the PSM2 API.
+
+If the system has an application that is coded to use PSM and has requirements
+to use PSM2 (i.e. the host has Omni-Path hardware), the compatibility library
+must be used.
+
+Please refer to your operating system's documentation to find how to modify the
+order in which system directories are searched for dynamic libraries. The
+libpsm2-compat version of libpsm_infinipath.so.1 must be earlier on the search
+path than that of libpsm-infinipath. Doing so allows applications coded to PSM
+to transparently use the PSM2 API and devices which require it.
+
+Please note that the installation path for the libpsm2-compat version of
+libpsm_infinipath.so.1 will differ depending on your operating system
+specifics. Common locations include:
+- /usr/lib64/psm2-compat/
+- /usr/lib/psm2-compat/
+
diff --git a/buildflags.mak b/buildflags.mak
new file mode 100644
index 0000000..78efd70
--- /dev/null
+++ b/buildflags.mak
@@ -0,0 +1,210 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2016 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2016 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2016 Intel Corporation. All rights reserved.
+#
+
+# set top_srcdir and include this file
+
+ifeq (,$(top_srcdir))
+$(error top_srcdir must be set to include makefile fragment)
+endif
+
+export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
+export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
+
+ifeq (${CCARCH},gcc)
+ export CC := gcc
+else
+ ifeq (${CCARCH},gcc4)
+ export CC := gcc4
+ else
+ ifeq (${CCARCH},icc)
+ export CC := icc
+ else
+ anerr := $(error Unknown C compiler arch: ${CCARCH})
+ endif # ICC
+ endif # gcc4
+endif # gcc
+
+ifeq (${FCARCH},gfortran)
+ export FC := gfortran
+else
+ anerr := $(error Unknown Fortran compiler arch: ${FCARCH})
+endif # gfortran
+
+BASECFLAGS += $(BASE_FLAGS)
+LDFLAGS += $(BASE_FLAGS)
+ASFLAGS += $(BASE_FLAGS)
+
+ifeq ($(PSM2_MOCK_TESTING),1)
+BASECFLAGS += -DPSM2_MOCK_TESTING=1
+# we skip the linker script for testing version, we want all symbols to be
+# reachable from outside the library
+else
+LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE)
+endif
+
+WERROR := -Werror
+INCLUDES := -I. -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn -I$(top_srcdir)/include/$(os)-$(arch)
+
+#
+# use IFS provided hfi1_user.h if installed.
+#
+IFS_HFI_HEADER_PATH := /usr/include/uapi
+INCLUDES += -I${IFS_HFI_HEADER_PATH}
+
+BASECFLAGS +=-Wall $(WERROR)
+
+#
+# test if compiler supports SSE4.2 (needed for crc32 instruction)
+#
+RET := $(shell echo "int main() {}" | ${CC} -msse4.2 -E -dM -xc - 2>&1 | grep -q SSE4_2 ; echo $$?)
+ifeq (0,${RET})
+ BASECFLAGS += -msse4.2
+else
+ $(error SSE4.2 compiler support required )
+endif
+
+#
+# test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction.
+#
+ifneq (,${PSM_AVX})
+ ifeq (${CC},icc)
+ MAVX2=-march=core-avx2
+ else
+ MAVX2=-mavx2
+ endif
+ RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
+ ifeq (0,${RET})
+ TMPVAR := $(BASECFLAGS)
+ BASECFLAGS := $(filter-out -msse4.2,$(TMPVAR))
+ BASECFLAGS += ${MAVX2}
+ endif
+
+ ifneq (icc,${CC})
+ RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?)
+ ifeq (0,${RET})
+ BASECFLAGS += -mavx512f
+ endif
+ endif
+endif
+
+#
+# feature test macros for drand48_r
+#
+BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE
+
+ifneq (,${HFI_BRAKE_DEBUG})
+ BASECFLAGS += -DHFI_BRAKE_DEBUG
+endif
+ifneq (,${PSM_DEBUG})
+ BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
+else
+ BASECFLAGS += -O3 -g3
+endif
+ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting
+ BASECFLAGS += -O -fprofile-arcs -ftest-coverage
+ LDFLAGS += -fprofile-arcs
+endif
+ifneq (,${PSM_LOG})
+ BASECFLAGS += -DPSM_LOG
+ifneq (,${PSM_LOG_FAST_IO})
+ BASECFLAGS += -DPSM_LOG_FAST_IO
+ PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message;
+endif
+endif
+ifneq (,${PSM_PERF})
+ BASECFLAGS += -DRDPMC_PERF_FRAMEWORK
+endif
+ifneq (,${PSM_HEAP_DEBUG})
+ BASECFLAGS += -DPSM_HEAP_DEBUG
+endif
+ifneq (,${PSM_PROFILE})
+ BASECFLAGS += -DPSM_PROFILE
+endif
+ifneq (,${PSM_CUDA})
+ BASECFLAGS += -DNVIDIA_GPU_DIRECT -DPSM_CUDA
+ CUDA_HOME ?= /usr/local/cuda
+ INCLUDES += -I$(CUDA_HOME)/include
+endif
+
+BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE
+
+ifeq (${CCARCH},gcc)
+ BASECFLAGS += -funwind-tables
+endif
+
+ifneq (,${PSM_VALGRIND})
+ CFLAGS += -DPSM_VALGRIND
+else
+ CFLAGS += -DNVALGRIND
+endif
+
+ASFLAGS += -g3 -fpic
+
+BASECFLAGS += ${OPA_CFLAGS}
+
+ifeq (${CCARCH},icc)
+ BASECFLAGS += -O3 -g3 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
+ CFLAGS += $(BASECFLAGS)
+ LDFLAGS += -static-intel
+else
+ ifeq (${CCARCH},gcc)
+ CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing -Wformat-security
+ else
+ ifeq (${CCARCH},gcc4)
+ CFLAGS += $(BASECFLAGS)
+ else
+ $(error Unknown compiler arch "${CCARCH}")
+ endif # gcc4
+ endif # gcc
+endif # icc
+
diff --git a/compat/40-psm-compat.rules b/compat/40-psm-compat.rules
new file mode 100644
index 0000000..fc7c4b1
--- /dev/null
+++ b/compat/40-psm-compat.rules
@@ -0,0 +1,52 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+KERNEL=="hfi1", SYMLINK+="ipath"
+KERNEL=="hfi1_[0-9]", MODE="0666", SYMLINK+="ipath"
diff --git a/compat/Makefile b/compat/Makefile
new file mode 100644
index 0000000..092775f
--- /dev/null
+++ b/compat/Makefile
@@ -0,0 +1,90 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+OUTDIR = .
+
+COMPATLIB := libpsm_infinipath
+COMPAT_LIB_TARG := $(INSTALL_LIB_TARG)/psm2-compat
+compat_build_dir := $(shell readlink -m .)
+
+MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' ../psm2.h)
+
+top_srcdir := $(compat_build_dir)/..
+include $(compat_build_dir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${COMPATLIB}-objs := psm-compat.o
+${COMPATLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${COMPATLIB}-objs})
+
+DEPS:= $(${COMPATLIB}-objs:.o=.d)
+-include $(DEPS)
+
+all .DEFAULT: ${${COMPATLIB}-objs} $(OUTDIR)/${COMPATLIB}.so.${MAJOR}
+
+install: all
+ install -m 0644 -D 40-psm-compat.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm-compat.rules
+ install -m 0644 -D libpsm2-compat.conf ${DESTDIR}${LIBPSM2_COMPAT_CONF_DIR}/modprobe.d/libpsm2-compat.conf
+ install -m 0755 -D libpsm2-compat.cmds ${DESTDIR}/usr/lib/libpsm2/libpsm2-compat.cmds
+ install -D $(OUTDIR)/${COMPATLIB}.so.${MAJOR} ${DESTDIR}${COMPAT_LIB_TARG}/${COMPATLIB}.so.${MAJOR}
+
+$(OUTDIR)/%.o: $(compat_build_dir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+
+$(OUTDIR)/${COMPATLIB}.so.${MAJOR}: ${${COMPATLIB}-objs}
+ $(CC) $(BASECFLAGS) $(LINKER_SCRIPT) $(LDFLAGS) -Wl,-soname=${COMPATLIB}.so.${MAJOR} -shared \
+ -L$(OUTDIR)/.. ${${COMPATLIB}-objs} -lpsm2 -o $@
+
+clean:
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno ${COMPATLIB}.*; \
+ cd -; \
+ fi
diff --git a/compat/buildflags.mak b/compat/buildflags.mak
new file mode 100644
index 0000000..c677989
--- /dev/null
+++ b/compat/buildflags.mak
@@ -0,0 +1,103 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+ifeq (,$(top_srcdir))
+$(error top_srcdir must be set to include makefile fragment)
+endif
+
+export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
+export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
+export CCARCH ?= gcc
+
+ifeq (${CCARCH},gcc)
+ export CC := gcc
+else
+ ifeq (${CCARCH},gcc4)
+ export CC := gcc4
+ else
+ ifeq (${CCARCH},icc)
+ export CC := icc
+ else
+ anerr := $(error Unknown C compiler arch: ${CCARCH})
+ endif # ICC
+ endif # gcc4
+endif # gcc
+
+BASECFLAGS += $(BASE_FLAGS)
+LDFLAGS += $(BASE_FLAGS)
+ASFLAGS += $(BASE_FLAGS)
+
+LINKER_SCRIPT_FILE := psm2_compat_linker_script.map
+LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE)
+WERROR := -Werror
+INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn
+
+BASECFLAGS +=-Wall $(WERROR)
+
+BASECFLAGS += -fpic -fPIC
+
+ASFLAGS += -g3 -fpic
+
+ifeq (${CCARCH},icc)
+ BASECFLAGS += -O3 -g3
+ CFLAGS += $(BASECFLAGS)
+ LDFLAGS += -static-intel
+else
+ ifeq (${CCARCH},gcc)
+ CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing
+ else
+ ifeq (${CCARCH},gcc4)
+ CFLAGS += $(BASECFLAGS)
+ else
+ $(error Unknown compiler arch "${CCARCH}")
+ endif
+ endif
+endif
diff --git a/compat/libpsm2-compat.cmds b/compat/libpsm2-compat.cmds
new file mode 100755
index 0000000..dcead1e
--- /dev/null
+++ b/compat/libpsm2-compat.cmds
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# This script was created to allow for both an hfi1 and qib adapter
+# to co-exist on the same machine.
+# The simlink from /dev/ipath is removed to allow ib_qib to load
+# correctly and create a proper device file.
+
+case "$1" in
+start)
+ # Remove symlink if hfi1 was loaded first
+ if [ -L "/dev/ipath" ]; then
+ rm /dev/ipath
+ fi
+ ;;
+stop)
+ # Restore symlink if hfi1 is loaded
+ if [ -f "/dev/hfi1" ] && ! [ -L "/dev/ipath" ]; then
+ ln -s /dev/hfi1 /dev/ipath
+ fi
+ ;;
+esac
diff --git a/compat/libpsm2-compat.conf b/compat/libpsm2-compat.conf
new file mode 100644
index 0000000..d71e8f2
--- /dev/null
+++ b/compat/libpsm2-compat.conf
@@ -0,0 +1,52 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+install ib_qib /usr/lib/libpsm2/libpsm2-compat.cmds start; modprobe -i ib_qib $CMDLINE_OPTS
+remove ib_qib modprobe -r -i ib_qib && /usr/lib/libpsm2/libpsm2-compat.cmds stop
diff --git a/compat/psm-compat.c b/compat/psm-compat.c
new file mode 100644
index 0000000..4309e02
--- /dev/null
+++ b/compat/psm-compat.c
@@ -0,0 +1,335 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "../psm2.h"
+#include "../psm2_mq.h"
+#include "../psm2_am.h"
+
+/* Functions from TS psm.h */
+psm2_error_t
+psm_init(int *major, int *minor)
+{
+ return psm2_init(major, minor);
+}
+
+psm2_error_t
+psm_finalize(void)
+{
+ return psm2_finalize();
+}
+
+psm2_error_t
+psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+ return psm2_map_nid_hostname(num, nids, hostnames);
+}
+
+void
+psm_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label)
+{
+ return psm2_epaddr_setlabel(epaddr, epaddr_label);
+}
+
+void
+psm_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt)
+{
+ psm2_epaddr_setctxt(epaddr, ctxt);
+}
+
+void *
+psm_epaddr_getctxt(psm2_epaddr_t epaddr)
+{
+ return psm2_epaddr_getctxt(epaddr);
+}
+
+psm2_error_t
+psm_setopt(psm2_component_t component, const void *component_obj,
+ int optname, const void *optval, uint64_t optlen)
+{
+ return psm2_setopt(component, component_obj,
+ optname, optval, optlen);
+}
+
+psm2_error_t
+psm_getopt(psm2_component_t component, const void *component_obj,
+ int optname, void *optval, uint64_t *optlen)
+{
+ return psm2_getopt(component, component_obj,
+ optname, optval, optlen);
+}
+
+psm2_error_t
+psm_poll(psm2_ep_t ep)
+{
+ return psm2_poll(ep);
+}
+
+void
+psm_uuid_generate(psm2_uuid_t uuid_out)
+{
+ psm2_uuid_generate(uuid_out);
+}
+
+/* Functions from TS psm_am.h */
+psm2_error_t
+psm_am_register_handlers(psm2_ep_t ep,
+ const psm2_am_handler_fn_t *handlers,
+ int num_handlers, int *handlers_idx)
+{
+ return psm2_am_register_handlers(ep, handlers, num_handlers, handlers_idx);
+}
+
+psm2_error_t
+psm_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src, size_t len,
+ int flags, psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ return psm2_am_request_short(epaddr, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt);
+}
+
+psm2_error_t
+psm_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src, size_t len,
+ int flags, psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ return psm2_am_reply_short(token, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt);
+}
+
+psm2_error_t
+psm_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters,
+ size_t sizeof_parameters_in,
+ size_t *sizeof_parameters_out)
+{
+ return psm2_am_get_parameters(ep, parameters, sizeof_parameters_in, sizeof_parameters_out);
+}
+
+
+/* Functions from TS psm_error.h */
+
+psm2_error_t
+psm_error_defer(psm2_error_token_t token)
+{
+ return psm2_error_defer(token);
+}
+
+psm2_error_t
+psm_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler)
+{
+ return psm2_error_register_handler(ep, errhandler);
+}
+
+const char *
+psm_error_get_string(psm2_error_t error)
+{
+ return psm2_error_get_string(error);
+}
+
+/* Functions from TS psm_mq.h */
+psm2_error_t
+psm_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_status_t *status)
+{
+ return psm2_mq_iprobe(mq, tag, tagsel, status);
+}
+
+psm2_error_t
+psm_mq_cancel(psm2_mq_req_t *ireq)
+{
+ return psm2_mq_cancel(ireq);
+}
+
+psm2_error_t
+psm_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+ return psm2_mq_wait(ireq, status);
+}
+
+psm2_error_t
+psm_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+ return psm2_mq_test(ireq, status);
+}
+
+psm2_error_t
+psm_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len, void *context, psm2_mq_req_t *req)
+{
+ return psm2_mq_isend(mq, dest, flags, stag, buf, len, context, req);
+}
+
+psm2_error_t
+psm_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len)
+{
+ return psm2_mq_send(mq, dest, flags, stag, buf, len);
+}
+
+psm2_error_t
+psm_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags,
+ void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo)
+{
+ return psm2_mq_irecv(mq, tag, tagsel, flags, buf, len, context, reqo);
+}
+
+psm2_error_t
+psm_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
+{
+ return psm2_mq_ipeek(mq, oreq, status);
+}
+
+psm2_error_t
+psm_mq_getopt(psm2_mq_t mq, int key, void *value)
+{
+ return psm2_mq_getopt(mq, key, value);
+}
+
+psm2_error_t
+psm_mq_setopt(psm2_mq_t mq, int key, const void *value)
+{
+ return psm2_mq_setopt(mq, key, value);
+}
+
+psm2_error_t
+psm_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+ const struct psm2_optkey *opts,
+ int numopts, psm2_mq_t *mqo)
+{
+ return psm2_mq_init(ep, tag_order_mask, opts, numopts, mqo);
+}
+
+psm2_error_t
+psm_mq_finalize(psm2_mq_t mq)
+{
+ return psm2_mq_finalize(mq);
+}
+
+void
+psm_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats)
+{
+ psm2_mq_get_stats(mq, stats);
+}
+
+/* Functions from TS psm_mq.h */
+psm2_error_t
+psm_ep_num_devunits(uint32_t *num_units_o)
+{
+ return psm2_ep_num_devunits(num_units_o);
+}
+
+uint64_t
+psm_epid_nid(psm2_epid_t epid)
+{
+ return psm2_epid_nid(epid);
+}
+
+uint64_t
+psm_epid_context(psm2_epid_t epid)
+{
+ return psm2_epid_context(epid);
+}
+
+uint64_t
+psm_epid_port(psm2_epid_t epid)
+{
+ return psm2_epid_port(epid);
+}
+
+psm2_error_t
+psm_ep_query (int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
+{
+ return psm2_ep_query (num_of_epinfo, array_of_epinfo);
+}
+
+psm2_error_t
+psm_ep_epid_lookup (psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+ return psm2_ep_epid_lookup (epid, epconn);
+}
+
+psm2_error_t
+psm_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o)
+{
+ return psm2_ep_epid_share_memory(ep, epid, result_o);
+}
+
+psm2_error_t
+psm_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts)
+{
+ return psm2_ep_open_opts_get_defaults(opts);
+}
+
+psm2_error_t
+psm_ep_open(psm2_uuid_t const unique_job_key, struct psm2_ep_open_opts const *opts_i,
+ psm2_ep_t *epo, psm2_epid_t *epido)
+{
+ return psm2_ep_open(unique_job_key, opts_i, epo, epido);
+}
+
+psm2_error_t
+psm_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
+{
+ return psm2_ep_close(ep, mode, timeout_in);
+}
+
+psm2_error_t
+psm_ep_connect(psm2_ep_t ep, int num_of_epid,
+ psm2_epid_t const *array_of_epid,
+ int const *array_of_epid_mask,
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr,
+ int64_t timeout)
+{
+ return psm2_ep_connect(ep, num_of_epid, array_of_epid, array_of_epid_mask,
+ array_of_errors, array_of_epaddr, timeout);
+}
diff --git a/compat/psm2_compat_linker_script.map b/compat/psm2_compat_linker_script.map
new file mode 100644
index 0000000..0933c68
--- /dev/null
+++ b/compat/psm2_compat_linker_script.map
@@ -0,0 +1,66 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+ C++ // Comments don't work in this file. */
+
+PSM_1.0
+{
+ /* Expose only those symbols we choose to. This way we do not
+ pollute users namespace more than absolutely necessary. */
+ global:
+ psm_*;
+
+ /* Make all other symbols local */
+ local:
+ *;
+};
diff --git a/include/common_defines.h b/include/common_defines.h
new file mode 100644
index 0000000..b244464
--- /dev/null
+++ b/include/common_defines.h
@@ -0,0 +1,176 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef COMMON_DEFINES_H
+#define COMMON_DEFINES_H
+
+/* TESTING being defined flips a couple of switches so that a testable version
+ * of libpsm2.so is built. It'll make properly annotated static functions be
+ * non-static, visible to the outside. Also, all mockable functions will be
+ * replaced with function pointers which will originally point to the actual
+ * implementation. However, those function pointers might be reset by the test
+ * code, thus allowing for mocking selected PSM2 functions for the purpose of
+ * the test.
+ *
+ * So far the following utilities have been introduced for enabling a
+ * conditiional compilation of the testable vs. production version of the library:
+ * - ustatic: toggles function visibility
+ * - MOCKABLE(): decorates function name so that it is visible after being mocked
+ * - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam
+ * for mocking a function
+ * - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam
+ * for mocking a function
+ *
+ * If the declaration and definition of a static function @c foo reside in
+ * different files, this would be the common use case:
+ *
+ * @code
+ * // somefile.c:
+ * int MOCKABLE(foo)();
+ * MOCK_DCL_EPILOGUE(foo);
+ *
+ * // otherfile.c:
+ * int MOCKABLE(foo)() {
+ * printf("I am the original foo!\n");
+ * }
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * int foo();
+ *
+ * // otherfile.c:
+ * int foo() {
+ * printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * On the other hand, if a testable version of the libary is being build, it
+ * would produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ *
+ * // otherfile.c:
+ * int foo_original_() {
+ * printf("I am the original foo!\n");
+ * }
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ *
+ * If the function to be mocked is a static function residing in the header,
+ * the following syntax would be used:
+ * @code
+ * // somefile.c:
+ * ustatic int MOCKABLE(foo)() {
+ * printf("I am the original foo!\n");
+ * }
+ * MOCK_DCL_EPILOGUE(foo);
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * static int foo() {
+ * printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * Similarly, if a testable version of the libary is being build, it would
+ * produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ */
+#ifndef TESTING
+
+/* If no testing is being done, ustatic resolves to regular "static" */
+#define ustatic static
+/* If no testing is being done, no indirection is introduced */
+#define MOCKABLE(fname) fname
+/* If no testing is being done, no declaration epilogue is needed */
+#define MOCK_DCL_EPILOGUE(fname)
+/* If no testing is being done, no definition epilogue is needed */
+#define MOCK_DEF_EPILOGUE(fname)
+
+#else /* ndef TESTING */
+
+/* For the testable version, all _ustatic_ function will NOT be static */
+#define ustatic
+/* TODO override inline directives in the same fashion as static */
+/* For the testable version, the actual implementation function is renamed */
+#define MOCKABLE(x) x ## _original_
+/* For the testable version, we declare the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the declaration of the actual function happens.
+ */
+#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x;
+/* For the testable version, we define the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the definition of the actual function happens.
+ */
+#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_;
+
+#endif /* ndef TESTING */
+
+#endif /* COMMON_DEFINES_H */
+
diff --git a/include/hfi1_deprecated.h b/include/hfi1_deprecated.h
new file mode 100644
index 0000000..36fd31f
--- /dev/null
+++ b/include/hfi1_deprecated.h
@@ -0,0 +1,181 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/*
+
+ hfi1_deprecated.h
+
+ Contains certain features of the hfi1 module that have been deprecated.
+
+ These features may still need to be supported by the psm library for
+ reasons of backwards compatibility.
+ */
+
+#ifndef __HFI1_DEPRECATED_H__
+
+#define __HFI1_DEPRECATED_H__
+
+/* First, include the current hfi1_user.h file: */
+
+#include <rdma/hfi/hfi1_user.h>
+
+/* Determine if we need to define and declare deprecated
+ entities based on the IB_IOCTL_MAGIC macro. */
+
+#if defined( IB_IOCTL_MAGIC )
+
+/* The macro: PSM2_SUPPORT_IW_CMD_API is used to stipulate
+ adding compile-time support of either the ioctl() or write()
+ command interfaces to the driver. Note though that the
+ final decision whether to support this depends on factors
+ only known at runtime. */
+#define PSM2_SUPPORT_IW_CMD_API 1
+/* IOCTL_CMD_API_MODULE_MAJOR defines the first version of the hfi1
+ * module that supports the ioctl() command interface. Prior to this
+ * (IOCTL_CMD_API_MODULE_MAJOR - 1 and smaller), the module used
+ * write() for the command interface. */
+#define IOCTL_CMD_API_MODULE_MAJOR 6
+
+/*
+ * round robin contexts across HFIs, then
+ * ports; this is the default.
+ * This option spreads the HFI selection within the local socket.
+ * If it is preferred to spread job over over entire set of
+ * HFIs within the system, see ALG_ACROSS_ALL below.
+ */
+#define HFI1_ALG_ACROSS_DEP 0
+
+/*
+ * use all contexts on an HFI (round robin
+ * active ports within), then next HFI
+ */
+#define HFI1_ALG_WITHIN_DEP 1
+
+struct hfi1_cmd_deprecated {
+ __u32 type; /* command type */
+ __u32 len; /* length of struct pointed to by add */
+ __u64 addr; /* pointer to user structure */
+};
+
+#define hfi1_cmd hfi1_cmd_deprecated
+
+#define HFI1_ALG_ACROSS HFI1_ALG_ACROSS_DEP
+#define HFI1_ALG_WITHIN HFI1_ALG_WITHIN_DEP
+
+#else
+
+#define HFI1_SWMAJOR_SHIFT 16
+
+#endif /* defined( IB_IOCTL_MAGIC )*/
+
+#define HFI1_ALG_ACROSS_ALL_DEP 2
+#define HFI1_ALG_ACROSS_ALL HFI1_ALG_ACROSS_ALL_DEP
+
+/* Note that struct hfi1_user_info_dep declaration is identical to
+ the struct hfi1_user_info declaration from MAJOR version 5 of the
+ hfi1_user.h file. */
+struct hfi1_user_info_dep {
+ /*
+ * version of user software, to detect compatibility issues.
+ * Should be set to HFI1_USER_SWVERSION.
+ */
+ __u32 userversion;
+ __u16 pad;
+ /* HFI selection algorithm, if unit has not selected */
+ __u16 hfi1_alg;
+ /*
+ * If two or more processes wish to share a context, each process
+ * must set the subcontext_cnt and subcontext_id to the same
+ * values. The only restriction on the subcontext_id is that
+ * it be unique for a given node.
+ */
+ __u16 subctxt_cnt;
+ __u16 subctxt_id;
+ /* 128bit UUID passed in by PSM. */
+ __u8 uuid[16];
+};
+
+/*
+ * We assume here that we have the hfi1_user.h file installed in the system path
+ * with the 'flags' field defined in struct sdma_req_info. (At least, when the
+ * user needs to run GPU workloads, this _should_ be the version of hfi1_user.h
+ * file installed by the IFS.)
+ */
+struct sdma_req_info_v6_3 {
+ /*
+ * bits 0-3 - version (currently unused)
+ * bits 4-7 - opcode (enum sdma_req_opcode)
+ * bits 8-15 - io vector count
+ */
+ __u16 ctrl;
+ /*
+ * Number of fragments contained in this request.
+ * User-space has already computed how many
+ * fragment-sized packet the user buffer will be
+ * split into.
+ */
+ __u16 npkts;
+ /*
+ * Size of each fragment the user buffer will be
+ * split into.
+ */
+ __u16 fragsize;
+ /*
+ * Index of the slot in the SDMA completion ring
+ * this request should be using. User-space is
+ * in charge of managing its own ring.
+ */
+ __u16 comp_idx;
+} __attribute__((packed));
+
+#endif /* #ifndef __HFI1_DEPRECATED_H__ */
diff --git a/include/linux-i386/bit_ops.h b/include/linux-i386/bit_ops.h
new file mode 100644
index 0000000..d272e75
--- /dev/null
+++ b/include/linux-i386/bit_ops.h
@@ -0,0 +1,98 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_BIT_OPS_H
+#define _HFI_i386_BIT_OPS_H
+
+static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit),
+ "=m"(*addr) : "dIr"(nr) : "memory");
+ return oldbit;
+}
+
+static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips___test_and_set_bit(int nr,
+ volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit),
+ "=m"(*addr) : "dIr"(nr) : "memory");
+ return oldbit;
+}
+
+#endif /* _HFI_i386_BIT_OPS_H */
diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
new file mode 100644
index 0000000..bfd5746
--- /dev/null
+++ b/include/linux-i386/sysdep.h
@@ -0,0 +1,171 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_SYSDEP_H
+#define _HFI_i386_SYSDEP_H
+
+typedef struct cpuid {
+ unsigned eax, ebx, ecx, edx;
+} cpuid_t;
+
+static __inline__ void
+get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id)
+{
+ unsigned a, b, c, d;
+
+ asm (" \
+ mov %4, %%eax \n\
+ mov %5, %%ecx \n\
+ cpuid \n\
+ mov %%eax, %0 \n\
+ mov %%ebx, %1 \n\
+ mov %%ecx, %2 \n\
+ mov %%edx, %3 \n\
+ " : "=g" (a), "=g" (b), "=g" (c), "=g" (d)
+ : "g" (func), "g" (subfunc)
+ : "%eax", "%ebx", "%ecx", "%edx"
+ );
+
+ id->eax = a;
+ id->ebx = b;
+ id->ecx = c;
+ id->edx = d;
+}
+
+static __inline__ uint64_t get_cycles(void)
+{
+ uint64_t v;
+ uint32_t a, d;
+
+ asm volatile ("rdtsc" : "=a" (a), "=d"(d));
+ v = ((uint64_t) a) | (((uint64_t) d) << 32);
+
+ return v;
+}
+
+#ifndef LOCK_PREFIX
+#define LOCK_PREFIX "lock "
+#endif
+
+static __inline__ void ips_barrier()
+{
+ asm volatile ("" : : : "memory");
+}
+
+static __inline__ void ips_mb()
+{
+ asm volatile ("mfence" : : : "memory");
+}
+
+/* gcc-3.4 has a bug with this function body at -O0 */
+static
+#if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4
+#else
+__inline__
+#endif
+void ips_rmb()
+{
+ asm volatile ("" : : : "memory");
+}
+
+static __inline__ void ips_wmb()
+{
+ asm volatile ("sfence" : : : "memory");
+}
+
+static __inline__ void ips_sync_writes()
+{
+ asm volatile ("sfence" : : : "memory");
+}
+
+static __inline__ void ips_sync_reads()
+{
+ asm volatile ("lfence" : : : "memory");
+}
+
+static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
+ uint32_t old_val, uint32_t new_val)
+{
+ uint32_t prev;
+ struct xchg_dummy {
+ uint32_t a[100];
+ };
+
+ asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev)
+ : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val)
+ : "memory");
+
+ return prev;
+}
+
+typedef struct {
+ volatile int32_t counter;
+} ips_atomic_t;
+
+#define ips_atomic_set(v, i) (((v)->counter) = (i))
+#define ips_atomic_cmpxchg(p, oval, nval) \
+ ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval)
+
+#if 0
+static __inline__ int32_t
+ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value)
+{
+ asm volatile ("lock cmpxchg %2, %0" :
+ "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+ return old_value;
+}
+#endif
+
+#endif /* _HFI_i386_SYSDEP_H */
diff --git a/include/opa_byteorder.h b/include/opa_byteorder.h
new file mode 100644
index 0000000..3139593
--- /dev/null
+++ b/include/opa_byteorder.h
@@ -0,0 +1,264 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_BYTEORDER_H
+#define OPA_BYTEORDER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <endian.h>
+
+#ifndef __BYTE_ORDER
+# error "BYTE_ORDER undefined"
+#endif
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+static __inline__ __u16 __hfi_fswab16(__u16)
+ __attribute__ ((always_inline));
+static __inline__ __u32 __hfi_fswab32(__u32)
+ __attribute__ ((always_inline));
+static __inline__ __u64 __hfi_fswab64(__u64)
+ __attribute__ ((always_inline));
+
+static __inline__ __u16 __hfi_fswab16(__u16 x) {
+ return ((x & (__u16) 0x00ffU) << 8)
+ | ((x & (__u16) 0xff00U) >> 8);
+} static __inline__ __u32 __hfi_fswab32(__u32 x) {
+ return ((x & (__u32) 0x000000ffUL) << 24)
+ | ((x & (__u32) 0x0000ff00UL) << 8)
+ | ((x & (__u32) 0x00ff0000UL) >> 8)
+ | ((x & (__u32) 0xff000000UL) >> 24);
+}
+
+static __inline__ __u64 __hfi_fswab64(__u64 x) {
+ return ((x & (__u64) 0x00000000000000ffULL) << 56)
+ | ((x & (__u64) 0x000000000000ff00ULL) << 40)
+ | ((x & (__u64) 0x0000000000ff0000ULL) << 24)
+ | ((x & (__u64) 0x00000000ff000000ULL) << 8)
+ | ((x & (__u64) 0x000000ff00000000ULL) >> 8)
+ | ((x & (__u64) 0x0000ff0000000000ULL) >> 24)
+ | ((x & (__u64) 0x00ff000000000000ULL) >> 40)
+ | ((x & (__u64) 0xff00000000000000ULL) >> 56);
+}
+
+static __inline__ __u16 __cpu_to_le16(__le16)
+ __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_le32(__le32)
+ __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_le64(__le64)
+ __attribute__ ((always_inline));
+
+static __inline__ __u16 __le16_to_cpu(__le16)
+ __attribute__ ((always_inline));
+static __inline__ __u32 __le32_to_cpu(__le32)
+ __attribute__ ((always_inline));
+static __inline__ __u64 __le64_to_cpu(__le64)
+ __attribute__ ((always_inline));
+
+static __inline__ __u16 __cpu_to_be16(__be16)
+ __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_be32(__be32)
+ __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_be64(__be64)
+ __attribute__ ((always_inline));
+
+static __inline__ __u16 __be16_to_cpu(__be16)
+ __attribute__ ((always_inline));
+static __inline__ __u32 __be32_to_cpu(__be32)
+ __attribute__ ((always_inline));
+static __inline__ __u64 __be64_to_cpu(__be64)
+ __attribute__ ((always_inline));
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+ return x;
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+ return x;
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+ return x;
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+ return x;
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+ return x;
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+ return x;
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+ return __hfi_fswab16(x);
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+ return __hfi_fswab32(x);
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+ return __hfi_fswab64(x);
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+ return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+ return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+ return __hfi_fswab64(x);
+}
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+ return __hfi_fswab16(x);
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+ return __hfi_fswab32(x);
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+ return __hfi_fswab64(x);
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+ return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+ return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+ return __hfi_fswab64(x);
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+ return x;
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+ return x;
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+ return x;
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+ return x;
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+ return x;
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+ return x;
+}
+
+#else
+# error "unsupported BYTE_ORDER: " #BYTE_ORDER
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif /* OPA_BYTEORDER_H */
diff --git a/include/opa_common.h b/include/opa_common.h
new file mode 100644
index 0000000..1e89b69
--- /dev/null
+++ b/include/opa_common.h
@@ -0,0 +1,62 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_COMMON_H
+#define OPA_COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+#include "hfi1_deprecated.h"
+
+#endif /* OPA_COMMON_H */
diff --git a/include/opa_debug.h b/include/opa_debug.h
new file mode 100644
index 0000000..d5d8ff2
--- /dev/null
+++ b/include/opa_debug.h
@@ -0,0 +1,108 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_DEBUG_H
+#define OPA_DEBUG_H
+
+#ifndef _HFI_DEBUGGING /* debugging enabled or not */
+#define _HFI_DEBUGGING 1
+#endif
+
+#if _HFI_DEBUGGING
+
+/*
+ * Mask values for debugging. The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically. This can be set at modprobe time also:
+ * modprobe hfi.ko hfi_debug=7
+ */
+
+#define __HFI_INFO 0x1 /* generic low verbosity stuff */
+#define __HFI_DBG 0x2 /* generic debug */
+#define __HFI_TRSAMPLE 0x8 /* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __HFI_VERBDBG 0x40 /* very verbose debug */
+#define __HFI_PKTDBG 0x80 /* print packet data */
+/* print process startup (init)/exit messages and important env vars */
+#define __HFI_PROCDBG 0x100
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG 0x200
+/* low-level environment variables */
+#define __HFI_ENVDBG 0x400
+#define __HFI_EPKTDBG 0x800 /* print error packet data */
+#define __HFI_CCADBG 0x1000 /* print CCA related events */
+#else /* _HFI_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(hfi_debug & _HFI_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __HFI_INFO 0x0 /* generic low verbosity stuff */
+#define __HFI_DBG 0x0 /* generic debug */
+#define __HFI_TRSAMPLE 0x0 /* generate trace buffer sample entries */
+#define __HFI_VERBDBG 0x0 /* very verbose debug */
+#define __HFI_PKTDBG 0x0 /* print packet data */
+#define __HFI_PROCDBG 0x0 /* print process startup (init)/exit messages */
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG 0x0
+#define __HFI_CCADBG 0x0 /* print CCA related events */
+
+#endif /* _HFI_DEBUGGING */
+
+#define __HFI_VERBOSEDBG __HFI_VERBDBG
+
+#endif /* OPA_DEBUG_H */
diff --git a/include/opa_intf.h b/include/opa_intf.h
new file mode 100644
index 0000000..e187d7d
--- /dev/null
+++ b/include/opa_intf.h
@@ -0,0 +1,90 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_INTF_H
+#define OPA_INTF_H
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+#ifdef __inline__
+#undef __inline__
+#endif
+#define __inline__ inline __attribute__((always_inline, unused))
+
+#include "sysdep.h"
+#include "bit_ops.h"
+
+/* these aren't implemented for user mode, which is OK until we multi-thread */
+typedef struct _atomic {
+ uint32_t counter;
+} atomic_t; /* no atomic_t type in user-land */
+#define atomic_set(a, v) ((a)->counter = (v))
+#define atomic_inc_return(a) (++(a)->counter)
+
+#if defined(__GNUC__)
+#define likely(x) __builtin_expect(!!(x), 1L)
+#define unlikely(x) __builtin_expect(!!(x), 0L)
+#define if_pt(cond) if (likely(cond))
+#define if_pf(cond) if (unlikely(cond))
+#define _Pragma_unlikely
+#define _Pragma_likely
+#else
+#error "Unsupported compiler"
+#endif
+
+#define yield() sched_yield()
+#endif /* OPA_INTF_H */
diff --git a/include/opa_queue.h b/include/opa_queue.h
new file mode 100644
index 0000000..f3d9595
--- /dev/null
+++ b/include/opa_queue.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $
+ */
+
+#ifndef OPA_QUEUE_H_
+#define OPA_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction. Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ * SLIST LIST STAILQ TAILQ CIRCLEQ
+ * _HEAD + + + + +
+ * _HEAD_INITIALIZER + + + + +
+ * _ENTRY + + + + +
+ * _INIT + + + + +
+ * _EMPTY + + + + +
+ * _FIRST + + + + +
+ * _NEXT + + + + +
+ * _PREV - - - + +
+ * _LAST - - + + +
+ * _FOREACH + + + + +
+ * _FOREACH_REVERSE - - - + +
+ * _INSERT_HEAD + + + + +
+ * _INSERT_BEFORE - + - + +
+ * _INSERT_AFTER + + + + +
+ * _INSERT_TAIL - - + + +
+ * _REMOVE_HEAD + - + - -
+ * _REMOVE + + + + +
+ *
+ */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define SLIST_HEAD(name, type) \
+struct name { \
+ struct type *slh_first; /* first element */ \
+}
+
+#define SLIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define SLIST_ENTRY(type) \
+struct { \
+ struct type *sle_next; /* next element */ \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+
+#define SLIST_FIRST(head) ((head)->slh_first)
+
+#define SLIST_FOREACH(var, head, field) \
+ for ((var) = SLIST_FIRST((head)); \
+ (var); \
+ (var) = SLIST_NEXT((var), field))
+
+#define SLIST_INIT(head) do { \
+ SLIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
+ SLIST_NEXT((slistelm), field) = (elm); \
+} while (0)
+
+#define SLIST_INSERT_HEAD(head, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
+ SLIST_FIRST((head)) = (elm); \
+} while (0)
+
+#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
+
+#define SLIST_REMOVE(head, elm, type, field) do { \
+ if (SLIST_FIRST((head)) == (elm)) { \
+ SLIST_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = SLIST_FIRST((head)); \
+ while (SLIST_NEXT(curelm, field) != (elm)) \
+ curelm = SLIST_NEXT(curelm, field); \
+ SLIST_NEXT(curelm, field) = \
+ SLIST_NEXT(SLIST_NEXT(curelm, field), field); \
+ } \
+} while (0)
+
+#define SLIST_REMOVE_HEAD(head, field) do { \
+ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define STAILQ_HEAD(name, type) \
+struct name { \
+ struct type *stqh_first;/* first element */ \
+ struct type **stqh_last;/* addr of last next element */ \
+}
+
+#define STAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).stqh_first }
+
+#define STAILQ_ENTRY(type) \
+struct { \
+ struct type *stqe_next; /* next element */ \
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
+
+#define STAILQ_FIRST(head) ((head)->stqh_first)
+
+#define STAILQ_FOREACH(var, head, field) \
+ for ((var) = STAILQ_FIRST((head)); \
+ (var); \
+ (var) = STAILQ_NEXT((var), field))
+
+#define STAILQ_INIT(head) do { \
+ STAILQ_FIRST((head)) = NULL; \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_NEXT((tqelm), field) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_FIRST((head)) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_TAIL(head, elm, field) do { \
+ STAILQ_NEXT((elm), field) = NULL; \
+ *(head)->stqh_last = (elm); \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+} while (0)
+
+#define STAILQ_LAST(head, type, field) \
+ (STAILQ_EMPTY(head) ? \
+ NULL : \
+ ((struct type *) \
+ ((char *)((head)->stqh_last) - offsetof(struct type, field))))
+
+#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+
+#define STAILQ_REMOVE(head, elm, type, field) do { \
+ if (STAILQ_FIRST((head)) == (elm)) { \
+ STAILQ_REMOVE_HEAD(head, field); \
+ } \
+ else { \
+ struct type *curelm = STAILQ_FIRST((head)); \
+ while (STAILQ_NEXT(curelm, field) != (elm)) \
+ curelm = STAILQ_NEXT(curelm, field); \
+ if ((STAILQ_NEXT(curelm, field) = \
+ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+ } \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD(head, field) do { \
+ if ((STAILQ_FIRST((head)) = \
+ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
+ if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define LIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define LIST_ENTRY(type) \
+struct { \
+ struct type *le_next; /* next element */ \
+ struct type **le_prev; /* address of previous next element */ \
+}
+
+/*
+ * List functions.
+ */
+
+#define LIST_EMPTY(head) ((head)->lh_first == NULL)
+
+#define LIST_FIRST(head) ((head)->lh_first)
+
+#define LIST_FOREACH(var, head, field) \
+ for ((var) = LIST_FIRST((head)); \
+ (var); \
+ (var) = LIST_NEXT((var), field))
+
+#define LIST_INIT(head) do { \
+ LIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+ LIST_NEXT((listelm), field)->field.le_prev = \
+ &LIST_NEXT((elm), field); \
+ LIST_NEXT((listelm), field) = (elm); \
+ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \
+} while (0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.le_prev = (listelm)->field.le_prev; \
+ LIST_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.le_prev = (elm); \
+ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \
+} while (0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
+ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+ LIST_FIRST((head)) = (elm); \
+ (elm)->field.le_prev = &LIST_FIRST((head)); \
+} while (0)
+
+#define LIST_NEXT(elm, field) ((elm)->field.le_next)
+
+#define LIST_REMOVE(elm, field) do { \
+ if (LIST_NEXT((elm), field) != NULL) \
+ LIST_NEXT((elm), field)->field.le_prev = \
+ (elm)->field.le_prev; \
+ *(elm)->field.le_prev = LIST_NEXT((elm), field); \
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+}
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).tqh_first }
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+} while (0)
+
+/*
+ * Circular queue declarations.
+ */
+#define CIRCLEQ_HEAD(name, type) \
+struct name { \
+ struct type *cqh_first; /* first element */ \
+ struct type *cqh_last; /* last element */ \
+}
+
+#define CIRCLEQ_HEAD_INITIALIZER(head) \
+ { (void *)&(head), (void *)&(head) }
+
+#define CIRCLEQ_ENTRY(type) \
+struct { \
+ struct type *cqe_next; /* next element */ \
+ struct type *cqe_prev; /* previous element */ \
+}
+
+/*
+ * Circular queue functions.
+ */
+#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head))
+
+#define CIRCLEQ_FIRST(head) ((head)->cqh_first)
+
+#define CIRCLEQ_FOREACH(var, head, field) \
+ for ((var) = CIRCLEQ_FIRST((head)); \
+ (var) != (void *)(head) || ((var) = NULL); \
+ (var) = CIRCLEQ_NEXT((var), field))
+
+#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \
+ for ((var) = CIRCLEQ_LAST((head)); \
+ (var) != (void *)(head) || ((var) = NULL); \
+ (var) = CIRCLEQ_PREV((var), field))
+
+#define CIRCLEQ_INIT(head) do { \
+ CIRCLEQ_FIRST((head)) = (void *)(head); \
+ CIRCLEQ_LAST((head)) = (void *)(head); \
+} while (0)
+
+#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \
+ CIRCLEQ_PREV((elm), field) = (listelm); \
+ if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \
+ CIRCLEQ_LAST((head)) = (elm); \
+ else \
+ CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\
+ CIRCLEQ_NEXT((listelm), field) = (elm); \
+} while (0)
+
+#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \
+ CIRCLEQ_NEXT((elm), field) = (listelm); \
+ CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \
+ if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \
+ CIRCLEQ_FIRST((head)) = (elm); \
+ else \
+ CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\
+ CIRCLEQ_PREV((listelm), field) = (elm); \
+} while (0)
+
+#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \
+ CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \
+ CIRCLEQ_PREV((elm), field) = (void *)(head); \
+ if (CIRCLEQ_LAST((head)) == (void *)(head)) \
+ CIRCLEQ_LAST((head)) = (elm); \
+ else \
+ CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \
+ CIRCLEQ_FIRST((head)) = (elm); \
+} while (0)
+
+#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \
+ CIRCLEQ_NEXT((elm), field) = (void *)(head); \
+ CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \
+ if (CIRCLEQ_FIRST((head)) == (void *)(head)) \
+ CIRCLEQ_FIRST((head)) = (elm); \
+ else \
+ CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \
+ CIRCLEQ_LAST((head)) = (elm); \
+} while (0)
+
+#define CIRCLEQ_LAST(head) ((head)->cqh_last)
+
+#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next)
+
+#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev)
+
+#define CIRCLEQ_REMOVE(head, elm, field) do { \
+ if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \
+ CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \
+ else \
+ CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \
+ CIRCLEQ_PREV((elm), field); \
+ if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \
+ CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \
+ else \
+ CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \
+ CIRCLEQ_NEXT((elm), field); \
+} while (0)
+
+#endif /* !OPA_QUEUE_H_ */
diff --git a/include/opa_revision.h b/include/opa_revision.h
new file mode 100644
index 0000000..4a28821
--- /dev/null
+++ b/include/opa_revision.h
@@ -0,0 +1,64 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_REVISION_H
+#define OPA_REVISION_H
+
+/* Those variables are defined in the _revision.c file
+which is dynamically generated during building of the library */
+extern char psmi_hfi_IFS_version[];
+extern char psmi_hfi_build_timestamp[];
+extern char psmi_hfi_sources_checksum[];
+extern char psmi_hfi_git_checksum[];
+
+#endif /* OPA_REVISION_H */
diff --git a/include/opa_service.h b/include/opa_service.h
new file mode 100644
index 0000000..16cf0fd
--- /dev/null
+++ b/include/opa_service.h
@@ -0,0 +1,268 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_SERVICE_H
+#define OPA_SERVICE_H
+
+/* This file contains all the lowest level routines calling into sysfs */
+/* and qib driver. All other calls are based on these routines. */
+
+#include <libgen.h>
+
+#include "opa_intf.h"
+#include "opa_common.h"
+#include "opa_udebug.h"
+#include "opa_byteorder.h"
+
+/* upper and lower bounds for HFI port numbers */
+#define HFI_MIN_PORT 1
+#define HFI_MAX_PORT 1
+#define HFI_NUM_PORTS (HFI_MAX_PORT - HFI_MIN_PORT + 1)
+/* any unit id to match. */
+#define HFI_UNIT_ID_ANY ((long)-1)
+/* any port num to match. */
+#define HFI_PORT_NUM_ANY ((long)0)
+
+/* base name of path (without unit #) for qib driver */
+#define HFI_DEVICE_PATH "/dev/hfi1"
+#define HFI_CLASS_PATH "/sys/class/infiniband/hfi1"
+
+/* Commands used to communicate with driver. */
+enum PSMI_HFI_CMD {
+ PSMI_HFI_CMD_ASSIGN_CTXT = 0, /* allocate HFI and context */
+ PSMI_HFI_CMD_CTXT_INFO, /* find out what resources we got */
+ PSMI_HFI_CMD_USER_INFO, /* set up userspace */
+ PSMI_HFI_CMD_TID_UPDATE, /* update expected TID entries */
+ PSMI_HFI_CMD_TID_FREE, /* free expected TID entries */
+ PSMI_HFI_CMD_CREDIT_UPD, /* force an update of PIO credit */
+ PSMI_HFI_CMD_RECV_CTRL, /* control receipt of packets */
+ PSMI_HFI_CMD_POLL_TYPE, /* set the kind of polling we want */
+ PSMI_HFI_CMD_ACK_EVENT, /* ack & clear user status bits */
+ PSMI_HFI_CMD_SET_PKEY, /* set context's pkey */
+ PSMI_HFI_CMD_CTXT_RESET, /* reset context's HW send context */
+ PSMI_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */
+ PSMI_HFI_CMD_GET_VERS, /* get the version of the user cdev */
+
+#ifdef PSM_CUDA
+ PSMI_HFI_CMD_TID_UPDATE_V2 = 28,
+#endif
+ PSMI_HFI_CMD_LAST,
+};
+
+/* Legacy commands used to communicate with driver using 'write' */
+enum LEGACY_HFI1_CMD {
+ LEGACY_HFI1_CMD_ASSIGN_CTXT = 1, /* allocate HFI and context */
+ LEGACY_HFI1_CMD_CTXT_INFO = 2, /* find out what resources we got */
+ LEGACY_HFI1_CMD_USER_INFO = 3, /* set up userspace */
+ LEGACY_HFI1_CMD_TID_UPDATE = 4, /* update expected TID entries */
+ LEGACY_HFI1_CMD_TID_FREE = 5, /* free expected TID entries */
+ LEGACY_HFI1_CMD_CREDIT_UPD = 6, /* force an update of PIO credit */
+
+ LEGACY_HFI1_CMD_RECV_CTRL = 8, /* control receipt of packets */
+ LEGACY_HFI1_CMD_POLL_TYPE = 9, /* set the kind of polling we want */
+ LEGACY_HFI1_CMD_ACK_EVENT = 10, /* ack & clear user status bits */
+ LEGACY_HFI1_CMD_SET_PKEY = 11, /* set context's pkey */
+ LEGACY_HFI1_CMD_CTXT_RESET = 12, /* reset context's HW send context */
+ LEGACY_HFI1_CMD_TID_INVAL_READ = 13, /* read TID cache invalidations */
+ LEGACY_HFI1_CMD_GET_VERS = 14 /* get the version of the user cdev */
+};
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+ returns 0 if the unit and port are not active. returns -1 when an error occurred. */
+int hfi_get_port_active(int, int);
+
+/* Given the unit number and port, return an error, or the corresponding LID */
+/* Returns an int, so -1 indicates a general error. -2 indicates that the unit/port
+ are not active. 0 indicates that the unit is valid, but no LID has been assigned. */
+int hfi_get_port_lid(int, int);
+
+/* Given the unit number and port, return an error, or the corresponding GID */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_gid(int, int, uint64_t *hi, uint64_t *lo);
+
+/* Given the unit number, return an error, or the corresponding LMC value
+ for the port */
+/* Returns an int, so -1 indicates an error. 0 */
+int hfi_get_port_lmc(int unit, int port);
+
+/* Given the unit number, return an error, or the corresponding link rate
+ for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port);
+
+/* Given a unit, port and SL, return an error, or the corresponding SC for the
+ SL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sl2sc(int unit, int port, int sl);
+
+/* Given a unit, port and SC, return an error, or the corresponding VL for the
+ SC as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sc2vl(int unit, int port, int sc);
+
+/* Given a unit, port and VL, return an error, or the corresponding MTU for the
+ VL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_vl2mtu(int unit, int port, int vl);
+
+/* Given a unit, port and index, return an error, or the corresponding pkey for
+ the index as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_index2pkey(int unit, int port, int index);
+
+/* Get the number of units supported by the driver. Does not guarantee
+ that a working chip has been found for each possible unit #. */
+/* Returns -1 with errno set, or number of units >=0 (0 means none found). */
+int hfi_get_num_units(void);
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+ returns 0 if no port on the unit is active.
+ returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit);
+
+/* get the number of contexts from the unit id. */
+/* Returns 0 if no unit or no match. */
+int hfi_get_num_contexts(int unit);
+
+/* Open hfi device file, return -1 on error. */
+int hfi_context_open(int unit, int port, uint64_t open_timeout);
+int hfi_context_open_ex(int unit, int port, uint64_t open_timeout,
+ char *dev_name,size_t dev_name_len);
+void hfi_context_close(int fd);
+
+/* hfi_get_user_major_version() returns the major version of the driver
+ that should be used for this session of psm. Valid only after
+ hfi_context_open has been called. */
+uint16_t hfi_get_user_major_version(void);
+
+/* hfi_get_user_minor_version() return the minor version of the driver */
+uint16_t hfi_get_user_minor_version(void);
+
+void hfi_set_user_version(uint32_t version);
+void hfi_set_user_major_version(uint16_t major_version);
+
+int hfi_cmd_write(int fd, struct hfi1_cmd *, size_t count);
+int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt);
+
+int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf);
+int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp);
+
+/* We use mmap64() because we compile in both 32 and 64 bit mode,
+ and we have to map physical addresses that are > 32 bits long.
+ While linux implements mmap64, it doesn't have a man page,
+ and isn't declared in any header file, so we declare it here ourselves. */
+
+/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+ redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+ it doesn't work when the address being mapped is > 32 bits. It chips
+ off bits 32 and above. So we stay with mmap64. */
+extern void *mmap64(void *, size_t, int, int, int, __off64_t);
+void *hfi_mmap64(void *, size_t, int, int, int, __off64_t);
+
+/* Statistics maintained by the driver */
+int hfi_get_stats(uint64_t *, int);
+int hfi_get_stats_names(char **namep);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit(int unitno, uint64_t *, int);
+int hfi_get_ctrs_unit_names(int unitno, char **namep);
+int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int);
+int hfi_get_ctrs_port_names(int unitno, char **namep);
+
+/* sysfs helper routines (only those currently used are exported;
+ * try to avoid using others) */
+
+/* Calls stat() for the given attribute, return value is unchanged
+ from stat() sbuf is populated from stat() too. */
+int hfi_sysfs_stat(const char *attr,struct stat *sbuf);
+
+/* read a signed 64-bit quantity, in some arbitrary base */
+int hfi_sysfs_read_s64(const char *attr, int64_t *valp, int base);
+
+/* read a string value */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+ char **datap);
+
+/* open attribute in unit's sysfs directory via open(2) */
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags);
+/* print to attribute in {unit,port} sysfs directory */
+int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+ const char *fmt, ...)
+ __attribute__((format(printf, 4, 5)));
+int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
+
+int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data,
+ size_t len);
+/* read up to one page of malloc'ed data (caller must free), returning
+ number of bytes read or -1 */
+int hfi_hfifs_read(const char *attr, char **datap);
+int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data);
+/* read a signed 64-bit quantity, in some arbitrary base */
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+ int64_t *valp, int base);
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+ int64_t *valp, int base);
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit);
+/* these read directly into supplied buffer and take a count */
+int hfi_hfifs_rd(const char *, void *, int);
+int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int);
+
+int hfi_hfifs_open(const char *relname, int flags);
+
+/* wait for device special file to show up. timeout is in
+ * milliseconds, 0 is "callee knows best", < 0 is infinite. */
+int hfi_wait_for_device(const char *path, long timeout);
+
+int hfi_cmd_wait_for_packet(int fd);
+
+#endif /* OPA_SERVICE_H */
diff --git a/include/opa_udebug.h b/include/opa_udebug.h
new file mode 100644
index 0000000..9fd59cb
--- /dev/null
+++ b/include/opa_udebug.h
@@ -0,0 +1,194 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_UDEBUG_H
+#define OPA_UDEBUG_H
+
+#include <stdio.h>
+#include "opa_debug.h"
+
+extern unsigned hfi_debug;
+const char *hfi_get_unit_name(int unit);
+extern char *__progname;
+
+static const char hfi_ident_tag[] = "PSM2_IDENTIFY";
+char *hfi_get_mylabel();
+
+#if _HFI_DEBUGGING
+
+extern char *__hfi_mylabel;
+void hfi_set_mylabel(char *);
+extern FILE *__hfi_dbgout;
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ printf("%s%s: " fmt, __hfi_mylabel, __progname, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ printf("%s%s: " fmt, __hfi_mylabel, __progname, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_INFO(fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ if (unlikely(hfi_debug&__HFI_INFO)) \
+ printf("%s%s: " fmt, __hfi_mylabel, __func__, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG)
+
+#define __HFI_DBG_WHICH(which, fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ if (unlikely(hfi_debug&(which))) \
+ fprintf(__hfi_dbgout, "%s%s: " fmt, __hfi_mylabel, __func__, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ if (unlikely(hfi_debug&(which))) \
+ fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__)
+#define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__)
+#define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__)
+#define _HFI_ENVDBG(lev, fmt, ...) \
+ __HFI_DBG_WHICH_NOFUNC( \
+ (lev == 0) ? __HFI_INFO : \
+ (lev > 1 ? __HFI_ENVDBG : (__HFI_PROCDBG|__HFI_ENVDBG)),\
+ "env " fmt, ##__VA_ARGS__)
+#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__)
+#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__)
+
+/*
+ * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together
+ * for a scope of code preparing debug info for printing; e.g.
+ * if (_HFI_DBG_ON) {
+ * // put your code here
+ * _HFI_DBG_ALWAYS(print your results here);
+ * }
+ */
+#define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG)
+#define _HFI_DBG_ALWAYS(fmt, ...) \
+ do { \
+ _Pragma_unlikely \
+ fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG)
+#define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG)
+#define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG)
+#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO)
+#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#else /* ! _HFI_DEBUGGING */
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+ do { \
+ printf("%s" fmt, "", ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+ do { \
+ printf("%s" fmt, "", ##__VA_ARGS__); \
+ } while (0)
+
+#define _HFI_INFO(fmt, ...)
+
+#define __HFI_PKTDBG_ON 0
+
+#define _HFI_DBG(fmt, ...)
+#define _HFI_PDBG(fmt, ...)
+#define _HFI_EPDBG(fmt, ...)
+#define _HFI_PRDBG(fmt, ...)
+#define _HFI_ENVDBG(lev, fmt, ...)
+#define _HFI_VDBG(fmt, ...)
+#define _HFI_MMDBG(fmt, ...)
+#define _HFI_CCADBG(fmt, ...)
+
+#define _HFI_DBG_ON 0
+#define _HFI_DBG_ALWAYS(fmt, ...)
+#define _HFI_VDBG_ON 0
+#define _HFI_VDBG_ALWAYS(fmt, ...)
+#define _HFI_PRDBG_ON 0
+#define _HFI_PRDBG_ALWAYS(fmt, ...)
+#define _HFI_CCADBG_ON 0
+#define _HFI_CCADBG_ALWAYS(fmt, ...)
+#define _HFI_INFO_ON 0
+#define _HFI_INFO_ALWAYS(fmt, ...)
+
+#endif /* _HFI_DEBUGGING */
+
+#endif /* OPA_UDEBUG_H */
diff --git a/include/opa_user.h b/include/opa_user.h
new file mode 100644
index 0000000..274d674
--- /dev/null
+++ b/include/opa_user.h
@@ -0,0 +1,973 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_USER_H
+#define OPA_USER_H
+
+/* This file contains all of the data structures and routines that are
+ publicly visible and usable (to low level infrastructure code; it is
+ not expected that any application, or even normal application-level library,
+ will ever need to use any of this).
+
+ Additional entry points and data structures that are used by these routines
+ may be referenced in this file, but they should not be generally available;
+ they are visible here only to allow use in inlined functions. Any variable,
+ data structure, or function that starts with a leading "_" is in this
+ category.
+*/
+
+/* Include header files we need that are unlikely to otherwise be needed by */
+/* programs. */
+#include <stddef.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <syslog.h>
+#include "opa_intf.h"
+#include "opa_common.h"
+#include "opa_byteorder.h"
+#include "opa_udebug.h"
+#include "opa_service.h"
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * hfi_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define HFI_RHF_LENGTH_MASK 0xFFF
+#define HFI_RHF_LENGTH_SHIFT 0
+#define HFI_RHF_RCVTYPE_MASK 0x7
+#define HFI_RHF_RCVTYPE_SHIFT 12
+#define HFI_RHF_USE_EGRBFR_MASK 0x1
+#define HFI_RHF_USE_EGRBFR_SHIFT 15
+#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF
+#define HFI_RHF_EGRBFR_INDEX_SHIFT 16
+#define HFI_RHF_SEQ_MASK 0xF
+#define HFI_RHF_SEQ_SHIFT 28
+
+#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF
+#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0
+#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF
+#define HFI_RHF_HDRQ_OFFSET_SHIFT 12
+
+#define HFI_RHF_ICRCERR 0x80000000
+#define HFI_RHF_ECCERR 0x20000000
+#define HFI_RHF_LENERR 0x10000000
+#define HFI_RHF_TIDERR 0x08000000
+
+#define HFI_RHF_TFGENERR 0x04000000
+#define HFI_RHF_TFSEQERR 0x02000000
+#define HFI_RHF_RCVTYPEERR 0x07000000
+
+#define HFI_RHF_DCERR 0x00800000
+#define HFI_RHF_DCUNCERR 0x00400000
+#define HFI_RHF_KHDRLENERR 0x00200000
+/* Change from 0xFFE00000 to 0xFDE00000, so that we don't commit to the
+ * error path on a SeqErr too soon - with RSM, the HFI may report a
+ * false SeqErr condition */
+#define HFI_RHF_ERR_MASK 0xFDE00000
+
+/* TidFlow related bits */
+#define HFI_TF_SEQNUM_SHIFT 0
+#define HFI_TF_SEQNUM_MASK 0x7ff
+#define HFI_TF_GENVAL_SHIFT 11
+#define HFI_TF_GENVAL_MASK 0xfffff
+
+#define HFI_TF_FLOWVALID_SHIFT 32
+#define HFI_TF_FLOWVALID_MASK 0x1
+#define HFI_TF_HDRSUPP_ENABLED_SHIFT 33
+#define HFI_TF_HDRSUPP_ENABLED_MASK 0x1
+
+#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT 34
+#define HFI_TF_KEEP_AFTER_SEQERR_MASK 0x1
+#define HFI_TF_KEEP_ON_GENERR_SHIFT 35
+#define HFI_TF_KEEP_ON_GENERR_MASK 0x1
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK 0x1
+#define HFI_TF_STATUS_SHIFT 37
+#define HFI_TF_STATUS_MASK 0x3
+#define HFI_TF_STATUS_SEQMISMATCH_SHIFT 37
+#define HFI_TF_STATUS_SEQMISMATCH_MASK 0x1
+#define HFI_TF_STATUS_GENMISMATCH_SHIFT 38
+#define HFI_TF_STATUS_GENMISMATCH_MASK 0x1
+
+#define HFI_TF_INVALID (~0U)
+#define HFI_TF_INVALID_GENERATION (~0U)
+#define HFI_TF_NFLOWS 32
+
+/* PBC bits */
+#define HFI_PBC_STATICRCC_SHIFT 0
+#define HFI_PBC_STATICRCC_MASK 0xffff
+
+#define HFI_PBC_SC4_SHIFT 4
+#define HFI_PBC_SC4_MASK 0x1
+
+#define HFI_PBC_INTR_SHIFT 31
+#define HFI_PBC_DCINFO_SHIFT 30
+#define HFI_PBC_TESTEBP_SHIFT 29
+#define HFI_PBC_PACKETBYPASS_SHIFT 28
+#define HFI_PBC_INSERTHCRC_SHIFT 26
+#define HFI_PBC_INSERTHCRC_MASK 0x3
+#define HFI_PBC_CREDITRETURN_SHIFT 25
+#define HFI_PBC_INSERTBYPASSICRC_SHIFT 24
+#define HFI_PBC_TESTBADICRC_SHIFT 23
+#define HFI_PBC_FECN_SHIFT 22
+#define HFI_PBC_VL_SHIFT 12
+#define HFI_PBC_VL_MASK 0xf
+#define HFI_PBC_LENGTHDWS_SHIFT 0
+#define HFI_PBC_LENGTHDWS_MASK 0xfff
+
+/* IB - LRH header consts */
+#define HFI_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */
+#define HFI_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */
+#define HFI_LRH_SC_SHIFT 12
+#define HFI_LRH_SC_MASK 0xf
+#define HFI_LRH_LVER_SHIFT 8
+#define HFI_LRH_LVER_MASK 0xf
+#define HFI_LRH_SL_SHIFT 4
+#define HFI_LRH_SL_MASK 0xf
+#define HFI_LRH_PKTLEN_MASK 0xfff
+
+/* IB - BTH header consts */
+#define HFI_BTH_OPCODE_SHIFT 24
+#define HFI_BTH_OPCODE_MASK 0xff
+#define HFI_BTH_SE_SHIFT 23
+#define HFI_BTH_MIGREQ_SHIFT 22
+#define HFI_BTH_EXTRA_BYTE_SHIFT 20
+#define HFI_BTH_EXTRA_BYTE_MASK 3
+#define HFI_BTH_TVER_SHIFT 16
+#define HFI_BTH_TVER_MASK 0xF
+
+#define HFI_BTH_BECN_SHIFT 30
+#define HFI_BTH_FECN_SHIFT 31
+#define HFI_BTH_QP_SHIFT 16
+#define HFI_BTH_QP_MASK 0xff
+#define HFI_BTH_FLOWID_SHIFT 11
+#define HFI_BTH_FLOWID_MASK 0x1f
+#define HFI_BTH_SUBCTXT_SHIFT 8
+#define HFI_BTH_SUBCTXT_MASK 0x7
+
+#define HFI_BTH_SEQ_SHIFT 0
+#define HFI_BTH_SEQ_MASK 0x7ff
+#define HFI_BTH_GEN_SHIFT 11
+#define HFI_BTH_GEN_MASK 0xfffff
+#define HFI_BTH_ACK_SHIFT 31
+
+/* KDETH header consts */
+#define HFI_KHDR_OFFSET_MASK 0x7fff
+#define HFI_KHDR_OM_SHIFT 15
+#define HFI_KHDR_TID_SHIFT 16
+#define HFI_KHDR_TID_MASK 0x3ff
+#define HFI_KHDR_TIDCTRL_SHIFT 26
+#define HFI_KHDR_TIDCTRL_MASK 0x3
+#define HFI_KHDR_INTR_SHIFT 28
+#define HFI_KHDR_SH_SHIFT 29
+#define HFI_KHDR_KVER_SHIFT 30
+#define HFI_KHDR_KVER_MASK 0x3
+
+#define HFI_KHDR_MSGSEQ_MASK 0xffff
+#define HFI_KHDR_TINYLEN_MASK 0xf
+#define HFI_KHDR_TINYLEN_SHIFT 16
+#define HFI_KHDR_EGRFLAGS_SHIFT 20
+#define HFI_KHDR_EGRFLAGS_MASK 0x3f
+
+#define GET_HFI_KHDR_TIDCTRL(val) \
+ (((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \
+ HFI_KHDR_TIDCTRL_MASK)
+
+#ifdef PSM_CUDA
+extern int is_driver_gpudirect_enabled;
+
+static __inline__ int _psmi_is_driver_gpudirect_enabled() __attribute__((always_inline));
+
+static __inline__ int
+_psmi_is_driver_gpudirect_enabled()
+{
+ return is_driver_gpudirect_enabled;
+}
+#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED _psmi_is_driver_gpudirect_enabled()
+#endif
+
+/* this portion only defines what we currently use */
+struct hfi_pbc {
+ __u32 pbc0;
+ __u16 PbcStaticRateControlCnt;
+ __u16 fill1;
+};
+
+/* hfi kdeth header format */
+struct hfi_kdeth {
+ __u32 kdeth0;
+
+ union {
+ struct {
+ __u16 job_key;
+ __u16 hcrc;
+ };
+ __u32 kdeth1;
+ };
+};
+
+/* misc. */
+#define HFI_CRC_SIZE_IN_BYTES 4
+#define HFI_PCB_SIZE_IN_BYTES 8
+
+#define HFI_EAGER_TIDCTRL 0x0
+
+#define HFI_DEFAULT_SERVICE_ID 0x1000117500000000ULL
+#define HFI_DEFAULT_P_KEY 0x8001 /* fabric default pkey for app traffic */
+
+#if 0
+#define HFI_PERMISSIVE_LID 0xFFFF
+#define HFI_AETH_CREDIT_SHIFT 24
+#define HFI_AETH_CREDIT_MASK 0x1F
+#define HFI_AETH_CREDIT_INVAL 0x1F
+#define HFI_PSN_MASK 0xFFFFFF
+#define HFI_MSN_MASK 0xFFFFFF
+#define HFI_QPN_MASK 0xFFFFFF
+#define HFI_MULTICAST_LID_BASE 0xC000
+#define HFI_MULTICAST_QPN 0xFFFFFF
+#endif
+
+/* Receive Header Queue: receive type (from hfi) */
+#define RCVHQ_RCV_TYPE_EXPECTED 0
+#define RCVHQ_RCV_TYPE_EAGER 1
+#define RCVHQ_RCV_TYPE_NON_KD 2
+#define RCVHQ_RCV_TYPE_ERROR 3
+
+/* OPA PSM assumes that the message header is always 56 bytes. */
+#define HFI_MESSAGE_HDR_SIZE 56
+/* Usable bytes in header (hdrsize - lrh - bth) */
+#define HFI_MESSAGE_HDR_SIZE_HFI (HFI_MESSAGE_HDR_SIZE-20)
+/* SPIO includes 8B PBC and message header */
+#define HFI_SPIO_HDR_SIZE (8+56)
+/*
+ * SDMA includes 8B sdma hdr, 8B PBC, and message header.
+ * If we are using GPU workloads, we need to set a new
+ * "flags" member which takes another 2 bytes in the
+ * sdma hdr. We let the driver know of this 2 extra bytes
+ * at runtime when we set the length for the iovecs.
+ */
+#define HFI_SDMA_HDR_SIZE (8+8+56)
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 hfi_hdrget_err_flags(const __le32 *rbuf)
+{
+ return __le32_to_cpu(rbuf[1]) & HFI_RHF_ERR_MASK;
+}
+
+static inline __u32 hfi_hdrget_rcv_type(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_RCVTYPE_SHIFT)
+ & HFI_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 hfi_hdrget_length_in_bytes(const __le32 *rbuf)
+{
+ return ((__le32_to_cpu(rbuf[0]) >> HFI_RHF_LENGTH_SHIFT)
+ & HFI_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 hfi_hdrget_egrbfr_index(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_EGRBFR_INDEX_SHIFT)
+ & HFI_RHF_EGRBFR_INDEX_MASK;
+}
+
+static inline __u32 hfi_hdrget_seq(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT)
+ & HFI_RHF_SEQ_MASK;
+}
+
+static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT)
+ & HFI_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 hfi_hdrget_egrbfr_offset(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_EGRBFR_OFFSET_SHIFT)
+ & HFI_RHF_EGRBFR_OFFSET_MASK;
+}
+
+static inline __u32 hfi_hdrget_use_egrbfr(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_USE_EGRBFR_SHIFT)
+ & HFI_RHF_USE_EGRBFR_MASK;
+}
+
+/* interval timing routines */
+/* Convert a count of cycles to elapsed nanoseconds */
+/* this is only accurate for reasonably large numbers of cycles (at least tens)
+*/
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t)
+ __attribute__ ((always_inline));
+/* convert elapsed nanoseconds to elapsed cycles */
+/* this is only accurate for reasonably large numbers of nsecs (at least tens)
+*/
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t)
+ __attribute__ ((always_inline));
+/* get current count of nanoseconds from unspecified base value (only useful
+ for intervals) */
+static __inline__ uint64_t get_nanoseconds() __attribute__ ((always_inline));
+
+struct _hfi_ctrl {
+ int32_t fd; /* device file descriptor */
+ /* tidflow valid */
+ uint32_t __hfi_tfvalid;
+ /* unit id */
+ uint32_t __hfi_unit;
+ /* port id */
+ uint32_t __hfi_port;
+
+ /* number of eager tid entries */
+ uint32_t __hfi_tidegrcnt;
+ /* number of expected tid entries */
+ uint32_t __hfi_tidexpcnt;
+
+ /* effective mtu size, should be <= base_info.mtu */
+ uint32_t __hfi_mtusize;
+ /* max PIO size, should be <= effective mtu size */
+ uint32_t __hfi_piosize;
+
+ /* two struct output from driver. */
+ struct hfi1_ctxt_info ctxt_info;
+ struct hfi1_base_info base_info;
+
+ /* some local storages in some condition: */
+ /* as storage of __hfi_rcvtidflow in hfi_userinit(). */
+ __le64 regs[HFI_TF_NFLOWS];
+
+ /* location to which OPA writes the rcvhdrtail register whenever
+ it changes, so that no chip registers are read in the performance
+ path. */
+ volatile __le64 *__hfi_rcvtail;
+
+ /* address where ur_rcvhdrtail is written */
+ volatile __le64 *__hfi_rcvhdrtail;
+ /* address where ur_rcvhdrhead is written */
+ volatile __le64 *__hfi_rcvhdrhead;
+ /* address where ur_rcvegrindextail is read */
+ volatile __le64 *__hfi_rcvegrtail;
+ /* address where ur_rcvegrindexhead is written */
+ volatile __le64 *__hfi_rcvegrhead;
+ /* address where ur_rcvegroffsettail is read */
+ volatile __le64 *__hfi_rcvofftail;
+ /* address where ur_rcvtidflow is written */
+ volatile __le64 *__hfi_rcvtidflow;
+};
+
+/* After the device is opened, hfi_userinit() is called to give the driver the
+ parameters the user code wants to use, and to get the implementation values,
+ etc. back. 0 is returned on success, a positive value is a standard errno,
+ and a negative value is reserved for future use. The first argument is
+ the filedescriptor returned by the device open.
+
+ It is allowed to have multiple devices (and of different types)
+ simultaneously opened and initialized, although this won't be fully
+ implemented initially. This routine is used by the low level
+ hfi protocol code (and any other code that has similar low level
+ functionality).
+ This is the only routine that takes a file descriptor, rather than an
+ struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything
+ else is returned by this routine.
+*/
+
+struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *);
+
+/* don't inline these; it's all init code, and not inlining makes the */
+/* overall code shorter and easier to debug */
+void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline));
+
+/* set the BTH pkey to check for this process. */
+/* This is for receive checks, not for sends. It isn't necessary
+ to set the default key, that's always allowed by the hardware.
+ If too many pkeys are in use for the hardware to support, this
+ will return EAGAIN, and the caller should then fail and exit
+ or use the default key and check the pkey in the received packet
+ checking. */
+int32_t hfi_set_pkey(struct _hfi_ctrl *, uint16_t);
+
+/* flush the eager buffers, by setting the
+ eager index head register == eager index tail, if queue is full */
+void hfi_flush_egr_bufs(struct _hfi_ctrl *ctrl);
+
+int hfi_wait_for_packet(struct _hfi_ctrl *);
+
+/* stop_start == 0 disables receive on the context, for use in queue overflow
+ conditions. stop_start==1 re-enables, and returns value of tail register,
+ to be used to re-init the software copy of the head register */
+int hfi_manage_rcvq(struct _hfi_ctrl *ctrl, uint32_t stop_start);
+
+/* ctxt_bp == 0 disables fabric back pressure on the context. */
+/* ctxt_bp == 1 enables fabric back pressure on the context. */
+int hfi_manage_bp(struct _hfi_ctrl *ctrl, uint8_t ctxt_bp);
+
+/* enable == 1 enables armlaunch (normal), 0 disables (only used */
+/* hfi_pkt_test -B at the moment, needed for linda). */
+int hfi_armlaunch_ctrl(struct _hfi_ctrl *ctrl, uint32_t enable);
+
+/* force an update of the PIOAvail register to memory */
+int hfi_force_pio_avail_update(struct _hfi_ctrl *ctrl);
+
+/* Disarm any send buffers which need disarming. */
+int hfi_disarm_bufs(struct _hfi_ctrl *ctrl);
+
+/* New user event mechanism, using spi_sendbuf_status HFI_EVENT_* bits
+ obsoletes hfi_disarm_bufs(), and extends it, although old mechanism
+ remains for binary compatibility. */
+int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits);
+
+/* Return send dma's current "in flight counter " */
+int hfi_sdma_inflight(struct _hfi_ctrl *ctrl, uint32_t *counter);
+
+/* Return send dma's current "completion counter" */
+int hfi_sdma_complete(struct _hfi_ctrl *ctrl, uint32_t *counter);
+
+/* set whether we want an interrupt on all packets, or just urgent ones */
+int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type);
+
+/* set send context pkey to verify, error if driver is not configured with */
+/* this pkey in its pkey table. */
+int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey);
+
+/* reset halted send context, error if context is not halted. */
+int hfi_reset_context(struct _hfi_ctrl *ctrl);
+
+/* Statistics maintained by the driver */
+const char *hfi_get_next_name(char **names);
+uint64_t hfi_get_single_stat(const char *attr, uint64_t *s);
+int hfi_get_stats_names_count(void);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit_names_count(int unitno);
+int hfi_get_ctrs_port_names_count(int unitno);
+
+uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s);
+int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c);
+void hfi_release_names(char *namep);
+
+/* Syslog wrapper
+
+ level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING,
+ LOG_NOTICE, LOG_INFO, LOG_DEBUG.
+
+ prefix should be a short string to describe which part of the software stack
+ is using syslog, i.e. "PSM", "mpi", "mpirun".
+*/
+void hfi_syslog(const char *prefix, int to_console, int level,
+ const char *format, ...)
+ __attribute__((format(printf, 4, 5)));
+
+void hfi_vsyslog(const char *prefix, int to_console, int level,
+ const char *format, va_list ap);
+
+/* parameters for PBC for pio write routines, to avoid passing lots
+ * of args; we instead pass the structure pointer. */
+struct hfi_pio_params {
+ uint16_t length;
+ uint8_t vl;
+ uint8_t port;
+ uint32_t cksum_is_valid;
+ uint32_t cksum;
+ uint32_t rate;
+};
+
+/* write pio buffers. The hfi_write_pio_force_order() version assumes
+ that the processor does not write store buffers to i/o devices in the
+ order in which they are writte, and that when flushing partially
+ filled store buffers, the words are not ordered either. The hfi_write_pio()
+ form is used when the processor writes store buffers to i/o in the order
+ in which they are filled, and writes partially filled buffers in increasing
+ address order (assuming they are filled that way).
+ The arguments are pio buffer address, payload length, header, and payload
+*/
+void hfi_write_pio(volatile uint32_t *, const struct hfi_pio_params *,
+ void *, void *);
+void hfi_write_pio_force_order(volatile uint32_t *,
+ const struct hfi_pio_params *, void *, void *);
+
+#define HFI_SPECIAL_TRIGGER_MAGIC 0xaebecede
+/* IBA7220 can use a "Special" trigger. We write to the last dword
+ in the mapped SendBuf to trigger the launch. */
+void hfi_write_pio_special_trigger2k(volatile uint32_t *,
+ const struct hfi_pio_params *, void *,
+ void *);
+void hfi_write_pio_special_trigger4k(volatile uint32_t *,
+ const struct hfi_pio_params *, void *,
+ void *);
+
+/*
+ * Copy routine that may copy a byte multiple times but optimized for througput
+ * This is not safe to use for PIO routines where we want a guarantee that a
+ * byte is only copied/moved across the bus once.
+ */
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src,
+ uint32_t ndwords);
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src,
+ uint32_t nqwords);
+
+/*
+* Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once.
+*/
+#if defined(__x86_64__)
+void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src,
+ uint32_t ndwords);
+void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src,
+ uint32_t nqwords);
+#else
+#define hfi_dwordcpy_safe hfi_dwordcpy
+#define hfi_qwordcpy_safe hfi_qwordcpy
+#endif
+
+/* From here to the end of the file are implementation details that should not
+ be used outside this file (other than to call the function), except in the
+ one infrastructure file in which they are defined.
+
+ NOTE: doing paired 32 bit writes to the chip to store 64 bit values (as
+ from 32 bit programs) will not work correctly, because there is no sub-qword
+ address decode. Therefore 32 bit programs use only a single 32 bit store;
+ the head register values are all less than 32 bits, anyway. Given that, we
+ use only 32 bits even for 64 bit programs, for simplicity. These functions
+ must not be called until after hfi_userinit() is called. The ctrl argument
+ is currently unused, but remains useful for adding debug code.
+*/
+
+static __inline__ void hfi_put_rcvegrindexhead(struct _hfi_ctrl *ctrl,
+ uint64_t val)
+{
+ *ctrl->__hfi_rcvegrhead = __cpu_to_le64(val);
+}
+
+static __inline__ void hfi_put_rcvhdrhead(struct _hfi_ctrl *ctrl, uint64_t val)
+{
+ *ctrl->__hfi_rcvhdrhead = __cpu_to_le64(val);
+}
+
+static __inline__ uint64_t hfi_get_rcvhdrtail(struct _hfi_ctrl *ctrl)
+{
+ uint64_t res = __le64_to_cpu(*ctrl->__hfi_rcvtail);
+ ips_rmb();
+ return res;
+}
+
+static __inline__ void hfi_tidflow_set_entry(struct _hfi_ctrl *ctrl,
+ uint32_t flowid, uint32_t genval,
+ uint32_t seqnum)
+{
+/* For proper behavior with RSM interception of FECN packets for CCA,
+ * the tidflow entry needs the KeepAfterSequenceError bit set.
+ * A packet that is converted from expected to eager by RSM will not
+ * trigger an update in the tidflow state. This will cause the tidflow
+ * to incorrectly report a sequence error on any non-FECN packets that
+ * arrive after the RSM intercepted packets. If the KeepAfterSequenceError
+ * bit is set, PSM can properly detect this "false SeqErr" condition,
+ * and recover without dropping packets.
+ * Note that if CCA/RSM are not important, this change will slightly
+ * increase the CPU load when packets are dropped. If this is significant,
+ * consider hiding this change behind a CCA/RSM environment variable.
+ */
+
+ ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64(
+ ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) |
+ ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) |
+ ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) |
+ (1ULL << HFI_TF_HDRSUPP_ENABLED_SHIFT) |
+ /* KeepAfterSequenceError = 1 -- previously was 0 */
+ (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) |
+ (1ULL << HFI_TF_KEEP_ON_GENERR_SHIFT) |
+ /* KeePayloadOnGenErr = 0 */
+ (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) |
+ (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT));
+}
+
+static __inline__ void hfi_tidflow_reset(struct _hfi_ctrl *ctrl,
+ uint32_t flowid, uint32_t genval,
+ uint32_t seqnum)
+{
+/*
+ * If a tidflow table entry is set to "Invalid", we want to drop
+ * header if payload is dropped, we want to get a header if the payload
+ * is delivered.
+ *
+ * We set a tidflow table entry "Invalid" by setting FlowValid=1 and
+ * GenVal=0x1FFF/0xFFFFF, this is a special generation number and no
+ * packet will use this value. We don't care SeqNum but we set it to
+ * 0x7FF. So if GenVal does not match, the payload is dropped because
+ * KeepPayloadOnGenErr=0; for packet header, KeepOnGenErr=0 make sure
+ * header is not generated. But if a packet happens to have the special
+ * generation number, the payload is delivered, HdrSuppEnabled=0 make
+ * sure header is generated if SeqNUm matches, if SeqNum does not match,
+ * KeepAfterSeqErr=1 makes sure the header is generated.
+ */
+ ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64(
+ /* genval = 0x1FFF or 0xFFFFF */
+ ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) |
+ /* seqnum = 0x7FF */
+ ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) |
+ ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) |
+ /* HdrSuppEnabled = 0 */
+ (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) |
+ /* KeepOnGenErr = 0 */
+ /* KeepPayloadOnGenErr = 0 */
+ (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) |
+ (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT));
+}
+
+/*
+ * This should only be used for debugging.
+ * Normally, we shouldn't read the chip.
+ */
+static __inline__ uint64_t hfi_tidflow_get(struct _hfi_ctrl *ctrl,
+ uint32_t flowid)
+{
+ return __le64_to_cpu(ctrl->__hfi_rcvtidflow[flowid]);
+}
+
+static __inline__ uint32_t hfi_tidflow_get_seqnum(uint64_t val)
+{
+ return (val >> HFI_TF_SEQNUM_SHIFT) & HFI_TF_SEQNUM_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_genval(uint64_t val)
+{
+ return (val >> HFI_TF_GENVAL_SHIFT) & HFI_TF_GENVAL_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_flowvalid(uint64_t val)
+{
+ return (val >> HFI_TF_FLOWVALID_SHIFT) & HFI_TF_FLOWVALID_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_enabled(uint64_t val)
+{
+ return (val >> HFI_TF_HDRSUPP_ENABLED_SHIFT) &
+ HFI_TF_HDRSUPP_ENABLED_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_after_seqerr(uint64_t val)
+{
+ return (val >> HFI_TF_KEEP_AFTER_SEQERR_SHIFT) &
+ HFI_TF_KEEP_AFTER_SEQERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_on_generr(uint64_t val)
+{
+ return (val >> HFI_TF_KEEP_ON_GENERR_SHIFT) &
+ HFI_TF_KEEP_ON_GENERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_keep_payload_on_generr(uint64_t val)
+{
+ return (val >> HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT) &
+ HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_seqmismatch(uint64_t val)
+{
+ return (val >> HFI_TF_STATUS_SEQMISMATCH_SHIFT) &
+ HFI_TF_STATUS_SEQMISMATCH_MASK;
+}
+
+static __inline__ uint32_t hfi_tidflow_get_genmismatch(uint64_t val)
+{
+ return (val >> HFI_TF_STATUS_GENMISMATCH_SHIFT) &
+ HFI_TF_STATUS_GENMISMATCH_MASK;
+}
+
+/*
+ * This should only be used by a process to write the eager index into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void hfi_hdrset_use_egrbfr(__le32 *rbuf, uint32_t val)
+{
+ rbuf[0] =
+ (rbuf[0] &
+ __cpu_to_le32(~(HFI_RHF_USE_EGRBFR_MASK <<
+ HFI_RHF_USE_EGRBFR_SHIFT))) |
+ __cpu_to_le32((val & HFI_RHF_USE_EGRBFR_MASK) <<
+ HFI_RHF_USE_EGRBFR_SHIFT);
+}
+
+static __inline__ void hfi_hdrset_egrbfr_index(__le32 *rbuf, uint32_t val)
+{
+ rbuf[0] =
+ (rbuf[0] &
+ __cpu_to_le32(~(HFI_RHF_EGRBFR_INDEX_MASK <<
+ HFI_RHF_EGRBFR_INDEX_SHIFT))) |
+ __cpu_to_le32((val & HFI_RHF_EGRBFR_INDEX_MASK) <<
+ HFI_RHF_EGRBFR_INDEX_SHIFT);
+}
+
+static __inline__ void hfi_hdrset_egrbfr_offset(__le32 *rbuf, uint32_t val)
+{
+ rbuf[1] =
+ (rbuf[1] &
+ __cpu_to_le32(~(HFI_RHF_EGRBFR_OFFSET_MASK <<
+ HFI_RHF_EGRBFR_OFFSET_SHIFT))) |
+ __cpu_to_le32((val & HFI_RHF_EGRBFR_OFFSET_MASK) <<
+ HFI_RHF_EGRBFR_OFFSET_SHIFT);
+}
+
+/*
+ * This should only be used by a process to update the receive header
+ * error flags.
+ */
+static __inline__ void hfi_hdrset_err_flags(__le32 *rbuf, uint32_t val)
+{
+ rbuf[1] |= __cpu_to_le32(val);
+}
+
+/*
+ * This should only be used by a process to write the rhf seq number into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void hfi_hdrset_seq(__le32 *rbuf, uint32_t val)
+{
+ rbuf[0] =
+ (rbuf[0] &
+ __cpu_to_le32(~(HFI_RHF_SEQ_MASK <<
+ HFI_RHF_SEQ_SHIFT))) |
+ __cpu_to_le32((val & HFI_RHF_SEQ_MASK) << HFI_RHF_SEQ_SHIFT);
+}
+
+/* Manage TID entries. It is possible that not all entries
+ requested may be allocated. A matching hfi_free_tid() must be
+ done for each hfi_update_tid(), because currently no caching or
+ reuse of expected tid entries is allowed, to work around malloc/free
+ and mmap/munmap issues. The driver decides which TID entries to allocate.
+ If hfi_free_tid is called to free entries in use by a different
+ send by the same process, data corruption will probably occur,
+ but only within that process, not for other processes.
+*/
+
+/* update tidcnt expected TID entries from the array pointed to by tidinfo. */
+/* Returns 0 on success, else an errno. See full description at declaration */
+static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl,
+ uint64_t vaddr, uint32_t *length,
+ uint64_t tidlist, uint32_t *tidcnt, uint16_t flags)
+{
+ struct hfi1_cmd cmd;
+#ifdef PSM_CUDA
+ struct hfi1_tid_info_v2 tidinfo;
+#else
+ struct hfi1_tid_info tidinfo;
+#endif
+ int err;
+
+ tidinfo.vaddr = vaddr; /* base address for this send to map */
+ tidinfo.length = *length; /* length of vaddr */
+
+ tidinfo.tidlist = tidlist; /* driver copies tids back directly */
+ tidinfo.tidcnt = 0; /* clear to zero */
+
+ cmd.type = PSMI_HFI_CMD_TID_UPDATE;
+#ifdef PSM_CUDA
+ cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2;
+
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ tidinfo.flags = flags;
+ else
+ tidinfo.flags = 0;
+#endif
+
+ cmd.len = sizeof(tidinfo);
+ cmd.addr = (__u64) &tidinfo;
+
+ err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+ if (err != -1) {
+ *length = tidinfo.length;
+ *tidcnt = tidinfo.tidcnt;
+ }
+
+ return err;
+}
+
+static __inline__ int32_t hfi_free_tid(struct _hfi_ctrl *ctrl,
+ uint64_t tidlist, uint32_t tidcnt)
+{
+ struct hfi1_cmd cmd;
+ struct hfi1_tid_info tidinfo;
+ int err;
+
+ tidinfo.tidlist = tidlist; /* input to driver */
+ tidinfo.tidcnt = tidcnt;
+
+ cmd.type = PSMI_HFI_CMD_TID_FREE;
+ cmd.len = sizeof(tidinfo);
+ cmd.addr = (__u64) &tidinfo;
+
+ err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+ return err;
+}
+
+static __inline__ int32_t hfi_get_invalidation(struct _hfi_ctrl *ctrl,
+ uint64_t tidlist, uint32_t *tidcnt)
+{
+ struct hfi1_cmd cmd;
+ struct hfi1_tid_info tidinfo;
+ int err;
+
+ tidinfo.tidlist = tidlist; /* driver copies tids back directly */
+ tidinfo.tidcnt = 0; /* clear to zero */
+
+ cmd.type = PSMI_HFI_CMD_TID_INVAL_READ;
+ cmd.len = sizeof(tidinfo);
+ cmd.addr = (__u64) &tidinfo;
+
+ err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
+
+ if (err != -1)
+ *tidcnt = tidinfo.tidcnt;
+
+ return err;
+}
+
+extern uint32_t __hfi_pico_per_cycle; /* only for use in these functions */
+
+/* this is only accurate for reasonably large numbers of cycles (at least tens) */
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs)
+{
+ return (__hfi_pico_per_cycle * cycs) / 1000ULL;
+}
+
+/* this is only accurate for reasonably large numbers of nsecs (at least tens) */
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns)
+{
+ return (ns * 1000ULL) / __hfi_pico_per_cycle;
+}
+
+static __inline__ uint64_t get_nanoseconds()
+{
+ return cycles_to_nanosecs(get_cycles());
+}
+
+/* open the diags device, if supported by driver. Returns 0 on */
+/* success, errno on failure. Also tells driver that diags */
+/* is active, which changes some driver behavior */
+int hfi_diag_open(unsigned); /* unit */
+int hfi_diag_close(void);
+
+/* diags chip read and write routines */
+
+int hfid_read32(uint64_t reg_offset, uint32_t *read_valp);
+int hfid_write32(uint64_t reg_offset, uint32_t write_val);
+
+int hfid_readmult(uint64_t, unsigned, uint64_t *); /* chip: offset, cnt, ptr */
+int hfid_write(uint64_t, uint64_t); /* chip: offset, value */
+
+#define HFI_READ_EEPROM 31337
+#define HFI_WRITE_EEPROM 101
+
+struct hfi_eeprom_req {
+ void *addr;
+ uint16_t len;
+ uint16_t offset;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define HFI_FLASH_VERSION 2
+struct hfi_flash {
+ /* flash layout version (HFI_FLASH_VERSION) */
+ __u8 if_fversion;
+ /* checksum protecting if_length bytes */
+ __u8 if_csum;
+ /*
+ * valid length (in use, protected by if_csum), including
+ * if_fversion and if_csum themselves)
+ */
+ __u8 if_length;
+ /* the GUID, in network order */
+ __u8 if_guid[8];
+ /* number of GUIDs to use, starting from if_guid */
+ __u8 if_numguid;
+ /* the (last 10 characters of) board serial number, in ASCII */
+ char if_serial[12];
+ /* board mfg date (YYYYMMDD ASCII) */
+ char if_mfgdate[8];
+ /* last board rework/test date (YYYYMMDD ASCII) */
+ char if_testdate[8];
+ /* logging of error counts, TBD */
+ __u8 if_errcntp[4];
+ /* powered on hours, updated at driver unload */
+ __u8 if_powerhour[2];
+ /* ASCII free-form comment field */
+ char if_comment[32];
+ /* Backwards compatible prefix for longer QLogic Serial Numbers */
+ char if_sprefix[4];
+ /* 82 bytes used, min flash size is 128 bytes */
+ __u8 if_future[46];
+};
+
+int hfid_send_pkt(const void *, unsigned); /* send a packet for diags */
+int hfid_read_i2c(struct hfi_eeprom_req *); /* diags read i2c flash */
+
+__u8 hfi_flash_csum(struct hfi_flash *, int);
+
+int hfid_reset_hardware(uint32_t);
+
+#endif /* OPA_USER_H */
diff --git a/include/psm2_mock_testing.h b/include/psm2_mock_testing.h
new file mode 100644
index 0000000..d1e9bff
--- /dev/null
+++ b/include/psm2_mock_testing.h
@@ -0,0 +1,176 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MOCK_TESTING_H
+#define PSM2_MOCK_TESTING_H
+
+/* PSM2_MOCK_TESTING being defined flips a couple of switches so that a
+ * testable version of libpsm2.so is built. It'll make properly annotated
+ * static functions be non-static, visible to the outside. Also, all mockable
+ * functions will be replaced with function pointers which will originally
+ * point to the actual implementation. However, those function pointers might
+ * be reset by the test code, thus allowing for mocking selected PSM2 functions
+ * for the purpose of the test.
+ *
+ * So far the following utilities have been introduced for enabling a
+ * conditional compilation of the testable vs. production version of the library:
+ * - ustatic: toggles function visibility
+ * - MOCKABLE(): decorates function name so that it is visible after being mocked
+ * - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam
+ * for mocking a function
+ * - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam
+ * for mocking a function
+ *
+ * If the declaration and definition of a static function @c foo reside in
+ * different files, this would be the common use case:
+ *
+ * @code
+ * // somefile.c:
+ * int MOCKABLE(foo)();
+ * MOCK_DCL_EPILOGUE(foo);
+ *
+ * // otherfile.c:
+ * int MOCKABLE(foo)() {
+ * printf("I am the original foo!\n");
+ * }
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * int foo();
+ *
+ * // otherfile.c:
+ * int foo() {
+ * printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * On the other hand, if a testable version of the libary is being build, it
+ * would produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ *
+ * // otherfile.c:
+ * int foo_original_() {
+ * printf("I am the original foo!\n");
+ * }
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ *
+ * If the function to be mocked is a static function residing in the header,
+ * the following syntax would be used:
+ * @code
+ * // somefile.c:
+ * ustatic int MOCKABLE(foo)() {
+ * printf("I am the original foo!\n");
+ * }
+ * MOCK_DCL_EPILOGUE(foo);
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * static int foo() {
+ * printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * Similarly, if a testable version of the libary is being build, it would
+ * produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ */
+#ifndef PSM2_MOCK_TESTING
+
+/* If no testing is being done, ustatic resolves to regular "static" */
+#define ustatic static
+/* If no testing is being done, no indirection is introduced */
+#define MOCKABLE(fname) fname
+/* If no testing is being done, no declaration epilogue is needed */
+#define MOCK_DCL_EPILOGUE(fname)
+/* If no testing is being done, no definition epilogue is needed */
+#define MOCK_DEF_EPILOGUE(fname)
+
+#else /* ndef PSM2_MOCK_TESTING */
+
+/* For the testable version, all _ustatic_ function will NOT be static */
+#define ustatic
+/* TODO override inline directives in the same fashion as static */
+/* For the testable version, the actual implementation function is renamed */
+#define MOCKABLE(x) x ## _original_
+/* For the testable version, we declare the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the declaration of the actual function happens.
+ */
+#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x;
+/* For the testable version, we define the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the definition of the actual function happens.
+ */
+#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_;
+
+#endif /* ndef PSM2_MOCK_TESTING */
+
+#endif /* PSM2_MOCK_TESTING_H */
+
diff --git a/include/rbtree.c b/include/rbtree.c
new file mode 100644
index 0000000..9d6930d
--- /dev/null
+++ b/include/rbtree.c
@@ -0,0 +1,692 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+/*
+ * Abstract:
+ * Implementation of quick map, a binary tree where the caller always provides
+ * all necessary storage.
+ *
+ * Environment:
+ * All
+ *
+ * $Revision$
+ */
+
+
+/*****************************************************************************
+*
+* Map
+*
+* Map is an associative array. By providing a key, the caller can retrieve
+* an object from the map. All objects in the map have an associated key,
+* as specified by the caller when the object was inserted into the map.
+* In addition to random access, the caller can traverse the map much like
+* a linked list, either forwards from the first object or backwards from
+* the last object. The objects in the map are always traversed in
+* order since the nodes are stored sorted.
+*
+* This implementation of Map uses a red black tree verified against
+* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth
+* printing, 1994.
+*
+*****************************************************************************/
+
+#include <string.h> /* for memset declaration */
+
+#if !defined ( RBTREE_GET_LEFTMOST ) || \
+ ! defined ( RBTREE_GET_RIGHTMOST ) || \
+ ! defined ( RBTREE_MAP_COUNT ) || \
+ ! defined ( RBTREE_ASSERT )
+#error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \
+ RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c"
+#endif
+
+#define IN /* nothing */
+
+/******************************************************************************
+*******************************************************************************
+************** ************
+************** IMPLEMENTATION OF QUICK MAP ************
+************** ************
+*******************************************************************************
+******************************************************************************/
+
+/* Forward declarations: */
+static void ips_cl_qmap_init(
+ IN cl_qmap_t *p_map,
+ IN cl_map_item_t* const root,
+ IN cl_map_item_t* const nil);
+static void ips_cl_qmap_insert_item(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item);
+static void ips_cl_qmap_remove_item(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item);
+static cl_map_item_t* ips_cl_qmap_successor(
+ IN cl_qmap_t* const p_map,
+ IN const cl_map_item_t* p_item);
+static cl_map_item_t* ips_cl_qmap_predecessor(
+ IN cl_qmap_t* const p_map,
+ IN const cl_map_item_t* p_item);
+static cl_map_item_t* ips_cl_qmap_search(
+ IN cl_qmap_t* const p_map,
+ IN unsigned long start,
+ IN unsigned long end);
+
+/*
+ * Get the root.
+ */
+static inline cl_map_item_t*
+__cl_map_root(
+ IN const cl_qmap_t* const p_map )
+{
+ RBTREE_ASSERT( p_map );
+ return( p_map->root->p_left );
+}
+
+
+/*
+ * Returns whether a given item is on the left of its parent.
+ */
+static int
+__cl_map_is_left_child(
+ IN const cl_map_item_t* const p_item )
+{
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_item->p_up );
+ RBTREE_ASSERT( p_item->p_up != p_item );
+
+ return( p_item->p_up->p_left == p_item );
+}
+
+
+/*
+ * Retrieve the pointer to the parent's pointer to an item.
+ */
+static cl_map_item_t**
+__cl_map_get_parent_ptr_to_item(
+ IN cl_map_item_t* const p_item )
+{
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_item->p_up );
+ RBTREE_ASSERT( p_item->p_up != p_item );
+
+ if( __cl_map_is_left_child( p_item ) )
+ return( &p_item->p_up->p_left );
+
+ RBTREE_ASSERT( p_item->p_up->p_right == p_item );
+ return( &p_item->p_up->p_right );
+}
+
+
+/*
+ * Rotate a node to the left. This rotation affects the least number of links
+ * between nodes and brings the level of C up by one while increasing the depth
+ * of A one. Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ * R R
+ * | |
+ * A C
+ * / \ / \
+ * W C A Z
+ * / \ / \
+ * B Z W B
+ * / \ / \
+ * X Y X Y
+ */
+static void
+__cl_map_rot_left(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item )
+{
+ cl_map_item_t **pp_root;
+
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_item->p_right != p_map->nil_item );
+
+ pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+
+ /* Point R to C instead of A. */
+ *pp_root = p_item->p_right;
+ /* Set C's parent to R. */
+ (*pp_root)->p_up = p_item->p_up;
+
+ /* Set A's right to B */
+ p_item->p_right = (*pp_root)->p_left;
+ /*
+ * Set B's parent to A. We trap for B being NIL since the
+ * caller may depend on NIL not changing.
+ */
+ if( (*pp_root)->p_left != p_map->nil_item )
+ (*pp_root)->p_left->p_up = p_item;
+
+ /* Set C's left to A. */
+ (*pp_root)->p_left = p_item;
+ /* Set A's parent to C. */
+ p_item->p_up = *pp_root;
+}
+
+
+/*
+ * Rotate a node to the right. This rotation affects the least number of links
+ * between nodes and brings the level of A up by one while increasing the depth
+ * of C one. Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ * R R
+ * | |
+ * C A
+ * / \ / \
+ * A Z W C
+ * / \ / \
+ * W B B Z
+ * / \ / \
+ * X Y X Y
+ */
+static void
+__cl_map_rot_right(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item )
+{
+ cl_map_item_t **pp_root;
+
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_item->p_left != p_map->nil_item );
+
+ /* Point R to A instead of C. */
+ pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+ (*pp_root) = p_item->p_left;
+ /* Set A's parent to R. */
+ (*pp_root)->p_up = p_item->p_up;
+
+ /* Set C's left to B */
+ p_item->p_left = (*pp_root)->p_right;
+ /*
+ * Set B's parent to C. We trap for B being NIL since the
+ * caller may depend on NIL not changing.
+ */
+ if( (*pp_root)->p_right != p_map->nil_item )
+ (*pp_root)->p_right->p_up = p_item;
+
+ /* Set A's right to C. */
+ (*pp_root)->p_right = p_item;
+ /* Set C's parent to A. */
+ p_item->p_up = *pp_root;
+}
+
+/*
+ * Balance a tree starting at a given item back to the root.
+ */
+static void
+__cl_map_ins_bal(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* p_item )
+{
+ cl_map_item_t* p_grand_uncle;
+
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_item != p_map->root );
+
+ while( p_item->p_up->color == CL_MAP_RED )
+ {
+ if( __cl_map_is_left_child( p_item->p_up ) )
+ {
+ p_grand_uncle = p_item->p_up->p_up->p_right;
+ RBTREE_ASSERT( p_grand_uncle );
+ if( p_grand_uncle->color == CL_MAP_RED )
+ {
+ p_grand_uncle->color = CL_MAP_BLACK;
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_item->p_up->p_up->color = CL_MAP_RED;
+ p_item = p_item->p_up->p_up;
+ continue;
+ }
+
+ if( !__cl_map_is_left_child( p_item ) )
+ {
+ p_item = p_item->p_up;
+ __cl_map_rot_left( p_map, p_item );
+ }
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_item->p_up->p_up->color = CL_MAP_RED;
+ __cl_map_rot_right( p_map, p_item->p_up->p_up );
+ }
+ else
+ {
+ p_grand_uncle = p_item->p_up->p_up->p_left;
+ RBTREE_ASSERT( p_grand_uncle );
+ if( p_grand_uncle->color == CL_MAP_RED )
+ {
+ p_grand_uncle->color = CL_MAP_BLACK;
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_item->p_up->p_up->color = CL_MAP_RED;
+ p_item = p_item->p_up->p_up;
+ continue;
+ }
+
+ if( __cl_map_is_left_child( p_item ) )
+ {
+ p_item = p_item->p_up;
+ __cl_map_rot_right( p_map, p_item );
+ }
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_item->p_up->p_up->color = CL_MAP_RED;
+ __cl_map_rot_left( p_map, p_item->p_up->p_up );
+ }
+ }
+}
+
+static void ips_cl_qmap_init(
+ IN cl_qmap_t *p_map,
+ IN cl_map_item_t* const root,
+ IN cl_map_item_t* const nil_item)
+{
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( root );
+ RBTREE_ASSERT( nil_item );
+
+ memset(p_map,0,sizeof(cl_qmap_t));
+
+ p_map->root = root;
+
+ /* setup the RB tree map */
+ p_map->nil_item = nil_item;
+
+ p_map->root->p_up = p_map->root;
+ p_map->root->p_left = p_map->nil_item;
+ p_map->root->p_right = p_map->nil_item;
+ p_map->root->color = CL_MAP_BLACK;
+
+ p_map->nil_item->p_up = p_map->nil_item;
+ p_map->nil_item->p_left = p_map->nil_item;
+ p_map->nil_item->p_right = p_map->nil_item;
+ p_map->nil_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_insert_item(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item )
+{
+ cl_map_item_t *p_insert_at, *p_comp_item;
+ int compare_res = 0;
+
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( p_item );
+ RBTREE_ASSERT( p_map->root->p_up == p_map->root );
+ RBTREE_ASSERT( p_map->root->color != CL_MAP_RED );
+ RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+
+ /* Find the insertion location. */
+ p_insert_at = p_map->root;
+ p_comp_item = __cl_map_root( p_map );
+
+ while( p_comp_item != p_map->nil_item )
+ {
+ p_insert_at = p_comp_item;
+
+ /* Traverse the tree until the correct insertion point is found. */
+ if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) )
+ {
+ p_comp_item = p_insert_at->p_left;
+ compare_res = 1;
+ } else {
+ p_comp_item = p_insert_at->p_right;
+ compare_res = -1;
+ }
+ }
+
+ RBTREE_ASSERT( p_insert_at != p_map->nil_item );
+ RBTREE_ASSERT( p_comp_item == p_map->nil_item );
+
+ /* Insert the item. */
+ p_item->p_left = p_map->nil_item;
+ p_item->p_right = p_map->nil_item;
+ p_item->color = CL_MAP_RED;
+ if( p_insert_at == p_map->root )
+ {
+ p_insert_at->p_left = p_item;
+ }
+ else if( compare_res > 0 ) /* key < p_insert_at->key */
+ {
+ p_insert_at->p_left = p_item;
+ }
+ else
+ {
+ p_insert_at->p_right = p_item;
+ }
+ /* Increase the count. */
+ RBTREE_MAP_COUNT(&p_map->payload)++;
+
+ p_item->p_up = p_insert_at;
+
+ /*
+ * We have added depth to this section of the tree.
+ * Rebalance as necessary as we retrace our path through the tree
+ * and update colors.
+ */
+ __cl_map_ins_bal( p_map, p_item );
+
+ __cl_map_root( p_map )->color = CL_MAP_BLACK;
+
+ /*
+ * Note that it is not necessary to re-color the nil node black because all
+ * red color assignments are made via the p_up pointer, and nil is never
+ * set as the value of a p_up pointer.
+ */
+}
+
+static void
+__cl_map_del_bal(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* p_item )
+{
+ cl_map_item_t *p_uncle;
+
+ while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) )
+ {
+ if( __cl_map_is_left_child( p_item ) )
+ {
+ p_uncle = p_item->p_up->p_right;
+
+ if( p_uncle->color == CL_MAP_RED )
+ {
+ p_uncle->color = CL_MAP_BLACK;
+ p_item->p_up->color = CL_MAP_RED;
+ __cl_map_rot_left( p_map, p_item->p_up );
+ p_uncle = p_item->p_up->p_right;
+ }
+
+ if( p_uncle->p_right->color != CL_MAP_RED )
+ {
+ if( p_uncle->p_left->color != CL_MAP_RED )
+ {
+ p_uncle->color = CL_MAP_RED;
+ p_item = p_item->p_up;
+ continue;
+ }
+
+ p_uncle->p_left->color = CL_MAP_BLACK;
+ p_uncle->color = CL_MAP_RED;
+ __cl_map_rot_right( p_map, p_uncle );
+ p_uncle = p_item->p_up->p_right;
+ }
+ p_uncle->color = p_item->p_up->color;
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_uncle->p_right->color = CL_MAP_BLACK;
+ __cl_map_rot_left( p_map, p_item->p_up );
+ break;
+ }
+ else
+ {
+ p_uncle = p_item->p_up->p_left;
+
+ if( p_uncle->color == CL_MAP_RED )
+ {
+ p_uncle->color = CL_MAP_BLACK;
+ p_item->p_up->color = CL_MAP_RED;
+ __cl_map_rot_right( p_map, p_item->p_up );
+ p_uncle = p_item->p_up->p_left;
+ }
+
+ if( p_uncle->p_left->color != CL_MAP_RED )
+ {
+ if( p_uncle->p_right->color != CL_MAP_RED )
+ {
+ p_uncle->color = CL_MAP_RED;
+ p_item = p_item->p_up;
+ continue;
+ }
+
+ p_uncle->p_right->color = CL_MAP_BLACK;
+ p_uncle->color = CL_MAP_RED;
+ __cl_map_rot_left( p_map, p_uncle );
+ p_uncle = p_item->p_up->p_left;
+ }
+ p_uncle->color = p_item->p_up->color;
+ p_item->p_up->color = CL_MAP_BLACK;
+ p_uncle->p_left->color = CL_MAP_BLACK;
+ __cl_map_rot_right( p_map, p_item->p_up );
+ break;
+ }
+ }
+ p_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_remove_item(
+ IN cl_qmap_t* const p_map,
+ IN cl_map_item_t* const p_item )
+{
+ cl_map_item_t *p_child, *p_del_item;
+
+ RBTREE_ASSERT( p_map );
+ RBTREE_ASSERT( p_item );
+
+ if( p_item == p_map->nil_item )
+ return;
+
+ if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) )
+ {
+ /* The item being removed has children on at most on side. */
+ p_del_item = p_item;
+ }
+ else
+ {
+ /*
+ * The item being removed has children on both side.
+ * We select the item that will replace it. After removing
+ * the substitute item and rebalancing, the tree will have the
+ * correct topology. Exchanging the substitute for the item
+ * will finalize the removal.
+ */
+ p_del_item = ips_cl_qmap_successor(p_map, p_item);
+ RBTREE_ASSERT( p_del_item != p_map->nil_item );
+ }
+
+ RBTREE_MAP_COUNT(&p_map->payload)--;
+
+ /* Get the pointer to the new root's child, if any. */
+ if( p_del_item->p_left != p_map->nil_item )
+ p_child = p_del_item->p_left;
+ else
+ p_child = p_del_item->p_right;
+
+ /*
+ * This assignment may modify the parent pointer of the nil node.
+ * This is inconsequential.
+ */
+ p_child->p_up = p_del_item->p_up;
+ (*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child;
+
+ if( p_del_item->color != CL_MAP_RED )
+ __cl_map_del_bal( p_map, p_child );
+
+ /*
+ * Note that the splicing done below does not need to occur before
+ * the tree is balanced, since the actual topology changes are made by the
+ * preceding code. The topology is preserved by the color assignment made
+ * below (reader should be reminded that p_del_item == p_item in some cases).
+ */
+ if( p_del_item != p_item )
+ {
+ /*
+ * Finalize the removal of the specified item by exchanging it with
+ * the substitute which we removed above.
+ */
+ p_del_item->p_up = p_item->p_up;
+ p_del_item->p_left = p_item->p_left;
+ p_del_item->p_right = p_item->p_right;
+ (*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item;
+ p_item->p_right->p_up = p_del_item;
+ p_item->p_left->p_up = p_del_item;
+ p_del_item->color = p_item->color;
+ }
+
+ RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+}
+
+static cl_map_item_t *
+ips_cl_qmap_successor(
+ IN cl_qmap_t* const p_map,
+ IN const cl_map_item_t* p_item )
+{
+ cl_map_item_t *p_tmp;
+
+ p_tmp = p_item->p_right;
+ if (p_tmp != p_map->nil_item) {
+ while (p_tmp->p_left != p_map->nil_item)
+ p_tmp = p_tmp->p_left;
+ return p_tmp;
+ } else {
+ p_tmp = p_item->p_up;
+ while (p_tmp->p_right == p_item) {
+ p_item = p_tmp;
+ p_tmp = p_tmp->p_up;
+ }
+ if (p_tmp == p_map->root)
+ return p_map->nil_item;
+ return p_tmp;
+ }
+}
+
+static cl_map_item_t *
+ips_cl_qmap_predecessor(
+ IN cl_qmap_t* const p_map,
+ IN const cl_map_item_t* p_item )
+{
+ cl_map_item_t *p_tmp;
+
+ p_tmp = p_item->p_left;
+ if (p_tmp != p_map->nil_item) {
+ while (p_tmp->p_right != p_map->nil_item)
+ p_tmp = p_tmp->p_right;
+ return p_tmp;
+ } else {
+ p_tmp = p_item->p_up;
+ while (p_tmp->p_left == p_item) {
+ p_item = p_tmp;
+ p_tmp = p_tmp->p_up;
+ }
+ if (p_tmp == p_map->root)
+ return p_map->nil_item;
+ return p_tmp;
+ }
+}
+
+/*
+ * return the first node with buffer overlapping or zero.
+ */
+static cl_map_item_t *
+ips_cl_qmap_search(cl_qmap_t * const p_map,
+ unsigned long start, unsigned long end)
+{
+ cl_map_item_t *p_item, *p_tmp;
+
+ RBTREE_ASSERT( p_map );
+ p_item = __cl_map_root(p_map);
+
+ while (p_item != p_map->nil_item) {
+ if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) {
+ p_tmp = p_item->p_right;
+ if (p_tmp != p_map->nil_item) {
+ p_item = p_tmp;
+ continue;
+ }
+
+ /*
+ * p_item is on immediate left side of 'start'.
+ */
+ if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) {
+ /*
+ * p_item is on immediate right
+ * side of 'start'.
+ */
+ p_item = ips_cl_qmap_successor(p_map, p_item);
+ if (p_item != p_map->nil_item &&
+ end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+ p_item = p_map->nil_item;
+ }
+ } else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) {
+ p_tmp = p_item->p_left;
+ if (p_tmp != p_map->nil_item) {
+ p_item = p_tmp;
+ continue;
+ }
+
+ /*
+ * p_tmp is on immediate left side of 'start'.
+ */
+ p_tmp = ips_cl_qmap_predecessor(p_map, p_item);
+ if (p_tmp == p_map->nil_item ||
+ (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) {
+ /*
+ * p_item is on immediate right
+ * side of 'start'.
+ */
+ if (end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+ p_item = p_map->nil_item;
+ } else
+ p_item = p_tmp;
+ }
+
+ break;
+ }
+
+
+ return p_item;
+}
diff --git a/include/rbtree.h b/include/rbtree.h
new file mode 100644
index 0000000..13245b0
--- /dev/null
+++ b/include/rbtree.h
@@ -0,0 +1,90 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __RBTREE_H__
+
+#define __RBTREE_H__
+
+#include <stdint.h>
+
+#ifndef RBTREE_MAP_PL
+#error "You must define RBTREE_MAP_PL before including rbtree.h"
+#endif
+
+#ifndef RBTREE_MI_PL
+#error "You must define RBTREE_MI_PL before including rbtree.h"
+#endif
+
+/*
+ * Red-Black tid cache definition.
+ */
+typedef struct _cl_map_item {
+ struct _cl_map_item *p_left; /* left pointer */
+ struct _cl_map_item *p_right; /* right pointer */
+ struct _cl_map_item *p_up; /* up pointer */
+ uint16_t color; /* red-black color */
+
+ RBTREE_MI_PL payload;
+} cl_map_item_t;
+
+typedef struct _cl_qmap {
+ cl_map_item_t *root; /* root node pointer */
+ cl_map_item_t *nil_item; /* terminator node pointer */
+
+ RBTREE_MAP_PL payload;
+} cl_qmap_t;
+
+#define CL_MAP_RED 0
+#define CL_MAP_BLACK 1
+
+#endif
diff --git a/libpsm2.spec.in b/libpsm2.spec.in
new file mode 100644
index 0000000..c5ddf62
--- /dev/null
+++ b/libpsm2.spec.in
@@ -0,0 +1,177 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2017 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+Summary: Intel PSM2 Libraries
+Name: @RPM_NAME@
+Version: @VERSION@
+Release: 1 at SPEC_FILE_RELEASE_DIST@
+License: BSD or GPLv2
+URL: https://github.com/01org/opa-psm2/
+
+# The tarball can be created by:
+# git clone https://github.com/01org/opa-psm2
+# cd opa-psm2
+# git checkout @DIST_SHA@
+# make dist
+Source0: @RPM_NAME at -%{version}.tar.gz
+
+# The OPA product is supported on x86_64 only:
+ExclusiveArch: x86_64
+
+BuildRequires: gcc
+Provides: hfi1-psm
+Obsoletes: hfi1-psm < 1.0.0
+
+%if "@RPM_NAME_BASEEXT@"
+%package -n @RPM_NAME@@RPM_NAME_BASEEXT@
+%endif
+Summary: Intel PSM2 Libraries
+Provides: @RPM_NAME@
+%if 0%{?suse_version}
+BuildRequires: libnuma-devel
+Requires: libnuma1
+%else
+%if 0%{?rhel}==0 || 0%{?rhel} > 6
+BuildRequires: systemd
+BuildRequires: numactl-devel
+Requires: numactl-libs
+%endif
+%endif
+
+%package -n @RPM_NAME at -devel
+Summary: Development files for Intel PSM2
+Requires: %{name}%{?_isa} = %{version}-%{release}
+Provides: hfi1-psm-devel
+Obsoletes: hfi1-psm-devel < 1.0.0
+
+%package -n @RPM_NAME at -compat
+Summary: Compat library for Intel PSM2
+Requires: %{name}%{?_isa} = %{version}-%{release}
+%if 0%{?fedora}
+Requires: systemd-udev
+%endif
+Provides: hfi1-psm-compat
+Obsoletes: hfi1-psm-compat < 1.0.0
+
+# If an alternate basename is defined, like in SLES >=12.3
+# Then we generate a different base src.rpm, so use this
+# description instead.
+%if "@RPM_NAME_BASEEXT@"
+%description
+The source code for the PSM2 messaging API, libpsm2.
+A low-level user-level communications interface for the Intel(R) OPA
+family of products. PSM2 users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+%endif
+
+# In distro's other than SLES >=12.3 we use a single description
+# for both the .src.rpm and the base binary rpm. As the
+# RPM_NAME_BASEEXT defaults to empty contents.
+%description -n @RPM_NAME@@RPM_NAME_BASEEXT@
+PSM2 Messaging API, or PSM2 API, is the low-level
+user-level communications interface for the Intel(R) OPA
+family of products. PSM2 users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description -n @RPM_NAME at -devel
+Intel(R) PSM2, psm2*.h, headers and libpsm2.so files necessary
+for developing software using libpsm2.
+
+%description -n @RPM_NAME at -compat
+Support for MPIs linked with PSM versions < 2. This will allow
+software compiled to use Intel(R) Truescale PSM, libinfinipath, to run
+with Intel(R) OPA PSM2, libpsm2.
+
+%prep
+%setup -q -n @RPM_NAME at -%{version}
+
+%build
+export CFLAGS="%{optflags}"
+make %{?_smp_mflags}
+
+%install
+%make_install
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%files -n @RPM_NAME@@RPM_NAME_BASEEXT@
+%if 0%{?rhel} && 0%{?rhel} < 7
+%{!?_licensedir:%global license %doc}
+%endif
+%license COPYING
+%{_libdir}/@TARGLIB at .so.@MAJOR at .@MINOR@
+%{_libdir}/@TARGLIB at .so.@MAJOR@
+ at 40_PSM_RULES@
+
+%files -n @RPM_NAME at -devel
+%{_libdir}/@TARGLIB at .so
+%{_includedir}/psm2.h
+%{_includedir}/psm2_mq.h
+%{_includedir}/psm2_am.h
+%{_includedir}/hfi1diag
+
+%files -n @RPM_NAME at -compat
+%{_libdir}/psm2-compat
+%if 0%{?rhel} && 0%{?rhel} < 7
+ at UDEVDIR@/rules.d/40-psm-compat.rules
+%else
+%{_udevrulesdir}/40-psm-compat.rules
+%endif
+ at LIBPSM2_COMPAT_SYM_CONF_DIR@/modprobe.d/libpsm2-compat.conf
+%{_prefix}/lib/libpsm2
+
+%changelog
+* Wed Aug 30 2017 Rusell McGuire <russell.w.mcguire at intel.com>
+- Adjust RPM names to match SLES 12.3 distro names
+* Tue Apr 05 2016 Paul Reger <paul.j.reger at intel.com>
+- Upstream PSM2 source code for Fedora.
diff --git a/libuuid/Makefile b/libuuid/Makefile
new file mode 100644
index 0000000..aa3f5ac
--- /dev/null
+++ b/libuuid/Makefile
@@ -0,0 +1,92 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+CFLAGS += -DPSM_UUID=1 -Wno-unused-function
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := psm_uuid.o parse.o pack.o unpack.o unparse.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+ $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno; \
+ cd -; \
+ fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+ @echo "Nothing to do for install."
diff --git a/libuuid/compare.c b/libuuid/compare.c
new file mode 100644
index 0000000..44f275b
--- /dev/null
+++ b/libuuid/compare.c
@@ -0,0 +1,53 @@
+/*
+ * compare.c --- compare whether or not two UUID's are the same
+ *
+ * Returns 0 if the two UUID's are different, and 1 if they are the same.
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "psm_uuid.h"
+#include <string.h>
+
+#define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1);
+
+int uuid_compare(const uuid_t uu1, const uuid_t uu2)
+{
+ struct uuid uuid1, uuid2;
+
+ uuid_unpack(uu1, &uuid1);
+ uuid_unpack(uu2, &uuid2);
+
+ UUCMP(uuid1.time_low, uuid2.time_low);
+ UUCMP(uuid1.time_mid, uuid2.time_mid);
+ UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version);
+ UUCMP(uuid1.clock_seq, uuid2.clock_seq);
+ return memcmp(uuid1.node, uuid2.node, 6);
+}
+
diff --git a/libuuid/pack.c b/libuuid/pack.c
new file mode 100644
index 0000000..801b891
--- /dev/null
+++ b/libuuid/pack.c
@@ -0,0 +1,69 @@
+/*
+ * Internal routine for packing UUID's
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_pack(const struct uuid *uu, uuid_t ptr)
+{
+ uint32_t tmp;
+ unsigned char *out = ptr;
+
+ tmp = uu->time_low;
+ out[3] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[2] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[1] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[0] = (unsigned char) tmp;
+
+ tmp = uu->time_mid;
+ out[5] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[4] = (unsigned char) tmp;
+
+ tmp = uu->time_hi_and_version;
+ out[7] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[6] = (unsigned char) tmp;
+
+ tmp = uu->clock_seq;
+ out[9] = (unsigned char) tmp;
+ tmp >>= 8;
+ out[8] = (unsigned char) tmp;
+
+ memcpy(out+10, uu->node, 6);
+}
+
diff --git a/libuuid/parse.c b/libuuid/parse.c
new file mode 100644
index 0000000..dd8c258
--- /dev/null
+++ b/libuuid/parse.c
@@ -0,0 +1,78 @@
+/*
+ * parse.c --- UUID parsing
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+int uuid_parse(const char *in, uuid_t uu)
+{
+ struct uuid uuid;
+ int i;
+ const char *cp;
+ char buf[3];
+
+ if (strlen(in) != 36)
+ return -1;
+ for (i=0, cp = in; i <= 36; i++,cp++) {
+ if ((i == 8) || (i == 13) || (i == 18) ||
+ (i == 23)) {
+ if (*cp == '-')
+ continue;
+ else
+ return -1;
+ }
+ if (i== 36)
+ if (*cp == 0)
+ continue;
+ if (!isxdigit(*cp))
+ return -1;
+ }
+ uuid.time_low = strtoul(in, NULL, 16);
+ uuid.time_mid = strtoul(in+9, NULL, 16);
+ uuid.time_hi_and_version = strtoul(in+14, NULL, 16);
+ uuid.clock_seq = strtoul(in+19, NULL, 16);
+ cp = in+24;
+ buf[2] = 0;
+ for (i=0; i < 6; i++) {
+ buf[0] = *cp++;
+ buf[1] = *cp++;
+ uuid.node[i] = strtoul(buf, NULL, 16);
+ }
+
+ uuid_pack(&uuid, uu);
+ return 0;
+}
diff --git a/libuuid/psm_uuid.c b/libuuid/psm_uuid.c
new file mode 100644
index 0000000..4db29a6
--- /dev/null
+++ b/libuuid/psm_uuid.c
@@ -0,0 +1,114 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/stat.h>
+#include <limits.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static void psmi_make_drand_uuid(psm2_uuid_t uuid_out)
+{
+ struct drand48_data drand48_data;
+ int i;
+ long int rnum;
+ srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data);
+ for(i=0; i < 16; i++) {
+ lrand48_r(&drand48_data, &rnum);
+ uuid_out[i] = rnum % UCHAR_MAX;
+ }
+}
+
+/* Since libuuid can call srand, we will generate our own uuids */
+void
+__psm2_uuid_generate(psm2_uuid_t uuid_out)
+{
+ PSM2_LOG_MSG("entering");
+ /* Prefer using urandom, fallback to drand48_r */
+ struct stat urandom_stat;
+ size_t nbytes;
+ int fd;
+ if(stat("/dev/urandom", &urandom_stat) != 0) {
+ psmi_make_drand_uuid(uuid_out);
+ return;
+ }
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if(fd == -1) {
+ psmi_make_drand_uuid(uuid_out);
+ } else {
+ nbytes = read(fd, (char *) uuid_out, 16);
+ if(nbytes != 16) {
+ psmi_make_drand_uuid(uuid_out);
+ }
+ close(fd);
+ }
+ PSM2_LOG_MSG("leaving");
+ return;
+}
+PSMI_API_DECL(psm2_uuid_generate)
+
+void
+psmi_uuid_unparse(const uuid_t uu, char *out)
+{
+ uuid_unparse_lower(uu, out);
+}
+
+int
+psmi_uuid_parse(const char *in, uuid_t uu)
+{
+ return uuid_parse(in, uu);
+}
+
diff --git a/libuuid/psm_uuid.h b/libuuid/psm_uuid.h
new file mode 100644
index 0000000..09df044
--- /dev/null
+++ b/libuuid/psm_uuid.h
@@ -0,0 +1,78 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM_UUID_H
+#define _PSM_UUID_H
+struct uuid {
+ uint32_t time_low;
+ uint16_t time_mid;
+ uint16_t time_hi_and_version;
+ uint16_t clock_seq;
+ uint8_t node[6];
+};
+
+typedef unsigned char uuid_t[16];
+
+int psmi_uuid_parse(const char *in, psm2_uuid_t uu);
+void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out);
+int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB);
+int uuid_compare(const uuid_t uu1, const uuid_t uu2);
+void uuid_pack(const struct uuid *uu, uuid_t ptr);
+void uuid_unparse(const uuid_t uu, char *out);
+void uuid_unparse_upper(const uuid_t uu, char *out);
+void uuid_unparse_lower(const uuid_t uu, char *out);
+void uuid_unpack(const uuid_t in, struct uuid *uu);
+int uuid_parse(const char *in, uuid_t uu);
+#endif
diff --git a/libuuid/unpack.c b/libuuid/unpack.c
new file mode 100644
index 0000000..26e4394
--- /dev/null
+++ b/libuuid/unpack.c
@@ -0,0 +1,63 @@
+/*
+ * Internal routine for unpacking UUID
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_unpack(const uuid_t in, struct uuid *uu)
+{
+ const uint8_t *ptr = in;
+ uint32_t tmp;
+
+ tmp = *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ uu->time_low = tmp;
+
+ tmp = *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ uu->time_mid = tmp;
+
+ tmp = *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ uu->time_hi_and_version = tmp;
+
+ tmp = *ptr++;
+ tmp = (tmp << 8) | *ptr++;
+ uu->clock_seq = tmp;
+
+ memcpy(uu->node, ptr, 6);
+}
+
diff --git a/libuuid/unparse.c b/libuuid/unparse.c
new file mode 100644
index 0000000..d859379
--- /dev/null
+++ b/libuuid/unparse.c
@@ -0,0 +1,75 @@
+/*
+ * unparse.c -- convert a UUID to string
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdio.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static const char *fmt_lower =
+ "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x";
+
+static const char *fmt_upper =
+ "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X";
+
+#ifdef UUID_UNPARSE_DEFAULT_UPPER
+#define FMT_DEFAULT fmt_upper
+#else
+#define FMT_DEFAULT fmt_lower
+#endif
+
+static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt)
+{
+ struct uuid uuid;
+
+ uuid_unpack(uu, &uuid);
+ sprintf(out, fmt,
+ uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
+ uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
+ uuid.node[0], uuid.node[1], uuid.node[2],
+ uuid.node[3], uuid.node[4], uuid.node[5]);
+}
+
+void uuid_unparse_lower(const uuid_t uu, char *out)
+{
+ uuid_unparse_x(uu, out, fmt_lower);
+}
+
+void uuid_unparse_upper(const uuid_t uu, char *out)
+{
+ uuid_unparse_x(uu, out, fmt_upper);
+}
+
+void uuid_unparse(const uuid_t uu, char *out)
+{
+ uuid_unparse_x(uu, out, FMT_DEFAULT);
+}
diff --git a/makesdeb.sh b/makesdeb.sh
new file mode 100755
index 0000000..072f741
--- /dev/null
+++ b/makesdeb.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2017 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2016 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Stop on error
+set -e
+
+BUILD_OPTS="gFGbBAS"
+BUILD_OPT=F
+
+function literate()
+{
+ echo $(sed "s/\B/&$2/g" <<< "$1")
+}
+
+function usage()
+{
+ echo "Usage: ${0##*/} [-h] [debuild -($(literate $BUILD_OPTS '|'))]"
+ exit $1
+}
+
+while getopts "h$BUILD_OPTS" OPT; do
+ case $OPT in
+ h)
+ usage
+ ;;
+ \?)
+ usage 1
+ ;;
+ *)
+ BUILD_OPT=$OPT
+ ;;
+ esac
+done
+
+# Remove parsed options
+shift $((OPTIND-1))
+
+# Check if we have any non-option parameters
+test ! $# -eq 0 && usage
+
+# Annotate changelog
+cat debian/changelog.in > debian/changelog
+
+GIT_TAG_PREFIX=v
+GIT_TAG_RELEASE=$(git describe --tags --long --match="$GIT_TAG_PREFIX*")
+VERSION=$(sed -e "s/^$GIT_TAG_PREFIX\(.\+\)-\(.\+\)-.\+/\1_\2/" -e 's/_/./g' <<< "$GIT_TAG_RELEASE")
+
+debchange --newversion=$VERSION "Bump up version to $VERSION"
+
+debchange --release ""
+
+# Build package
+debuild -$BUILD_OPT -tc
+
+echo "The deb package(s) is (are) in parent directory"
+
diff --git a/makesrpm.sh b/makesrpm.sh
new file mode 100755
index 0000000..e673b35
--- /dev/null
+++ b/makesrpm.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+#It makes no sense to have both CUDA and non-CUDA in the same invocation
+#as they require different versions of the hfi1_user.h at this point in time.
+#Limiting this script to only build CUDA if requested
+
+#default BUILDARG to build source RPM only
+BUILDARG=s
+RPM_NAME=libpsm2
+
+function usage()
+{
+ echo "Usage: $0 [OPTION] [OPTION] [OPTION]"
+ echo " "
+ echo "Creates tar ball of source and source rpms by default."
+ echo "Optionally generates binary rpm(s) "
+ echo " "
+ echo " s,a,b,p,c,i,l"
+ echo " Optional, default is s (sourcerpm)"
+ echo " Set single extension letter for rpmbuild -b argument"
+ echo " -r <name>, -rpmname <name>"
+ echo " Optional, set the output rpm name"
+ echo " -e <basename ext>, -baseext <basename ext>"
+ echo " Optional, set a base name extension"
+ echo " This only appends an extra string onto the base RPM name"
+ echo " Does not affect supporting RPMs"
+ echo " -c, -cuda"
+ echo " Optional, default is unset"
+ echo " Sets PSM_CUDA=1, creating -cuda based spec and rpms"
+ echo " -d <path>, -dir <path>"
+ echo " Optionally sets output folder for rpmbuild to use"
+ echo " Examples:"
+ echo " $0 b"
+ echo " $0 s -cuda"
+ echo " $0 -cuda"
+ echo " $0 -d ./temp"
+ echo " $0 b -cuda -dir output"
+ exit 1
+}
+
+err=0
+
+# OUTDIR is where the Makefile places its meta-data
+OUTDIR=build_release
+
+# Set TEMPDIR first, so user control can override the value
+# This is where rpmbuild places rpm(s) and uses its build meta-data.
+# It can be set the same as OUTDIR, and work just fine if desired.
+TEMPDIR=temp.$$
+
+while [ "$1" != "" ]; do
+ case $1 in
+ -d | -dir) shift
+ if [ -z "$1" ]; then
+ usage
+ fi
+ TEMPDIR=$1
+ ;;
+ -c | -cuda) export PSM_CUDA=1
+ RPM_EXT="-cuda"
+ ;;
+ -e | -baseext) shift
+ if [ -z "$1" ]; then
+ usage
+ fi
+ $RPM_NAME_BASEEXT="$1"
+ export RPM_NAME_BASEEXT="$1"
+ ;;
+ -r | -rpmname) shift
+ if [ -z "$1" ]; then
+ usage
+ fi
+ $RPM_NAME="$1"
+ export RPM_NAME="$1"
+ ;;
+ s|a|b|p|c|i|l) BUILDARG=$1
+ ;;
+ * ) err=1
+ usage
+ ;;
+ esac
+ shift
+done
+
+# Generic cleanup, build, and tmp folder creation
+make distclean OUTDIR=$OUTDIR
+make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT dist OUTDIR=$OUTDIR
+mkdir -p ./$TEMPDIR/{BUILD,RPMS,SOURCES,SPECS,SRPMS,BUILDROOT}
+# Differnet paths based on RPM_EXT
+cp ${OUTDIR}/$RPM_NAME-*.tar.gz $TEMPDIR/SOURCES
+make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT specfile OUTDIR=$OUTDIR
+cp ${OUTDIR}/$RPM_NAME.spec $TEMPDIR/SPECS
+rpmbuild -b$BUILDARG --define "_topdir $PWD/$TEMPDIR" --nodeps $TEMPDIR/SPECS/$RPM_NAME.spec
+
+echo "The SRPM(s) are in $TEMPDIR/SRPMS/`ls $TEMPDIR/SRPMS`"
diff --git a/mpspawn/mpspawn_stats.h b/mpspawn/mpspawn_stats.h
new file mode 100644
index 0000000..4382587
--- /dev/null
+++ b/mpspawn/mpspawn_stats.h
@@ -0,0 +1,132 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _MPSPAWN_STATS_H
+#define _MPSPAWN_STATS_H
+
+#include <math.h>
+
+#define MPSPAWN_STATS_VERSION 1
+
+typedef enum {
+ MPSPAWN_STATS_TYPE_DOUBLE = 0x1,
+#define MPSPAWN_STATS_TYPE_DOUBLE 0x1
+ MPSPAWN_STATS_TYPE_HEADER = 0x2,
+#define MPSPAWN_STATS_TYPE_HEADER 0x2
+ MPSPAWN_STATS_REDUCTION_MAX = 0x1000,
+#define MPSPAWN_STATS_REDUCTION_MAX 0x1000
+ MPSPAWN_STATS_REDUCTION_MIN = 0x2000,
+#define MPSPAWN_STATS_REDUCTION_MIN 0x2000
+ MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000,
+#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000
+ MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000
+#define MPSPAWN_STATS_SKIP_IF_ZERO 0x8000
+} mpspawn_stats_flags;
+
+#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \
+ MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN)
+
+#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg)))
+#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL)
+#define MPSPAWN_ISNAN_U64(x) (((uint64_t)(x)) == MPSPAWN_NAN_U64)
+
+#define MPSPAWN_NAN ((uint64_t) ~0ULL) /* NAN) */
+#define MPSPAWN_ISNAN(x) (isnan(x))
+
+struct mpspawn_stats_add_args; /* client->mpspawn stats registration */
+struct mpspawn_stats_req_args; /* mpspawn->client fn callback stats request */
+struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */
+
+/* Clients implement this function to fill in mpspawn request for stats */
+typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *);
+/* mpspawn implements this function to allow clients to register new stats */
+typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *);
+/* mpspawn implements this function to map rank indexes into epaddr structs */
+struct psm2_epaddr;
+typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank);
+
+typedef struct mpspawn_stats_req_args {
+ int version;
+ int num;
+ uint64_t *stats;
+ uint16_t *flags;
+ void *context;
+} mpspawn_stats_req_args_t;
+
+typedef
+struct mpspawn_stats_add_args {
+ int version;
+ int num;
+ char *header;
+ char **desc;
+ uint16_t *flags;
+ mpspawn_stats_req_fn req_fn;
+ void *context;
+} mpspawn_stats_add_args_t;
+
+typedef
+struct mpspawn_stats_init_args {
+ int version;
+ psm2_mq_t mq; /* initialized mq endpoint */
+ int num_epaddr; /* number of endpoints in job */
+ mpspawn_stats_add_fn add_fn; /* function for client to add stats */
+ mpspawn_map_epaddr_fn epaddr_map_fn;
+ const char *stats_types; /* stats type string mpirun -M */
+} mpspawn_stats_init_args_t;
+
+/* Function in psm exposed to register stats */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args);
+
+#endif
diff --git a/opa/Makefile b/opa/Makefile
new file mode 100644
index 0000000..d065429
--- /dev/null
+++ b/opa/Makefile
@@ -0,0 +1,113 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+TARGLIB := libopa
+MAJOR := $(OPA_LIB_MAJOR)
+MINOR := $(OPA_LIB_MINOR)
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+BASECFLAGS += -D_GNU_SOURCE
+INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips
+
+ifeq (${arch},x86_64)
+ PLATFORM_OBJ=opa_dwordcpy-x86_64-fast.o
+else
+ PLATFORM_OBJ=
+endif
+
+${TARGLIB}-objs := opa_debug.o opa_time.o opa_proto.o \
+ opa_service.o opa_utils.o \
+ opa_dwordcpy-$(arch).o opa_i2cflash.o opa_sysfs.o opa_syslog.o \
+ $(PLATFORM_OBJ)
+
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+install: all
+ @echo "Nothing to do for install."
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.S
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+ $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.S | ${DEPS}
+ $(CC) $(ASFLAGS) -c $< -o $@
+
+clean:
+ @rm -f $(OUTDIR)/_revision.c
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno ${TARGLIB}.*; \
+ cd -; \
+ fi
+
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
diff --git a/opa/opa_debug.c b/opa/opa_debug.c
new file mode 100644
index 0000000..71b0003
--- /dev/null
+++ b/opa/opa_debug.c
@@ -0,0 +1,364 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <execinfo.h>
+#include <fcntl.h>
+#include <ucontext.h>
+#include "opa_user.h"
+#include "../psm_log.h"
+
+unsigned hfi_debug = 1;
+char *__hfi_mylabel = NULL;
+FILE *__hfi_dbgout;
+static void init_hfi_mylabel(void) __attribute__ ((constructor));
+static void init_hfi_backtrace(void) __attribute__ ((constructor));
+static void init_hfi_dbgfile(void) __attribute__ ((constructor));
+static void fini_hfi_backtrace(void) __attribute__ ((destructor));
+static struct sigaction SIGSEGV_old_act;
+static struct sigaction SIGBUS_old_act;
+static struct sigaction SIGILL_old_act;
+static struct sigaction SIGABRT_old_act;
+static struct sigaction SIGINT_old_act;
+static struct sigaction SIGTERM_old_act;
+#ifdef HFI_BRAKE_DEBUG
+static void hfi_brake_debug(void) __attribute__ ((constructor));
+
+/*
+ How to use hfi_break_debug code:
+
+ 1. Build psm with HFI_BRAKE_DEBUG set in the environment.
+ 2. Create a script for your test case (e.g. mpistress?). In the script
+ make sure to choose a HFI brake file that corresponds to a network
+ file system that is common to all hosts where you will run your code.
+ Also, in the script, make sure to propagate the "HFI_BRAKE_FILE_NAME"
+ env var to all hosts.
+ 3. Bring up 3 putty sessions to one of the hosts that your script uses.
+ 4. In putty session number 1, touch the HFI_BRAKE_FILE and sync.
+ 5. In putty session number 1, start the script. You should see messages
+ of the form:
+-bash-4.2$ ./mpistress.0304.sc
+<hostname>:5716 remove the file: "/nfs/user/HFI_BRAKE" to continue
+<hostname>:5717 remove the file: "/nfs/user/HFI_BRAKE" to continue
+<hostname>:3456 remove the file: "/nfs/user/HFI_BRAKE" to continue
+<hostname>:3457 remove the file: "/nfs/user/HFI_BRAKE" to continue
+
+ Note that the hostname and process id are shown for all of the processes that are started
+ by your script.
+ 6. In putty session 2, bring up gdb, and debug the program that is referenced in your script.
+ For example: /usr/mpi/gcc/openmpi-1.10.2-hfi/tests/intel/mpi_stress
+ 7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1.
+ 8. Note, at this point, you have only one gdb session. I leave it as an exercise to the reader to
+ determine how to bring up multiple gdb sessions.
+ 9. In putty session 3, rm the HFI_BRAKE_FILE.
+ 10. You are now debugging a live session of psm.
+ */
+
+static void hfi_brake_debug(void)
+{
+ struct stat buff;
+ char hostname[80];
+ const char *hfi_brake_file_name = getenv("HFI_BRAKE_FILE_NAME");
+ gethostname(hostname, 80);
+ hostname[sizeof(hostname) - 1] = '\0';
+
+ if (!hfi_brake_file_name)
+ hfi_brake_file_name = "/tmp/HFI_BRAKE_FILE";
+ printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name);
+ while (0 == stat(hfi_brake_file_name, &buff))
+ {
+ printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name);
+ sleep(10);
+ }
+ printf("%s:%d continuing.\n",hostname,getpid());
+}
+#endif
+
+static void init_hfi_mylabel(void)
+{
+ char lbl[1024];
+ char hostname[80];
+ char *e;
+ /* By default, try to come up with a decent default label, it will be
+ * overridden later. Try getting rank, if that's not available revert to
+ * pid. */
+ gethostname(hostname, 80);
+ lbl[0] = '\0';
+ hostname[sizeof(hostname) - 1] = '\0';
+ if ((((e = getenv("PSC_MPI_RANK")) && *e)) ||
+ (((e = getenv("MPI_RANKID")) && *e)) ||
+ (((e = getenv("MPIRUN_RANK")) && *e))) {
+ char *ep;
+ unsigned long val;
+ val = strtoul(e, &ep, 10);
+ if (ep != e) /* valid conversion */
+ snprintf(lbl, 1024, "%s.%lu", hostname, val);
+ }
+ if (lbl[0] == '\0')
+ snprintf(lbl, 1024, "%s.%u", hostname, getpid());
+ __hfi_mylabel = strdup(lbl);
+}
+
+/* FIXME: This signal handler does not conform to the posix standards described
+ in 'man 7 signal' due to it calling unsafe functions.
+
+ See 'CALLS UNSAFE FUNCTION' notes below for examples.
+ */
+static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv)
+{
+ /* we make these static to try and avoid issues caused
+ by stack overflow that might have gotten us here. */
+ static void *backaddr[128]; /* avoid stack usage */
+ static char buf[150], hname[64], fname[128];
+ static int i, j, fd, id;
+ extern char *__progname;
+ PSM_LOG_DECLARE_BT_BUFFER();
+
+ /* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */
+ PSM_LOG_BT(100,__FUNCTION__);
+ /* If this is a SIGINT do not display backtrace. Just invoke exit
+ handlers */
+ if ((sig == SIGINT) || (sig == SIGTERM))
+ /* CALLS UNSAFE FUNCTION (exit) */
+ exit(1);
+
+ /* CALLS UNSAFE FUNCTION (snprintf) */
+ id = snprintf(buf, sizeof(buf),
+ "\n%.60s:%u terminated with signal %d", __progname,
+ getpid(), sig);
+ if (ucv) {
+ static ucontext_t *uc;
+ uc = (ucontext_t *) ucv;
+ id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx",
+#if defined(__x86_64__)
+ (unsigned long)uc->uc_mcontext.gregs[REG_RIP],
+ (unsigned long)uc->uc_mcontext.gregs[REG_RSP]);
+#elif defined(__i386__)
+ (unsigned long)uc->uc_mcontext.gregs[REG_EIP],
+ (unsigned long)uc->uc_mcontext.gregs[REG_ESP]);
+#else
+ 0ul, 0ul);
+#warning No stack pointer or instruction pointer for this arch
+#endif
+ }
+ id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n");
+ /* CALLS UNSAFE FUNCTION (fprintf) */
+ fprintf(stderr, "%.*s", id, buf);
+
+ i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0]));
+ if (i > 2) /* skip ourselves and backtrace */
+ j = 2, i -= j;
+ else
+ j = 0;
+
+ backtrace_symbols_fd(backaddr + j, i, 2);
+ (void)fsync(2);
+
+ /* Try to write it to a file as well, in case the rest doesn't make it
+ out. Do it second, in case we get a second failure (more likely).
+ We might eventually want to print some more of the registers to the
+ btr file, to aid debugging, but not for now. Truncate the program
+ name if overly long, so we always get pid and (at least part of)
+ hostname. */
+ /* CALLS UNSAFE FUNCTION (gethostname) */
+ (void)gethostname(hname, sizeof(hname));
+ hname[sizeof(hname) - 1] = '\0';
+ snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname,
+ getpid(), hname);
+ if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) {
+ /* CALLS UNSAFE FUNCTION (fdopen) */
+ FILE *fp = fdopen(fd, "w");
+ if (fp)
+ fprintf(fp, "%.*s", id, buf);
+ backtrace_symbols_fd(backaddr + j, i, fd);
+ if (fp)
+ /* CALLS UNSAFE FUNCTION (fclose) */
+ fclose(fp);
+ }
+ switch (sig){
+ case SIGSEGV:
+ (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv);
+ break;
+ case SIGBUS:
+ (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv);
+ break;
+ case SIGILL:
+ (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv);
+ break;
+ case SIGABRT:
+ (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv);
+ break;
+ default:
+ break;
+ }
+ exit(1); /* not _exit(), want atexit handlers to get run */
+}
+
+/* We do this as a constructor so any user program that sets signal handlers
+ for these will override our settings, but we still get backtraces if they
+ don't.
+*/
+static void init_hfi_backtrace(void)
+{
+ /* we need to track memory corruption */
+ static struct sigaction act; /* easier than memset */
+ act.sa_sigaction = hfi_sighdlr;
+ act.sa_flags = SA_SIGINFO;
+
+ if (getenv("HFI_BACKTRACE")) {
+ /* permanent, although probably
+ undocumented way to disable backtraces. */
+ (void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act);
+ (void)sigaction(SIGBUS, &act, &SIGBUS_old_act);
+ (void)sigaction(SIGILL, &act, &SIGILL_old_act);
+ (void)sigaction(SIGABRT, &act, &SIGABRT_old_act);
+ (void)sigaction(SIGINT, &act, &SIGINT_old_act);
+ (void)sigaction(SIGTERM, &act, &SIGTERM_old_act);
+ }
+}
+
+/* if HFI_DEBUG_FILENAME is set in the environment, then all the
+ debug prints (not info and error) will go to that file.
+ %h is expanded to the hostname, and %p to the pid, if present. */
+static void init_hfi_dbgfile(void)
+{
+ char *fname = getenv("HFI_DEBUG_FILENAME");
+ char *exph, *expp, tbuf[1024];
+ FILE *newf;
+
+ if (!fname) {
+ __hfi_dbgout = stdout;
+ return;
+ }
+ exph = strstr(fname, "%h"); /* hostname */
+ expp = strstr(fname, "%p"); /* pid */
+ if (exph || expp) {
+ int baselen;
+ char hname[256], pid[12];
+ if (exph) {
+ *hname = hname[sizeof(hname) - 1] = 0;
+ gethostname(hname, sizeof(hname) - 1);
+ if (!*hname)
+ strcpy(hname, "[unknown]");
+ }
+ if (expp)
+ snprintf(pid, sizeof(pid), "%d", getpid());
+ if (exph && expp) {
+ if (exph < expp) {
+ baselen = exph - fname;
+ snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+ baselen, fname, hname,
+ (int)(expp - (exph + 2)), exph + 2,
+ pid, expp + 2);
+ } else {
+ baselen = expp - fname;
+ snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+ baselen, fname, pid,
+ (int)(exph - (expp + 2)), expp + 2,
+ hname, exph + 2);
+ }
+ } else if (exph) {
+ baselen = exph - fname;
+ snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+ baselen, fname, hname, exph + 2);
+ } else {
+ baselen = expp - fname;
+ snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+ baselen, fname, pid, expp + 2);
+ }
+ fname = tbuf;
+ }
+ newf = fopen(fname, "a");
+ if (!newf) {
+ _HFI_ERROR
+ ("Unable to open \"%s\" for debug output, using stdout: %s\n",
+ fname, strerror(errno));
+ __hfi_dbgout = stdout;
+ } else {
+ __hfi_dbgout = newf;
+ setlinebuf(__hfi_dbgout);
+ }
+}
+
+void hfi_set_mylabel(char *label)
+{
+ __hfi_mylabel = label;
+}
+
+char *hfi_get_mylabel()
+{
+ return __hfi_mylabel;
+}
+
+static void fini_hfi_backtrace(void)
+{
+ if (getenv("HFI_BACKTRACE")) {
+ (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL);
+ (void)sigaction(SIGBUS, &SIGBUS_old_act, NULL);
+ (void)sigaction(SIGILL, &SIGILL_old_act, NULL);
+ (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL);
+ (void)sigaction(SIGINT, &SIGINT_old_act, NULL);
+ (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL);
+ }
+}
diff --git a/opa/opa_dwordcpy-generic.c b/opa/opa_dwordcpy-generic.c
new file mode 100644
index 0000000..929202d
--- /dev/null
+++ b/opa/opa_dwordcpy-generic.c
@@ -0,0 +1,298 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "opa_intf.h"
+#include "psm_user.h"
+
+#if defined(__x86_64__)
+#define hfi_dwordcpy hfi_dwordcpy_safe
+#define hfi_qwordcpy hfi_qwordcpy_safe
+#endif
+
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+ uint_fast32_t ndw = ndwords;
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = (const uint64_t *) src;
+ dst64[0] = (volatile uint64_t *) dest;
+
+ while (ndw >= 8) {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ ndw -= 8;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+ }
+ if (ndw) {
+ src = (const uint32_t *) src64[0];
+ dest = (volatile uint32_t *) dst64[0];
+
+ switch (ndw) {
+ case 7:
+ *dest++ = *src++;
+ case 6:
+ *dest++ = *src++;
+ case 5:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+
+ }
+}
+
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords)
+{
+ uint_fast32_t nqw = nqwords;
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = src;
+ dst64[0] = dest;
+
+ while (nqw >= 8) {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ nqw -= 8;
+ }
+ if (nqw) {
+ switch (nqw) {
+ case 7:
+ *(dst64[0])++ = *(src64[0])++;
+ case 6:
+ *(dst64[0])++ = *(src64[0])++;
+ case 5:
+ *(dst64[0])++ = *(src64[0])++;
+ case 4:
+ *(dst64[0])++ = *(src64[0])++;
+ case 3:
+ *(dst64[0])++ = *(src64[0])++;
+ case 2:
+ *(dst64[0])++ = *(src64[0])++;
+ case 1:
+ *(dst64[0])++ = *(src64[0])++;
+ }
+ }
+}
+
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m512i *dp = (volatile __m512i *) dest;
+ const __m512i *sp = (const __m512i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0x3f) == 0x0) {
+ /* source and destination are both 64 byte aligned */
+ do {
+ __m512i tmp0 = _mm512_load_si512(sp);
+ _mm512_store_si512((__m512i *)dp, tmp0);
+ } while ((--nblock) && (++dp) && (++sp));
+ } else {
+ /* only destination is 64 byte aligned - use unaligned loads */
+ do {
+ __m512i tmp0 = _mm512_loadu_si512(sp);
+ _mm512_store_si512((__m512i *)dp, tmp0);
+ } while ((--nblock) && (++dp) && (++sp));
+ }
+}
+#endif
+
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m256i *dp = (volatile __m256i *) dest;
+ const __m256i *sp = (const __m256i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0x1f) == 0x0) {
+ /* source and destination are both 32 byte aligned */
+ do {
+ __m256i tmp0 = _mm256_load_si256(sp);
+ __m256i tmp1 = _mm256_load_si256(sp + 1);
+ _mm256_store_si256((__m256i *)dp, tmp0);
+ _mm256_store_si256((__m256i *)(dp + 1), tmp1);
+ } while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+ } else {
+ /* only destination is 32 byte aligned - use unaligned loads */
+ do {
+ __m256i tmp0 = _mm256_loadu_si256(sp);
+ __m256i tmp1 = _mm256_loadu_si256(sp + 1);
+ _mm256_store_si256((__m256i *)dp, tmp0);
+ _mm256_store_si256((__m256i *)(dp + 1), tmp1);
+ } while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+ }
+}
+#endif
+
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m128i *dp = (volatile __m128i *) dest;
+ const __m128i *sp = (const __m128i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0xf) == 0x0) {
+ /* source and destination are both 16 byte aligned */
+ do {
+ __m128i tmp0 = _mm_load_si128(sp);
+ __m128i tmp1 = _mm_load_si128(sp + 1);
+ __m128i tmp2 = _mm_load_si128(sp + 2);
+ __m128i tmp3 = _mm_load_si128(sp + 3);
+ _mm_store_si128((__m128i *)dp, tmp0);
+ _mm_store_si128((__m128i *)(dp + 1), tmp1);
+ _mm_store_si128((__m128i *)(dp + 2), tmp2);
+ _mm_store_si128((__m128i *)(dp + 3), tmp3);
+ } while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+ } else {
+ /* only destination is 16 byte aligned - use unaligned loads */
+ do {
+ __m128i tmp0 = _mm_loadu_si128(sp);
+ __m128i tmp1 = _mm_loadu_si128(sp + 1);
+ __m128i tmp2 = _mm_loadu_si128(sp + 2);
+ __m128i tmp3 = _mm_loadu_si128(sp + 3);
+ _mm_store_si128((__m128i *)dp, tmp0);
+ _mm_store_si128((__m128i *)(dp + 1), tmp1);
+ _mm_store_si128((__m128i *)(dp + 2), tmp2);
+ _mm_store_si128((__m128i *)(dp + 3), tmp3);
+ } while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+ }
+}
+#endif
+
+void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = src;
+ dst64[0] = dest;
+
+ psmi_assert((dst64[0] != NULL) && (src64[0] != NULL));
+ psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0);
+
+ do {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+ } while (--nblock);
+}
diff --git a/opa/opa_dwordcpy-i386.S b/opa/opa_dwordcpy-i386.S
new file mode 100644
index 0000000..f3d898d
--- /dev/null
+++ b/opa/opa_dwordcpy-i386.S
@@ -0,0 +1,84 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+ .globl hfi_dwordcpy
+ .file "opa_dword32cpy.S"
+ .text
+ .p2align 4,,15
+hfi_dwordcpy:
+ // standard C calling convention, args on stack
+ // does not return any value
+ .type hfi_dwordcpy, @function
+ // save caller-saved regs
+ mov %edi,%eax
+ mov %esi,%edx
+
+ // setup regs
+ mov 0xc(%esp,1),%ecx
+ mov 0x4(%esp,1),%edi
+ mov 0x8(%esp,1),%esi
+ // and do it
+ cld
+ rep
+ movsd
+
+ // restore
+ mov %eax,%edi
+ mov %edx,%esi
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/opa/opa_dwordcpy-x86_64-fast.S b/opa/opa_dwordcpy-x86_64-fast.S
new file mode 100644
index 0000000..fe07ebf
--- /dev/null
+++ b/opa/opa_dwordcpy-x86_64-fast.S
@@ -0,0 +1,77 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+ .globl hfi_dwordcpy
+ .file "opa_dwordcpy-x86_64-fast.S"
+ .text
+ .p2align 4,,15
+ // standard C calling convention, rdi is dest, rsi is source, rdx is count
+ // does not return any value
+hfi_dwordcpy:
+ .type hfi_dwordcpy, @function
+ movl %edx,%ecx
+ shrl $1,%ecx
+ andl $1,%edx
+ cld
+ rep
+ movsq
+ movl %edx,%ecx
+ rep
+ movsd
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/opa/opa_dwordcpy-x86_64.c b/opa/opa_dwordcpy-x86_64.c
new file mode 100644
index 0000000..929202d
--- /dev/null
+++ b/opa/opa_dwordcpy-x86_64.c
@@ -0,0 +1,298 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "opa_intf.h"
+#include "psm_user.h"
+
+#if defined(__x86_64__)
+#define hfi_dwordcpy hfi_dwordcpy_safe
+#define hfi_qwordcpy hfi_qwordcpy_safe
+#endif
+
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+ uint_fast32_t ndw = ndwords;
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = (const uint64_t *) src;
+ dst64[0] = (volatile uint64_t *) dest;
+
+ while (ndw >= 8) {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ ndw -= 8;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+ }
+ if (ndw) {
+ src = (const uint32_t *) src64[0];
+ dest = (volatile uint32_t *) dst64[0];
+
+ switch (ndw) {
+ case 7:
+ *dest++ = *src++;
+ case 6:
+ *dest++ = *src++;
+ case 5:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+
+ }
+}
+
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords)
+{
+ uint_fast32_t nqw = nqwords;
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = src;
+ dst64[0] = dest;
+
+ while (nqw >= 8) {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ nqw -= 8;
+ }
+ if (nqw) {
+ switch (nqw) {
+ case 7:
+ *(dst64[0])++ = *(src64[0])++;
+ case 6:
+ *(dst64[0])++ = *(src64[0])++;
+ case 5:
+ *(dst64[0])++ = *(src64[0])++;
+ case 4:
+ *(dst64[0])++ = *(src64[0])++;
+ case 3:
+ *(dst64[0])++ = *(src64[0])++;
+ case 2:
+ *(dst64[0])++ = *(src64[0])++;
+ case 1:
+ *(dst64[0])++ = *(src64[0])++;
+ }
+ }
+}
+
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m512i *dp = (volatile __m512i *) dest;
+ const __m512i *sp = (const __m512i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0x3f) == 0x0) {
+ /* source and destination are both 64 byte aligned */
+ do {
+ __m512i tmp0 = _mm512_load_si512(sp);
+ _mm512_store_si512((__m512i *)dp, tmp0);
+ } while ((--nblock) && (++dp) && (++sp));
+ } else {
+ /* only destination is 64 byte aligned - use unaligned loads */
+ do {
+ __m512i tmp0 = _mm512_loadu_si512(sp);
+ _mm512_store_si512((__m512i *)dp, tmp0);
+ } while ((--nblock) && (++dp) && (++sp));
+ }
+}
+#endif
+
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m256i *dp = (volatile __m256i *) dest;
+ const __m256i *sp = (const __m256i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0x1f) == 0x0) {
+ /* source and destination are both 32 byte aligned */
+ do {
+ __m256i tmp0 = _mm256_load_si256(sp);
+ __m256i tmp1 = _mm256_load_si256(sp + 1);
+ _mm256_store_si256((__m256i *)dp, tmp0);
+ _mm256_store_si256((__m256i *)(dp + 1), tmp1);
+ } while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+ } else {
+ /* only destination is 32 byte aligned - use unaligned loads */
+ do {
+ __m256i tmp0 = _mm256_loadu_si256(sp);
+ __m256i tmp1 = _mm256_loadu_si256(sp + 1);
+ _mm256_store_si256((__m256i *)dp, tmp0);
+ _mm256_store_si256((__m256i *)(dp + 1), tmp1);
+ } while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+ }
+}
+#endif
+
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ volatile __m128i *dp = (volatile __m128i *) dest;
+ const __m128i *sp = (const __m128i *) src;
+
+ psmi_assert((dp != NULL) && (sp != NULL));
+ psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+ if ((((uintptr_t) sp) & 0xf) == 0x0) {
+ /* source and destination are both 16 byte aligned */
+ do {
+ __m128i tmp0 = _mm_load_si128(sp);
+ __m128i tmp1 = _mm_load_si128(sp + 1);
+ __m128i tmp2 = _mm_load_si128(sp + 2);
+ __m128i tmp3 = _mm_load_si128(sp + 3);
+ _mm_store_si128((__m128i *)dp, tmp0);
+ _mm_store_si128((__m128i *)(dp + 1), tmp1);
+ _mm_store_si128((__m128i *)(dp + 2), tmp2);
+ _mm_store_si128((__m128i *)(dp + 3), tmp3);
+ } while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+ } else {
+ /* only destination is 16 byte aligned - use unaligned loads */
+ do {
+ __m128i tmp0 = _mm_loadu_si128(sp);
+ __m128i tmp1 = _mm_loadu_si128(sp + 1);
+ __m128i tmp2 = _mm_loadu_si128(sp + 2);
+ __m128i tmp3 = _mm_loadu_si128(sp + 3);
+ _mm_store_si128((__m128i *)dp, tmp0);
+ _mm_store_si128((__m128i *)(dp + 1), tmp1);
+ _mm_store_si128((__m128i *)(dp + 2), tmp2);
+ _mm_store_si128((__m128i *)(dp + 3), tmp3);
+ } while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+ }
+}
+#endif
+
+void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+ const uint64_t *src64[4];
+ volatile uint64_t *dst64[4];
+ src64[0] = src;
+ dst64[0] = dest;
+
+ psmi_assert((dst64[0] != NULL) && (src64[0] != NULL));
+ psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0);
+
+ do {
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+
+ *dst64[0] = *src64[0];
+ src64[1] = src64[0] + 1;
+ src64[2] = src64[0] + 2;
+ src64[3] = src64[0] + 3;
+ dst64[1] = dst64[0] + 1;
+ dst64[2] = dst64[0] + 2;
+ dst64[3] = dst64[0] + 3;
+ *dst64[1] = *src64[1];
+ *dst64[2] = *src64[2];
+ *dst64[3] = *src64[3];
+ src64[0] += 4;
+ dst64[0] += 4;
+ } while (--nblock);
+}
diff --git a/opa/opa_i2cflash.c b/opa/opa_i2cflash.c
new file mode 100644
index 0000000..5b54bc2
--- /dev/null
+++ b/opa/opa_i2cflash.c
@@ -0,0 +1,87 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "opa_user.h"
+
+uint8_t hfi_flash_csum(struct hfi_flash *ifp, int adjust)
+{
+ uint8_t *ip = (uint8_t *) ifp;
+ uint8_t csum = 0, len;
+
+ /*
+ * Limit length checksummed to max length of actual data.
+ * Checksum of erased eeprom will still be bad, but we avoid
+ * reading past the end of the buffer we were passed.
+ */
+ len = ifp->if_length;
+ if (len > sizeof(struct hfi_flash))
+ len = sizeof(struct hfi_flash);
+ while (len--)
+ csum += *ip++;
+ csum -= ifp->if_csum;
+ csum = ~csum;
+ if (adjust)
+ ifp->if_csum = csum;
+ return csum;
+}
diff --git a/opa/opa_proto.c b/opa/opa_proto.c
new file mode 100644
index 0000000..c9eb9f4
--- /dev/null
+++ b/opa/opa_proto.c
@@ -0,0 +1,578 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains the initialization functions used by the low
+ level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "ipserror.h"
+#include "opa_user.h"
+#include "opa_udebug.h"
+
+#include <sched.h>
+
+#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+
+/* It is allowed to have multiple devices (and of different types)
+ simultaneously opened and initialized, although this (still! Oct 07)
+ implemented. This routine is used by the low level hfi protocol code (and
+ any other code that has similar low level functionality).
+ This is the only routine that takes a file descriptor, rather than an
+ struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything
+ else is returned as part of hfi1_base_info.
+*/
+struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
+{
+ struct _hfi_ctrl *spctrl = NULL;
+ struct hfi1_ctxt_info *cinfo;
+ struct hfi1_base_info *binfo;
+ void *tmp;
+ uint64_t *tmp64;
+ struct hfi1_cmd c;
+ uintptr_t pg_mask;
+ int __hfi_pg_sz;
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ /* for major version 6 of driver, we will use uinfo_new. See below for details. */
+ struct hfi1_user_info uinfo_new = {0};
+#endif
+
+ /* First get the page size */
+ __hfi_pg_sz = sysconf(_SC_PAGESIZE);
+ pg_mask = ~(intptr_t) (__hfi_pg_sz - 1);
+
+ if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) {
+ _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n",
+ strerror(errno));
+ goto err;
+ }
+ cinfo = &spctrl->ctxt_info;
+ binfo = &spctrl->base_info;
+
+ _HFI_VDBG("uinfo: ver %x, alg %d, subc_cnt %d, subc_id %d\n",
+ uinfo->userversion, uinfo->hfi1_alg,
+ uinfo->subctxt_cnt, uinfo->subctxt_id);
+
+ /* 1. ask driver to assign context to current process */
+ memset(&c, 0, sizeof(struct hfi1_cmd));
+ c.type = PSMI_HFI_CMD_ASSIGN_CTXT;
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ /* If psm is communicating with a MAJOR version 6 driver, we need
+ to pass in an actual struct hfi1_user_info not a hfi1_user_info_dep.
+ Else if psm is communicating with a MAJOR version 5 driver, we can
+ just continue to pass a hfi1_user_info_dep as struct hfi1_user_info_dep
+ is identical to the MAJOR version 5 struct hfi1_user_info. */
+ if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR)
+ {
+ /* If psm is communicating with a MAJOR version 6 driver,
+ we copy uinfo into uinfo_new and pass uinfo_new to the driver. */
+ c.len = sizeof(uinfo_new);
+ c.addr = (__u64) (&uinfo_new);
+
+ uinfo_new.userversion = uinfo->userversion;
+ uinfo_new.pad = uinfo->pad;
+ uinfo_new.subctxt_cnt = uinfo->subctxt_cnt;
+ uinfo_new.subctxt_id = uinfo->subctxt_id;
+ memcpy(uinfo_new.uuid,uinfo->uuid,sizeof(uinfo_new.uuid));
+ }
+ else
+ {
+ /* If psm is working with an old driver, we continue to use
+ the struct hfi1_user_info_dep version of the struct: */
+ c.len = sizeof(*uinfo);
+ c.addr = (__u64) uinfo;
+ }
+#else
+ c.len = sizeof(*uinfo);
+ c.addr = (__u64) uinfo;
+#endif
+ if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+ if (errno == ENODEV) {
+ _HFI_INFO("PSM2 and driver version mismatch\n");
+ /* Overwrite errno. One would wish that the driver
+ * didn't return ENODEV for a version mismatch */
+ errno = EPROTONOSUPPORT;
+ } else {
+ _HFI_INFO("assign_context command failed: %s\n",
+ strerror(errno));
+ }
+ goto err;
+ }
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR)
+ {
+ /* for the new driver, we copy the results of the call back to uinfo from
+ uinfo_new. */
+ uinfo->userversion = uinfo_new.userversion;
+ uinfo->pad = uinfo_new.pad;
+ uinfo->subctxt_cnt = uinfo_new.subctxt_cnt;
+ uinfo->subctxt_id = uinfo_new.subctxt_id;
+ memcpy(uinfo->uuid,uinfo_new.uuid,sizeof(uinfo_new.uuid));
+ }
+#endif
+
+ /* 2. get context info from driver */
+ c.type = PSMI_HFI_CMD_CTXT_INFO;
+ c.len = sizeof(*cinfo);
+ c.addr = (__u64) cinfo;
+
+ if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+ _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
+ goto err;
+ }
+
+ /* sanity checking... */
+ if (cinfo->rcvtids%8) {
+ _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
+ goto err;
+ }
+ if (cinfo->egrtids%8) {
+ _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids);
+ goto err;
+ }
+ if (cinfo->rcvtids < cinfo->egrtids) {
+ _HFI_INFO("rcvtids(%d) < egrtids(%d)\n",
+ cinfo->rcvtids, cinfo->egrtids);
+ goto err;
+ }
+ if (cinfo->rcvhdrq_cnt%32) {
+ _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n",
+ cinfo->rcvhdrq_cnt);
+ goto err;
+ }
+ if (cinfo->rcvhdrq_entsize%64) {
+ _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n",
+ cinfo->rcvhdrq_entsize);
+ goto err;
+ }
+ if (cinfo->rcvegr_size%__hfi_pg_sz) {
+ _HFI_INFO("rcvegr_size not page multiple: %d\n",
+ cinfo->rcvegr_size);
+ goto err;
+ }
+
+ _HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n",
+ cinfo->runtime_flags, cinfo->rcvegr_size);
+ _HFI_VDBG("ctxtinfo: active %d, unit %d, ctxt %d, subctxt %d\n",
+ cinfo->num_active, cinfo->unit, cinfo->ctxt, cinfo->subctxt);
+ _HFI_VDBG("ctxtinfo: rcvtids %d, credits %d\n",
+ cinfo->rcvtids, cinfo->credits);
+ _HFI_VDBG("ctxtinfo: numa %d, cpu %x, send_ctxt %d\n",
+ cinfo->numa_node, cinfo->rec_cpu, cinfo->send_ctxt);
+ _HFI_VDBG("ctxtinfo: rcvhdrq_cnt %d, rcvhdrq_entsize %d\n",
+ cinfo->rcvhdrq_cnt, cinfo->rcvhdrq_entsize);
+ _HFI_VDBG("ctxtinfo: egrtids %d, sdma_ring_size %d\n",
+ cinfo->egrtids, cinfo->sdma_ring_size);
+
+ /* if affinity has not been setup, set it */
+ if ((!getenv("HFI_NO_CPUAFFINITY") && cinfo->rec_cpu != (__u16) -1) ||
+ getenv("HFI_FORCE_CPUAFFINITY")) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cinfo->rec_cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
+ _HFI_INFO("Couldn't set runon processor %u "
+ "(unit:context %u:%u) (%u active chips): %s\n",
+ cinfo->rec_cpu, cinfo->unit, cinfo->ctxt,
+ cinfo->num_active, strerror(errno));
+ }
+ }
+
+
+ /* 4. Get user base info from driver */
+ c.type = PSMI_HFI_CMD_USER_INFO;
+ c.len = sizeof(*binfo);
+ c.addr = (__u64) binfo;
+
+ if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+ _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno));
+ goto err;
+ }
+
+ hfi_set_user_version(binfo->sw_version);
+
+ _HFI_VDBG("baseinfo: hwver %x, swver %x, jkey %d, qp %d\n",
+ binfo->hw_version, binfo->sw_version,
+ binfo->jkey, binfo->bthqp);
+ _HFI_VDBG("baseinfo: credit_addr %llx, sop %llx, pio %llx\n",
+ binfo->sc_credits_addr, binfo->pio_bufbase_sop,
+ binfo->pio_bufbase);
+ _HFI_VDBG("baseinfo: hdrbase %llx, egrbase %llx, sdmabase %llx\n",
+ binfo->rcvhdr_bufbase, binfo->rcvegr_bufbase,
+ binfo->sdma_comp_bufbase);
+ _HFI_VDBG("baseinfo: ureg %llx, eventbase %llx, "
+ "statusbase %llx, tailaddr %llx\n", binfo->user_regbase,
+ binfo->events_bufbase, binfo->status_bufbase,
+ binfo->rcvhdrtail_base);
+
+ if (getenv("PSM2_IDENTIFY")) {
+ printf("%s %s run-time driver interface v%d.%d\n",
+ hfi_get_mylabel(), hfi_ident_tag, hfi_get_user_major_version(), hfi_get_user_minor_version());
+ }
+
+ /*
+ * Check if driver version matches PSM version,
+ * this is different from PSM API version.
+ */
+ if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) {
+ _HFI_INFO
+ ("User major version 0x%x not same as driver major 0x%x\n",
+ hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT);
+ if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version())
+ goto err; /* else assume driver knows how to be compatible */
+ } else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) {
+ _HFI_PRDBG
+ ("User minor version 0x%x not same as driver minor 0x%x\n",
+ HFI1_USER_SWMINOR, binfo->sw_version & 0xffff);
+ }
+
+ /* Map the PIO credits address */
+ if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->sc_credits_addr &
+ pg_mask)) == MAP_FAILED) {
+ _HFI_INFO("mmap of sc_credits_addr (%llx) failed: %s\n",
+ (unsigned long long)binfo->sc_credits_addr,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, __hfi_pg_sz);
+ binfo->sc_credits_addr = (uint64_t) (uintptr_t) tmp |
+ (binfo->sc_credits_addr & ~pg_mask);
+ _HFI_VDBG("sc_credits_addr %llx\n",
+ binfo->sc_credits_addr);
+ }
+
+ /* Map the PIO buffer SOP address */
+ if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
+ PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->pio_bufbase_sop & pg_mask))
+ == MAP_FAILED) {
+ _HFI_INFO("mmap of pio buffer sop at %llx failed: %s\n",
+ (unsigned long long)binfo->pio_bufbase_sop,
+ strerror(errno));
+ goto err;
+ } else {
+ /* Do not try to read the PIO buffers; they are mapped write */
+ /* only. We'll fault them in as we write to them. */
+ binfo->pio_bufbase_sop = (uintptr_t) tmp;
+ _HFI_VDBG("pio_bufbase_sop %llx\n",
+ binfo->pio_bufbase_sop);
+ }
+
+ /* Map the PIO buffer address */
+ if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
+ PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->pio_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of pio buffer at %llx failed: %s\n",
+ (unsigned long long)binfo->pio_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ /* Do not try to read the PIO buffers; they are mapped write */
+ /* only. We'll fault them in as we write to them. */
+ binfo->pio_bufbase = (uintptr_t) tmp;
+ _HFI_VDBG("sendpio_bufbase %llx\n", binfo->pio_bufbase);
+ }
+
+ /* Map the receive header queue */
+ if ((tmp =
+ hfi_mmap64(0, cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->rcvhdr_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of rcvhdrq at %llx failed: %s\n",
+ (unsigned long long)binfo->rcvhdr_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ /* for use in protocol code */
+ hfi_touch_mmap(tmp,
+ cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize);
+ binfo->rcvhdr_bufbase = (uintptr_t) tmp; /* set to mapped address */
+ _HFI_VDBG("rcvhdr_bufbase %llx\n", binfo->rcvhdr_bufbase);
+ }
+
+ /* Map the receive eager buffer */
+ if ((tmp =
+ hfi_mmap64(0, cinfo->egrtids * cinfo->rcvegr_size,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->rcvegr_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of rcvegrq bufs from %llx failed: %s\n",
+ (unsigned long long)binfo->rcvegr_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, cinfo->egrtids * cinfo->rcvegr_size);
+ binfo->rcvegr_bufbase = (uint64_t) (uintptr_t) tmp;
+ _HFI_VDBG("rcvegr_bufbase %llx\n", binfo->rcvegr_bufbase);
+ }
+
+ /* Map the sdma completion queue */
+ if (!(cinfo->runtime_flags & HFI1_CAP_SDMA)) {
+ binfo->sdma_comp_bufbase = 0;
+ } else
+ if ((tmp =
+ hfi_mmap64(0, cinfo->sdma_ring_size *
+ sizeof(struct hfi1_sdma_comp_entry),
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->sdma_comp_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO
+ ("mmap of sdma completion queue from %llx failed: %s\n",
+ (unsigned long long)binfo->sdma_comp_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ binfo->sdma_comp_bufbase = (uint64_t) (uintptr_t) tmp;
+ }
+ _HFI_VDBG("sdma_comp_bufbase %llx\n", binfo->sdma_comp_bufbase);
+
+ /* Map RXE per-context CSRs */
+ if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+ PROT_WRITE | PROT_READ, MAP_SHARED | MAP_LOCKED,
+ fd,
+ (__off64_t) binfo->user_regbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of user registers at %llx failed: %s\n",
+ (unsigned long long)binfo->user_regbase,
+ strerror(errno));
+ goto err;
+ } else {
+ /* we don't try to fault these in, no need */
+ binfo->user_regbase = (uint64_t) (uintptr_t) tmp;
+ _HFI_VDBG("user_regbase %llx\n", binfo->user_regbase);
+ }
+
+ /*
+ * Set up addresses for optimized register writeback routines.
+ * This is for the real onchip registers, shared context or not
+ */
+ tmp64 = (uint64_t *) tmp;
+ spctrl->__hfi_rcvhdrtail = (volatile __le64 *)&tmp64[ur_rcvhdrtail];
+ spctrl->__hfi_rcvhdrhead = (volatile __le64 *)&tmp64[ur_rcvhdrhead];
+ spctrl->__hfi_rcvegrtail =
+ (volatile __le64 *)&tmp64[ur_rcvegrindextail];
+ spctrl->__hfi_rcvegrhead =
+ (volatile __le64 *)&tmp64[ur_rcvegrindexhead];
+ spctrl->__hfi_rcvofftail =
+ (volatile __le64 *)&tmp64[ur_rcvegroffsettail];
+
+ if (!(cinfo->runtime_flags & HFI1_CAP_HDRSUPP)) {
+ spctrl->__hfi_rcvtidflow = spctrl->regs;
+ spctrl->__hfi_tfvalid = 0;
+ } else {
+ spctrl->__hfi_rcvtidflow =
+ (volatile __le64 *)&tmp64[ur_rcvtidflowtable];
+ spctrl->__hfi_tfvalid = 1;
+ }
+
+ /* Map the rcvhdrq tail register address */
+ if (!(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+ /*
+ * We don't use receive header queue tail register to detect
+ * new packets, but here we save the address for
+ * false-eager-full recovery.
+ */
+ binfo->rcvhdrtail_base =
+ (uint64_t) (uintptr_t) spctrl->__hfi_rcvhdrtail;
+ spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
+ } else
+ if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->rcvhdrtail_base &
+ pg_mask)) == MAP_FAILED) {
+ _HFI_INFO("mmap of rcvhdrq tail addr %llx failed: %s\n",
+ (unsigned long long)binfo->rcvhdrtail_base,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, __hfi_pg_sz);
+ binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) tmp;
+ spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
+ }
+ _HFI_VDBG("rcvhdr_tail_addr %llx\n", binfo->rcvhdrtail_base);
+
+ /* Map the event page */
+ if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->events_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of status page at %llx failed: %s\n",
+ (unsigned long long)binfo->events_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ binfo->events_bufbase = (uint64_t) (uintptr_t) tmp |
+ (binfo->events_bufbase & ~pg_mask);
+ _HFI_VDBG("events_bufbase %llx\n", binfo->events_bufbase);
+ }
+
+ /* Map the status page */
+ if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
+ PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->status_bufbase & pg_mask)) ==
+ MAP_FAILED) {
+ _HFI_INFO("mmap of status page (%llx) failed: %s\n",
+ (unsigned long long)binfo->status_bufbase,
+ strerror(errno));
+ goto err;
+ } else {
+ binfo->status_bufbase = (uintptr_t) tmp;
+ _HFI_VDBG("status_bufbase %llx\n", binfo->status_bufbase);
+ }
+
+ /* If subcontext is used, map the buffers */
+ if (uinfo->subctxt_cnt) {
+ unsigned num_subcontexts = uinfo->subctxt_cnt;
+ size_t size;
+
+ size = __hfi_pg_sz;
+ if ((tmp = hfi_mmap64(0, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->subctxt_uregbase &
+ pg_mask)) == MAP_FAILED) {
+ _HFI_INFO
+ ("mmap of subcontext uregbase array (%llx) failed: %s\n",
+ (unsigned long long)binfo->subctxt_uregbase,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, size);
+ binfo->subctxt_uregbase = (uint64_t) (uintptr_t) tmp;
+ _HFI_VDBG("subctxt_uregbase %llx\n",
+ binfo->subctxt_uregbase);
+ }
+
+ size = ALIGN(cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
+ __hfi_pg_sz) * num_subcontexts;
+ if ((tmp = hfi_mmap64(0, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->subctxt_rcvhdrbuf &
+ pg_mask)) == MAP_FAILED) {
+ _HFI_INFO
+ ("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n",
+ (unsigned long long)binfo->subctxt_rcvhdrbuf,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, size);
+ binfo->subctxt_rcvhdrbuf = (uint64_t) (uintptr_t) tmp;
+ _HFI_VDBG("subctxt_rcvhdrbuf %llx\n",
+ binfo->subctxt_rcvhdrbuf);
+ }
+
+ size = ALIGN(cinfo->egrtids * cinfo->rcvegr_size,
+ __hfi_pg_sz) * num_subcontexts;
+ if ((tmp = hfi_mmap64(0, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED, fd,
+ (__off64_t) binfo->subctxt_rcvegrbuf &
+ pg_mask)) == MAP_FAILED) {
+ _HFI_INFO
+ ("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n",
+ (unsigned long long)binfo->subctxt_rcvegrbuf,
+ strerror(errno));
+ goto err;
+ } else {
+ hfi_touch_mmap(tmp, size);
+ binfo->subctxt_rcvegrbuf = (uint64_t) (uintptr_t) tmp;
+ _HFI_VDBG("subctxt_rcvegrbuf %llx\n",
+ binfo->subctxt_rcvegrbuf);
+ }
+ }
+
+ /* Save some info. */
+ spctrl->fd = fd;
+ spctrl->__hfi_unit = cinfo->unit;
+ /*
+ * driver should provide the port where the context is opened for, But
+ * OPA driver does not have port interface to psm because there is only
+ * one port. So we hardcode the port to 1 here. When we work on the
+ * version of PSM for the successor to OPA, we should have port returned
+ * from driver and will be set accordingly.
+ */
+ /* spctrl->__hfi_port = cinfo->port; */
+ spctrl->__hfi_port = 1;
+ spctrl->__hfi_tidegrcnt = cinfo->egrtids;
+ spctrl->__hfi_tidexpcnt = cinfo->rcvtids - cinfo->egrtids;
+
+ return spctrl;
+
+err:
+ if (spctrl)
+ free(spctrl);
+ return NULL;
+}
diff --git a/opa/opa_service.c b/opa/opa_service.c
new file mode 100644
index 0000000..38e6518
--- /dev/null
+++ b/opa/opa_service.c
@@ -0,0 +1,909 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low
+ level hfi protocol code. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <poll.h>
+#include "opa_service.h"
+#include "psmi_wrappers.h"
+
+typedef union
+{
+ struct
+ {
+ uint16_t minor;
+ uint16_t major;
+ };
+ uint32_t version;
+} sw_version_t;
+
+static sw_version_t sw_version =
+{
+ {
+ .major = HFI1_USER_SWMAJOR,
+ .minor = HFI1_USER_SWMINOR
+ }
+};
+
+/*
+ * This function is necessary in a udev-based world. There can be an
+ * arbitrarily long (but typically less than one second) delay between
+ * a driver getting loaded and any dynamic special files turning up.
+ *
+ * The timeout is in milliseconds. A value of zero means "callee
+ * decides timeout". Negative is infinite.
+ *
+ * Returns 0 on success, -1 on error or timeout. Check errno to see
+ * whether there was a timeout (ETIMEDOUT) or an error (any other
+ * non-zero value).
+ */
+int hfi_wait_for_device(const char *path, long timeout)
+{
+ int saved_errno;
+ struct stat st;
+ long elapsed;
+ int ret;
+
+ if (timeout == 0)
+ timeout = 15000;
+
+ elapsed = 0;
+
+ while (1) {
+ static const long default_ms = 250;
+ struct timespec req = { 0 };
+ long ms;
+
+ ret = stat(path, &st);
+ saved_errno = errno;
+
+ if (ret == 0 || (ret == -1 && errno != ENOENT))
+ break;
+
+ if ((timeout > 0) && ((timeout - elapsed) <= 0)) {
+ saved_errno = ETIMEDOUT;
+ break;
+ }
+
+ if (elapsed == 0) {
+ if (timeout < 0)
+ _HFI_DBG
+ ("Device file %s not present on first check; "
+ "waiting indefinitely...\n", path);
+ else
+ _HFI_DBG
+ ("Device file %s not present on first check; "
+ "waiting up to %.1f seconds...\n", path,
+ timeout / 1e3);
+ }
+
+ if (timeout < 0 || timeout - elapsed >= default_ms)
+ ms = default_ms;
+ else
+ ms = timeout;
+
+ elapsed += ms;
+ req.tv_nsec = ms * 1000000;
+
+ ret = nanosleep(&req, NULL);
+ saved_errno = errno;
+
+ if (ret == -1)
+ break;
+ }
+
+ if (ret == 0)
+ _HFI_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3);
+ else
+ _HFI_INFO
+ ("The %s device failed to appear after %.1f seconds: %s\n",
+ path, elapsed / 1e3, strerror(saved_errno));
+
+ errno = saved_errno;
+ return ret;
+}
+
+/* fwd declaration */
+ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count);
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+
+/* fwd declaration */
+ustatic int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count);
+/* Function pointer. */
+static int (*_hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_ioctl;
+
+#else
+/* Function pointer. */
+static int (*const _hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_write;
+#endif
+
+uint16_t hfi_get_user_major_version(void)
+{
+ return sw_version.major;
+}
+
+void hfi_set_user_major_version(uint16_t major_version)
+{
+ sw_version.major = major_version;
+}
+
+uint16_t hfi_get_user_minor_version(void)
+{
+ return sw_version.minor;
+}
+
+void hfi_set_user_version(uint32_t version)
+{
+ sw_version.version = version;
+}
+
+int hfi_context_open(int unit, int port, uint64_t open_timeout)
+{
+ char dev_name_ignored[256];
+
+ return hfi_context_open_ex(unit, port, open_timeout,
+ dev_name_ignored, sizeof(dev_name_ignored));
+}
+
+int hfi_context_open_ex(int unit, int port, uint64_t open_timeout,
+ char *dev_name,size_t dev_name_len)
+{
+ int fd;
+
+ if (unit != HFI_UNIT_ID_ANY && unit >= 0)
+ snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH,
+ unit);
+ else
+ snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH,
+ 0);
+
+ if (hfi_wait_for_device(dev_name, (long)open_timeout) == -1) {
+ _HFI_DBG("Could not find an HFI Unit on device "
+ "%s (%lds elapsed)", dev_name,
+ (long)open_timeout / 1000);
+ return -1;
+ }
+
+ if ((fd = open(dev_name, O_RDWR)) == -1) {
+ _HFI_DBG("(host:Can't open %s for reading and writing",
+ dev_name);
+ return -1;
+ }
+
+ if (fcntl(fd, F_SETFD, FD_CLOEXEC))
+ _HFI_INFO("Failed to set close on exec for device: %s\n",
+ strerror(errno));
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ {
+ /* if hfi1DriverMajor == -1, then we are potentially talking to a new driver.
+ Let's confirm by issuing an ioctl version request: */
+ struct hfi1_cmd c;
+
+ memset(&c, 0, sizeof(struct hfi1_cmd));
+ c.type = PSMI_HFI_CMD_GET_VERS;
+ c.len = 0;
+ c.addr = 0;
+
+ if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
+ /* Let's assume that the driver is the old driver */
+ hfi_set_user_major_version(IOCTL_CMD_API_MODULE_MAJOR - 1);
+ /* the old driver uses write() for its command interface: */
+ _hfi_cmd_send = _hfi_cmd_write;
+ }
+ else
+ {
+ int major = c.addr >> HFI1_SWMAJOR_SHIFT;
+ if (major != hfi_get_user_major_version()) {
+ /* If there is a skew between the major version of the driver
+ that is executing and the major version which was used during
+ compilation of PSM, we treat that is a fatal error. */
+ _HFI_INFO("PSM2 and driver version mismatch: (%d != %d)\n",
+ major, hfi_get_user_major_version());
+ close(fd);
+ return -1;
+ }
+ }
+ }
+
+#endif
+ return fd;
+}
+
+void hfi_context_close(int fd)
+{
+ (void)close(fd);
+}
+
+int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt)
+{
+ return writev(fd, iov, iovcnt);
+}
+
+int hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+ return _hfi_cmd_send(fd, cmd, count);
+}
+
+ustatic
+int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+ const static unsigned int cmdTypeToWriteNum[PSMI_HFI_CMD_LAST] = {
+ [PSMI_HFI_CMD_ASSIGN_CTXT] = LEGACY_HFI1_CMD_ASSIGN_CTXT,
+ [PSMI_HFI_CMD_CTXT_INFO] = LEGACY_HFI1_CMD_CTXT_INFO,
+ [PSMI_HFI_CMD_USER_INFO] = LEGACY_HFI1_CMD_USER_INFO,
+ [PSMI_HFI_CMD_TID_UPDATE] = LEGACY_HFI1_CMD_TID_UPDATE,
+ [PSMI_HFI_CMD_TID_FREE] = LEGACY_HFI1_CMD_TID_FREE,
+ [PSMI_HFI_CMD_CREDIT_UPD] = LEGACY_HFI1_CMD_CREDIT_UPD,
+ [PSMI_HFI_CMD_RECV_CTRL] = LEGACY_HFI1_CMD_RECV_CTRL,
+ [PSMI_HFI_CMD_POLL_TYPE] = LEGACY_HFI1_CMD_POLL_TYPE,
+ [PSMI_HFI_CMD_ACK_EVENT] = LEGACY_HFI1_CMD_ACK_EVENT,
+ [PSMI_HFI_CMD_SET_PKEY] = LEGACY_HFI1_CMD_SET_PKEY,
+ [PSMI_HFI_CMD_CTXT_RESET] = LEGACY_HFI1_CMD_CTXT_RESET,
+ [PSMI_HFI_CMD_TID_INVAL_READ] = LEGACY_HFI1_CMD_TID_INVAL_READ,
+ [PSMI_HFI_CMD_GET_VERS] = LEGACY_HFI1_CMD_GET_VERS,
+ };
+
+ if (cmd->type < PSMI_HFI_CMD_LAST) {
+ cmd->type = cmdTypeToWriteNum[cmd->type];
+
+ return psmi_write(fd, cmd, count);
+ } else {
+ errno = EINVAL;
+ return -1;
+ }
+}
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ustatic
+int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+ uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr };
+ const static struct
+ {
+ unsigned int ioctlCmd;
+ unsigned int addrOrLiteralIdx;
+ } cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = {
+ [PSMI_HFI_CMD_ASSIGN_CTXT] = {HFI1_IOCTL_ASSIGN_CTXT , 0},
+ [PSMI_HFI_CMD_CTXT_INFO] = {HFI1_IOCTL_CTXT_INFO , 0},
+ [PSMI_HFI_CMD_USER_INFO] = {HFI1_IOCTL_USER_INFO , 0},
+ [PSMI_HFI_CMD_TID_UPDATE] = {HFI1_IOCTL_TID_UPDATE , 0},
+ [PSMI_HFI_CMD_TID_FREE] = {HFI1_IOCTL_TID_FREE , 0},
+ [PSMI_HFI_CMD_CREDIT_UPD] = {HFI1_IOCTL_CREDIT_UPD , 1},
+ [PSMI_HFI_CMD_RECV_CTRL] = {HFI1_IOCTL_RECV_CTRL , 1},
+ [PSMI_HFI_CMD_POLL_TYPE] = {HFI1_IOCTL_POLL_TYPE , 1},
+ [PSMI_HFI_CMD_ACK_EVENT] = {HFI1_IOCTL_ACK_EVENT , 1},
+ [PSMI_HFI_CMD_SET_PKEY] = {HFI1_IOCTL_SET_PKEY , 1},
+ [PSMI_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1},
+ [PSMI_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0},
+ [PSMI_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1},
+#ifdef PSM_CUDA
+ [PSMI_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0},
+#endif
+ };
+
+ if (cmd->type < PSMI_HFI_CMD_LAST)
+ return psmi_ioctl(fd,
+ cmdTypeToIoctlNum[cmd->type].ioctlCmd,
+ addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]);
+ else
+ {
+ errno = EINVAL;
+ return -1;
+ }
+}
+#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */
+
+/* we use mmap64() because we compile in both 32 and 64 bit mode,
+ and we have to map physical addresses that are > 32 bits long.
+ While linux implements mmap64, it doesn't have a man page,
+ and isn't declared in any header file, so we declare it here ourselves.
+
+ We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+ redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+ it doesn't work when the address being mapped is > 32 bits. It chips
+ off bits 32 and above. So we stay with mmap64. */
+void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd,
+ __off64_t offset)
+{
+ return mmap64(addr, length, prot, flags, fd, offset);
+}
+
+/* get the number of units supported by the driver. Does not guarantee */
+/* that a working chip has been found for each possible unit #. */
+/* number of units >=0 (0 means none found). */
+/* formerly used sysfs file "num_units" */
+int hfi_get_num_units(void)
+{
+ int ret;
+
+ for (ret = 0;; ret++) {
+ char pathname[PATH_MAX];
+ struct stat st;
+ int r;
+
+ snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH "_%d", ret);
+ if (ret == 0)
+ /* We only wait for the first device to come up. Not
+ on subsequent devices in order to save time. */
+ r = hfi_wait_for_device(pathname, 0);
+ else
+ r = stat(pathname, &st);
+ if (!r)
+ continue;
+ else
+ break;
+ }
+
+ return ret;
+}
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+ returns 0 if no port on the unit is active.
+ returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit)
+{
+ int p,rv;
+
+ for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++)
+ if ((rv=hfi_get_port_lid(unit, p)) > 0)
+ break;
+
+ if (p <= HFI_MAX_PORT)
+ {
+ return 1;
+ }
+
+ return rv;
+}
+
+/* get the number of contexts from the unit id. */
+/* Returns 0 if no unit or no match. */
+int hfi_get_num_contexts(int unit_id)
+{
+ int n = 0;
+ int units;
+ int64_t val;
+ uint32_t p = HFI_MIN_PORT;
+
+ units = hfi_get_num_units();
+
+ if_pf(units <= 0)
+ return 0;
+
+ if (unit_id == HFI_UNIT_ID_ANY) {
+ uint32_t u;
+
+ for (u = 0; u < units; u++) {
+ for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++)
+ if (hfi_get_port_lid(u, p) > 0)
+ break;
+
+ if (p <= HFI_MAX_PORT &&
+ !hfi_sysfs_unit_read_s64(u, "nctxts", &val, 0))
+ n += (uint32_t) val;
+ }
+ } else {
+ for (; p <= HFI_MAX_PORT; p++)
+ if (hfi_get_port_lid(unit_id, p) > 0)
+ break;
+
+ if (p <= HFI_MAX_PORT &&
+ !hfi_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0))
+ n += (uint32_t) val;
+ }
+
+ return n;
+}
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+ returns 0 if the unit and port are not active.
+ returns -1 when an error occurred. */
+int hfi_get_port_active(int unit, int port)
+{
+ int ret;
+ char *state;
+
+ ret = hfi_sysfs_port_read(unit, port, "phys_state", &state);
+ if (ret == -1) {
+ if (errno == ENODEV)
+ /* this is "normal" for port != 1, on single port chips */
+ _HFI_VDBG
+ ("Failed to get phys_state for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ else
+ _HFI_DBG
+ ("Failed to get phys_state for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ return -1;
+ } else {
+ if (strncmp(state, "5: LinkUp", 9)) {
+ _HFI_DBG("Link is not Up for unit %u:%u\n", unit, port);
+ free(state);
+ return 0;
+ }
+ free(state);
+ return 1;
+ }
+}
+
+/* Given the unit number, return an error, or the corresponding LID
+ For now, it's used only so the MPI code can determine it's own
+ LID, and which other LIDs (if any) are also assigned to this node
+ Returns an int, so -1 indicates an error. 0 may indicate that
+ the unit is valid, but no LID has been assigned.
+ No error print because we call this for both potential
+ ports without knowing if both ports exist (or are connected) */
+int hfi_get_port_lid(int unit, int port)
+{
+ int ret;
+ int64_t val;
+
+ if (hfi_get_port_active(unit,port) != 1)
+ return -2;
+ ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0);
+ _HFI_VDBG("hfi_get_port_lid: ret %d, unit %d port %d\n", ret, unit,
+ port);
+
+ if (ret == -1) {
+ if (errno == ENODEV)
+ /* this is "normal" for port != 1, on single port chips */
+ _HFI_VDBG("Failed to get LID for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ else
+ _HFI_DBG("Failed to get LID for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ } else {
+ ret = val;
+
+/* disable this feature since we don't have a way to provide
+ file descriptor in multiple context case. */
+#if 0
+ if (getenv("HFI_DIAG_LID_LOOP")) {
+ /* provides diagnostic ability to run MPI, etc. even */
+ /* on loopback, by claiming a different LID for each context */
+ struct hfi1_ctxt_info info;
+ struct hfi1_cmd cmd;
+ cmd.type = PSMI_HFI_CMD_CTXT_INFO;
+ cmd.cmd.ctxt_info = (uintptr_t) &info;
+ if (__hfi_lastfd == -1)
+ _HFI_INFO
+ ("Can't run CONTEXT_INFO for lid_loop, fd not set\n");
+ else if (write(__hfi_lastfd, &cmd, sizeof(cmd)) == -1)
+ _HFI_INFO("CONTEXT_INFO command failed: %s\n",
+ strerror(errno));
+ else if (!info.context)
+ _HFI_INFO("CONTEXT_INFO returned context 0!\n");
+ else {
+ _HFI_PRDBG
+ ("Using lid 0x%x, base %x, context %x\n",
+ ret + info.context, ret, info.context);
+ ret += info.context;
+ }
+ }
+#endif
+ }
+
+ return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding GID
+ For now, it's used only so the MPI code can determine its fabric ID.
+ Returns an int, so -1 indicates an error.
+ No error print because we call this for both potential
+ ports without knowing if both ports exist (or are connected) */
+int hfi_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo)
+{
+ int ret;
+ char *gid_str = NULL;
+
+ ret = hfi_sysfs_port_read(unit, port, "gids/0", &gid_str);
+
+ if (ret == -1) {
+ if (errno == ENODEV)
+ /* this is "normal" for port != 1, on single
+ * port chips */
+ _HFI_VDBG("Failed to get GID for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ else
+ _HFI_DBG("Failed to get GID for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ } else {
+ int gid[8];
+ if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x",
+ &gid[0], &gid[1], &gid[2], &gid[3],
+ &gid[4], &gid[5], &gid[6], &gid[7]) != 8) {
+ _HFI_DBG("Failed to parse GID for unit %u:%u: %s\n",
+ unit, port, gid_str);
+ ret = -1;
+ } else {
+ *hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1])
+ << 32) |
+ (((uint64_t)
+ gid[2]) << 16) | (((uint64_t) gid[3]) << 0);
+ *lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5])
+ << 32) |
+ (((uint64_t)
+ gid[6]) << 16) | (((uint64_t) gid[7]) << 0);
+ }
+ free(gid_str);
+ }
+
+ return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding LMC value
+ for the port */
+/* Returns an int, so -1 indicates an error. 0 */
+int hfi_get_port_lmc(int unit, int port)
+{
+ int ret;
+ int64_t val;
+
+ ret = hfi_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0);
+
+ if (ret == -1) {
+ _HFI_INFO("Failed to get LMC for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+ } else
+ ret = val;
+
+ return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding link rate
+ for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port)
+{
+ int ret;
+ double rate;
+ char *data_rate = NULL, *newptr;
+
+ ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate);
+ if (ret == -1)
+ goto get_port_rate_error;
+ else {
+ rate = strtod(data_rate, &newptr);
+ if ((rate == 0) && (data_rate == newptr))
+ goto get_port_rate_error;
+ }
+
+ free(data_rate);
+ return ((int)(rate * 2) >> 1);
+
+get_port_rate_error:
+ _HFI_INFO("Failed to get link rate for unit %u:%u: %s\n",
+ unit, port, strerror(errno));
+
+ return ret;
+}
+
+/* Given a unit, port and SL, return an error, or the corresponding SC for the
+ SL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sl2sc(int unit, int port, int sl)
+{
+ int ret;
+ int64_t val;
+ char sl2scpath[16];
+
+ snprintf(sl2scpath, sizeof(sl2scpath), "sl2sc/%d", sl);
+ ret = hfi_sysfs_port_read_s64(unit, port, sl2scpath, &val, 0);
+
+ if (ret == -1) {
+ _HFI_DBG
+ ("Failed to get SL2SC mapping for SL %d unit %u:%u: %s\n",
+ sl, unit, port, strerror(errno));
+ } else
+ ret = val;
+
+ return ret;
+}
+
+/* Given a unit, port and SC, return an error, or the corresponding VL for the
+ SC as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_sc2vl(int unit, int port, int sc)
+{
+ int ret;
+ int64_t val;
+ char sc2vlpath[16];
+
+ snprintf(sc2vlpath, sizeof(sc2vlpath), "sc2vl/%d", sc);
+ ret = hfi_sysfs_port_read_s64(unit, port, sc2vlpath, &val, 0);
+
+ if (ret == -1) {
+ _HFI_DBG
+ ("Failed to get SC2VL mapping for SC %d unit %u:%u: %s\n",
+ sc, unit, port, strerror(errno));
+ } else
+ ret = val;
+
+ return ret;
+}
+
+/* Given a unit, port and VL, return an error, or the corresponding MTU for the
+ VL as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_vl2mtu(int unit, int port, int vl)
+{
+ int ret;
+ int64_t val;
+ char vl2mtupath[16];
+
+ snprintf(vl2mtupath, sizeof(vl2mtupath), "vl2mtu/%d", vl);
+ ret = hfi_sysfs_port_read_s64(unit, port, vl2mtupath, &val, 0);
+
+ if (ret == -1) {
+ _HFI_DBG
+ ("Failed to get VL2MTU mapping for VL %d unit %u:%u: %s\n",
+ vl, unit, port, strerror(errno));
+ } else
+ ret = val;
+
+ return ret;
+}
+
+/* Given a unit, port and index, return an error, or the corresponding pkey
+ value for the index as programmed by the SM */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_index2pkey(int unit, int port, int index)
+{
+ int ret;
+ int64_t val;
+ char index2pkeypath[16];
+
+ snprintf(index2pkeypath, sizeof(index2pkeypath), "pkeys/%d", index);
+ ret = hfi_sysfs_port_read_s64(unit, port, index2pkeypath, &val, 0);
+
+ if (ret == -1) {
+ _HFI_DBG
+ ("Failed to get index2pkey mapping for index %d unit %u:%u: %s\n",
+ index, unit, port, strerror(errno));
+ } else
+ ret = val;
+
+ return ret;
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the hfi driver, they return new info with
+ * the qib driver
+ */
+static int hfi_count_names(const char *namep)
+{
+ int n = 0;
+ while (*namep != '\0') {
+ if (*namep == '\n')
+ n++;
+ namep++;
+ }
+ return n;
+}
+
+int hfi_get_stats_names(char **namep)
+{
+ int i;
+ i = hfi_hfifs_read("driver_stats_names", namep);
+ if (i < 0)
+ return -1;
+ else
+ return hfi_count_names(*namep);
+}
+
+int hfi_get_stats(uint64_t *s, int nelem)
+{
+ int i;
+ i = hfi_hfifs_rd("driver_stats", s, nelem * sizeof(*s));
+ if (i < 0)
+ return -1;
+ else
+ return i / sizeof(*s);
+}
+
+int hfi_get_ctrs_unit_names(int unitno, char **namep)
+{
+ int i;
+ i = hfi_hfifs_unit_read(unitno, "counter_names", namep);
+ if (i < 0)
+ return -1;
+ else
+ return hfi_count_names(*namep);
+}
+
+int hfi_get_ctrs_unit(int unitno, uint64_t *c, int nelem)
+{
+ int i;
+ i = hfi_hfifs_unit_rd(unitno, "counters", c, nelem * sizeof(*c));
+ if (i < 0)
+ return -1;
+ else
+ return i / sizeof(*c);
+}
+
+int hfi_get_ctrs_port_names(int unitno, char **namep)
+{
+ int i;
+ i = hfi_hfifs_unit_read(unitno, "portcounter_names", namep);
+ if (i < 0)
+ return -1;
+ else
+ return hfi_count_names(*namep);
+}
+
+int hfi_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem)
+{
+ int i;
+ char buf[32];
+ snprintf(buf, sizeof(buf), "port%dcounters", port);
+ i = hfi_hfifs_unit_rd(unitno, buf, c, nelem * sizeof(*c));
+ if (i < 0)
+ return -1;
+ else
+ return i / sizeof(*c);
+}
+
+int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf)
+{
+ int fd;
+ size_t count;
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+ sprintf(ccabuf, HFI_CLASS_PATH "_%d/ports/%d/CCMgtA/cc_settings_bin",
+ unit, port);
+ fd = open(ccabuf, O_RDONLY);
+ if (fd < 0) {
+ return 0;
+ }
+ /*
+ * 4 bytes for 'control map'
+ * 2 bytes 'port control'
+ * 32 (#SLs) * 6 bytes 'congestion setting' (per-SL)
+ */
+ count = 4 + 2 + (32 * 6);
+ if (read(fd, ccabuf, count) != count) {
+ _HFI_CCADBG("Read cc_settings_bin failed. using static CCA\n");
+ close(fd);
+ return 0;
+ }
+
+ close(fd);
+
+ return 1;
+}
+
+int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp)
+{
+ int i;
+ unsigned short ccti_limit;
+ uint16_t *cct;
+ int fd;
+ char pathname[256];
+
+ *cctp = NULL;
+ sprintf(pathname, HFI_CLASS_PATH "_%d/ports/%d/CCMgtA/cc_table_bin",
+ unit, port);
+ fd = open(pathname, O_RDONLY);
+ if (fd < 0) {
+ _HFI_CCADBG("Open cc_table_bin failed. using static CCA\n");
+ return 0;
+ }
+ if (read(fd, &ccti_limit, sizeof(ccti_limit)) != sizeof(ccti_limit)) {
+ _HFI_CCADBG("Read ccti_limit failed. using static CCA\n");
+ close(fd);
+ return 0;
+ }
+
+ _HFI_CCADBG("ccti_limit = %d\n", ccti_limit);
+
+ if (ccti_limit < 63) {
+ _HFI_CCADBG("Read ccti_limit %d not in range [63, 65535], "
+ "using static CCA.\n", ccti_limit);
+ close(fd);
+ return 0;
+ }
+
+ i = (ccti_limit + 1) * sizeof(uint16_t);
+ cct = malloc(i);
+ if (!cct) {
+ close(fd);
+ return -1;
+ }
+ if (read(fd, cct, i) != i) {
+ _HFI_CCADBG("Read ccti_entry_list, using static CCA\n");
+ free(cct);
+ close(fd);
+ return 0;
+ }
+
+ close(fd);
+
+ _HFI_CCADBG("cct[0] = 0x%04x\n", cct[0]);
+
+ *cctp = cct;
+ return ccti_limit;
+}
+
+/*
+ * This is for diag function hfi_wait_for_packet() only
+ */
+int hfi_cmd_wait_for_packet(int fd)
+{
+ int ret;
+ struct pollfd pfd;
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+
+ ret = poll(&pfd, 1, 500 /* ms */);
+
+ return ret;
+}
diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c
new file mode 100644
index 0000000..f0cec91
--- /dev/null
+++ b/opa/opa_sysfs.c
@@ -0,0 +1,854 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains a simple sysfs interface used by the low level
+ hfi protocol code. It also implements the interface to hfifs. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "opa_service.h"
+
+static char *sysfs_path;
+static size_t sysfs_path_len;
+static char *hfifs_path;
+static long sysfs_page_size;
+
+static void __attribute__ ((constructor)) sysfs_init(void)
+{
+ struct stat s;
+ if (sysfs_path == NULL)
+ sysfs_path = getenv("HFI_SYSFS_PATH");
+ if (sysfs_path == NULL) {
+ static char syspath[64];
+ snprintf(syspath, sizeof(syspath), "%s_%d", HFI_CLASS_PATH, 0);
+ sysfs_path = syspath;
+ }
+ if (stat(sysfs_path, &s) || !S_ISDIR(s.st_mode))
+ _HFI_DBG("Did not find sysfs directory %s, using anyway\n",
+ sysfs_path);
+ sysfs_path_len = strlen(sysfs_path);
+
+ if (hfifs_path == NULL)
+ hfifs_path = getenv("HFI_HFIFS_PATH");
+ if (hfifs_path == NULL)
+ hfifs_path = "/hfifs";
+
+ if (!sysfs_page_size)
+ sysfs_page_size = sysconf(_SC_PAGESIZE);
+}
+
+const char *hfi_sysfs_path(void)
+{
+ return sysfs_path;
+}
+
+size_t hfi_sysfs_path_len(void)
+{
+ return sysfs_path_len;
+}
+
+const char *hfi_hfifs_path(void)
+{
+ return hfifs_path;
+}
+
+/* Calls stat() for the given attribute, return value is unchanged
+ from stat() sbuf is populated from stat() too. */
+int hfi_sysfs_stat(const char *attr,struct stat *sbuf)
+{
+ char buf[1024];
+
+ snprintf(buf, sizeof(buf), "%s/%s", hfi_sysfs_path(), attr);
+ return stat(buf,sbuf);
+}
+
+int hfi_sysfs_open(const char *attr, int flags)
+{
+ char buf[1024];
+ int saved_errno;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "%s/%s", hfi_sysfs_path(), attr);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open driver attribute '%s': %s\n", attr,
+ strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+int hfi_hfifs_open(const char *attr, int flags)
+{
+ char buf[1024];
+ int saved_errno;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "%s/%s", hfi_hfifs_path(), attr);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open driver attribute '%s': %s\n", attr,
+ strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+static int sysfs_vprintf(int fd, const char *fmt, va_list ap)
+{
+ char *buf;
+ int len, ret;
+ int saved_errno;
+
+ buf = alloca(sysfs_page_size);
+ len = vsnprintf(buf, sysfs_page_size, fmt, ap);
+
+ if (len > sysfs_page_size) {
+ _HFI_DBG("Attempt to write more (%d) than %ld bytes\n", len,
+ sysfs_page_size);
+ saved_errno = EINVAL;
+ ret = -1;
+ goto bail;
+ }
+
+ ret = write(fd, buf, len);
+ saved_errno = errno;
+
+ if (ret != -1 && ret < len) {
+ _HFI_DBG("Write ran short (%d < %d)\n", ret, len);
+ saved_errno = EAGAIN;
+ ret = -1;
+ }
+
+bail:
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_printf(const char *attr, const char *fmt, ...)
+{
+ int fd = -1;
+ va_list ap;
+ int ret = -1;
+ int saved_errno;
+
+ fd = hfi_sysfs_open(attr, O_WRONLY);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ goto bail;
+ }
+
+ va_start(ap, fmt);
+ ret = sysfs_vprintf(fd, fmt, ap);
+ saved_errno = errno;
+ va_end(ap);
+
+ if (ret == -1) {
+ _HFI_DBG("Failed to write to driver attribute '%s': %s\n", attr,
+ strerror(errno));
+ }
+
+bail:
+ if (fd != -1)
+ close(fd);
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+ int saved_errno;
+ char buf[1024];
+ int fd;
+ int len, l;
+
+ snprintf(buf, sizeof(buf), "%s", hfi_sysfs_path());
+ len = l = strlen(buf) - 1;
+ while (l > 0 && isdigit(buf[l]))
+ l--;
+ if (l)
+ buf[++l] = 0;
+ else
+ l = len; /* assume they know what they are doing */
+ snprintf(buf + l, sizeof(buf) - l, "%u/%s", unit, attr);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+ unit, strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags)
+{
+ int saved_errno;
+ char buf[1024];
+ int fd;
+ char *path_copy = strdup(hfi_sysfs_path());
+
+ snprintf(buf, sizeof(buf), "%s/hfi1_%u/device/numa_node",
+ dirname(path_copy), unit);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n",
+ unit, strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr,
+ int flags)
+{
+ int saved_errno;
+ char buf[1024];
+ int fd;
+ int len, l;
+
+ snprintf(buf, sizeof(buf), "%s", hfi_sysfs_path());
+ len = l = strlen(buf) - 1;
+ while (l > 0 && isdigit(buf[l]))
+ l--;
+ if (l)
+ buf[++l] = 0;
+ else
+ l = len; /* assume they know what they are doing */
+ snprintf(buf + l, sizeof(buf) - l, "%u/ports/%u/%s", unit, port, attr);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n",
+ attr, unit, port, strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+int hfi_hfifs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+ int saved_errno;
+ char buf[1024];
+ int fd;
+
+ snprintf(buf, sizeof(buf), "%s/%u/%s", hfi_hfifs_path(), unit, attr);
+ fd = open(buf, flags);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+ unit, strerror(errno));
+ _HFI_DBG("Offending file name: %s\n", buf);
+ }
+
+ errno = saved_errno;
+ return fd;
+}
+
+int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret = -1;
+ int saved_errno;
+ int fd;
+
+ fd = hfi_sysfs_port_open(unit, port, attr, O_WRONLY);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ goto bail;
+ }
+
+ va_start(ap, fmt);
+ ret = sysfs_vprintf(fd, fmt, ap);
+ saved_errno = errno;
+ va_end(ap);
+
+ if (ret == -1) {
+ _HFI_DBG("Failed to write to attribute '%s' of unit %d: %s\n",
+ attr, unit, strerror(errno));
+ }
+
+bail:
+ if (fd != -1)
+ close(fd);
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...)
+{
+ va_list ap;
+ int ret = -1;
+ int saved_errno;
+ int fd;
+
+ fd = hfi_sysfs_unit_open(unit, attr, O_WRONLY);
+ saved_errno = errno;
+
+ if (fd == -1) {
+ goto bail;
+ }
+
+ va_start(ap, fmt);
+ ret = sysfs_vprintf(fd, fmt, ap);
+ saved_errno = errno;
+ va_end(ap);
+
+ if (ret == -1) {
+ _HFI_DBG("Failed to write to attribute '%s' of unit %d: %s\n",
+ attr, unit, strerror(errno));
+ }
+
+bail:
+ if (fd != -1)
+ close(fd);
+
+ errno = saved_errno;
+ return ret;
+}
+
+static int read_page(int fd, char **datap)
+{
+ char *data = NULL;
+ int saved_errno;
+ int ret = -1;
+
+ data = malloc(sysfs_page_size);
+ saved_errno = errno;
+
+ if (!data) {
+ _HFI_DBG("Could not allocate memory: %s\n", strerror(errno));
+ goto bail;
+ }
+
+ ret = read(fd, data, sysfs_page_size);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ _HFI_DBG("Read of attribute failed: %s\n", strerror(errno));
+ goto bail;
+ }
+
+bail:
+ if (ret == -1) {
+ free(data);
+ } else {
+ *datap = data;
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_read(const char *attr, char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_sysfs_open(attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ saved_errno = errno;
+
+bail:
+ if (ret == -1)
+ *datap = NULL;
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ saved_errno = errno;
+
+bail:
+ if (ret == -1)
+ *datap = NULL;
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+ char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ saved_errno = errno;
+
+bail:
+ if (ret == -1)
+ *datap = NULL;
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_unit_write(uint32_t unit, const char *attr, const void *data,
+ size_t len)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ if (len > sysfs_page_size) {
+ _HFI_DBG("Attempt to write more (%ld) than %ld bytes\n",
+ (long)len, sysfs_page_size);
+ saved_errno = EINVAL;
+ goto bail;
+ }
+
+ fd = hfi_sysfs_unit_open(unit, attr, O_WRONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = write(fd, data, len);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ _HFI_DBG("Attempt to write %ld bytes failed: %s\n",
+ (long)len, strerror(errno));
+ goto bail;
+ }
+
+ if (ret < len) {
+ /* sysfs routines can routine count including null byte
+ so don't return an error if it's > len */
+ _HFI_DBG
+ ("Attempt to write %ld bytes came up short (%ld bytes)\n",
+ (long)len, (long)ret);
+ saved_errno = EAGAIN;
+ ret = -1;
+ }
+
+bail:
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_hfifs_read(const char *attr, char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_hfifs_open(attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ saved_errno = errno;
+
+bail:
+ if (ret == -1)
+ *datap = NULL;
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ saved_errno = errno;
+
+bail:
+ if (ret == -1)
+ *datap = NULL;
+
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+/*
+ * The _rd routines jread directly into a supplied buffer,
+ * unlike the _read routines.
+ */
+int hfi_hfifs_rd(const char *attr, void *buf, int n)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_hfifs_open(attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read(fd, buf, n);
+ saved_errno = errno;
+
+bail:
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_hfifs_unit_rd(uint32_t unit, const char *attr, void *buf, int n)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read(fd, buf, n);
+ saved_errno = errno;
+
+bail:
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data,
+ size_t len)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_hfifs_unit_open(unit, attr, O_WRONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = write(fd, data, len);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ _HFI_DBG("Attempt to write %ld bytes failed: %s\n",
+ (long)len, strerror(errno));
+ goto bail;
+ }
+
+ if (ret != len) {
+ _HFI_DBG
+ ("Attempt to write %ld bytes came up short (%ld bytes)\n",
+ (long)len, (long)ret);
+ saved_errno = EAGAIN;
+ ret = -1;
+ }
+
+bail:
+ if (fd != -1) {
+ close(fd);
+ }
+
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_read_s64(const char *attr, int64_t *valp, int base)
+{
+ char *data, *end;
+ int ret;
+ int saved_errno;
+ long long val;
+
+ ret = hfi_sysfs_read(attr, &data);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ goto bail;
+ }
+
+ val = strtoll(data, &end, base);
+ saved_errno = errno;
+
+ if (!*data || !(*end == '\0' || isspace(*end))) {
+ ret = -1;
+ goto bail;
+ }
+
+ *valp = val;
+ ret = 0;
+
+bail:
+ free(data);
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+ int64_t *valp, int base)
+{
+ char *data=NULL, *end;
+ int saved_errno;
+ long long val;
+ int ret;
+
+ ret = hfi_sysfs_unit_read(unit, attr, &data);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ goto bail;
+ }
+
+ val = strtoll(data, &end, base);
+ saved_errno = errno;
+
+ if (!*data || !(*end == '\0' || isspace(*end))) {
+ ret = -1;
+ goto bail;
+ }
+
+ *valp = val;
+ ret = 0;
+
+bail:
+ if (data)
+ free(data);
+ errno = saved_errno;
+ return ret;
+}
+
+static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap)
+{
+ int fd = -1, ret = -1;
+ int saved_errno;
+
+ fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY);
+ saved_errno = errno;
+
+ if (fd == -1)
+ goto bail;
+
+ ret = read_page(fd, datap);
+ if (ret == -1)
+ *datap = NULL;
+
+ saved_errno = errno;
+ close(fd);
+bail:
+ errno = saved_errno;
+ return ret;
+}
+
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit)
+{
+ char *data=NULL, *end;
+ int saved_errno;
+ long long val;
+ int64_t ret = -1;
+
+ saved_errno = errno;
+ if (hfi_sysfs_unit_read_node(unit, &data) == -1) {
+ goto bail;
+ }
+
+ val = strtoll(data, &end, 0);
+ saved_errno = errno;
+
+ if (!*data || !(*end == '\0' || isspace(*end))) {
+ ret = -1;
+ goto bail;
+ }
+
+ ret = (int64_t) val;
+bail:
+ free(data);
+ errno = saved_errno;
+ return ret;
+}
+
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+ int64_t *valp, int base)
+{
+ char *data, *end;
+ int saved_errno;
+ long long val;
+ int ret;
+
+ ret = hfi_sysfs_port_read(unit, port, attr, &data);
+ saved_errno = errno;
+
+ if (ret == -1) {
+ goto bail;
+ }
+
+ val = strtoll(data, &end, base);
+ saved_errno = errno;
+
+ if (!*data || !(*end == '\0' || isspace(*end))) {
+ ret = -1;
+ goto bail;
+ }
+
+ *valp = val;
+ ret = 0;
+
+bail:
+ free(data);
+ errno = saved_errno;
+ return ret;
+}
diff --git a/opa/opa_syslog.c b/opa/opa_syslog.c
new file mode 100644
index 0000000..ccd39c5
--- /dev/null
+++ b/opa/opa_syslog.c
@@ -0,0 +1,113 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "opa_user.h"
+
+#define SYSLOG_MAXLEN 512
+
+extern char *__hfi_mylabel;
+
+void
+hfi_vsyslog(const char *prefix, int to_console, int level,
+ const char *format, va_list ap)
+{
+ char logprefix[SYSLOG_MAXLEN];
+ size_t len;
+
+ if (to_console) {
+ char hostname[80];
+ va_list ap_cons;
+ va_copy(ap_cons, ap);
+ len = strlen(format);
+ gethostname(hostname, sizeof(hostname));
+ hostname[sizeof(hostname) - 1] = '\0';
+
+ if (__hfi_mylabel)
+ fprintf(stderr, "%s", __hfi_mylabel);
+ else
+ fprintf(stderr, "%s: ", hostname);
+
+ vfprintf(stderr, format, ap_cons);
+ if (format[len] != '\n')
+ fprintf(stderr, "\n");
+ fflush(stderr);
+ va_end(ap_cons);
+ }
+
+ len = snprintf(logprefix, sizeof(logprefix),
+ "(hfi/%s)[%d]: %s", prefix ? prefix : "hfi",
+ (int)getpid(), format);
+
+ vsyslog(level | LOG_USER, logprefix, ap);
+
+ return;
+}
+
+void
+hfi_syslog(const char *prefix, int to_console, int level,
+ const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ hfi_vsyslog(prefix, to_console, level, format, ap);
+ va_end(ap);
+}
diff --git a/opa/opa_time.c b/opa/opa_time.c
new file mode 100644
index 0000000..1b636ed
--- /dev/null
+++ b/opa/opa_time.c
@@ -0,0 +1,284 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "opa_user.h"
+
+/* init the cycle counter to picosecs/cycle conversion automatically */
+/* at program startup, if it's using timing functions. */
+static void init_picos_per_cycle(void) __attribute__ ((constructor));
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle);
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle);
+
+/* in case two of our mechanisms fail */
+#define SAFEDEFAULT_PICOS_PER_CYCLE 500
+
+uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+
+/* This isn't perfect, but it's close enough for rough timing. We want this
+ to work on systems where the cycle counter isn't the same as the clock
+ frequency.
+ __hfi_pico_per_cycle isn't going to lead to completely accurate
+ conversions from timestamps to nanoseconds, but it's close enough for
+ our purposes, which is mainly to allow people to show events with nsecs
+ or usecs if desired, rather than cycles. We use it in some performance
+ analysis, but it has to be done with care, since cpuspeed can change,
+ different cpu's can have different speeds, etc.
+
+ Some architectures don't have their TSC-equivalent running at anything
+ related to the processor speed (e.g. G5 Power systems use a fixed
+ 33 MHz frequency).
+*/
+
+#define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */
+
+static int timebase_debug; /* off by default */
+
+#define timebase_warn_always(fmt, ...) \
+ hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__)
+#define timebase_warn(fmt, ...) if (timebase_debug) \
+ timebase_warn_always(fmt, ##__VA_ARGS__)
+
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle)
+{
+#if defined(__x86_64__) || defined(__i386__)
+ /* If pico-per-cycle is less than 200, the clock speed would be greater
+ * than 5 GHz. Similarly, we minimally support a 1GHz clock.
+ * Allow some slop, because newer kernels with HPET can be a few
+ * units off, and we don't want to spend the startup time needlessly */
+ if (pico_per_cycle >= 198 && pico_per_cycle <= 1005)
+ return 1;
+#endif
+ else
+ return 0;
+}
+
+/*
+ * Method #1:
+ *
+ * Derive the pico-per-cycle by trying to correlate the difference between two
+ * reads of the tsc counter to gettimeofday.
+ */
+static void init_picos_per_cycle()
+{
+ struct timeval tvs, tve;
+ int64_t usec = 0;
+ uint64_t ts, te;
+ int64_t delta;
+ uint32_t picos = 0;
+ int trials = 0;
+ int retry = 0;
+ cpu_set_t cpuset, cpuset_saved;
+ int have_cpuset = 1;
+
+ /*
+ * Make sure we try to calculate the cycle time without being migrated.
+ */
+ CPU_ZERO(&cpuset_saved);
+ if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved))
+ have_cpuset = 0;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset))
+ have_cpuset = 0;
+
+ /*
+ * If we set affinity correctly, give the scheduler another change to put
+ * us on processor 0
+ */
+ if (have_cpuset)
+ sched_yield();
+
+retry_pico_test:
+ if (++retry == 10) {
+ __hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos);
+ goto reset_cpu_mask; /* Reset CPU mask before exiting */
+ }
+
+ usec = 0;
+ gettimeofday(&tvs, NULL);
+ ts = get_cycles();
+ while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */
+ trials++;
+ usleep(125);
+ gettimeofday(&tve, NULL);
+ usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) +
+ 1000000000000LL * (tve.tv_sec - tvs.tv_sec);
+ if (usec < 0) {
+ timebase_warn
+ ("RTC timebase, gettimeofday is negative (!) %lld\n",
+ (long long)usec);
+ goto retry_pico_test;
+ }
+ }
+ te = get_cycles();
+ delta = te - ts;
+ picos = (uint32_t) (usec / delta);
+
+ if (!hfi_timebase_isvalid(picos)) {
+ cpu_set_t cpuget;
+ int affinity_valid =
+ !sched_getaffinity(0, sizeof(cpuget), &cpuget);
+ if (affinity_valid && !CPU_ISSET(0, &cpuget))
+ affinity_valid = 0;
+ timebase_warn
+ ("Failed to get valid RTC timebase, gettimeofday delta=%lld, "
+ "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n",
+ (long long)usec, (long long)delta, picos,
+ affinity_valid ? "YES" : "NO", retry);
+ goto retry_pico_test;
+ }
+
+ /* If we've had to retry even once, let that be known */
+ if (retry > 1)
+ timebase_warn("Clock is %d picos/cycle found in %d trials and "
+ "%.3f seconds (retry=%d)\n", picos, trials,
+ (double)usec / 1.0e12, retry);
+
+ __hfi_pico_per_cycle = picos;
+
+reset_cpu_mask:
+ /* Restore affinity */
+ if (have_cpuset) {
+ sched_setaffinity(0, sizeof(cpuset), &cpuset_saved);
+ /*
+ * Give a chance to other processes that also set affinity to 0 for
+ * doing this test.
+ */
+ sched_yield();
+ }
+}
+
+/*
+ * Method #2:
+ *
+ * Derive the pico-per-cycle from /proc instead of using sleep trick
+ * that relies on scheduler.
+ */
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
+{
+ /* we only validate once */
+ uint32_t new_pico_per_cycle = old_pico_per_cycle;
+
+ char hostname[80];
+ gethostname(hostname, 80);
+ hostname[sizeof(hostname) - 1] = '\0';
+
+ if (getenv("HFI_DEBUG_TIMEBASE"))
+ timebase_debug = 1;
+
+ /* If the old one is valid, don't bother with this mechanism */
+ if (hfi_timebase_isvalid(old_pico_per_cycle))
+ return old_pico_per_cycle;
+
+#if defined(__x86_64__) || defined(__i386__)
+ {
+ FILE *fp = fopen("/proc/cpuinfo", "r");
+ char input[255];
+ char *p = NULL;
+
+ if (!fp)
+ goto fail;
+
+ while (!feof(fp) && fgets(input, 255, fp)) {
+ if (strstr(input, "cpu MHz")) {
+ p = strchr(input, ':');
+ double MHz = 0.0;
+ if (p)
+ MHz = atof(p + 1);
+ new_pico_per_cycle =
+ (uint32_t) (1000000. / MHz);
+ break;
+ }
+ }
+ fclose(fp);
+ if (!p)
+ goto fail;
+ }
+#endif
+
+ /* If there's no change (within a small range), just return the old one */
+ if (abs(new_pico_per_cycle - old_pico_per_cycle) < 5)
+ return old_pico_per_cycle;
+
+ if (hfi_timebase_isvalid(new_pico_per_cycle)) {
+ timebase_warn_always
+ ("RTC timebase, using %d picos/cycle from /proc "
+ "instead of the detected %d picos/cycle\n",
+ new_pico_per_cycle, old_pico_per_cycle);
+ return new_pico_per_cycle;
+ }
+
+fail:
+ new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+ timebase_warn_always
+ ("Problem obtaining CPU time base, detected to be %d "
+ "pico/cycle, adjusted to safe default %d picos/cycle",
+ old_pico_per_cycle, new_pico_per_cycle);
+ return new_pico_per_cycle;
+}
diff --git a/opa/opa_utils.c b/opa/opa_utils.c
new file mode 100644
index 0000000..2b66b77
--- /dev/null
+++ b/opa/opa_utils.c
@@ -0,0 +1,425 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low */
+/* level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <time.h>
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "ipserror.h"
+#include "opa_user.h"
+
+/* keep track whether we disabled mmap in malloc */
+int __hfi_malloc_no_mmap = 0;
+
+/* touch the pages, with a 32 bit read */
+void hfi_touch_mmap(void *m, size_t bytes)
+{
+ volatile uint32_t *b = (volatile uint32_t *)m, c;
+ size_t i; /* m is always page aligned, so pgcnt exact */
+ int __hfi_pg_sz;
+
+ /* First get the page size */
+ __hfi_pg_sz = sysconf(_SC_PAGESIZE);
+
+ _HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n",
+ (unsigned long)bytes / __hfi_pg_sz, m);
+ bytes /= sizeof(c);
+ for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c))
+ c = b[i];
+}
+
+/* flush the eager buffers, by setting the eager index head to eager index tail
+ if eager buffer queue is full.
+
+ Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR
+ was set in RHF errors), and no good eager packets were received, so
+ that eager head wasn't advanced. */
+
+void hfi_flush_egr_bufs(struct _hfi_ctrl *ctrl)
+{
+ uint64_t head = __le64_to_cpu(*ctrl->__hfi_rcvegrhead);
+ uint64_t tail = __le64_to_cpu(*ctrl->__hfi_rcvegrtail);
+
+ if ((head % ctrl->__hfi_tidegrcnt) ==
+ ((tail + 1) % ctrl->__hfi_tidegrcnt)) {
+ _HFI_DBG
+ ("eager array full after overflow, flushing (head %llx, tail %llx\n",
+ (long long)head, (long long)tail);
+ *ctrl->__hfi_rcvegrhead = __cpu_to_le64(tail);
+ }
+}
+
+/* stop_start == 0 disables receive on the context, for use in queue
+ overflow conditions. stop_start==1 re-enables, to be used to
+ re-init the software copy of the head register */
+int hfi_manage_rcvq(struct _hfi_ctrl *ctrl, uint32_t stop_start)
+{
+ struct hfi1_cmd cmd;
+
+ cmd.type = PSMI_HFI_CMD_RECV_CTRL;
+ cmd.len = 0;
+ cmd.addr = (uint64_t) stop_start;
+
+ if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+ if (errno != EINVAL) /* not implemented in driver */
+ _HFI_INFO("manage rcvq failed: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* ack event bits, and clear them. Usage is check *spi_sendbuf_status,
+ pass bits you are prepared to handle to hfi_event_ack(), perform the
+ appropriate actions for bits that were set, and then (if appropriate)
+ check the bits again. */
+int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits)
+{
+ struct hfi1_cmd cmd;
+
+ cmd.type = PSMI_HFI_CMD_ACK_EVENT;
+ cmd.len = 0;
+ cmd.addr = ackbits;
+
+ if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+ if (errno != EINVAL) /* not implemented in driver. */
+ _HFI_DBG("event ack failed: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* Tell the driver to change the way packets can generate interrupts.
+
+ HFI1_POLL_TYPE_URGENT: Generate interrupt only when packet sets
+ HFI_KPF_INTR
+ HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on).
+
+ PSM: Uses TYPE_URGENT in ips protocol
+*/
+int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type)
+{
+ struct hfi1_cmd cmd;
+
+ cmd.type = PSMI_HFI_CMD_POLL_TYPE;
+ cmd.len = 0;
+ cmd.addr = (uint64_t) poll_type;
+
+ if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+ if (errno != EINVAL) /* not implemented in driver */
+ _HFI_INFO("poll type failed: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* set the send context pkey to check BTH pkey in each packet.
+ driver should check its pkey table to see if it can find
+ this pkey, if not, driver should return error. */
+int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey)
+{
+ struct hfi1_cmd cmd;
+
+ cmd.type = PSMI_HFI_CMD_SET_PKEY;
+ cmd.len = 0;
+ cmd.addr = (uint64_t) pkey;
+
+ if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+ if (errno != EINVAL)
+ _HFI_INFO("set pkey failed: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* Tell the driver to reset the send context. if the send context
+ if halted, reset it, if not, return error back to caller.
+ After context reset, the credit return should be reset to
+ zero by a hardware credit return DMA.
+ Driver will return ENOLCK if the reset is timeout, in this
+ case PSM needs to re-call again. */
+int hfi_reset_context(struct _hfi_ctrl *ctrl)
+{
+ struct hfi1_cmd cmd;
+
+ cmd.type = PSMI_HFI_CMD_CTXT_RESET;
+ cmd.len = 0;
+ cmd.addr = 0;
+
+retry:
+ if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) {
+ if (errno == ENOLCK)
+ goto retry;
+
+ if (errno != EINVAL)
+ _HFI_INFO("reset ctxt failed: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* wait for a received packet for our context
+ This allows us to not busy wait, if nothing has happened for a
+ while, which allows better measurements of cpu utilization, and
+ in some cases, slightly better performance. Called where we would
+ otherwise call sched_yield(). It is not guaranteed that a packet
+ has arrived, so the normal checking loop(s) should be done.
+
+ PSM: not used as is, PSM has it's own use of polling for interrupt-only
+ packets (sets hfi_poll_type to TYPE_URGENT) */
+int hfi_wait_for_packet(struct _hfi_ctrl *ctrl)
+{
+ return hfi_cmd_wait_for_packet(ctrl->fd);
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the hfi driver, they return new info with
+ * the qib driver
+ */
+static int hfi_count_names(const char *namep)
+{
+ int n = 0;
+ while (*namep != '\0') {
+ if (*namep == '\n')
+ n++;
+ namep++;
+ }
+ return n;
+}
+
+const char *hfi_get_next_name(char **names)
+{
+ char *p, *start;
+
+ p = start = *names;
+ while (*p != '\0' && *p != '\n') {
+ p++;
+ }
+ if (*p == '\n') {
+ *p = '\0';
+ p++;
+ *names = p;
+ return start;
+ } else
+ return NULL;
+}
+
+void hfi_release_names(char *namep)
+{
+ /* names were initialised in the data section before. Now
+ * they are allocated when hfi_hfifs_read() is called. Allocation
+ * for names is done only once at init time. Should we eventually
+ * have an "stats_type_unregister" type of routine to explicitly
+ * deallocate memory and free resources ?
+ */
+#if 0
+ if (namep != NULL)
+ free(namep);
+#endif
+}
+
+int hfi_get_stats_names_count()
+{
+ char *namep;
+ int c;
+
+ c = hfi_get_stats_names(&namep);
+ free(namep);
+ return c;
+}
+
+int hfi_get_ctrs_unit_names_count(int unitno)
+{
+ char *namep;
+ int c;
+
+ c = hfi_get_ctrs_unit_names(unitno, &namep);
+ free(namep);
+ return c;
+}
+
+int hfi_get_ctrs_port_names_count(int unitno)
+{
+ char *namep;
+ int c;
+
+ c = hfi_get_ctrs_port_names(unitno, &namep);
+ free(namep);
+ return c;
+}
+
+int hfi_lookup_stat(const char *attr, char *namep, uint64_t *stats,
+ uint64_t *s)
+{
+ const char *p;
+ int i, ret = -1, len = strlen(attr);
+ int nelem = hfi_count_names(namep);
+
+ for (i = 0; i < nelem; i++) {
+ p = hfi_get_next_name(&namep);
+ if (p == NULL)
+ break;
+ if (strncasecmp(p, attr, len + 1) == 0) {
+ ret = i;
+ *s = stats[i];
+ }
+ }
+ return ret;
+}
+
+uint64_t hfi_get_single_stat(const char *attr, uint64_t *s)
+{
+ int nelem, n = 0, ret = -1;
+ char *namep = NULL;
+ uint64_t *stats = NULL;
+
+ nelem = hfi_get_stats_names(&namep);
+ if (nelem == -1 || namep == NULL)
+ goto bail;
+ stats = calloc(nelem, sizeof(uint64_t));
+ if (stats == NULL)
+ goto bail;
+ n = hfi_get_stats(stats, nelem);
+ if (n != nelem)
+ goto bail;
+ ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+ if (namep != NULL)
+ free(namep);
+ if (stats != NULL)
+ free(stats);
+ return ret;
+}
+
+uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s)
+{
+ int nelem, n = 0, ret = -1;
+ char *namep = NULL;
+ uint64_t *stats = NULL;
+
+ nelem = hfi_get_ctrs_unit_names(unit, &namep);
+ if (nelem == -1 || namep == NULL)
+ goto bail;
+ stats = calloc(nelem, sizeof(uint64_t));
+ if (stats == NULL)
+ goto bail;
+ n = hfi_get_ctrs_unit(unit, stats, nelem);
+ if (n != nelem)
+ goto bail;
+ ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+ if (namep != NULL)
+ free(namep);
+ if (stats != NULL)
+ free(stats);
+ return ret;
+}
+
+int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *s)
+{
+ int nelem, n = 0, ret = -1;
+ char *namep = NULL;
+ uint64_t *stats = NULL;
+
+ nelem = hfi_get_ctrs_port_names(unit, &namep);
+ if (nelem == -1 || namep == NULL)
+ goto bail;
+ stats = calloc(nelem, sizeof(uint64_t));
+ if (stats == NULL)
+ goto bail;
+ n = hfi_get_ctrs_port(unit, port, stats, nelem);
+ if (n != nelem)
+ goto bail;
+ ret = hfi_lookup_stat(attr, namep, stats, s);
+bail:
+ if (namep != NULL)
+ free(namep);
+ if (stats != NULL)
+ free(stats);
+ return ret;
+}
+
+/*
+ * Add a constructor function to disable mmap if asked to do so by the user
+ */
+static void init_mallopt_disable_mmap(void) __attribute__ ((constructor));
+
+static void init_mallopt_disable_mmap(void)
+{
+ char *env = getenv("HFI_DISABLE_MMAP_MALLOC");
+
+ if (env && *env) {
+ if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) {
+ __hfi_malloc_no_mmap = 1;
+ }
+ }
+
+ return;
+}
diff --git a/opa/opa_write_pio-i386.c b/opa/opa_write_pio-i386.c
new file mode 100644
index 0000000..359fdbc
--- /dev/null
+++ b/opa/opa_write_pio-i386.c
@@ -0,0 +1,305 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains the initialization functions used by the low
+ level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "hfi_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order. requires additional serialization
+*/
+void hfi_write_pio_force_order(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm, void *hdr,
+ void *bdata)
+{
+ union hfi_pbc buf = {.qword = 0 };
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *piob++ = buf.dword;
+ /* 32 bit programs require fence after first 32 bits of pbc write */
+ /* Can't do as uint64_t store, or compiler could reorder */
+ ips_wmb();
+ *piob++ = buf.pbcflags;
+
+ if (!pioparm->length) {
+ uint32_t *dhdr, dcpywords;
+ dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1;
+ hfi_dwordcpy_safe(piob, hdr, dcpywords);
+ ips_wmb();
+ dhdr = hdr;
+ piob += dcpywords;
+ dhdr += dcpywords;
+ *piob++ = *dhdr;
+ } else {
+ uint32_t *pay2 = bdata, j;
+ uint32_t len = pioparm->length;
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+ len >>= 2;
+ if (len > 16) {
+ uint32_t pay_words = 16 * ((len - 1) / 16);
+ hfi_dwordcpy_safe(piob, pay2, pay_words);
+ piob += pay_words;
+ pay2 += pay_words;
+ len -= pay_words;
+ }
+ /* now write the final chunk a word at a time, fence before trigger */
+ for (j = 0; j < (len - 1); j++)
+ *piob++ = *pay2++;
+ ips_wmb(); /* flush the buffer out now, so */
+ *piob++ = *pay2;
+ }
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ ips_wmb();
+ *piob = pioparm->cksum;
+ }
+
+ /* send it on it's way, now, rather than waiting for processor to
+ * get around to flushing it */
+ ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order. Avoids serializing and flush instructions
+ * where possible.
+ */
+void hfi_write_pio(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm, void *hdr, void *bdata)
+{
+ union hfi_pbc buf = { 0 };
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *piob++ = buf.dword;
+ /* 32 bit programs needs compiler fence to prevent compiler reordering
+ the two 32 bit stores in a uint64_t, but on inorder wc systems, does
+ not need a memory fence. */
+ asm volatile ("" : : : "memory");
+ *piob++ = buf.pbcflags;
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+ asm volatile ("" : : : "memory");
+
+ if (pioparm->length)
+ hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+ pioparm->length >> 2);
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ piob += pioparm->length >> 2;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ asm volatile ("" : : : "memory");
+ *piob = pioparm->cksum;
+ }
+
+ /* send it on it's way, now, rather than waiting for processor to
+ * get around to flushing it */
+ ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order. Avoids serializing and flush instructions
+ * where possible.
+ */
+static void hfi_write_pio_special_trigger(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata,
+ unsigned offset)
+ __attribute__ ((always_inline));
+
+static void hfi_write_pio_special_trigger(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata,
+ unsigned offset)
+{
+ union hfi_pbc buf = { 0 };
+ volatile uint32_t *piobs = piob;
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *piob++ = buf.dword;
+ /* 32 bit programs needs compiler fence to prevent compiler reordering
+ the two 32 bit stores in a uint64_t, but on inorder wc systems, does
+ not need a memory fence. */
+ asm volatile ("" : : : "memory");
+ *piob++ = buf.pbcflags;
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+ asm volatile ("" : : : "memory");
+
+ if (pioparm->length)
+ hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+ pioparm->length >> 2);
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ piob += pioparm->length >> 2;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ asm volatile ("" : : : "memory");
+ *piob = pioparm->cksum;
+ }
+
+ /* send it on it's way, now, rather than waiting for processor to
+ * get around to flushing it */
+ ips_wmb();
+ *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC;
+ ips_wmb();
+}
+
+void hfi_write_pio_special_trigger2k(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata)
+{
+ hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void hfi_write_pio_special_trigger4k(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata)
+{
+ hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
diff --git a/opa/opa_write_pio-x86_64.c b/opa/opa_write_pio-x86_64.c
new file mode 100644
index 0000000..1140705
--- /dev/null
+++ b/opa/opa_write_pio-x86_64.c
@@ -0,0 +1,296 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains the initialization functions used by the low
+ level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "opa_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order. requires additional serialization
+*/
+void hfi_write_pio_force_order(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm, void *hdr,
+ void *bdata)
+{
+ union hfi_pbc buf = {.qword = 0 };
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *(volatile uint64_t *)piob = buf.qword;
+ ips_wmb(); /* pbc must be forced to be first write to chip buffer */
+ piob += 2;
+
+ if (!pioparm->length) {
+ uint32_t *dhdr, dcpywords;
+ dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1;
+ hfi_dwordcpy_safe(piob, hdr, dcpywords);
+ ips_wmb();
+ dhdr = hdr;
+ piob += dcpywords;
+ dhdr += dcpywords;
+ *piob++ = *dhdr;
+ } else {
+ uint32_t *pay2 = bdata, j;
+ uint32_t len = pioparm->length;
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+ len >>= 2;
+ if (len > 16) {
+ uint32_t pay_words = 16 * ((len - 1) / 16);
+ hfi_dwordcpy_safe(piob, pay2, pay_words);
+ piob += pay_words;
+ pay2 += pay_words;
+ len -= pay_words;
+ }
+ /* now write the final chunk a word at a time, fence before trigger */
+ for (j = 0; j < (len - 1); j++)
+ *piob++ = *pay2++;
+ ips_wmb(); /* flush the buffer out now, so */
+ *piob++ = *pay2;
+ }
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ ips_wmb();
+ *piob = pioparm->cksum;
+ }
+
+ /* send it on it's way, now, rather than waiting for processor to
+ * get around to flushing it */
+ ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order. Avoids serializing and flush instructions
+ * where possible.
+ */
+void hfi_write_pio(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm, void *hdr, void *bdata)
+{
+ union hfi_pbc buf = { 0 };
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *(volatile uint64_t *)piob = buf.qword;
+ piob += 2;
+ asm volatile ("" : : : "memory");
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+
+ asm volatile ("" : : : "memory");
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+
+ if (pioparm->length)
+ hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+ pioparm->length >> 2);
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ piob += pioparm->length >> 2;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ asm volatile ("" : : : "memory");
+ *piob = pioparm->cksum;
+ }
+
+ /* send it on it's way, now, rather than waiting for processor to
+ * get around to flushing it */
+ ips_wmb();
+}
+
+/*
+ * here we trigger on a "special" address, so just bang it out
+ * as fast as possible...
+ */
+static void
+hfi_write_pio_special_trigger(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm, void *hdr,
+ void *bdata, unsigned offset)
+ __attribute__ ((always_inline));
+
+static void
+hfi_write_pio_special_trigger(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata, unsigned offset)
+{
+ union hfi_pbc buf = { 0 };
+ volatile uint32_t *piobs = piob;
+ uint32_t cksum_len = pioparm->cksum_is_valid ?
+ HFI_CRC_SIZE_IN_BYTES : 0;
+
+ buf.length =
+ __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len +
+ pioparm->length) >> 2) + 1);
+ if (pioparm->port > 1)
+ buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) |
+ __PBC_IBPORT | pioparm->rate);
+ else
+ buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+ pioparm->rate);
+
+ *(volatile uint64_t *)piob = buf.qword;
+ piob += 2;
+ asm volatile ("" : : : "memory");
+
+ hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2);
+ piob += HFI_MESSAGE_HDR_SIZE >> 2;
+ asm volatile ("" : : : "memory");
+
+ if (pioparm->length)
+ hfi_dwordcpy_safe(piob, (uint32_t *) bdata,
+ pioparm->length >> 2);
+
+ /* If checksum is enabled insert CRC at end of packet */
+ if_pf(pioparm->cksum_is_valid) {
+ int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2;
+ int nCRC = 0;
+
+ piob += pioparm->length >> 2;
+
+ while (nCRC < (nCRCopies - 1)) {
+ *piob = pioparm->cksum;
+ piob++;
+ nCRC++;
+ }
+
+ asm volatile ("" : : : "memory");
+ *piob = pioparm->cksum;
+ }
+
+ /*
+ * flush then write "special" then flush...
+ */
+ ips_wmb();
+ *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC;
+ ips_wmb();
+}
+
+void hfi_write_pio_special_trigger2k(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata)
+{
+ hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void hfi_write_pio_special_trigger4k(volatile uint32_t *piob,
+ const struct hfi_pio_params *pioparm,
+ void *hdr, void *bdata)
+{
+ hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
diff --git a/psm.c b/psm.c
new file mode 100644
index 0000000..16a2ceb
--- /dev/null
+++ b/psm.c
@@ -0,0 +1,732 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <dlfcn.h>
+#include "psm_user.h"
+#include "opa_revision.h"
+#include "opa_udebug.h"
+#include "psm_mq_internal.h"
+
+static int psmi_verno_major = PSM2_VERNO_MAJOR;
+static int psmi_verno_minor = PSM2_VERNO_MINOR;
+static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+static int psmi_verno_client_val;
+int psmi_epid_ver;
+
+#define PSMI_NOT_INITIALIZED 0
+#define PSMI_INITIALIZED 1
+#define PSMI_FINALIZED -1 /* Prevent the user from calling psm2_init
+ * once psm_finalize has been called. */
+static int psmi_isinit = PSMI_NOT_INITIALIZED;
+
+/* Global lock used for endpoint creation and destroy
+ * (in functions psm2_ep_open and psm2_ep_close) and also
+ * for synchronization with recv_thread (so that recv_thread
+ * will not work on an endpoint which is in a middle of closing). */
+psmi_lock_t psmi_creation_lock;
+
+#ifdef PSM_CUDA
+int is_cuda_enabled;
+int device_support_gpudirect;
+int cuda_runtime_version;
+int is_driver_gpudirect_enabled;
+#endif
+
+/*
+ * Bit field that contains capability set.
+ * Each bit represents different capability.
+ * It is supposed to be filled with logical OR
+ * on conditional compilation basis
+ * along with future features/capabilities.
+ * At the very beginning we start with Multi EPs.
+ */
+uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP;
+
+int psmi_verno_client()
+{
+ return psmi_verno_client_val;
+}
+
+/* This function is used to determine whether the current library build can
+ * successfully communicate with another library that claims to be version
+ * 'verno'.
+ *
+ * PSM 2.x is always ABI compatible, but this checks to see if two different
+ * versions of the library can coexist.
+ */
+int psmi_verno_isinteroperable(uint16_t verno)
+{
+ if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR)
+ return 0;
+
+ return 1;
+}
+
+int MOCKABLE(psmi_isinitialized)()
+{
+ return (psmi_isinit == PSMI_INITIALIZED);
+}
+MOCK_DEF_EPILOGUE(psmi_isinitialized);
+
+#ifdef PSM_CUDA
+int psmi_cuda_initialize()
+{
+ psm2_error_t err = PSM2_OK;
+ int num_devices, dev;
+ struct cudaDeviceProp dev_prop;
+ char *dlerr;
+
+ PSM2_LOG_MSG("entering");
+ _HFI_VDBG("Enabling CUDA support.\n");
+
+ psmi_cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+ psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
+ if (!psmi_cuda_lib || !psmi_cudart_lib) {
+ dlerr = dlerror();
+ _HFI_ERROR("Unable to open libcuda.so and libcudart.so. Error %s\n",
+ dlerr ? dlerr : "no dlerror()");
+ goto fail;
+ }
+
+ psmi_cudaRuntimeGetVersion = dlsym(psmi_cudart_lib, "cudaRuntimeGetVersion");
+
+ if (!psmi_cudaRuntimeGetVersion) {
+ _HFI_ERROR
+ ("Unable to resolve symbols in CUDA libraries.\n");
+ goto fail;
+ }
+
+ PSMI_CUDA_CALL(cudaRuntimeGetVersion, &cuda_runtime_version);
+ if (cuda_runtime_version < 4010) {
+ _HFI_ERROR("Please update CUDA runtime, required minimum version is 4.1 \n");
+ goto fail;
+ }
+
+
+ psmi_cuCtxGetCurrent = dlsym(psmi_cuda_lib, "cuCtxGetCurrent");
+ psmi_cuCtxSetCurrent = dlsym(psmi_cuda_lib, "cuCtxSetCurrent");
+ psmi_cuPointerGetAttribute = dlsym(psmi_cuda_lib, "cuPointerGetAttribute");
+ psmi_cuPointerSetAttribute = dlsym(psmi_cuda_lib, "cuPointerSetAttribute");
+
+ psmi_cudaGetDeviceCount = dlsym(psmi_cudart_lib, "cudaGetDeviceCount");
+ psmi_cudaGetDeviceProperties = dlsym(psmi_cudart_lib, "cudaGetDeviceProperties");
+ psmi_cudaGetDevice = dlsym(psmi_cudart_lib, "cudaGetDevice");
+ psmi_cudaSetDevice = dlsym(psmi_cudart_lib, "cudaSetDevice");
+ psmi_cudaStreamCreate = dlsym(psmi_cudart_lib, "cudaStreamCreate");
+ psmi_cudaDeviceSynchronize = dlsym(psmi_cudart_lib, "cudaDeviceSynchronize");
+ psmi_cudaStreamSynchronize = dlsym(psmi_cudart_lib, "cudaStreamSynchronize");
+ psmi_cudaEventCreate = dlsym(psmi_cudart_lib, "cudaEventCreate");
+ psmi_cudaEventDestroy = dlsym(psmi_cudart_lib, "cudaEventDestroy");
+ psmi_cudaEventQuery = dlsym(psmi_cudart_lib, "cudaEventQuery");
+ psmi_cudaEventRecord = dlsym(psmi_cudart_lib, "cudaEventRecord");
+ psmi_cudaEventSynchronize = dlsym(psmi_cudart_lib, "cudaEventSynchronize");
+ psmi_cudaMalloc = dlsym(psmi_cudart_lib, "cudaMalloc");
+ psmi_cudaHostAlloc = dlsym(psmi_cudart_lib, "cudaHostAlloc");
+ psmi_cudaFreeHost = dlsym(psmi_cudart_lib, "cudaFreeHost");
+ psmi_cudaMemcpy = dlsym(psmi_cudart_lib, "cudaMemcpy");
+ psmi_cudaMemcpyAsync = dlsym(psmi_cudart_lib, "cudaMemcpyAsync");
+
+ psmi_cudaIpcGetMemHandle = dlsym(psmi_cudart_lib, "cudaIpcGetMemHandle");
+ psmi_cudaIpcOpenMemHandle = dlsym(psmi_cudart_lib, "cudaIpcOpenMemHandle");
+ psmi_cudaIpcCloseMemHandle = dlsym(psmi_cudart_lib, "cudaIpcCloseMemHandle");
+
+ if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent ||
+ !psmi_cuPointerGetAttribute || !psmi_cuPointerSetAttribute ||
+ !psmi_cudaGetDeviceCount || !psmi_cudaGetDeviceProperties ||
+ !psmi_cudaGetDevice || !psmi_cudaSetDevice ||
+ !psmi_cudaStreamCreate ||
+ !psmi_cudaDeviceSynchronize || !psmi_cudaStreamSynchronize ||
+ !psmi_cudaEventCreate || !psmi_cudaEventDestroy ||
+ !psmi_cudaEventQuery || !psmi_cudaEventRecord ||
+ !psmi_cudaEventSynchronize ||
+ !psmi_cudaMalloc || !psmi_cudaHostAlloc || !psmi_cudaFreeHost ||
+ !psmi_cudaMemcpy || !psmi_cudaMemcpyAsync || !psmi_cudaIpcGetMemHandle ||
+ !psmi_cudaIpcOpenMemHandle || !psmi_cudaIpcCloseMemHandle) {
+ _HFI_ERROR
+ ("Unable to resolve symbols in CUDA libraries.\n");
+ goto fail;
+ }
+
+ if (cuda_runtime_version > 7000) {
+ psmi_cudaStreamCreateWithFlags = dlsym(psmi_cudart_lib,
+ "cudaStreamCreateWithFlags");
+ if (!psmi_cudaStreamCreateWithFlags) {
+ _HFI_ERROR
+ ("Unable to resolve symbols in CUDA libraries.\n");
+ goto fail;
+ }
+ }
+
+ /* Check if all devices support Unified Virtual Addressing. */
+ PSMI_CUDA_CALL(cudaGetDeviceCount, &num_devices);
+ for (dev = 0; dev < num_devices; dev++) {
+ PSMI_CUDA_CALL(cudaGetDeviceProperties, &dev_prop, dev);
+ if (dev_prop.unifiedAddressing != 1) {
+ _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
+ goto fail;
+ }
+ /* Only devices based on Kepler and
+ * above can support GPU Direct.
+ */
+ if (dev_prop.major >= 3 && cuda_runtime_version >= 5000)
+ device_support_gpudirect = 1;
+ else {
+ device_support_gpudirect = 0;
+ _HFI_INFO("Device %d does not GPUDirect RDMA (Non-fatal error) \n", dev);
+ }
+ }
+ PSM2_LOG_MSG("leaving");
+ return err;
+fail:
+ err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM2 CUDA support.\n");
+ return err;
+}
+#endif
+
+psm2_error_t __psm2_init(int *major, int *minor)
+{
+ psm2_error_t err = PSM2_OK;
+ union psmi_envvar_val env_tmask;
+
+ psmi_log_initialize();
+
+ PSM2_LOG_MSG("entering");
+#ifdef RDPMC_PERF_FRAMEWORK
+ psmi_rdpmc_perf_framework_init();
+#endif /* RDPMC_PERF_FRAMEWORK */
+
+ GENERIC_PERF_INIT();
+
+ if (psmi_isinit == PSMI_INITIALIZED)
+ goto update;
+
+ if (psmi_isinit == PSMI_FINALIZED) {
+ err = PSM2_IS_FINALIZED;
+ goto fail;
+ }
+
+ if (major == NULL || minor == NULL) {
+ err = PSM2_PARAM_ERR;
+ goto fail;
+ }
+
+ psmi_init_lock(&psmi_creation_lock);
+
+#ifdef PSM_DEBUG
+ if (!getenv("PSM2_NO_WARN"))
+ fprintf(stderr,
+ "!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n");
+#endif
+
+#ifdef PSM_PROFILE
+ if (!getenv("PSM2_NO_WARN"))
+ fprintf(stderr,
+ "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n");
+#endif
+
+ /* Make sure we complain if fault injection is enabled */
+ if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN"))
+ fprintf(stderr,
+ "!!! WARNING !!! You are running with fault injection enabled!\n");
+
+ /* Make sure, as an internal check, that this version knows how to detect
+ * compatibility with other library versions it may communicate with */
+ if (psmi_verno_isinteroperable(psmi_verno) != 1) {
+ err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "psmi_verno_isinteroperable() not updated for current version!");
+ goto fail;
+ }
+
+ /* The only way to not support a client is if the major number doesn't
+ * match */
+ if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) {
+ err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION,
+ "This library does not implement version %d.%d",
+ *major, *minor);
+ goto fail;
+ }
+
+ /* Make sure we don't keep track of a client that claims a higher version
+ * number than we are */
+ psmi_verno_client_val =
+ min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno);
+
+ /* Check to see if we need to set Architecture flags to something
+ * besides big core Xeons */
+ cpuid_t id;
+ psmi_cpu_model = CPUID_MODEL_UNDEFINED;
+
+ /* First check to ensure Genuine Intel */
+ get_cpuid(0x0, 0, &id);
+ if(id.ebx == CPUID_GENUINE_INTEL_EBX
+ && id.ecx == CPUID_GENUINE_INTEL_ECX
+ && id.edx == CPUID_GENUINE_INTEL_EDX)
+ {
+ /* Use cpuid with EAX=1 to get processor info */
+ get_cpuid(0x1, 0, &id);
+ psmi_cpu_model = CPUID_GENUINE_INTEL;
+ }
+
+ if( (psmi_cpu_model == CPUID_GENUINE_INTEL) &&
+ (id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON)
+ {
+ psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) |
+ ((id.eax & CPUID_EXMODEL_MASK) >> 12);
+ }
+
+ psmi_isinit = PSMI_INITIALIZED;
+ /* hfi_debug lives in libhfi.so */
+ psmi_getenv("PSM2_TRACEMASK",
+ "Mask flags for tracing",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_ULONG_FLAGS,
+ (union psmi_envvar_val)hfi_debug, &env_tmask);
+ hfi_debug = (long)env_tmask.e_ulong;
+
+ /* The "real thing" is done in hfi_proto.c as a constructor function, but
+ * we getenv it here to report what we're doing with the setting */
+ {
+ extern int __hfi_malloc_no_mmap;
+ union psmi_envvar_val env_mmap;
+ char *env = getenv("HFI_DISABLE_MMAP_MALLOC");
+ int broken = (env && *env && !__hfi_malloc_no_mmap);
+ psmi_getenv("HFI_DISABLE_MMAP_MALLOC",
+ broken ? "Skipping mmap disable for malloc()" :
+ "Disable mmap for malloc()",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_YESNO,
+ (union psmi_envvar_val)0, &env_mmap);
+ if (broken)
+ _HFI_ERROR
+ ("Couldn't successfully disable mmap in mallocs "
+ "with mallopt()\n");
+ }
+
+ {
+ union psmi_envvar_val env_epid_ver;
+ psmi_getenv("PSM2_ADDR_FMT",
+ "Used to force PSM2 to use a particular version of EPID",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver);
+ psmi_epid_ver = env_epid_ver.e_int;
+ if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ " The max epid version supported in this version of PSM2 is %d \n"
+ "Please upgrade PSM2 \n",
+ PSMI_MAX_EPID_VERNO_SUPPORTED);
+ goto fail;
+ } else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ " Invalid value provided through PSM2_ADDR_FMT \n");
+ goto fail;
+ }
+ }
+
+#ifdef PSM_CUDA
+ union psmi_envvar_val env_enable_cuda;
+ psmi_getenv("PSM2_CUDA",
+ "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)0, &env_enable_cuda);
+ is_cuda_enabled = env_enable_cuda.e_int;
+#endif
+
+ if (getenv("PSM2_IDENTIFY")) {
+ Dl_info info_psm;
+ char ofed_delta[100] = "";
+ strcat(strcat(ofed_delta," built for OFED DELTA "),psmi_hfi_IFS_version);
+ printf("%s %s PSM2 v%d.%d%s\n"
+ "%s %s location %s\n"
+ "%s %s build date %s\n"
+ "%s %s src checksum %s\n"
+ "%s %s git checksum %s\n"
+ "%s %s built against driver interface v%d.%d\n",
+ hfi_get_mylabel(), hfi_ident_tag,
+ PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR,
+ (strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta
+#ifdef PSM_CUDA
+ : "-cuda",
+#else
+ : "",
+#endif
+ hfi_get_mylabel(), hfi_ident_tag, dladdr(psm2_init, &info_psm) ?
+ info_psm.dli_fname : "libpsm2 not available",
+ hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp,
+ hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum,
+ hfi_get_mylabel(), hfi_ident_tag,
+ (strcmp(psmi_hfi_git_checksum,"") != 0) ?
+ psmi_hfi_git_checksum : "<not available>",
+ hfi_get_mylabel(), hfi_ident_tag, HFI1_USER_SWMAJOR, HFI1_USER_SWMINOR);
+ }
+
+ if (getenv("PSM2_DIAGS")) {
+ _HFI_INFO("Running diags...\n");
+ psmi_diags();
+ }
+
+ psmi_multi_ep_init();
+
+ psmi_faultinj_init();
+
+ psmi_epid_init();
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED) {
+ err = psmi_cuda_initialize();
+ if (err != PSM2_OK)
+ goto fail;
+ }
+#endif
+
+update:
+ *major = (int)psmi_verno_major;
+ *minor = (int)psmi_verno_minor;
+fail:
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_init)
+
+
+uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask)
+{
+ return (psm2_capabilities_bitset & req_cap_mask);
+}
+PSMI_API_DECL(psm2_get_capability_mask)
+
+
+psm2_error_t __psm2_finalize(void)
+{
+ struct psmi_eptab_iterator itor;
+ char *hostname;
+ psm2_ep_t ep;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ GENERIC_PERF_DUMP(stderr);
+ ep = psmi_opened_endpoint;
+ while (ep != NULL) {
+ psmi_opened_endpoint = ep->user_ep_next;
+ psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL,
+ 2 * PSMI_MIN_EP_CLOSE_TIMEOUT);
+ ep = psmi_opened_endpoint;
+ }
+
+ psmi_epid_fini();
+
+ psmi_faultinj_fini();
+
+ /* De-allocate memory for any allocated space to store hostnames */
+ psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME);
+ while ((hostname = psmi_epid_itor_next(&itor)))
+ psmi_free(hostname);
+ psmi_epid_itor_fini(&itor);
+
+ psmi_isinit = PSMI_FINALIZED;
+ PSM2_LOG_MSG("leaving");
+ psmi_log_fini();
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_finalize)
+
+/*
+ * Function exposed in >= 1.05
+ */
+psm2_error_t
+__psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+ int i;
+ psm2_error_t err = PSM2_OK;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (nids == NULL || hostnames == NULL) {
+ err = PSM2_PARAM_ERR;
+ goto fail;
+ }
+
+ for (i = 0; i < num; i++) {
+ if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1)))
+ break;
+ }
+
+fail:
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_map_nid_hostname)
+
+void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label)
+{
+ PSM2_LOG_MSG("entering");
+ PSM2_LOG_MSG("leaving");
+ return; /* ignore this function */
+}
+PSMI_API_DECL(psm2_epaddr_setlabel)
+
+void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt)
+{
+
+ /* Eventually deprecate this API to use set/get opt as this is unsafe. */
+ PSM2_LOG_MSG("entering");
+ psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+ PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *));
+ PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_epaddr_setctxt)
+
+void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr)
+{
+ psm2_error_t err;
+ uint64_t optlen = sizeof(void *);
+ void *result = NULL;
+
+ PSM2_LOG_MSG("entering");
+ /* Eventually deprecate this API to use set/get opt as this is unsafe. */
+ err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+ PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen);
+
+ PSM2_LOG_MSG("leaving");
+
+ if (err == PSM2_OK)
+ return result;
+ else
+ return NULL;
+}
+PSMI_API_DECL(psm2_epaddr_getctxt)
+
+psm2_error_t
+__psm2_setopt(psm2_component_t component, const void *component_obj,
+ int optname, const void *optval, uint64_t optlen)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ switch (component) {
+ case PSM2_COMPONENT_CORE:
+ rv = psmi_core_setopt(component_obj, optname, optval, optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_MQ:
+ /* Use the deprecated MQ set/get opt for now which does not use optlen */
+ rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_AM:
+ /* Hand off to active messages */
+ rv = psmi_am_setopt(component_obj, optname, optval, optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_IB:
+ /* Hand off to IPS ptl to set option */
+ rv = psmi_ptl_ips.setopt(component_obj, optname, optval,
+ optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ }
+
+ /* Unrecognized/unknown component */
+ rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+ component);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_setopt);
+
+psm2_error_t
+__psm2_getopt(psm2_component_t component, const void *component_obj,
+ int optname, void *optval, uint64_t *optlen)
+{
+ psm2_error_t rv;
+
+ PSM2_LOG_MSG("entering");
+ switch (component) {
+ case PSM2_COMPONENT_CORE:
+ rv = psmi_core_getopt(component_obj, optname, optval, optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_MQ:
+ /* Use the deprecated MQ set/get opt for now which does not use optlen */
+ rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_AM:
+ /* Hand off to active messages */
+ rv = psmi_am_getopt(component_obj, optname, optval, optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ case PSM2_COMPONENT_IB:
+ /* Hand off to IPS ptl to set option */
+ rv = psmi_ptl_ips.getopt(component_obj, optname, optval,
+ optlen);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+ break;
+ }
+
+ /* Unrecognized/unknown component */
+ rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+ component);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_getopt);
+
+psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly)
+{
+ PSM2_LOG_MSG("entering");
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK_NO_PROGRESS;
+}
+PSMI_API_DECL(psmi_poll_noop)
+
+psm2_error_t __psm2_poll(psm2_ep_t ep)
+{
+ psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK;
+ psm2_ep_t tmp;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ASSERT_INITIALIZED();
+
+ PSMI_LOCK(ep->mq->progress_lock);
+
+ tmp = ep;
+ do {
+ err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */
+ if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+ PSMI_UNLOCK(ep->mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+ return err1;
+ }
+
+ err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */
+ if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+ PSMI_UNLOCK(ep->mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+ return err2;
+ }
+ ep = ep->mctxt_next;
+ } while (ep != tmp);
+
+ /* This is valid because..
+ * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK
+ * PSM2_OK & PSM2_OK => PSM2_OK
+ * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK
+ * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */
+ PSMI_UNLOCK(ep->mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+ return (err1 & err2);
+}
+PSMI_API_DECL(psm2_poll)
+
+psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh)
+{
+ psm2_error_t err1 = PSM2_OK_NO_PROGRESS;
+ psm2_error_t err2;
+ psm2_ep_t tmp;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_LOCK_ASSERT(ep->mq->progress_lock);
+
+ tmp = ep;
+ do {
+ if (poll_amsh) {
+ err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */
+ if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+ PSM2_LOG_MSG("leaving");
+ return err1;
+ }
+ }
+
+ err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */
+ if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+ PSM2_LOG_MSG("leaving");
+ return err2;
+ }
+
+ ep = ep->mctxt_next;
+ } while (ep != tmp);
+ PSM2_LOG_MSG("leaving");
+ return (err1 & err2);
+}
+PSMI_API_DECL(psmi_poll_internal)
+#ifdef PSM_PROFILE
+/* These functions each have weak symbols */
+void psmi_profile_block()
+{
+ ; /* empty for profiler */
+}
+
+void psmi_profile_unblock()
+{
+ ; /* empty for profiler */
+}
+
+void psmi_profile_reblock(int did_no_progress)
+{
+ ; /* empty for profiler */
+}
+#endif
diff --git a/psm2.h b/psm2.h
new file mode 100644
index 0000000..3da78e3
--- /dev/null
+++ b/psm2.h
@@ -0,0 +1,1517 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2017 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_H
+#define PSM2_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2.h
+ * @page psm2_main PSM2 API
+ *
+ * @brief PSM2 OPA Messaging Library
+ *
+ * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level
+ * user-level communications interface for the OPA family of products.
+ * PSM2 users are enabled with mechanisms necessary to implement higher level
+ * communications interfaces in parallel environments.
+ *
+ * Since PSM2 targets clusters of multicore processors, it internally implements
+ * two levels of communication: intra-node shared memory communication and
+ * inter-node OPA communication. Both of these levels are encapsulated
+ * below the interface and the user is free to assume that intra-node and
+ * inter-node communication is transparently handled within PSM.
+ *
+ * @section compat Compatibility
+ *
+ * PSM2 can coexist with other QLogic/Pathscale software distributions, such as
+ * OpenIB/OpenFabrics, which allows applications to simultaneously target
+ * PSM-based and non PSM-based applications on a single node without changing
+ * any system-level configuration. However, PSM2 does not support running
+ * PSM-based and non PSM-based communication within the same user process.
+ *
+ * Except where noted, PSM2 does not assume an SPMD (single program, multiple
+ * data) parallel model and extends to MPMD (multiple program, multiple data)
+ * environments in specific areas. However, PSM2 assumes the runtime environment
+ * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness
+ * (little or big) and will fail at startup if any of these assumptions do not
+ * hold. For homogeneous systems PSM2 can run either in 32-bit or 64-bit
+ * environments. Even though both environments should expect similar
+ * performance from the API, PSM2 has chosen to favor 64-bit environments in
+ * some minor areas.
+ *
+ * @section ep_model Endpoint Communication Model
+ *
+ * PSM2 follows an endpoint communication model where an endpoint is defined as
+ * an object (or handle) instantiated to support sending and receiving messages
+ * to other endpoints. In order to prevent PSM2 from being tied to a particular
+ * parallel model (such as SPMD), control over the parallel layout of endpoints
+ * is retained by the user. Opening endpoints (@ref psm2_ep_open) and
+ * connecting endpoints to enable communication (@ref psm2_ep_connect) are two
+ * decoupled mechanisms. Users that do not dynamically change the number of
+ * endpoints beyond parallel startup will probably lump both mechanisms
+ * together at startup. Users that wish to manipulate the location and number
+ * of endpoints at runtime can do so by explicitly connecting sets or subsets
+ * of endpoints.
+ *
+ * As a side effect, this greater flexibility forces the user to cope with a
+ * two-stage initialization process. In the first stage of opening an endpoint
+ * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a
+ * globally distributable endpoint identifier (@ref psm2_epid_t). Prior to the
+ * second stage of connecting endpoints (@ref psm2_ep_connect), a user must
+ * distribute all relevent endpoint identifiers through an out-of-band
+ * mechanism. Once the endpoint identifiers are successfully distributed to
+ * all processes that wish to communicate, the user
+ * connects all endpoint identifiers to the locally opened endpoint
+ * (@ref psm2_ep_connect). In connecting the endpoints, the user obtains an
+ * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM
+ * communication primitives.
+ *
+ *
+ * @section components PSM2 Components
+ *
+ * PSM2 exposes a single endpoint initialization model, but enables various
+ * levels of communication functionality and semantics through @e components.
+ * The first major component available in PSM2 is PSM2 Matched Queues
+ * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am).
+ *
+ * Matched Queues (MQ) present a queue-based communication model with the
+ * distinction that queue consumers use a 3-tuple of metadata to match incoming
+ * messages against a list of preposted receive buffers. The MQ semantics are
+ * sufficiently akin to MPI to cover the entire MPI-1.2 standard.
+ *
+ * The Active Message (AM) component presents a request/reply model where
+ * the arrival of a message triggers the execution of consumer-provided
+ * handler code. This can be used to implement many one-sided and two-sided
+ * communications paradigms.
+ *
+ * With future releases of the PSM2 interface, more components will
+ * be exposed to accommodate users that implement parallel communication
+ * models that deviate from the Matched Queue semantics. For example, PSM
+ * plans to expose a connection management component to make it easier to
+ * handle endpoint management for clients without their own connection
+ * managers.
+ *
+ *
+ * @section progress PSM2 Communication Progress Guarantees
+ *
+ * PSM2 internally ensures progress of both intra-node and inter-node messages,
+ * but not autonomously. This means that while performance does not depend
+ * greatly on how the user decides to schedule communication progress,
+ * explicit progress calls are required for correctness. The @ref psm2_poll
+ * function is available to make progress over all PSM2 components in a generic
+ * manner. For more information on making progress over many communication
+ * operations in the MQ component, see the @ref mq_progress documentation.
+ *
+ *
+ * @section completion PSM2 Completion semantics
+ *
+ * PSM2 implements the MQ component, which documents its own
+ * message completion semantics (@ref mq_completion).
+ *
+ *
+ * @section error_handling PSM2 Error handling
+ *
+ * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error.
+ * While most errors are fatal in that the user is not expected to be able to
+ * recover from them, PSM2 still allows some level of control. By
+ * default, PSM2 returns all errors to the user but as a convenience, allows
+ * users to either defer errors internally to PSM2 or to have PSM2 return all
+ * errors to the user (callers to PSM2 functions). PSM2 attempts to deallocate
+ * its resources as a best effort, but exits are always non-collective with
+ * respect to endpoints opened in other processes. The user is expected to be
+ * able to handle non-collective exits from any endpoint and in turn cleanly
+ * and independently terminate the parallel environment. Local error handling
+ * can be handled in three modes:
+ *
+ * Errors and error handling can be individually registered either globally or
+ * per-endpoint:
+ * @li @b Per-endpoint error handling captures errors for functions where the
+ * error scoping is determined to be over an endpoint. This includes all
+ * communication functions that include an EP or MQ handle as the first
+ * parameter.
+ *
+ * @li @b Global error handling captures errors for functions where a
+ * particular endpoint cannot be identified or for @ref psm2_ep_open, where
+ * errors (if any) occur before the endpoint is opened.
+ *
+ * Error handling is controlled by registering error handlers (@ref
+ * psm2_error_register_handler). The global error handler can
+ * be set at any time (even before @ref psm2_init), whereas a per-endpoint error
+ * handler can be set as soon as a new endpoint is successfully created. If a
+ * per-endpoint handle is not registered, the per-endpoint handler inherits
+ * from the global error handler at time of open.
+ *
+ * PSM2 predefines two different mechanisms for handling errors:
+ *
+ * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER)
+ * @li No-op PSM2 error handler where errors are returned
+ * (@ref PSM2_ERRHANDLER_NO_HANDLER)
+ *
+ * The default PSM-internal error handler effectively frees the user from
+ * explicitly handling the return values of ever PSM2 function but may not
+ * return to the user in a function determined to have caused a fatal error.
+ *
+ * The No-op PSM2 error handler bypasses all error handling functionality and
+ * always returns the error to the user. The user can then use @ref
+ * psm2_error_get_string to obtain a generic string from an error code (compared
+ * to a more detailed error message available through registering of error
+ * handlers).
+ *
+ * For even more control, users can register their own error handlers to have
+ * access to more precise error strings and selectively control when an when
+ * not to return to callers of PSM2 functions. All error handlers shown defer
+ * error handling to PSM2 for errors that are not recognized using @ref
+ * psm2_error_defer. Deferring an error from a custom error handler is
+ * equivalent to relying on the default error handler.
+ *
+ * @section env_var Environment variables
+ *
+ * Some PSM2 behaviour can be controlled via environment variables.
+ *
+ * @li @b PSM2_DEVICES. PSM2 implements three devices for communication which
+ * are, in order, @c self, @c shm and @c hfi. For PSM2 jobs that do not
+ * require shared-memory communications, @b PSM2_DEVICES can be specified as @c
+ * self, @c hfi. Similarly, for shared-memory only jobs, the @c hfi device
+ * can be disabled. It is up to the user to ensure that the endpoint ids
+ * passed in @ref psm2_ep_connect do not require a device that has been
+ * explicitly disabled by the user. In some instances, enabling only the
+ * devices that are required may improve performance.
+ *
+ * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts
+ * of PSM2 will output debugging information. With a default value of @c 0x1,
+ * informative messages will be printed (this value should be considered a
+ * minimum). At @c 0x101, startup and finalization messages are added to the
+ * output. At @c 0x1c3, every communication event is logged and should hence
+ * be used for extreme debugging only.
+ *
+ * @li @b PSM2_MULTI_EP. By default, only one PSM2 endpoint may be opened in
+ * a process. With the correct setting of this environment variable, a process
+ * may open more than one PSM2 endpoint. In order to enable multiple endpoint
+ * per process support, the value of this environment variable should be set
+ * to "1" or "yes".
+ *
+ * @section thr_sfty Thread safety and reentrancy
+ * Unless specifically noted otherwise, all PSM2 functions should not be considered
+ * to be thread safe or reentrant.
+ */
+
+/** @brief Local endpoint handle (opaque)
+ * @ingroup ep
+ *
+ * Handle returned to the user when a new local endpoint is created. The
+ * handle is a local handle to be used in all communication functions and is
+ * not intended to globally identify the opened endpoint in any way.
+ *
+ * All open endpoint handles can be globally identified using the endpoint id
+ * integral type (@ref psm2_epid_t) and all communication must use an endpoint
+ * address (@ref psm2_epaddr_t) that can be obtained by connecting a local
+ * endpoint to one or more endpoint identifiers.
+ *
+ * @remark The local endpoint handle is opaque to the user. */
+typedef struct psm2_ep *psm2_ep_t;
+
+/** @brief MQ handle (opaque)
+ * @ingroup mq
+ *
+ * Handle returned to the user when a new Matched queue is created (@ref
+ * psm2_mq_init). */
+typedef struct psm2_mq *psm2_mq_t;
+
+/*! @defgroup init PSM2 Initialization and Maintenance
+ * @{
+ */
+#define PSM2_VERNO 0x0201 /*!< Header-defined Version number */
+#define PSM2_VERNO_MAJOR 0x02 /*!< Header-defined Major Version Number */
+#define PSM2_VERNO_MINOR 0x01 /*!< Header-defined Minor Version Number */
+#define PSM2_VERNO_COMPAT_MAJOR 0x01 /*!<Minimum PSM1 Major Version Number for Compatibility */
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_error {
+ /*! Interface-wide "ok", guaranteed to be 0. */
+ PSM2_OK = 0,
+ /*! No events progressed on @ref psm2_poll (not fatal) */
+ PSM2_OK_NO_PROGRESS = 1,
+ /*! Error in a function parameter */
+ PSM2_PARAM_ERR = 3,
+ /*! PSM2 ran out of memory */
+ PSM2_NO_MEMORY = 4,
+ /*! PSM2 has not been initialized by @ref psm2_init */
+ PSM2_INIT_NOT_INIT = 5,
+ /*! API version passed in @ref psm2_init is incompatible */
+ PSM2_INIT_BAD_API_VERSION = 6,
+ /*! PSM2 Could not set affinity */
+ PSM2_NO_AFFINITY = 7,
+ /*! PSM2 Unresolved internal error */
+ PSM2_INTERNAL_ERR = 8,
+ /*! PSM2 could not set up shared memory segment */
+ PSM2_SHMEM_SEGMENT_ERR = 9,
+ /*! PSM2 option is a read-only option */
+ PSM2_OPT_READONLY = 10,
+ /*! PSM2 operation timed out */
+ PSM2_TIMEOUT = 11,
+ /*! Too many endpoints */
+ PSM2_TOO_MANY_ENDPOINTS = 12,
+
+ /*! PSM2 is finalized */
+ PSM2_IS_FINALIZED = 13,
+
+ /*! Endpoint was closed */
+ PSM2_EP_WAS_CLOSED = 20,
+ /*! PSM2 Could not find an OPA Unit */
+ PSM2_EP_NO_DEVICE = 21,
+ /*! User passed a bad unit or port number */
+ PSM2_EP_UNIT_NOT_FOUND = 22,
+ /*! Failure in initializing endpoint */
+ PSM2_EP_DEVICE_FAILURE = 23,
+ /*! Error closing the endpoing error */
+ PSM2_EP_CLOSE_TIMEOUT = 24,
+ /*! No free ports could be obtained */
+ PSM2_EP_NO_PORTS_AVAIL = 25,
+ /*! Could not detect network connectivity */
+ PSM2_EP_NO_NETWORK = 26,
+ /*! Invalid Unique job-wide UUID Key */
+ PSM2_EP_INVALID_UUID_KEY = 27,
+ /*! Internal out of resources */
+ PSM2_EP_NO_RESOURCES = 28,
+
+ /*! Endpoint connect status unknown (because of other failures or if
+ * connect attempt timed out) */
+ PSM2_EPID_UNKNOWN = 40,
+ /*! Endpoint could not be reached by any PSM2 component */
+ PSM2_EPID_UNREACHABLE = 41,
+ /*! At least one of the connecting nodes was incompatible in endianess */
+ PSM2_EPID_INVALID_NODE = 43,
+ /*! At least one of the connecting nodes provided an invalid MTU */
+ PSM2_EPID_INVALID_MTU = 44,
+ /*! At least one of the connecting nodes provided a bad key */
+ PSM2_EPID_INVALID_UUID_KEY = 45,
+ /*! At least one of the connecting nodes is running an incompatible
+ * PSM2 protocol version */
+ PSM2_EPID_INVALID_VERSION = 46,
+ /*! At least one node provided garbled information */
+ PSM2_EPID_INVALID_CONNECT = 47,
+ /*! EPID was already connected */
+ PSM2_EPID_ALREADY_CONNECTED = 48,
+ /*! EPID is duplicated, network connectivity problem */
+ PSM2_EPID_NETWORK_ERROR = 49,
+ /*! EPID incompatible partition keys */
+ PSM2_EPID_INVALID_PKEY = 50,
+ /*! Unable to resolve path for endpoint */
+ PSM2_EPID_PATH_RESOLUTION = 51,
+
+ /*! MQ Non-blocking request is incomplete */
+ PSM2_MQ_NO_COMPLETIONS = 60,
+ /*! MQ Message has been truncated at the receiver */
+ PSM2_MQ_TRUNCATION = 61,
+
+ /*! AM reply error */
+ PSM2_AM_INVALID_REPLY = 70,
+
+ /*! Reserved Value to indicate highest ENUM value */
+ PSM2_ERROR_LAST = 80
+};
+
+/*! Backwards header compatibility for a confusing error return name */
+#define PSM2_MQ_INCOMPLETE PSM2_MQ_NO_COMPLETIONS
+
+/*! @see psm2_error */
+typedef enum psm2_error psm2_error_t;
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_component {
+ /*! PSM2 core library */
+ PSM2_COMPONENT_CORE = 0,
+ /*! MQ component */
+ PSM2_COMPONENT_MQ = 1,
+ /*! AM component */
+ PSM2_COMPONENT_AM = 2,
+ /*! IB component */
+ PSM2_COMPONENT_IB = 3
+};
+
+/*! @see psm2_component */
+typedef enum psm2_component psm2_component_t;
+
+/*! @brief PSM2 Path resolution mechanism
+ */
+enum psm2_path_res {
+ /*! PSM2 no path resolution */
+ PSM2_PATH_RES_NONE = 0,
+ /*! Use OFED Plus for path resolution */
+ PSM2_PATH_RES_OPP = 1,
+ /*! Use OFED UMAD for path resolution */
+ PSM2_PATH_RES_UMAD = 2
+};
+
+/*! @see psm2_path_resolution */
+typedef enum psm2_path_res psm2_path_res_t;
+
+/** @brief Initialize PSM2 interface
+ *
+ * Call to initialize the PSM2 library for a desired API revision number.
+ *
+ * @param[in,out] api_verno_major As input a pointer to an integer that holds
+ * @ref PSM2_VERNO_MAJOR. As output, the pointer
+ * is updated with the major revision number of
+ * the loaded library.
+ * @param[in,out] api_verno_minor As input, a pointer to an integer that holds
+ * @ref PSM2_VERNO_MINOR. As output, the pointer
+ * is updated with the minor revision number of
+ * the loaded library.
+ *
+ * @pre The user has not called any other PSM2 library call except @ref
+ * psm2_error_register_handler to register a global error handler.
+ *
+ * @post Depending on the environment variable @ref PSM2_MULTI_EP being set and
+ * its contents, support for opening multiple endpoints is either enabled
+ * or disabled.
+ *
+ * @warning PSM2 initialization is a precondition for all functions used in the
+ * PSM2 library.
+ *
+ * @returns PSM2_OK The PSM2 interface could be opened and the desired API
+ * revision can be provided.
+ * @returns PSM2_INIT_BAD_API_VERSION The PSM2 library cannot compatibility for
+ * the desired API version.
+ *
+ * @code{.c}
+ // In this example, we want to handle our own errors before doing init,
+ // since we don't want a fatal error if OPA is not found.
+ // Note that @ref psm2_error_register_handler
+ // (and @ref psm2_uuid_generate and @ref psm2_get_capability_mask)
+ // are the only function that can be called before @ref psm2_init
+
+ int try_to_initialize_psm() {
+ int verno_major = PSM2_VERNO_MAJOR;
+ int verno_minor = PSM2_VERNO_MINOR;
+
+ int err = psm2_error_register_handler(NULL, // Global handler
+ PSM2_ERRHANDLER_NO_HANDLER); // return errors
+ if (err) {
+ fprintf(stderr, "Couldn't register global handler: %s\n",
+ psm2_error_get_string(err));
+ return -1;
+ }
+
+ err = psm2_init(&verno_major, &verno_minor);
+ if (err || verno_major > PSM2_VERNO_MAJOR) {
+ if (err)
+ fprintf(stderr, "PSM2 initialization failure: %s\n",
+ psm2_error_get_string(err));
+ else
+ fprintf(stderr, "PSM2 loaded an unexpected/unsupported "
+ "version (%d.%d)\n", verno_major, verno_minor);
+ return -1;
+ }
+
+ // We were able to initialize PSM2 but will defer all further error
+ // handling since most of the errors beyond this point will be fatal.
+ int err = psm2_error_register_handler(NULL, // Global handler
+ PSM2_ERRHANDLER_PSM_HANDLER);
+ if (err) {
+ fprintf(stderr, "Couldn't register global errhandler: %s\n",
+ psm2_error_get_string(err));
+ return -1;
+ }
+ return 1;
+ }
+ @endcode
+ */
+psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor);
+
+/*! @brief PSM2 capabilities definitions
+ *
+ * Each capability is defined as a separate bit,
+ * i.e. next capabilities must be defined as
+ * consecutive bits : 0x2, 0x4 ... and so on.
+ */
+#define PSM2_MULTI_EP_CAP 0x1 /* Multiple Endpoints capability */
+
+/** @brief PSM2 capabilities provider
+ *
+ * @param[in] req_cap_mask Requested capabilities are given as bit field.
+ *
+ * @returns internal capabilities bit field ANDed with a requested bit mask */
+uint64_t psm2_get_capability_mask(uint64_t req_cap_mask);
+
+/** @brief Finalize PSM2 interface
+ *
+ * Single call to finalize PSM2 and close all unclosed endpoints
+ *
+ * @post The user guarantees not to make any further PSM2 calls, including @ref
+ * psm2_init.
+ *
+ * @returns PSM2_OK Always returns @c PSM2_OK */
+psm2_error_t psm2_finalize(void);
+
+/** @brief Error handling opaque token
+ *
+ * A token is required for users that register their own handlers and wish to
+ * defer further error handling to PSM. */
+typedef struct psm2_error_token *psm2_error_token_t;
+
+/** @brief Error handling function
+ *
+ * Users can handle errors explicitly instead of relying on PSM's own error
+ * handler. There is one global error handler and error handlers that can be
+ * individually set for each opened endpoint. By default, endpoints will
+ * inherit the global handler registered at the time of open.
+ *
+ * @param[in] ep Handle associated to the endpoint over which the error occurred
+ * or @c NULL if the error is being handled by the global error
+ * handler.
+ * @param[in] error PSM2 error identifier
+ * @param[in] error_string A descriptive error string of maximum length @ref
+ * PSM2_ERRSTRING_MAXLEN.
+ * @param[in] token Opaque PSM2 token associated with the particular event that
+ * generated the error. The token can be used to extract the
+ * error string and can be passed to @ref psm2_error_defer to
+ * defer any remaining or unhandled error handling to PSM.
+ *
+ * @post If the error handler returns, the error returned is propagated to the
+ * caller. */
+typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep,
+ const psm2_error_t error,
+ const char *error_string,
+ psm2_error_token_t token);
+
+#define PSM2_ERRHANDLER_DEFAULT ((psm2_ep_errhandler_t)-1)
+/**< Obsolete names, only here for backwards compatibility */
+#define PSM2_ERRHANDLER_NOP ((psm2_ep_errhandler_t)-2)
+/**< Obsolete names, only here for backwards compatibility */
+
+#define PSM2_ERRHANDLER_PSM_HANDLER ((psm2_ep_errhandler_t)-1)
+/**< PSM2 error handler as explained in @ref error_handling */
+
+#define PSM2_ERRHANDLER_NO_HANDLER ((psm2_ep_errhandler_t)-2)
+/**< Bypasses the default PSM2 error handler and returns all errors to the user
+ * (this is the default) */
+
+#define PSM2_ERRSTRING_MAXLEN 512 /**< Maximum error string length. */
+
+/** @brief PSM2 error handler registration
+ *
+ * Function to register error handlers on a global basis and on a per-endpoint
+ * basis. PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special
+ * pre-defined handlers to respectively enable use of the default PSM-internal
+ * handler or the no-handler that disables registered error handling and
+ * returns all errors to the caller (both are documented in @ref
+ * error_handling).
+ *
+ * @param[in] ep Handle of the endpoint over which the error handler should be
+ * registered. With ep set to @c NULL, the behavior of the
+ * global error handler can be controlled.
+ * @param[in] errhandler Handler to register. Can be a user-specific error
+ * handling function or PSM2_ERRHANDLER_PSM_HANDLER or
+ * PSM2_ERRHANDLER_NO_HANDLER.
+ *
+ * @remark When ep is set to @c NULL, this is the only function that can be
+ * called before @ref psm2_init
+ */
+psm2_error_t
+psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler);
+
+/** @brief PSM2 deferred error handler
+ *
+ * Function to handle fatal PSM2 errors if no error handler is installed or if
+ * the user wishes to defer further error handling to PSM. Depending on the
+ * type of error, PSM2 may or may not return from the function call.
+ *
+ * @param[in] err_token Error token initially passed to error handler
+ *
+ * @pre The user is calling into the function because it has decided that PSM
+ * should handle an error case.
+ *
+ * @post The function may or may not return depending on the error
+ */
+psm2_error_t psm2_error_defer(psm2_error_token_t err_token);
+
+/** @brief Get generic error string from error
+ *
+ * Function to return the default error string associated to a PSM2 error.
+ *
+ * While a more detailed and precise error string is usually available within
+ * error handlers, this function is available to obtain an error string out of
+ * an error handler context or when a no-op error handler is registered.
+ *
+ * @param[in] error PSM2 error
+ */
+const char *psm2_error_get_string(psm2_error_t error);
+
+/** @brief Option key/pair structure
+ *
+ * Currently only used in MQ.
+ */
+struct psm2_optkey {
+ uint32_t key; /**< Option key */
+ void *value; /**< Option value */
+};
+
+/*! @} */
+
+/*! @defgroup ep PSM2 Device Endpoint Management
+ * @{
+ */
+
+/** @brief Endpoint ID
+ *
+ * Integral type of size 8 bytes that can be used by the user to globally
+ * identify a successfully opened endpoint. Although the contents of the
+ * endpoint id integral type remains opaque to the user, unique network id and
+ * OPA port number can be extracted using @ref psm2_epid_nid and @ref
+ * psm2_epid_context.
+ */
+typedef uint64_t psm2_epid_t;
+
+/** @brief Endpoint Address (opaque)
+ *
+ * Remote endpoint addresses are created when the user binds an endpoint ID
+ * to a particular endpoint handle using @ref psm2_ep_connect. A given endpoint
+ * address is only guaranteed to be valid over a single endpoint.
+ */
+typedef struct psm2_epaddr *psm2_epaddr_t;
+
+/** @brief PSM2 Unique UID
+ *
+ * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an
+ * endpoint within a particular job. Since PSM2 does not participate in job
+ * allocation and management, users are expected to generate a unique ID to
+ * associate endpoints to a particular parallel or collective job.
+ * @see psm2_uuid_generate
+ */
+typedef uint8_t psm2_uuid_t[16];
+
+/** @brief Get Endpoint identifier's Unique Network ID */
+uint64_t psm2_epid_nid(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA context number */
+uint64_t psm2_epid_context(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA port (deprecated, use
+ * @ref psm2_epid_context instead) */
+uint64_t psm2_epid_port(psm2_epid_t epid);
+
+/** @brief List the number of available OPA units
+ *
+ * Function used to determine the number of locally available OPA units.
+ * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1.
+ *
+ * @returns PSM2_OK unless the user has not called @ref psm2_init
+ */
+psm2_error_t psm2_ep_num_devunits(uint32_t *num_units);
+
+/** @brief Utility to generate UUIDs for @ref psm2_ep_open
+ *
+ * This function is available as a utility for generating unique job-wide ids.
+ * See discussion in @ref psm2_ep_open for further information.
+ *
+ * @remark This function does not require PSM2 to be initialized.
+ */
+void psm2_uuid_generate(psm2_uuid_t uuid_out);
+
+/* Affinity modes for the affinity member of struct psm2_ep_open_opts */
+#define PSM2_EP_OPEN_AFFINITY_SKIP 0 /**< Disable setting affinity */
+#define PSM2_EP_OPEN_AFFINITY_SET 1 /**< Enable setting affinity unless
+ already set */
+#define PSM2_EP_OPEN_AFFINITY_FORCE 2 /**< Enable setting affinity regardless
+ of current affinity setting */
+
+/* Default values for some constants */
+#define PSM2_EP_OPEN_PKEY_DEFAULT 0xffffffffffffffffULL
+ /**< Default protection key */
+
+/** @brief Endpoint Open Options
+ *
+ * These options are available for opening a PSM2 endpoint. Each is
+ * individually documented and setting each option to -1 or passing NULL as the
+ * options parameter in @ref psm2_ep_open instructs PSM2 to use
+ * implementation-defined defaults.
+ *
+ * Each option is documented in @ref psm2_ep_open
+ */
+struct psm2_ep_open_opts {
+ int64_t timeout; /**< timeout in nanoseconds to open device */
+ int unit; /**< OPA Unit ID to open on */
+ int affinity; /**< How PSM2 should set affinity */
+ int shm_mbytes; /**< Megabytes used for intra-node, deprecated */
+ int sendbufs_num; /**< Preallocated send buffers */
+ uint64_t network_pkey; /**< Network Protection Key (v1.01) */
+ int port; /**< IB port to use (1 to N) */
+ int outsl; /**< IB SL to use when sending pkts */
+ uint64_t service_id; /* IB Service ID to use for endpoint */
+ psm2_path_res_t path_res_type; /* Path resolution type */
+ int senddesc_num; /* Preallocated send descriptors */
+ int imm_size; /* Immediate data size for endpoint */
+};
+
+/** @brief OPA endpoint creation
+ *
+ * Function used to create a new local communication endpoint on an OPA
+ * adapter. The returned endpoint handle is required in all PSM2 communication
+ * operations, as PSM2 can manage communication over multiple endpoints. An
+ * opened endpoint has no global context until the user connects the endpoint
+ * to other global endpoints by way of @ref psm2_ep_connect. All local endpoint
+ * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are
+ * also returned when an endpoint is opened. It is assumed that the user can
+ * provide an out-of-band mechanism to distribute the endpoint IDs in order to
+ * establish connections between endpoints (@ref psm2_ep_connect for more
+ * information).
+ *
+ * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in
+ * a parallel job. It is up to the user to ensure
+ * that the key is globally unique over a period long
+ * enough to prevent duplicate keys over the same set
+ * of endpoints (see comments below).
+ *
+ * @param[in] opts Open options of type @ref psm2_ep_open_opts
+ * (see @ref psm2_ep_open_opts_get_defaults).
+ *
+ * @param[out] ep User-supplied storage to return a pointer to the newly
+ * created endpoint. The returned pointer of type @ref psm2_ep_t
+ * is a local handle and cannot be used to globally identify the
+ * endpoint.
+ * @param[out] epid User-supplied storage to return the endpoint ID associated
+ * to the newly created local endpoint returned in the @c ep
+ * handle. The endpoint ID is an integral type suitable for
+ * uniquely identifying the local endpoint.
+ *
+ * PSM2 does not internally verify the consistency of the uuid, it is up to the
+ * user to ensure that the uid is unique enough not to collide with other
+ * currently-running jobs. Users can employ three mechanisms to obtain a uuid.
+ *
+ * 1. Use the supplied @ref psm2_uuid_generate utility
+ *
+ * 2. Use an OS or library-specific uuid generation utility, that complies with
+ * OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on
+ * FreeBSD.
+ * (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm)
+ *
+ * 3. Manually pack a 16-byte string using a utility such as /dev/random or
+ * other source with enough entropy and proper seeding to prevent two nodes
+ * from generating the same uuid_t.
+ *
+ * The following options are relevent when opening an endpoint:
+ * @li @c timeout establishes the number of nanoseconds to wait before
+ * failing to open a port (with -1, defaults to 15 secs).
+ * @li @c unit sets the OPA unit number to use to open a port (with
+ * -1, PSM2 determines the best unit to open the port). If @c
+ * HFI_UNIT is set in the environment, this setting is ignored.
+ * @li @c affinity enables or disables PSM2 setting processor affinity. The
+ * option can be controlled to either disable (@ref
+ * PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting
+ * only if it is already unset (@ref
+ * PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity begin
+ * set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE).
+ * If @c HFI_NO_CPUAFFINITY is set in the environment, this
+ * setting is ignored.
+ * @li @c shm_mbytes sets a maximum number of megabytes that can be allocated
+ * to each local endpoint ID connected through this
+ * endpoint (with -1, defaults to 10 MB).
+ * @li @c sendbufs_num sets the number of send buffers that can be
+ * pre-allocated for communication (with -1, defaults to
+ * 512 buffers of MTU size).
+ * @li @c network_pkey sets the protection key to employ for point-to-point
+ * PSM2 communication. Unless a specific value is used,
+ * this parameter should be set to
+ * PSM2_EP_OPEN_PKEY_DEFAULT.
+ *
+ * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only
+ * once per process and subsequent calls will fail. In order to enable creation
+ * of multiple endoints per process, one must properly set the environment variable
+ * @ref PSM2_MULTI_EP before calling @ref psm2_init.
+ *
+ * @code{.c}
+ // In order to open an endpoint and participate in a job, each endpoint has
+ // to be distributed a unique 16-byte UUID key from an out-of-band source.
+ // Presumably this can come from the parallel spawning utility either
+ // indirectly through an implementors own spawning interface or as in this
+ // example, the UUID is set as a string in an environment variable
+ // propagated to all endpoints in the job.
+
+ int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle
+ psm2_epid_t *epid, // output endpoint identifier
+ int unit) // unit of our choice
+ {
+ psm2_ep_open_opts epopts;
+ psm2_uuid_t job_uuid;
+ char *c;
+
+ // Let PSM2 assign its default values to the endpoint options.
+ psm2_ep_open_opts_get_defaults(&epopts);
+
+ // We want a stricter timeout and a specific unit
+ epopts.timeout = 15*1e9; // 15 second timeout
+ epopts.unit = unit; // We want a specific unit, -1 would let PSM
+ // choose the unit for us.
+ epopts.port = port; // We want a specific unit, <= 0 would let PSM
+ // choose the port for us.
+ // We've already set affinity, don't let PSM2 do so if it wants to.
+ if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET)
+ epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP;
+
+ // ENDPOINT_UUID is set to the same value in the environment of all the
+ // processes that wish to communicate over PSM2 and was generated by
+ // the process spawning utility
+ c = getenv("ENDPOINT_UUID");
+ if (c && *c)
+ implementor_string_to_16byte_packing(c, job_uuid);
+ else {
+ fprintf(stderr, "Can't find UUID for endpoint\n);
+ return -1;
+ }
+
+ // Assume we don't want to handle errors here.
+ psm2_ep_open(job_uuid, &epopts, ep, epid);
+ return 1;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_ep_open(const psm2_uuid_t unique_job_key,
+ const struct psm2_ep_open_opts *opts, psm2_ep_t *ep,
+ psm2_epid_t *epid);
+
+/** @brief Endpoint open default options.
+ *
+ * Function used to initialize the set of endpoint options to their default
+ * values for use in @ref psm2_ep_open.
+ *
+ * @param[out] opts Endpoint Open options.
+ *
+ * @warning For portable operation, users should always call this function
+ * prior to calling @ref psm2_ep_open.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_INIT_NOT_INIT If psm has not been initialized.
+ */
+psm2_error_t
+psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts);
+
+/** @brief Endpoint shared memory query
+ *
+ * Function used to determine if a remote endpoint shares memory with a
+ * currently opened local endpiont.
+ *
+ * @param[in] ep Endpoint handle
+ * @param[in] epid Endpoint ID
+ *
+ * @param[out] result Result is non-zero if the remote endpoint shares memory with the local
+ * endpoint @c ep, or zero otherwise.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_EPID_UNKNOWN If the epid is not recognized
+ */
+psm2_error_t
+psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result);
+
+/** @brief Close endpoint
+ * @param[in] ep PSM2 endpoint handle
+ * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE
+ * @param[in] timeout How long to wait in nanoseconds if mode is
+ * PSM2_EP_CLOSE_GRACEFUL, 0 waits forever. If @c mode is
+ * @ref PSM2_EP_CLOSE_FORCE, this parameter is ignored.
+ *
+ * The following errors are returned, others are handled by the per-endpoint
+ * error handler:
+ *
+ * @return PSM2_OK Endpoint was successfully closed without force or
+ * successfully closed with force within the supplied timeout.
+ * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed
+ * within timeout.
+ */
+psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout);
+
+#define PSM2_EP_CLOSE_GRACEFUL 0 /**< Graceful mode in @ref psm2_ep_close */
+#define PSM2_EP_CLOSE_FORCE 1 /**< Forceful mode in @ref psm2_ep_close */
+
+/** @brief Provide mappings for network id to hostname
+ *
+ * Since PSM2 does not assume or rely on the availability of an external
+ * networkid-to-hostname mapping service, users can provide one or more of
+ * these mappings. The @ref psm2_map_nid_hostname function allows a list of
+ * network ids to be associated to hostnames.
+ *
+ * This function is not mandatory for correct operation but may allow PSM2 to
+ * provide better diagnostics when remote endpoints are unavailable and can
+ * otherwise only be identified by their network id.
+ *
+ * @param[in] num Number elements in @c nid and @c hostnames arrays
+ * @param[in] nids User-provided array of network ids (i.e. OPA LIDs),
+ * should be obtained by calling @ref psm2_epid_nid on each
+ * epid.
+ * @param[in] hostnames User-provided array of hostnames (array of
+ * NUL-terimated strings) where each hostname index
+ * maps to the provided nid hostname.
+ *
+ * @warning Duplicate nids may be provided in the input @c nids array, only
+ * the first corresponding hostname will be remembered.
+ *
+ * @pre The user may or may not have already provided a hostname mappings.
+ * @post The user may free any dynamically allocated memory passed to the
+ * function.
+ *
+ */
+psm2_error_t
+psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames);
+
+/** @brief Connect one or more remote endpoints to a local endpoint
+ *
+ * Function to non-collectively establish a connection to a set of endpoint IDs
+ * and translate endpoint IDs into endpoint addresses. Establishing a remote
+ * connection with a set of remote endpoint IDs does not imply a collective
+ * operation and the user is free to connect unequal sets on each process.
+ * Similarly, a given endpoint address does not imply that a pairwise
+ * communication context exists between the local endpoint and remote endpoint.
+ *
+ * @param[in] ep PSM2 endpoint handle
+ *
+ * @param[in] num_of_epid The number of endpoints to connect to, which
+ * also establishes the number of elements contained in
+ * all of the function's array-based parameters.
+ *
+ * @param[in] array_of_epid User-allocated array that contains @c num_of_epid
+ * valid endpoint identifiers. Each endpoint id (or
+ * epid) has been obtained through an out-of-band
+ * mechanism and each endpoint must have been opened
+ * with the same uuid key.
+ *
+ * @param[in] array_of_epid_mask User-allocated array that contains
+ * @c num_of_epid integers. This array of masks
+ * allows users to select which of the epids in @c
+ * array_of_epid should be connected. If the integer
+ * at index i is zero, psm does not attempt to connect
+ * to the epid at index i in @c array_of_epid. If
+ * this parameter is NULL, psm will try to connect to
+ * each epid.
+ *
+ * @param[out] array_of_errors User-allocated array of at least @c num_of_epid
+ * elements. If the function does not return
+ * PSM2_OK, this array can be consulted for each
+ * endpoint not masked off by @c array_of_epid_mask
+ * to know why the endpoint could not be connected.
+ * Endpoints that could not be connected because of
+ * an unrelated failure will be marked as @ref
+ * PSM2_EPID_UNKNOWN. If the function returns
+ * PSM2_OK, the errors for all endpoints will also
+ * contain PSM2_OK.
+ *
+ * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid
+ * elements of type psm2_epaddr_t. Each
+ * successfully connected endpoint is updated with
+ * an endpoint address handle that corresponds to
+ * the endpoint id at the same index in @c
+ * array_of_epid. Handles are only updated if the
+ * endpoint could be connected and if its error in
+ * array_of_errors is PSM2_OK.
+ *
+ * @param[in] timeout Timeout in nanoseconds after which connection attempts
+ * will be abandoned. Setting this value to 0 disables
+ * timeout and waits until all endpoints have been
+ * successfully connected or until an error is detected.
+ *
+ * @pre The user has opened a local endpoint and obtained a list of endpoint
+ * IDs to connect to a given endpoint handle using an out-of-band
+ * mechanism not provided by PSM.
+ *
+ * @post If the connect is successful, @c array_of_epaddr is updated with valid
+ * endpoint addresses.
+ *
+ * @post If unsuccessful, the user can query the return status of each
+ * individual remote endpoint in @c array_of_errors.
+ *
+ * @post The user can call into @ref psm2_ep_connect many times with the same
+ * endpoint ID and the function is guaranteed to return the same output
+ * parameters.
+ *
+ * @post PSM2 does not keep any reference to the arrays passed into the
+ * function and the caller is free to deallocate them.
+ *
+ * The error value with the highest importance is returned by
+ * the function if some portion of the communication failed. Users should
+ * always refer to individual errors in @c array_of_errors whenever the
+ * function cannot return PSM2_OK.
+ *
+ * @returns PSM2_OK The entire set of endpoint IDs were successfully connected
+ * and endpoint addresses are available for all endpoint IDs.
+ *
+ * @code{.c}
+ int connect_endpoints(psm2_ep_t ep, int numep,
+ const psm2_epid_t *array_of_epid,
+ psm2_epaddr_t **array_of_epaddr_out)
+ {
+ psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t));
+ if (errors == NULL)
+ return -1;
+
+ psm2_epaddr_t *all_epaddrs =
+ (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t));
+
+ if (all_epaddrs == NULL)
+ return -1;
+
+ psm2_ep_connect(ep, numep, array_of_epid,
+ NULL, // We want to connect all epids, no mask needed
+ errors,
+ all_epaddrs,
+ 30*e9); // 30 second timeout, <1 ns is forever
+ *array_of_epaddr_out = all_epaddrs;
+ free(errors);
+ return 1;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid,
+ const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* This function is equivalent to calling psm2_ep_disconnect2() with mode ==
+* PSM2_EP_DISCONNECT_GRACEFUL.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+* which also indicates the number of elements contained
+* in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+* valid endpoint addresses. Each endpoint address (or
+* epaddr) has been obtained through a previous
+* psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+* num_of_epaddr integers. This array of masks
+* allows users to select which of the
+* epaddresses in array_of_epaddr should be
+* disconnected. If the integer at index i is
+* zero, PSM2 does not attempt to disconnect to
+* the epaddr at index i in array_of_epaddr. If
+* this parameter is NULL, PSM2 tries to
+* disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+* elements. If the function does not return PSM2_OK,
+* this array can be consulted for each endpoint
+* address not masked off by array_of_epaddr_mask to
+* know why the endpoint could not be disconnected.
+* Any endpoint address that could not be
+* disconnected because of an unrelated failure is
+* marked as PSM2_EPID_UNKNOWN. If the function
+* returns PSM2_OK, the errors for all endpoint
+* addresses also contain PSM2_OK.
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+* are abandoned. Setting this value to 0 disables timeout and
+* waits until all endpoints have been successfully
+* disconnected or until an error is detected.
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+* array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+* remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+* and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+* if some portion of the communication failed. Refer to individual errors
+* in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+* and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+ const psm2_epaddr_t *array_of_epaddr)
+{
+ psm2_error_t *errors =
+ (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+ if (errors == NULL)
+ return -1;
+ psm2_ep_disconnect(
+ ep, num_epaddr, array_of_epaddr,
+ NULL, // We want to disconnect all epaddrs, no mask needed,
+ errors,
+ 30 * e9); // 30 second timeout, <1 ns is forever
+ free(errors);
+ return 1;
+}
+ at endcode
+*/
+psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+ psm2_epaddr_t *array_of_epaddr,
+ const int *array_of_epaddr_mask,
+ psm2_error_t *array_of_errors, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+* which also indicates the number of elements contained
+* in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+* valid endpoint addresses. Each endpoint address (or
+* epaddr) has been obtained through a previous
+* psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+* num_of_epaddr integers. This array of masks
+* allows users to select which of the
+* epaddresses in array_of_epaddr should be
+* disconnected. If the integer at index i is
+* zero, PSM2 does not attempt to disconnect to
+* the epaddr at index i in array_of_epaddr. If
+* this parameter is NULL, PSM2 tries to
+* disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+* elements. If the function does not return PSM2_OK,
+* this array can be consulted for each endpoint
+* address not masked off by array_of_epaddr_mask to
+* know why the endpoint could not be disconnected.
+* Any endpoint address that could not be
+* disconnected because of an unrelated failure is
+* marked as PSM2_EPID_UNKNOWN. If the function
+* returns PSM2_OK, the errors for all endpoint
+* addresses also contain PSM2_OK.
+*
+* @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+* are abandoned. Setting this value to 0 disables timeout and
+* waits until all endpoints have been successfully
+* disconnected or until an error is detected. Supplying a
+* negative value here sets the disconnection mode to "force".
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+* array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+* remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+* and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+* if some portion of the communication failed. Refer to individual errors
+* in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+* and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+ const psm2_epaddr_t *array_of_epaddr)
+{
+ psm2_error_t *errors =
+ (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+ if (errors == NULL)
+ return -1;
+ psm2_ep_disconnect2(
+ ep, num_epaddr, array_of_epaddr,
+ NULL, // We want to disconnect all epaddrs, no mask needed,
+ errors,
+ PSM2_EP_DISCONECT_GRACEFUL,
+ 30 * e9); // 30 second timeout, 0 ns is forever
+ free(errors);
+ return 1;
+}
+ at endcode
+*/
+psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+ psm2_epaddr_t *array_of_epaddr,
+ const int *array_of_epaddr_mask,
+ psm2_error_t *array_of_errors,
+ int mode, int64_t timeout);
+
+#define PSM2_EP_DISCONNECT_GRACEFUL PSM2_EP_CLOSE_GRACEFUL /**< Graceful mode in @ref psm2_ep_disconnect2 */
+#define PSM2_EP_DISCONNECT_FORCE PSM2_EP_CLOSE_FORCE /**< Forceful mode in @ref psm2_ep_disconnect2 */
+
+/** @brief Ensure endpoint communication progress
+ *
+ * Function to ensure progress for all PSM2 components instantiated on an
+ * endpoint (currently, this only includes the MQ component). The function
+ * never blocks and is typically required in two cases:
+ *
+ * @li Allowing all PSM2 components instantiated over a given endpoint to make
+ * communication progress. Refer to @ref mq_progress for a detailed
+ * discussion on MQ-level progress issues.
+ *
+ * @li Cases where users write their own synchronization primitives that
+ * depend on remote communication (such as spinning on a memory location
+ * which's new value depends on ongoing communication).
+ *
+ * The poll function doesn't block, but the user can rely on the @ref
+ * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of
+ * frequency (poll until an event happens) or execution environment (poll for a
+ * while but yield to other threads of CPUs are oversubscribed).
+ *
+ * @returns PSM2_OK Some communication events were progressed
+ * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress
+ *
+ */
+psm2_error_t psm2_poll(psm2_ep_t ep);
+
+/** @brief Set a user-determined ep address label.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] epaddr_label_string User-allocated string to print when
+ * identifying endpoint in error handling or other verbose
+ * printing. The NULL-terminated string must be allocated by
+ * the user since PSM2 only keeps a pointer to the label. If
+ * users do not explicitly set a label for each endpoint,
+ * endpoints will identify themselves as hostname:port.
+ */
+void psm2_epaddr_setlabel(psm2_epaddr_t epaddr,
+ const char *epaddr_label_string);
+
+/** @brief Set a user-determined ep address context.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] ctxt Opaque user defined state to associate with an endpoint
+ * address. This state can be retrieved via
+ * @ref psm2_epaddr_getctxt.
+ */
+void
+psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt);
+
+/** @brief Get the user-determined ep address context. Users can associate an
+ * opaque context with each endpoint via @ref psm2_epaddr_setctxt.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect.
+ */
+void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr);
+
+/* Below are all component specific options. The component object for each of
+ * the options is also specified.
+ */
+
+/* PSM2_COMPONENT_CORE options */
+/* PSM2 debug level */
+#define PSM2_CORE_OPT_DEBUG 0x101
+ /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set
+ * before initializing the PSM2 library.
+ *
+ * component object: (null)
+ * option value: PSM2 Debug mask to set or currently active debug level.
+ */
+
+/* PSM2 endpoint address context */
+#define PSM2_CORE_OPT_EP_CTXT 0x102
+ /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint
+ * address (psm2_epaddr_t).
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address.
+ * option value: Context associated with PSM2 endpoint address.
+ */
+
+/* PSM2_COMPONENT_IB options */
+/* Default service level to use to communicate with remote endpoints */
+#define PSM2_IB_OPT_DF_SL 0x201
+ /**< [@b uint32_t ] Default OPA SL to use for all remote communication.
+ * If unset defaults to Service Level 0.
+ *
+ * component object: Opened PSM2 endpoint id (@ref psm2_ep_t).
+ * option value: Default IB SL to use for endpoint. (0 <= SL < 15)
+ */
+
+/* Set IB service level to use for communication to an endpoint */
+#define PSM2_IB_OPT_EP_SL 0x202
+ /**< [@b uint32_t ] OPA SL to use for communication to specified
+ * remote endpoint.
+ *
+ * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address.
+ * option value: SL used to communicate with remote endpoint. (0 <= SL < 15)
+ */
+
+/* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */
+/* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */
+#define PSM2_MQ_OPT_RNDV_IB_SZ 0x301
+ /**< [@b uint32_t ] Size at which to start enabling rendezvous
+ * messaging for OPA messages (if unset, defaults to values
+ * between 56000 and 72000 depending on the system configuration)
+ *
+ * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+ * option value: Size at which to switch to rendezvous protocol.
+ */
+#define PSM2_MQ_RNDV_HFI_SZ PSM2_MQ_OPT_RNDV_IB_SZ
+#define PSM2_MQ_RNDV_IPATH_SZ PSM2_MQ_OPT_RNDV_IB_SZ
+
+#define PSM2_MQ_OPT_RNDV_SHM_SZ 0x302
+#define PSM2_MQ_RNDV_SHM_SZ PSM2_MQ_OPT_RNDV_SHM_SZ
+ /**< [@b uint32_t ] Size at which to start enabling
+ * rendezvous messaging for shared memory (intra-node) messages (If
+ * unset, defaults to 64000 bytes).
+ *
+ * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+ * option value: Size at which to switch to rendezvous protocol.
+ */
+
+#define PSM2_MQ_OPT_SYSBUF_MYBYTES 0x303
+#define PSM2_MQ_MAX_SYSBUF_MBYTES PSM2_MQ_OPT_SYSBUF_MYBYTES
+ /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected
+ * messages.
+ *
+ * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+ * option value: Deprecated; this option has no effect.
+ */
+
+/* PSM2_COMPONENT_AM options */
+#define PSM2_AM_OPT_FRAG_SZ 0x401
+#define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ
+/*!< [@b uint32_t ] Maximum active message fragment size that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ * option value is the smalles fragment size across all
+ * active endpoints.
+ * option value: Maximum active message fragment size in bytes.
+ */
+
+#define PSM2_AM_OPT_NARGS 0x402
+#define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS
+
+/*!< [@b uint32_t ] Maximum number of message arguments that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ * option value is the smalles fragment size across all
+ * active endpoints.
+ * option value: Maximum number of active message arguments.
+ */
+
+#define PSM2_AM_OPT_HANDLERS 0x403
+#define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS
+/*!< [@b uint32_t ] Maximum number of message handlers that can be registered
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ * option value is the smalles fragment size across all
+ * active endpoints.
+ * option value: Maximum number of active message handlers.
+ */
+
+/** @brief Set an option for a PSM2 component
+ *
+ * Function to set the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to set the option
+ * @param[in] component_obj Opaque component specify object to apply the set
+ * operation on. These are passed uninterpreted to the
+ * appropriate component for interpretation.
+ * @param[in] optname Name of component option to set. These are component
+ * specific and passed uninterpreted to the appropriate
+ * component for interpretation.
+ * @param[in] optval Pointer to storage that contains the value to be updated
+ * for the supplied option. It is up to the user to
+ * ensure that the pointer points to a memory location with a
+ * correct size and format.
+ * @param[in] optlen Size of the memory region pointed to by optval.
+ *
+ * @returns PSM2_OK if option could be set.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option.
+ *
+ */
+psm2_error_t
+psm2_setopt(psm2_component_t component, const void *component_obj,
+ int optname, const void *optval, uint64_t optlen);
+
+/** @brief Get an option for a PSM2 component
+ *
+ * Function to get the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to get the option
+ * @param[in] component_obj Opaque component specify object to apply the get
+ * operation on. These are passed uninterpreted to the
+ * appropriate component for interpretation.
+ * @param[in] optname Name of component option to get. These are component
+ * specific and passed uninterpreted to the appropriate
+ * component for interpretation.
+ * @param[out] optval Pointer to storage that contains the value to be updated
+ * for the supplied option. It is up to the user to
+ * ensure that the pointer points to a valid memory region.
+ * @param[in,out] optlen This is a value result parameter initially containing
+ * the size of the memory region pointed to by optval and
+ * modified to return the actual size of optval.
+ *
+ * @returns PSM2_OK if option value could be retrieved successfully.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size.
+ * optlen contains the required memory region size for
+ * optname value.
+ *
+ */
+psm2_error_t
+psm2_getopt(psm2_component_t component, const void *component_obj,
+ int optname, void *optval, uint64_t *optlen);
+
+/** @brief Datatype for end-point information */
+typedef struct psm2_epinfo {
+ psm2_ep_t ep; /**< The ep for this end-point*/
+ psm2_epid_t epid; /**< The epid for this end-point */
+ psm2_uuid_t uuid; /**< The UUID for this end-point */
+ uint16_t jkey; /**< The job key for this end-point */
+ char uuid_str[64]; /**< String representation of the UUID for this end-point */
+} psm2_epinfo_t;
+
+/** @brief Datatype for end-point connection */
+typedef struct psm2_epconn {
+ psm2_epaddr_t addr; /**< The epaddr for this connection */
+ psm2_ep_t ep; /**< The ep for this connection */
+ psm2_mq_t mq; /**< The mq for this connection */
+} psm2_epconn_t;
+
+/** @brief Query PSM2 for end-point information.
+ *
+ * Function to query PSM2 for end-point information. This allows retrieval of
+ * end-point information in cases where the caller does not have access to the
+ * results of psm2_ep_open(). In the default single-rail mode PSM2 will use
+ * a single endpoint. If either multi-rail mode or multi-endpoint mode is
+ * enabled, PSM2 will use multiple endpoints.
+ *
+ * @param[in,out] num_of_epinfo On input, sizes the available number of entries
+ * in array_of_epinfo. On output, specifies the
+ * returned number of entries in array_of_epinfo.
+ * @param[out] array_of_epinfo Returns end-point information structures.
+ *
+ * @pre PSM2 is initialized and the end-point has been opened.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ */
+psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo);
+
+/** @brief Query PSM2 for end-point connections.
+ *
+ * Function to query PSM2 for end-point connections. This allows retrieval of
+ * end-point connections in cases where the caller does not have access to the
+ * results of psm2_ep_connect(). The epid values can be found using
+ * psm2_ep_query() so that each PSM2 process can determine its own epid. These
+ * values can then be distributed across the PSM2 process so that each PSM
+ * process knows the epid for all other PSM2 processes.
+ *
+ * @param[in] epid The epid of a PSM2 process.
+ * @param[out] epconn The connection information for that PSM2 process.
+ *
+ * @pre PSM2 is initialized and the end-point has been connected to this epid.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM.
+ */
+psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Query given PSM2 end-point for its connections.
+ *
+ * The need for this function comes with 'multi-ep' feature.
+ * Function is similar to (@ref psm2_ep_epid_lookup).
+ * It differs in that an extra parameter which identifies
+ * the end-point [ep] must be provided which limits the lookup to that single ep.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM.
+ * @returns PSM2_PARAM_ERR if output [epconn] is NULL.
+ */
+psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Get PSM2 epid for given epaddr.
+ *
+ * @param[in] epaddr The endpoint address.
+ * @param[out] epid The epid of a PSM2 process.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL.
+ */
+psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/psm2_am.h b/psm2_am.h
new file mode 100644
index 0000000..1383fbb
--- /dev/null
+++ b/psm2_am.h
@@ -0,0 +1,411 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef PSM2_AM_H
+#define PSM2_AM_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <psm2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_am.h
+ * @brief PSM2 Active Message.
+ *
+ * @page psm2_am Active Message Interface
+ *
+ * PSM2 implements an Active Message (AM) component that lives alongside the
+ * Matched Queues (MQ) component. The active message interface essentially
+ * provides a remote procedure call mechanism. A PSM2 process can generate a
+ * request to run an active message handler on a remote PSM2 process
+ * identified by its end-point address (epaddr). End-point address values
+ * are returned by PSM2 when connecting end-points using the psm2_ep_connect()
+ * function.
+ *
+ * An AM handler may make local state updates, and may generate at most
+ * one reply to be returned to the original requestor. This reply will cause
+ * a handler to be run on that requestor. The requestor handler may make
+ * local state updates but is not allowed to reply nor request in that handler
+ * context. A request or reply can convey a small number of in-line arguments
+ * plus a short amount of data. A tight bound is placed on the number of
+ * in-line arguments to allow them to be packed into a header. A bound is
+ * placed on the size of the data payload so that the request or reply can
+ * be sent as a single packet within the MTU of the underlying communication
+ * transport. Longer payloads must be synthesized on top of the provided
+ * short request/reply mechanism by fragmentation and reassembly, or
+ * transported by some other means.
+ *
+ * Handlers are run in the process context of the targeted PSM2 process,
+ * either in its main thread of execution or in a progress thread. A handler
+ * may therefore be executed concurrently with the main thread of execution
+ * of the PSM2 process. PSM2 ensures that its own state is protected against this
+ * concurrent execution. However, a handler must make its own arrangements to
+ * protect its own state. Alternatively, the PSM2 progress thread can be
+ * disabled using the PSM2_RCVTHREAD environment variable if this is too
+ * onerous for the handler.
+ *
+ * PSM2 has an active progress model and requires that the PSM2 library is
+ * called in order to make progress. This can be achieved using the psm2_poll()
+ * function. A PSM2 implementatation may provide passive progress through some
+ * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume
+ * this and must arrange to make active progress through calls into the PSM
+ * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM
+ * interfaces, and that MTsafety must be provided by the consumer if required.
+ *
+ * The order in which AM requests are issued by an initiator to a particular
+ * target defines the order in which those AM requests will be executed on
+ * that target. Therefore the AM implementation will maintain the order
+ * of handler executions on a flow, and this also applies when progress
+ * threads are used. For multiple initiators issuing requests to a particular
+ * target, the handler executions will be interleaved in some sequentially
+ * consistent ordering.
+ */
+
+/*! @defgroup am PSM2 Active Message
+ *
+ * @{
+ */
+
+/** @brief Datatype for an index representing an active message handler */
+typedef uint32_t psm2_handler_t;
+
+/** @brief Datatype for a token for an active message handler.*/
+typedef void *psm2_am_token_t;
+
+/* PSM2 AM flags
+ * These flags may be combined using bitwise-or.
+ */
+#define PSM2_AM_FLAG_NONE 0 /**< No other PSM2 AM flags are needed. */
+#define PSM2_AM_FLAG_ASYNC 1 /**< No need to copy source data. */
+#define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is
+ guaranteed not to generate a reply. */
+
+/** @brief The psm2_amarg type represents the type of an AM argument. This is
+ * a 64-bit type and is broken down into four 16-bit fields, two 32-bit
+ * fields or one 64-bit field for the convenience of code using the PSM2 AM
+ * interface.
+ */
+typedef
+struct psm2_amarg {
+ union {
+ struct {
+ uint16_t u16w3;
+ uint16_t u16w2;
+ uint16_t u16w1;
+ uint16_t u16w0;
+ };
+ struct {
+ uint32_t u32w1;
+ uint32_t u32w0;
+ };
+ uint64_t u64w0;
+ uint64_t u64;
+ };
+} psm2_amarg_t;
+
+/** @brief The AM handler function type
+ *
+ * psm2_am_handler_fm_t is the datatype for an AM handler. PSM2 AM will call-back
+ * into an AM handler using this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * @param[in] token This is an opaque token value passed into a handler.
+ * A request handler may send at most one reply back to the
+ * original requestor, and must pass this value as the token
+ * parameter to the psm2_am_reply_short() function. A reply
+ * handler is also passed a token value, but must not attempt
+ * to reply.
+ * @param[in] args A pointer to the arguments provided to this handler.
+ * @param[in] nargs The number of arguments.
+ * @param[in] src A pointer to the data payload provided to this handler.
+ * @param[in] len The length of the data payload in bytes.
+ *
+ * @returns 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm2_am_handler_fn_t) (psm2_am_token_t token,
+ psm2_amarg_t *args, int nargs,
+ void *src, uint32_t len);
+
+/** @brief Type for a completion call-back handler.
+ *
+ * A completion handler can be specified to give a call-back on the initiation
+ * side that an AM request or reply has completed on the target side. The
+ * call-back has a context pointer which is provided along with the call-back
+ * function pointer when the initiator generates the request or reply. This
+ * approach will typically give higher performance than using an AM request or
+ * reply to achieve the same effect, though note that no additional information
+ * can be passed from the target side back to the initiator side with the
+ * completion handler approach.
+ *
+ * @param[in] context A context pointer.
+ * @returns void This handler has no return result.
+ */
+typedef
+void (*psm2_am_completion_fn_t) (void *context);
+
+/** @brief Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of
+ * handlers that can be registered is limited to the max_handlers value
+ * returned by psm2_am_get_parameters(). Handlers are associated with a PSM
+ * end-point. The handlers are allocated index numbers in the the handler table
+ * for that end-point. The allocated index for the handler function in
+ * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These
+ * handler index values are used in the psm2_am_request_short() and
+ * psm2_am_reply_short() functions.
+ *
+ * @param[in] ep End-point value
+ * @param[in] handlers Array of handler functions
+ * @param[in] num_handlers Number of handlers (sizes the handlers and
+ * handlers_idx arrays)
+ * @param[out] handlers_idx Used to return handler index mapping table
+ *
+ * @returns PSM2_OK Indicates success
+ * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm2_error_t psm2_am_register_handlers(psm2_ep_t ep,
+ const psm2_am_handler_fn_t *
+ handlers, int num_handlers,
+ int *handlers_idx);
+
+/** @brief Generate an AM request.
+ *
+ * This function generates an AM request causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_request_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ * PSM2_AM_FLAG_NONE - No flags
+ * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data
+ * PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to
+ * generate a reply
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the request cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC
+ * is provided then a copy will not be taken and the PSM2 AM implementation
+ * retains ownership of the payload src memory until the request is locally
+ * complete. Local completion can be determined using the completion handler
+ * call-back, or through an AM handler associated with an AM reply.
+ *
+ * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that
+ * a reply will not be generated. Use of this flag is optional, but it may
+ * enable a performance optimization in this case by indicating that reply
+ * state is not required.
+ *
+ * @param[in] epaddr End-point address to run handler on
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ * bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ * remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ * completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src,
+ size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+
+/** @brief Generate an AM reply.
+ *
+ * This function may only be called from an AM handler called due to an AM
+ * request. If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM
+ * handler must not call this function. Otherwise, the AM request handler may
+ * call psm2_am_reply_short() at most once, and must pass in the token value
+ * that it received in its own handler call-back.
+ *
+ * This function generates an AM reply causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_reply_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ * PSM2_AM_FLAG_NONE - No flags
+ * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the reply cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC is
+ * provided then a copy will not be taken and the PSM2 AM implementation retains
+ * ownership of the payload src memory until the reply is locally complete.
+ * Local completion can be determined using the completion handler call-back.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ * the reply.
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ * bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ * remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ * completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src,
+ size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+
+/** @brief Return the source end-point address for a token.
+ *
+ * This function is used to obtain the epaddr object representing the message
+ * initiator from a token passed by PSM2 to a message handler.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ * the reply.
+ * @param[out] epaddr_out Pointer to the where the epaddr should be returned.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL.
+ */
+psm2_error_t psm2_am_get_source(psm2_am_token_t token,
+ psm2_epaddr_t *epaddr_out);
+
+/** @brief AM parameters
+ *
+ * This structure is used to return PSM2 AM implementation-specific parameter
+ * values back to the caller of the psm2_am_get_parameters() function. This
+ * API also specifies the minimum values for these parameters that an
+ * implementation must at least provide:
+ * max_handlers >= 64,
+ * max_nargs >= 2,
+ * max_request_short >= 256 and
+ * max_reply_short >= 256.
+ */
+struct psm2_am_parameters {
+ /** Maximum number of handlers that can be registered. */
+ uint32_t max_handlers;
+ /** Maximum number of arguments to an AM handler. */
+ uint32_t max_nargs;
+ /** Maximum number of bytes in a request payload. */
+ uint32_t max_request_short;
+ /** Maximum number of bytes in a reply payload. */
+ uint32_t max_reply_short;
+};
+
+/** @brief Get the AM parameter values
+ *
+ * This function retrieves the implementation-specific AM parameter values for
+ * the specified end-point.
+ *
+ * @param[in] ep The end-point value returned by psm2_ep_open().
+ * @param[out] parameters Pointer to the struct where the parameters will be
+ * returned.
+ * @param[in] sizeof_parameters_in The size in bytes of the struct provided by
+ * the caller.
+ * @param[out] sizeof_parameters_out The size in bytes of the struct returned
+ * by PSM.
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_get_parameters(psm2_ep_t ep,
+ struct psm2_am_parameters *parameters,
+ size_t sizeof_parameters_in,
+ size_t *sizeof_parameters_out);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/psm2_linker_script.map b/psm2_linker_script.map
new file mode 100644
index 0000000..f1db50e
--- /dev/null
+++ b/psm2_linker_script.map
@@ -0,0 +1,93 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+ C++ // Comments don't work in this file. */
+
+PSM2_1.0
+{
+ /* Expose only those symbols we choose to. This way we do not
+ pollute users namespace more than absolutely necessary. */
+ global:
+ psm2_*;
+
+ /* Below symbols are used for hfidiags hfi1_pkt_test */
+ /* opa_udebug.h - global */
+ hfi_debug;
+ hfi_get_unit_name;
+ __progname;
+
+ /* opa_udebug.h - _HFI_DEBUGGING */
+ __hfi_mylabel;
+ hfi_set_mylabel;
+ hfi_get_mylabel;
+ __hfi_dbgout;
+
+ /* opa_service.h */
+ hfi_context_open;
+ hfi_get_port_vl2mtu;
+ hfi_get_port_lid;
+ hfi_context_close;
+ hfi_cmd_write;
+ hfi_mmap64;
+
+ /* opa_user.h */
+ hfi_userinit;
+ hfi_poll_type;
+ hfi_wait_for_packet;
+ __hfi_pico_per_cycle;
+
+ /* Make all other symbols local */
+ local:
+ *;
+};
+
diff --git a/psm2_linker_script_map.in b/psm2_linker_script_map.in
new file mode 100644
index 0000000..efa87c5
--- /dev/null
+++ b/psm2_linker_script_map.in
@@ -0,0 +1,95 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info.
+ C++ // Comments don't work in this file. */
+
+PSM2_1.0
+{
+ /* Expose only those symbols we choose to. This way we do not
+ pollute users namespace more than absolutely necessary. */
+ global:
+ psm2_*;
+
+ /* Below symbols are used for hfidiags hfi1_pkt_test */
+ /* opa_udebug.h - global */
+ hfi_debug;
+ hfi_get_unit_name;
+ __progname;
+
+ /* opa_udebug.h - _HFI_DEBUGGING */
+ __hfi_mylabel;
+ hfi_set_mylabel;
+ hfi_get_mylabel;
+ __hfi_dbgout;
+
+ /* opa_service.h */
+ hfi_context_open;
+ hfi_get_port_vl2mtu;
+ hfi_get_port_lid;
+ hfi_context_close;
+ hfi_cmd_write;
+ hfi_mmap64;
+
+ /* opa_user.h */
+ hfi_userinit;
+ hfi_poll_type;
+ hfi_wait_for_packet;
+ __hfi_pico_per_cycle;
+
+ /* Additional globals */
+ _psm2_additional_globals_;
+ /* Make all other symbols local */
+ local:
+ *;
+};
+
diff --git a/psm2_mq.h b/psm2_mq.h
new file mode 100644
index 0000000..6c23b10
--- /dev/null
+++ b/psm2_mq.h
@@ -0,0 +1,1403 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2017 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MQ_H
+#define PSM2_MQ_H
+
+#include <psm2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_mq.h
+ * @brief PSM2 Matched Queues
+ *
+ * @page psm2_mq Matched Queues interface
+ *
+ * The Matched Queues (MQ) interface implements a queue-based communication
+ * model with the distinction that queue message consumers use a 3-tuple of
+ * metadata to match incoming messages against a list of preposted receive
+ * buffers. These semantics are consistent with those presented by MPI-1.2
+ * and all the features and side-effects of Message-Passing find their way into
+ * Matched Queues. There is currently a single MQ context,
+ * If need be, MQs may expose a function to allocate more than
+ * one MQ context in the future. Since an MQ is implicitly bound to a locally
+ * opened endpoint, handle all MQ functions use an MQ handle instead of an EP
+ * handle as a communication context.
+ *
+ * @section tagmatch MQ Tag Matching
+ *
+ * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t)
+ * and a 3-tuple of tag objects. Two of the tag objects are provided by the
+ * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is
+ * provided by the sender as part of every message sent (@ref psm2_mq_send and
+ * @ref psm2_mq_isend). Since MQ is a receiver-directed communication model,
+ * the tag matching done at the receiver involves matching the sent message's
+ * origin and send tag (@c stag) with the source endpointer address, tag (@c
+ * rtag), and tag selector (@c rtagsel) attached to every preposted receive
+ * buffer. The incoming @c stag is compared to the posted @c rtag but only for
+ * significant bits set to @c 1 in the @c rtagsel. The @c rtagsel can be used
+ * to mask off parts (or even all) of the bitwise comparison between sender and
+ * receiver tags. A successful match causes the message to be received into
+ * the buffer with which the tag is matched. If the incoming message is too
+ * large, it is truncated to the size of the posted receive buffer. The
+ * bitwise operation corresponding to a successful match and receipt of an
+ * expected message amounts to the following expression evaluating as true:
+ *
+ * @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim
+ *
+ * It is up to the user to encode (pack) into the 64-bit unsigned
+ * integers, including employing the @c rtagsel tag selector as a method to
+ * wildcart part or all of the bits significant in the tag matching operation.
+ * For example, MPI uses triple based on context (MPI communicator), source
+ * rank, send tag. The following code example shows how the triple can be
+ * packed into 64 bits:
+ *
+ * @code{.c}
+ //
+ // 64-bit send tag formed by packing the triple:
+ //
+ // ( context_id_16bits | source_rank_16bits | send_tag_32bits )
+ //
+ stag = ( (((context_id)&0xffffULL)<<48)| \
+ (((source_rank)&0xffffULL)<<32)| \
+ (((send_tag)&0xffffffffULL)) );
+ @endcode
+ *
+ * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel
+ * masking bits against a list of send tags and returns the first successful
+ * match. Zero bits in the @c tagsel can be used to indicate wildcarded bits
+ * in the 64-bit tag which can be useful for implementing MPI's
+ * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG. Following the example bit splicing in
+ * the above @c stag example:
+ *
+ * @code{.c}
+ // Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333
+
+ // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD
+ rtag = 0x3333000000000007;
+ rtagsel = 0xffff0000ffffffff;
+
+ // MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+ rtag = 0x3333000300000000;
+ rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag
+
+ // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+ rtag = 0x3333000300000000;
+ rtagsel = 0xffff000080000000; // can't ignore sign bit in tag
+ @endcode
+ *
+ *
+ * Applications that do not follow tag matching semantics can simply always
+ * pass a value of @c 0 for @c rtagsel, which will always yield a successful
+ * match to the first preposted buffer. If a message cannot be matched to any
+ * of the preposted buffers, the message is delivered as an unexpected
+ * message.
+ *
+ * @section mq_receive MQ Message Reception
+ *
+ * MQ messages are either received as @e expected or @e unexpected: @li The
+ * received message is @e expected if the incoming message tag matches the
+ * combination of tag and tag selector of at least one of the user-provided
+ * receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * @li The received message is @e unexpected if the incoming message tag @b
+ * doesn't match any combination of tag and tag selector from all the
+ * user-provided receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * Unexpected messages are messages that the MQ library buffers until the
+ * user provides a receive buffer that can match the unexpected message.
+ * With Matched Queues and MPI alike, unexpected messages can occur as a
+ * side-effect of the programming model, whereby the arrival of messages can be
+ * slightly out of step with the ordering in which the user
+ * provides receive buffers. Unexpected messages can also be triggered by the
+ * difference between the rate at which a sender produces messages and the rate
+ * at which a paired receiver can post buffers and hence consume the messages.
+ *
+ * In all cases, too many @e unexpected messages will negatively affect
+ * performance. Users can employ some of the following mechanisms to reduce
+ * the effect of added memory allocations and copies that result from
+ * unexpected messages:
+ * @li If and when possible, receive buffers should be posted as early as
+ * possible and ideally before calling into the progress engine.
+ * @li Use of rendezvous messaging that can be controlled with
+ * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These
+ * options default to values determined to make effective use of
+ * bandwidth and are hence not advisable for all communication message
+ * sizes, but rendezvous messages inherently prevent unexpected
+ * messages by synchronizing the sender with the receiver beforehand.
+ * @li The amount of memory that is allocated to handle unexpected messages
+ * can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ * option.
+ * @li MQ statistics, such as the amount of received unexpected messages and
+ * the aggregate amount of unexpected bytes are available in the @ref
+ * psm2_mq_stats structure.
+ *
+ * Whenever a match occurs, whether the message is expected or unexpected, it
+ * is generally up to the user to ensure that the message is not truncated.
+ * Message truncation occurs when the size of the preposted buffer is less than
+ * the size of the incoming matched message. MQ will correctly handle
+ * message truncation by always copying the appropriate amount of bytes as to
+ * not overwrite any data. While it is valid to send less data than the amount
+ * of data that has been preposted, messages that are truncated will be marked
+ * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status
+ * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t).
+ *
+ * @section mq_completion MQ Completion Semantics
+ *
+ * Message completion in Matched Queues follows local completion semantics.
+ * When sending an MQ message, it is deemed complete when MQ guarantees that
+ * the source data has been sent and that the entire input source data memory
+ * location can be safely overwritten. As with standard Message-Passing,
+ * MQ does not make any remote completion guarantees for sends. MQ does
+ * however, allow a sender to synchronize with a receiver to send a synchronous
+ * message which sends a message only after a matching receive buffer has been
+ * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC).
+ *
+ * A receive is deemed complete after it has matched its associated receive
+ * buffer with an incoming send and that the data from the send has been
+ * completely delivered to the receive buffer.
+ *
+ * @section mq_progress MQ Progress Requirements
+ *
+ * Progress on MQs must be @e explicitly ensured by the user for correctness.
+ * The progress requirement holds even if certain areas of the MQ
+ * implementation require less network attention than others, or if progress
+ * may internally be guaranteed through interrupts. The main polling function,
+ * @ref psm2_poll, is the most general form of ensuring process on a given
+ * endpoint. Calling @ref psm2_poll ensures that progress is made over all the
+ * MQs and other components instantiated over the endpoint passed to @ref
+ * psm2_poll.
+ *
+ * While @ref psm2_poll is the only way to directly ensure progress, other MQ
+ * functions will conditionally ensure progres depending on how they are used:
+ *
+ * @li @ref psm2_mq_wait employs polling and waits until the request is
+ * completed. For blocking communication operations where the caller is
+ * waiting on a single send or receive to complete, psm2_mq_wait usually
+ * provides the best responsiveness in terms of latency.
+ *
+ * @li @ref psm2_mq_test can test a particular request for completion, but @b
+ * never directly or indirectly ensures progress as it only tests the
+ * completion status of a request, nothing more. See functional documentation
+ * in @ref psm2_mq_test for a detailed discussion.
+ *
+ * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion
+ * queue is empty and will not ensure progress as long as the completion queue
+ * is non-empty. Users that always aggressively process all elements of the MQ
+ * completion queue as part of their own progress engine will indirectly always
+ * ensure MQ progress. The ipeek mechanism is the preferred way for
+ * ensuring progress when many non-blocking requests are in flight since ipeek
+ * returns requests in the order in which they complete. Depending on how the
+ * user initiates and completes communication, this may be preferable to
+ * calling other progress functions on individual requests.
+ */
+
+/*! @defgroup mq PSM Matched Queues
+ *
+ * @{
+ */
+
+/** @brief Initialize the MQ component for MQ communication
+ *
+ * This function provides the Matched Queue handle necessary to perform all
+ * Matched Queue communication operations.
+ *
+ * @param[in] ep Endpoint over which to initialize Matched Queue
+ * @param[in] tag_order_mask Order mask hint to let MQ know what bits of the
+ * send tag are required to maintain MQ message
+ * order. In MPI parlance, this mask sets the bits
+ * that store the context (or communicator ID). The
+ * user can choose to pass PSM2_MQ_ORDERMASK_NONE or
+ * PSM2_MQ_ORDERMASK_ALL to tell MQ to respectively
+ * provide no ordering guarantees or to provide
+ * ordering over all messages by ignoring the
+ * contexts of the send tags.
+ * @param[in] opts Set of options for Matched Queue
+ * @param[in] numopts Number of options passed
+ * @param[out] mq User-supplied storage to return the Matched Queue handle
+ * associated to the newly created Matched Queue.
+ *
+ * @remark This function can be called many times to retrieve the MQ handle
+ * associated to an endpoint, but options are only considered the first
+ * time the function is called.
+ *
+ * @post The user obtains a handle to an instantiated Match Queue.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A new Matched Queue has been instantiated across all the
+ * members of the group.
+ *
+ * @code{.c}
+ int try_open_endpoint_and_initialize_mq(
+ psm2_ep_t *ep, // endpoint handle
+ psm2_epid_t *epid, // unique endpoint ID
+ psm2_uuid_t job_uuid, // unique job uuid, for ep_open
+ psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep'
+ uint64_t communicator_bits) // Where we store our communicator or
+ // context bits in the 64-bit tag.
+ {
+ // Simplified open, see psm2_ep_open documentation for more info
+ psm2_ep_open(job_uuid,
+ NULL, // no options
+ ep, epid);
+
+ // We initialize a matched queue by telling PSM the bits that are
+ // order-significant in the tag. Point-to-point ordering will not be
+ // maintained between senders where the communicator bits are not the
+ // same.
+ psm2_mq_init(ep,
+ communicator_bits,
+ NULL, // no other MQ options
+ 0, // 0 options passed
+ mq); // newly initialized matched Queue
+
+ return 1;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+ const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq);
+
+#define PSM2_MQ_ORDERMASK_NONE 0ULL
+ /**< Used to initialize MQ and disable all MQ message ordering
+ * guarantees (this mask may prevent the use of MQ to maintain matched
+ * message envelope delivery required in MPI). */
+
+#define PSM2_MQ_ORDERMASK_ALL 0xffffffffffffffffULL
+ /**< Used to initialize MQ with no message ordering hints, which forces
+ * MQ to maintain order over all messages */
+
+/** @brief Finalize (close) an MQ handle
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A given Matched Queue has been freed and use of the future
+ * use of the handle produces undefined results.
+ */
+psm2_error_t
+psm2_mq_finalize(psm2_mq_t mq);
+
+#define PSM2_MQ_TAG_ELEMENTS 3
+ /**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t
+ * type. */
+
+/** @struct psm2_mq_tag
+ ** @brief MQ Message tag
+ *
+ * Extended message tag type introduced in PSM 2.0. The previous 64 bit tag
+ * values are replaced by a struct containing three 32 bit tag values for a
+ * total of 96 bits. Matching semantics are unchanged from the previous 64-bit
+ * matching scheme; the only difference is that 96 bits are matched instead of
+ * 64. For interoperability with existing PSM routines, 64 bit tags are
+ * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to
+ * zero. Other than this caveat, all of the existing routines using 64-bit
+ * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type.
+ * For example, a message sent using @ref psm2_mq_send can be received using
+ * @ref psm2_mq_irecv2, provided the tags match as described above.
+ */
+typedef
+//struct psm2_mq_tag {
+union psm2_mq_tag {
+// union {
+ uint32_t tag[PSM2_MQ_TAG_ELEMENTS] __attribute__ ((aligned(16)));
+ /**< 3 x 32bit array representation of @ref psm2_mq_tag */
+ struct {
+ uint32_t tag0; /**< 1 of 3 uint32_t tag values */
+ uint32_t tag1; /**< 2 of 3 uint32_t tag values */
+ uint32_t tag2; /**< 3 of 3 uint32_t tag values */
+ };
+// };
+} psm2_mq_tag_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.
+ * For wait and test functions, MQ fills in the structure upon completion.
+ * Upon completion, receive requests fill in every field of the status
+ * structure while send requests only return a valid error_code and context
+ * pointer.
+ */
+typedef
+struct psm2_mq_status {
+ /** Sender's original message tag (receive reqs only) */
+ uint64_t msg_tag;
+ /** Sender's original message length (receive reqs only) */
+ uint32_t msg_length;
+ /** Actual number of bytes transfered (receive reqs only) */
+ uint32_t nbytes;
+ /** MQ error code for communication operation */
+ psm2_error_t error_code;
+ /**< User-associated context for send or receive */
+ void *context;
+} psm2_mq_status_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations. For
+ * wait and test functions, MQ fills in the structure upon completion. Upon
+ * completion, requests fill in every field of the status structure with the
+ * exception of the nbytes field, which is only valid for receives. Version 2
+ * of the status type contains an @ref psm2_mq_tag_t type to represent the tag
+ * instead of a 64-bit integer value and is for use with PSM v2 routines.
+ */
+
+typedef
+struct psm2_mq_status2 {
+ /** Remote peer's epaddr */
+ psm2_epaddr_t msg_peer;
+ /** Sender's original message tag */
+ psm2_mq_tag_t msg_tag;
+ /** Sender's original message length */
+ uint32_t msg_length;
+ /** Actual number of bytes transfered (receiver only) */
+ uint32_t nbytes;
+ /** MQ error code for communication operation */
+ psm2_error_t error_code;
+ /** User-associated context for send or receive */
+ void *context;
+} psm2_mq_status2_t;
+
+/** @brief PSM2 Communication handle (opaque) */
+typedef struct psm2_mq_req *psm2_mq_req_t;
+
+/*! @} */
+/*! @ingroup mq
+ * @defgroup mq_options PSM Matched Queue Options
+ * @{
+ *
+ * MQ options can be modified at any point at runtime, unless otherwise noted.
+ * The following example shows how to retrieve the current message size at
+ * which messages are sent as synchronous.
+ *
+ * @code{.c}
+ uint32_t get_hfirv_size(psm2_mq_t mq)
+ {
+ uint32_t rvsize;
+ psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize);
+ return rvsize;
+ }
+ @endcode
+ */
+
+/** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to retrieve the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve. Possible values are:
+ * @li @ref PSM2_MQ_RNDV_HFI_SZ
+ * @li @ref PSM2_MQ_RNDV_SHM_SZ
+ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that can be used to store the value of
+ * the option to be set. It is up to the user to ensure that the
+ * pointer points to a memory location large enough to accommodate
+ * the value associated to the type. Each option documents the size
+ * associated to its value.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ */
+psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value);
+
+/** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to set the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve. Possible values are:
+ * @li @ref PSM2_MQ_RNDV_HFI_SZ
+ * @li @ref PSM2_MQ_RNDV_SHM_SZ
+ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that contains the value to be updated
+ * for the supplied option number. It is up to the user to
+ * ensure that the pointer points to a memory location with a
+ * correct size.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option
+ * (currently no MQ options are read-only).
+ */
+psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value);
+
+/*! @} */
+/*! @ingroup mq
+ * @{
+ */
+
+#define PSM2_MQ_FLAG_SENDSYNC 0x01
+ /**< MQ Send Force synchronous send */
+
+#define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL))
+ /**< MQ request completion value */
+
+#define PSM2_MQ_ANY_ADDR ((psm2_epaddr_t)NULL)
+ /**< MQ receive from any source epaddr */
+
+/** @brief Post a receive to a Matched Queue with tag selection criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c tag and @c
+ * tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ * upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ * be used for explicitly controlling message receive
+ * completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ * messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ * match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags,
+ void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with source and tag selection
+ * criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c src, @c tag
+ * and @c tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status2_t
+ * upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ * be used for explicitly controlling message receive
+ * completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ * messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ * match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+ psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len,
+ void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with matched request
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. The provided request should already be matched using the @ref
+ * psm2_mq_improbe or @ref psm2_mq_improbe2 routines. It is an error to pass a
+ * request that has not already been matched by one of those routines.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ * upon completion
+ * @param[inout] reqo PSM MQ Request handle matched previously by a matched
+ * probe routine (@ref psm2_mq_improbe or @ref
+ * psm2_mq_improbe2), also to be used for explicitly
+ * controlling message receive completion.
+ *
+ * @post The supplied receive buffer is given to MQ to deliver the matched
+ * message.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+ void *context, psm2_mq_req_t *reqo);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ * synchronously, meaning that the message will not be sent until
+ * the receiver acknowledges that it has matched the send with a
+ * receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ * synchronously, meaning that the message will not be sent until
+ * the receiver acknowledges that it has matched the send with a
+ * receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+ psm2_mq_tag_t *stag, const void *buf, uint32_t len);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ * synchronously, meaning that the message will not be sent until
+ * the receiver acknowledges that it has matched the send with a
+ * receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ * psm2_mq_status_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ * be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ * until its request is completed by either @ref psm2_mq_test or @ref
+ * psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+ psm2_mq_req_t
+ non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+ const void *buf, uint32_t len,
+ int context_id, int send_tag, const my_request_t *req)
+ {
+ psm2_mq_req_t req_mq;
+ // Set up our send tag, assume that "my_rank" is global and represents
+ // the rank of this process in the job
+ uint64_t tag = ( ((context_id & 0xffff) << 48) |
+ ((my_rank & 0xffff) << 32) |
+ ((send_tag & 0xffffffff)) );
+
+ psm2_mq_isend(mq, dest_ep,
+ 0, // no flags
+ tag,
+ buf,
+ len,
+ req, // this req is available in psm2_mq_status_t when one
+ // of the synchronization functions is called.
+ &req_mq);
+ return req_mq;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ * synchronously, meaning that the message will not be sent until
+ * the receiver acknowledges that it has matched the send with a
+ * receive buffer.
+ * @param[in] stag Message Send Tag, array of three 32-bit values.
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ * psm2_mq_status2_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ * be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ * until its request is completed by either @ref psm2_mq_test or @ref
+ * psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+ psm2_mq_req_t
+ non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+ const void *buf, uint32_t len,
+ int context_id, int send_tag, const my_request_t *req)
+ {
+ psm2_mq_req_t req_mq;
+ // Set up our send tag, assume that "my_rank" is global and represents
+ // the rank of this process in the job
+ psm2_mq_tag_t tag;
+ tag.tag[0] = send_tag;
+ tag.tag[1] = my_rank;
+ tag.tag[2] = context_id;
+
+ psm2_mq_isend(mq, dest_ep,
+ 0, // no flags
+ &tag,
+ buf,
+ len,
+ req, // this req is available in psm2_mq_status2_t when one
+ // of the synchronization functions is called.
+ &req_mq);
+ return req_mq;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+ psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context,
+ psm2_mq_req_t *req);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received. The message is not fully matched until the user
+ * provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv.
+ * Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ * regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ * unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel,
+ psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received. The message is not fully matched until the
+ * user provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv. Probing for messages may be useful if the size
+ * of the message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ * regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ * unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+ psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received. If a match is successful, the message is
+ * removed from the matching queue and returned as a request object. The
+ * message can be received using @ref psm2_mq_imrecv. It is erroneous to use
+ * the request object returned by @ref psm2_mq_improbe for any purpose other
+ * than passing to @ref psm2_mq_imrecv. Probing for messages may be useful if
+ * the size of the message to be received is unknown, in which case its size
+ * will be available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] req PSM MQ Request handle, to be used for receiving the matched
+ * message.
+ * @param[out] status Upon return, @c status is filled with information
+ * regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req,
+ psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received. If a match is successful, the message is removed from
+ * the matching queue and returned as a request object. The message can be
+ * received using @ref psm2_mq_imrecv. It is erroneous to use the request
+ * object returned by @ref psm2_mq_improbe for any purpose other than passing to
+ * @ref psm2_mq_imrecv. Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be available
+ * in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched
+ * message.
+ * @param[out] status Upon return, @c status is filled with information
+ * regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+ psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo,
+ psm2_mq_status2_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion. Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress. The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ * PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ * ready for completion. If @c status is non-NULL, it is also
+ * updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ * are no further requests ready for completion.
+ * The contents of @c req and @c status remain
+ * unchanged.
+ * @code{.c}
+ // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+ // We return the amount of non-blocking requests that we've completed
+ int main_progress_loop(psm2_mq_t mq)
+ {
+ int num_completed = 0;
+ psm2_mq_req_t req;
+ psm2_mq_status_t status;
+ psm2_error_t err;
+ my_request_t *myreq;
+
+ do {
+ err = psm2_mq_ipeek(mq, &req,
+ NULL); // No need for status in ipeek here
+ if (err == PSM2_MQ_NO_COMPLETIONS)
+ return num_completed;
+ else if (err != PSM2_OK)
+ goto errh;
+ num_completed++;
+
+ // We obtained 'req' at the head of the completion queue. We can
+ // now free the request with PSM and obtain our original reques
+ // from the status' context
+ err = psm2_mq_test(&req, // will be marked as invalid
+ &status); // we need the status
+ myreq = (my_request_t *) status.context;
+
+ // handle the completion for myreq whether myreq is a posted receive
+ // or a non-blocking send.
+ }
+ while (1);
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion. Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress. The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ * PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ * ready for completion. If @c status is non-NULL, it is also
+ * updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ * are no further requests ready for completion.
+ * The contents of @c req and @c status remain
+ * unchanged.
+ * @code{.c}
+ // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+ // We return the amount of non-blocking requests that we've completed
+ int main_progress_loop(psm2_mq_t mq)
+ {
+ int num_completed = 0;
+ psm2_mq_req_t req;
+ psm2_mq_status2_t status;
+ psm2_error_t err;
+ my_request_t *myreq;
+
+ do {
+ err = psm2_mq_ipeek2(mq, &req,
+ NULL); // No need for status in ipeek here
+ if (err == PSM2_MQ_NO_COMPLETIONS)
+ return num_completed;
+ else if (err != PSM2_OK)
+ goto errh;
+ num_completed++;
+
+ // We obtained 'req' at the head of the completion queue. We can
+ // now free the request with PSM and obtain our original reques
+ // from the status' context
+ err = psm2_mq_test2(&req, // will be marked as invalid
+ &status); // we need the status
+ myreq = (my_request_t *) status.context;
+
+ // handle the completion for myreq whether myreq is a posted receive
+ // or a non-blocking send.
+ }
+ while (1);
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends. This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ * or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ * the output of a @ref psm2_mq_status_t or NULL if status is to be
+ * ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ * suspended, the user need not ensure that progress is made prior to
+ * calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ * associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as the requests that are used in each of the calls are
+ * associated with different MQs.
+ *
+ * @remarks
+ * @li This function ensures progress on the endpoint as long as the request
+ * is incomplete.
+ * @li @c status can be NULL, in which case no status is written upon
+ * completion.
+ * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ * immediately.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ * @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends. This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ * or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ * the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ * ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ * suspended, the user need not ensure that progress is made prior to
+ * calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ * associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as the requests that are used in each of the calls are
+ * associated with different MQs.
+ *
+ * @remarks
+ * @li This function ensures progress on the endpoint as long as the request
+ * is incomplete.
+ * @li @c status can be NULL, in which case no status is written upon
+ * completion.
+ * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ * immediately.
+ *
+ * The following error code is returned. Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ * @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly. It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed. For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ * or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ * the output of a @ref psm2_mq_status_t or NULL if status is to be
+ * ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ * psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ * PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ * back to the MQ library. If the request is incomplete, the contents of
+ * @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ * psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as the requests that are used in each of the calls are
+ * associated with different MQs.
+ *
+ * The following two errors are always returned. Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ * unchanged.
+ *
+ * @code{.c}
+ // Function that returns the first completed request in an array
+ // of requests.
+ void *
+ user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+ {
+ int i;
+ void *context = NULL;
+
+ // Ensure progress only once
+ psm2_poll(ep);
+
+ // Test for at least one completion and return it's context
+ psm2_mq_status_t stat;
+ for (i = 0; i < nreqs; i++) {
+ if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) {
+ context = stat.context;
+ break;
+ }
+ }
+ return context;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly. It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed. For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ * or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ * the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ * ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ * psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ * PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ * back to the MQ library. If the request is incomplete, the contents of
+ * @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ * psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as the requests that are used in each of the calls are
+ * associated with different MQs.
+ *
+ * The following two errors are always returned. Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ * unchanged.
+ *
+ * @code{.c}
+ // Function that returns the first completed request in an array
+ // of requests.
+ void *
+ user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+ {
+ int i;
+ void *context = NULL;
+
+ // Ensure progress only once
+ psm2_poll(ep);
+
+ // Test for at least one completion and return it's context
+ psm2_mq_status2_t stat;
+ for (i = 0; i < nreqs; i++) {
+ if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) {
+ context = stat.context;
+ break;
+ }
+ }
+ return context;
+ }
+ @endcode
+ */
+psm2_error_t
+psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Cancel a preposted request
+ *
+ * Function to cancel a preposted receive request returned by @ref
+ * psm2_mq_irecv. It is currently illegal to cancel a send request initiated
+ * with @ref psm2_mq_isend.
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend.
+ *
+ * @post Whether the cancel is successful or not, the user returns the
+ * request to the library by way of @ref psm2_mq_test or @ref
+ * psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * as long as the requests that are used in each of the calls are
+ * associated with different MQs.
+ *
+ * Only the two following errors can be returned directly, without being
+ * handled by the error handler (@ref psm2_error_register_handler):
+ *
+ * @retval PSM2_OK The request could be successfully cancelled such that the
+ * preposted receive buffer could be removed from the preposted
+ * receive queue before a match occurred. The associated @c
+ * request remains unchanged and the user must still return
+ * the storage to the MQ library.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled
+ * since the preposted receive buffer has already
+ * matched an incoming message. The @c request
+ * remains unchanged.
+ *
+ */
+psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req);
+
+/*! @brief MQ statistics structure */
+struct psm2_mq_stats {
+ /** Bytes received into a matched user buffer */
+ uint64_t rx_user_bytes;
+ /** Messages received into a matched user buffer */
+ uint64_t rx_user_num;
+ /** Bytes received into an unmatched system buffer */
+ uint64_t rx_sys_bytes;
+ /** Messages received into an unmatched system buffer */
+ uint64_t rx_sys_num;
+
+ /** Total Messages transmitted (shm and hfi) */
+ uint64_t tx_num;
+ /** Messages transmitted eagerly */
+ uint64_t tx_eager_num;
+ /** Bytes transmitted eagerly */
+ uint64_t tx_eager_bytes;
+ /** Messages transmitted using expected TID mechanism */
+ uint64_t tx_rndv_num;
+ /** Bytes transmitted using expected TID mechanism */
+ uint64_t tx_rndv_bytes;
+ /** Messages transmitted (shm only) */
+ uint64_t tx_shm_num;
+ /** Messages received through shm */
+ uint64_t rx_shm_num;
+
+ /** Number of system buffers allocated */
+ uint64_t rx_sysbuf_num;
+ /** Bytes allcoated for system buffers */
+ uint64_t rx_sysbuf_bytes;
+
+ /** Internally reserved for future use */
+ uint64_t _reserved[16];
+};
+
+#define PSM2_MQ_NUM_STATS 13 /**< How many stats are currently used in @ref psm2_mq_stats */
+
+/*! @see psm2_mq_stats */
+ typedef struct psm2_mq_stats psm2_mq_stats_t;
+
+/** @brief Retrieve statistics from an instantiated MQ */
+ void
+ psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats);
+
+/*! @} */
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/psm_am.c b/psm_am.c
new file mode 100644
index 0000000..df193da
--- /dev/null
+++ b/psm_am.c
@@ -0,0 +1,269 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+ and copied out in __psm2_am_get_parameters. When debugging is enabled,
+ various assertions reference these parameters for sanity checking. */
+struct psm2_am_parameters psmi_am_parameters = { 0 };
+
+static int _ignore_handler(PSMI_AM_ARGS_DEFAULT)
+{
+ return 0;
+}
+
+int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT)
+{
+ abort();
+ return 0;
+}
+
+static void psmi_am_min_parameters(struct psm2_am_parameters *dest,
+ struct psm2_am_parameters *src)
+{
+ dest->max_handlers = min(dest->max_handlers, src->max_handlers);
+ dest->max_nargs = min(dest->max_nargs, src->max_nargs);
+ dest->max_request_short =
+ min(dest->max_request_short, src->max_request_short);
+ dest->max_reply_short =
+ min(dest->max_reply_short, src->max_reply_short);
+}
+
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
+{
+ int i;
+ psm2_am_handler_fn_t *am_htable;
+ struct psm2_am_parameters params;
+
+ psmi_am_parameters.max_handlers = INT_MAX;
+ psmi_am_parameters.max_nargs = INT_MAX;
+ psmi_am_parameters.max_request_short = INT_MAX;
+ psmi_am_parameters.max_reply_short = INT_MAX;
+
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+ ep->ptl_self.am_get_parameters(ep, ¶ms);
+ psmi_am_min_parameters(&psmi_am_parameters, ¶ms);
+ }
+
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ ep->ptl_ips.am_get_parameters(ep, ¶ms);
+ psmi_am_min_parameters(&psmi_am_parameters, ¶ms);
+ }
+
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ ep->ptl_amsh.am_get_parameters(ep, ¶ms);
+ psmi_am_min_parameters(&psmi_am_parameters, ¶ms);
+ }
+
+ ep->am_htable =
+ psmi_malloc(ep, UNDEFINED,
+ sizeof(psm2_am_handler_fn_t) * PSMI_AM_NUM_HANDLERS);
+ if (ep->am_htable == NULL)
+ return PSM2_NO_MEMORY;
+
+ am_htable = (psm2_am_handler_fn_t *) ep->am_htable;
+ for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++)
+ am_htable[i] = _ignore_handler;
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_am_register_handlers(psm2_ep_t ep,
+ const psm2_am_handler_fn_t *handlers,
+ int num_handlers, int *handlers_idx)
+{
+ int i, j;
+
+ PSM2_LOG_MSG("entering");
+ /* For now just assign any free one */
+ for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+ if (ep->am_htable[i] == _ignore_handler) {
+ ep->am_htable[i] = handlers[j];
+ handlers_idx[j] = i;
+ if (++j == num_handlers) /* all registered */
+ break;
+ }
+ }
+
+ if (j < num_handlers) {
+ /* Not enough free handlers, restore unused handlers */
+ for (i = 0; i < j; i++)
+ ep->am_htable[handlers_idx[i]] = _ignore_handler;
+ PSM2_LOG_MSG("leaving");
+ return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
+ "Insufficient "
+ "available AM handlers: registered %d of %d requested handlers",
+ j, num_handlers);
+ }
+ else {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+ }
+}
+PSMI_API_DECL(psm2_am_register_handlers)
+
+psm2_error_t
+__psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src, size_t len,
+ int flags, psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ psm2_error_t err;
+ ptl_ctl_t *ptlc = epaddr->ptlctl;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert(epaddr != NULL);
+ psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+ psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+ psmi_assert(nargs > 0 ? args != NULL : 1);
+ psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short);
+ psmi_assert(len > 0 ? src != NULL : 1);
+
+ PSMI_LOCK(ptlc->ep->mq->progress_lock);
+
+ err = ptlc->am_short_request(epaddr, handler, args,
+ nargs, src, len, flags, completion_fn,
+ completion_ctxt);
+ PSMI_UNLOCK(ptlc->ep->mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_am_request_short)
+
+psm2_error_t
+__psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src, size_t len,
+ int flags, psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ psm2_error_t err;
+ struct psmi_am_token *tok;
+ psm2_epaddr_t epaddr;
+ ptl_ctl_t *ptlc;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert_always(token != NULL);
+ psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+ psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+ psmi_assert(nargs > 0 ? args != NULL : 1);
+ psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short);
+ psmi_assert(len > 0 ? src != NULL : 1);
+
+ tok = (struct psmi_am_token *)token;
+ epaddr = tok->epaddr_incoming;
+ ptlc = epaddr->ptlctl;
+
+ /* No locking here since we are already within handler context and already
+ * locked */
+
+ err = ptlc->am_short_reply(token, handler, args,
+ nargs, src, len, flags, completion_fn,
+ completion_ctxt);
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_am_reply_short)
+
+psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out)
+{
+ struct psmi_am_token *tok;
+
+ PSM2_LOG_MSG("entering");
+ if (token == NULL || epaddr_out == NULL) {
+ PSM2_LOG_MSG("leaving");
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid %s parameters", __FUNCTION__);
+ }
+
+ tok = (struct psmi_am_token *)token;
+ *epaddr_out = tok->epaddr_incoming;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_source)
+
+psm2_error_t
+__psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters,
+ size_t sizeof_parameters_in,
+ size_t *sizeof_parameters_out)
+{
+ size_t s;
+
+ PSM2_LOG_MSG("entering");
+ if (parameters == NULL) {
+ PSM2_LOG_MSG("leaving");
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid %s parameters", __FUNCTION__);
+ }
+
+ memset(parameters, 0, sizeof_parameters_in);
+ s = min(sizeof(psmi_am_parameters), sizeof_parameters_in);
+ memcpy(parameters, &psmi_am_parameters, s);
+ *sizeof_parameters_out = s;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_parameters)
diff --git a/psm_am_internal.h b/psm_am_internal.h
new file mode 100644
index 0000000..29edfb8
--- /dev/null
+++ b/psm_am_internal.h
@@ -0,0 +1,93 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM2_AM_INTERNAL_H
+#define _PSM2_AM_INTERNAL_H
+
+#define PSMI_AM_MAX_ARGS 10
+#define PSMI_AM_NUM_HANDLERS 256 /* must be power of 2 */
+
+#define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token, \
+ psm2_amarg_t *args, int nargs, \
+ void *src, uint32_t len
+
+struct psmi_am_token {
+ psm2_epaddr_t epaddr_incoming;
+ uint32_t flags;
+ /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */
+ uint32_t can_reply;
+
+ /* PTLs may add other stuff here */
+};
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+ and copied out in __psm2_am_get_parameters. When debugging is enabled,
+ various assertions reference these parameters for sanity checking. */
+extern struct psm2_am_parameters psmi_am_parameters;
+
+PSMI_ALWAYS_INLINE(psm2_am_handler_fn_t
+ psm_am_get_handler_function(psm2_ep_t ep,
+ psm2_handler_t handler_idx))
+{
+ int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1);
+ psm2_am_handler_fn_t fn = (psm2_am_handler_fn_t) ep->am_htable[hidx];
+ psmi_assert_always(fn != NULL);
+ return fn;
+}
+
+/* PSM internal initialization */
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep);
+
+#endif
diff --git a/psm_context.c b/psm_context.c
new file mode 100644
index 0000000..21bc893
--- /dev/null
+++ b/psm_context.c
@@ -0,0 +1,817 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "psm_user.h"
+
+#define HFI_USERINIT_RETRY_MAX 3
+#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT 1
+ustatic int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid);
+MOCK_DCL_EPILOGUE(psmi_sharedcontext_params);
+static int psmi_get_hfi_selection_algorithm(void);
+ustatic psm2_error_t psmi_init_userinfo_params(psm2_ep_t ep,
+ int unit_id,
+ psm2_uuid_t const unique_job_key,
+ struct hfi1_user_info_dep *user_info);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable)
+{
+ int poll_type;
+ int ret;
+
+ if ((enable && (context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)) ||
+ (!enable && !(context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)))
+ return PSM2_OK;
+
+ if (enable)
+ poll_type = HFI1_POLL_TYPE_URGENT;
+ else
+ poll_type = 0;
+
+ ret = hfi_poll_type(context->ctrl, poll_type);
+
+ if (ret != 0)
+ return PSM2_EP_NO_RESOURCES;
+ else {
+ if (enable)
+ context->runtime_flags |= PSMI_RUNTIME_INTR_ENABLED;
+ else
+ context->runtime_flags &= ~PSMI_RUNTIME_INTR_ENABLED;
+
+ return PSM2_OK;
+ }
+}
+
+int psmi_context_interrupt_isenabled(psmi_context_t *context)
+{
+ return context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED;
+}
+
+/* Returns 1 when all of the active units have their free contexts
+ * equal the number of contexts. This is an indication that no
+ * jobs are currently running.
+ *
+ * Note that this code is clearly racy (this code may happen concurrently
+ * by two or more processes, and this point of observation,
+ * occurs earlier in time to when the decision is made for deciding which
+ * context to assign, which will also occurs earlier in time to when the
+ * context is actually assigned. And, when the context is finally
+ * assigned, this will change the "nfreectxts" observed below.)
+ */
+static int psmi_all_active_units_have_max_freecontexts(int nunits)
+{
+ int u;
+
+ for (u=0;u < nunits;u++)
+ {
+ if (hfi_get_unit_active(u) > 0)
+ {
+ int64_t nfreectxts=0,nctxts=0;
+
+ if (!hfi_sysfs_unit_read_s64(u, "nctxts", &nctxts, 0) &&
+ !hfi_sysfs_unit_read_s64(u, "nfreectxts", &nfreectxts, 0))
+ {
+ if (nfreectxts != nctxts)
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+/* returns the integer value of an environment variable, or 0 if the environment
+ * variable is not set. */
+static int psmi_get_envvar(const char *env)
+{
+ const char *env_val = getenv(env);
+
+ if (env_val && *env_val)
+ {
+ int r = atoi(env_val);
+ return (r >= 0) ? r : 0;
+ }
+ return 0;
+}
+
+/* returns the 8-bit hash value of an uuid. */
+static inline
+uint8_t
+psmi_get_uuid_hash(psm2_uuid_t const uuid)
+{
+ int i;
+ uint8_t hashed_uuid = 0;
+
+ for (i=0; i < sizeof(psm2_uuid_t); ++i)
+ hashed_uuid ^= *((uint8_t const *)uuid + i);
+
+ return hashed_uuid;
+}
+
+int psmi_get_current_proc_location()
+{
+ int core_id, node_id;
+
+ core_id = sched_getcpu();
+ if (core_id < 0)
+ return -EINVAL;
+
+ node_id = numa_node_of_cpu(core_id);
+ if (node_id < 0)
+ return -EINVAL;
+
+ return node_id;
+}
+
+static void
+psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start,
+ long *unit_end, int nunits)
+{
+ /* if the number of ranks on the host is 1 and ... */
+ if ((psmi_get_envvar("MPI_LOCALNRANKS") == 1) &&
+ /*
+ * All of the active units have free contexts equal the
+ * number of contexts.
+ */
+ psmi_all_active_units_have_max_freecontexts(nunits)) {
+ /* we start looking at unit 0, and end at nunits-1: */
+ *unit_start = 0;
+ *unit_end = nunits - 1;
+ } else {
+ /* else, we are going to look at:
+ (a hash of the job key plus the local rank id) mod nunits. */
+
+ *unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
+ psmi_get_uuid_hash(job_key)) % nunits;
+ if (*unit_start > 0)
+ *unit_end = *unit_start - 1;
+ else
+ *unit_end = nunits-1;
+ }
+}
+
+static
+psm2_error_t
+psmi_compute_start_and_end_unit(psmi_context_t *context,long unit_param,
+ int nunitsactive,int nunits,psm2_uuid_t const job_key,
+ long *unit_start,long *unit_end)
+{
+ int node_id, unit_id, found = 0;
+ int saved_hfis[nunits];
+ context->user_info.hfi1_alg = HFI1_ALG_ACROSS;
+ /* if the user did not set HFI_UNIT then ... */
+ if (unit_param == HFI_UNIT_ID_ANY)
+ {
+ /* Get the actual selection algorithm from the environment: */
+ context->user_info.hfi1_alg = psmi_get_hfi_selection_algorithm();
+ /* If round-robin is selection algorithm and ... */
+ if ((context->user_info.hfi1_alg == HFI1_ALG_ACROSS) &&
+ /* there are more than 1 active units then ... */
+ (nunitsactive > 1))
+ {
+ /*
+ * Pick first HFI we find on same root complex
+ * as current task. If none found, fall back to
+ * load-balancing algorithm.
+ */
+ node_id = psmi_get_current_proc_location();
+ if (node_id >= 0) {
+ for (unit_id = 0; unit_id < nunits; unit_id++) {
+ if (hfi_get_unit_active(unit_id) <= 0)
+ continue;
+
+ if (hfi_sysfs_unit_read_node_s64(unit_id) == node_id) {
+ saved_hfis[found] = unit_id;
+ found++;
+ _HFI_VDBG("Picking unit: %d for current task"
+ " which is on node:%d\n", unit_id, node_id);
+ }
+ }
+
+ /*
+ * Spread HFI selection between units if
+ * we find more than one within a socket.
+ */
+ if (found > 1) {
+ *unit_start = (psmi_get_envvar("MPI_LOCALRANKID") +
+ psmi_get_uuid_hash(job_key)) % found;
+
+ *unit_start = *unit_end = saved_hfis[*unit_start];
+ } else if (found == 1) {
+ *unit_start = *unit_end = saved_hfis[0];
+ }
+ }
+
+ if (node_id < 0 || !found) {
+ psmi_spread_hfi_selection(job_key, unit_start,
+ unit_end, nunits);
+ }
+ } else if ((context->user_info.hfi1_alg == HFI1_ALG_ACROSS_ALL) &&
+ (nunitsactive > 1)) {
+ psmi_spread_hfi_selection(job_key, unit_start,
+ unit_end, nunits);
+ }
+ else {
+ *unit_start = 0;
+ *unit_end = nunits - 1;
+ }
+ } else if (unit_param >= 0) {
+ /* the user specified HFI_UNIT, we use it. */
+ *unit_start = *unit_end = unit_param;
+ } else {
+ psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "PSM2 can't open unit: %ld for reading and writing",
+ unit_param);
+ return PSM2_EP_DEVICE_FAILURE;
+ }
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_param, long port,
+ psm2_uuid_t const job_key, int64_t timeout_ns,
+ psmi_context_t *context)
+{
+ long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev;
+ int lid, sc, vl;
+ uint64_t gid_hi, gid_lo;
+ char dev_name[MAXPATHLEN];
+ psm2_error_t err = PSM2_OK;
+ uint32_t hfi_type;
+ int nunits = hfi_get_num_units(), nunitsactive=0;
+
+ /*
+ * If shared contexts are enabled, try our best to schedule processes
+ * across one or many devices
+ */
+
+ /* if no units, then no joy. */
+ if (nunits <= 0)
+ {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "PSM2 no hfi units are available");
+ goto ret;
+ }
+
+ /* Calculate the number of active units: */
+ for (unit_id=0;unit_id < nunits;unit_id++)
+ {
+ if (hfi_get_unit_active(unit_id) > 0)
+ nunitsactive++;
+ }
+ /* if no active units, then no joy. */
+ if (nunitsactive == 0)
+ {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "PSM2 no hfi units are active");
+ goto ret;
+ }
+ if (timeout_ns > 0)
+ open_timeout = (long)(timeout_ns / MSEC_ULL);
+
+
+ unit_start = 0; unit_end = nunits - 1;
+ err = psmi_compute_start_and_end_unit(context, unit_param,
+ nunitsactive, nunits,
+ job_key,
+ &unit_start, &unit_end);
+ if (err != PSM2_OK)
+ return err;
+
+ /* this is the start of a loop that starts at unit_start and goes to unit_end.
+ but note that the way the loop computes the loop control variable is by
+ an expression involving the mod operator. */
+ context->fd = -1;
+ context->ctrl = NULL;
+ unit_id_prev = unit_id = unit_start;
+ do
+ {
+ /* close previous opened unit fd before attempting open of current unit. */
+ if (context->fd > 0)
+ {
+ hfi_context_close(context->fd);
+ context->fd = -1;
+ }
+
+ /* if the unit_id is not active, go to next one. */
+ if (hfi_get_unit_active(unit_id) <= 0) {
+ unit_id_prev = unit_id;
+ unit_id = (unit_id + 1) % nunits;
+ continue;
+ }
+
+ /* open this unit. */
+ context->fd = hfi_context_open_ex(unit_id, port, open_timeout,
+ dev_name, sizeof(dev_name));
+
+ /* go to next unit if failed to open. */
+ if (context->fd == -1) {
+ unit_id_prev = unit_id;
+ unit_id = (unit_id + 1) % nunits;
+ continue;
+ }
+
+ /* collect the userinfo params. */
+ if ((err = psmi_init_userinfo_params(ep,
+ (int)unit_id, job_key,
+ &context->user_info)))
+ goto bail;
+
+ /* attempt to assign the context via hfi_userinit() */
+ int retry = 0;
+ do {
+ if (retry > 0)
+ _HFI_INFO("hfi_userinit: failed, trying again (%d/%d)\n",
+ retry, HFI_USERINIT_RETRY_MAX);
+ context->ctrl = hfi_userinit(context->fd, &context->user_info);
+ } while (context->ctrl == NULL && ++retry <= HFI_USERINIT_RETRY_MAX);
+ unit_id_prev = unit_id;
+ unit_id = (unit_id + 1) % nunits;
+ } while (unit_id_prev != unit_end && context->ctrl == NULL);
+
+ if (context->ctrl == NULL)
+ {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "PSM2 can't open hfi unit: %ld",unit_param);
+ goto ret;
+ }
+ _HFI_VDBG("hfi_userinit() passed.\n");
+
+ if ((lid = hfi_get_port_lid(context->ctrl->__hfi_unit,
+ context->ctrl->__hfi_port)) <= 0) {
+ err = psmi_handle_error(NULL,
+ PSM2_EP_DEVICE_FAILURE,
+ "Can't get HFI LID in psm2_ep_open: is SMA running?");
+ goto bail;
+ }
+ if (hfi_get_port_gid(context->ctrl->__hfi_unit,
+ context->ctrl->__hfi_port, &gid_hi,
+ &gid_lo) == -1) {
+ err =
+ psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "Can't get HFI GID in psm2_ep_open: is SMA running?");
+ goto bail;
+ }
+ ep->unit_id = context->ctrl->__hfi_unit;
+ ep->portnum = context->ctrl->__hfi_port;
+ ep->gid_hi = gid_hi;
+ ep->gid_lo = gid_lo;
+
+ context->ep = (psm2_ep_t) ep;
+ context->runtime_flags = context->ctrl->ctxt_info.runtime_flags;
+
+#ifdef PSM_CUDA
+ /* Check backward compatibility bits here and save the info */
+ if (context->ctrl->ctxt_info.runtime_flags & HFI1_CAP_GPUDIRECT_OT)
+ is_driver_gpudirect_enabled = 1;
+#endif
+
+ /* Get type of hfi assigned to context */
+ hfi_type = psmi_get_hfi_type(context);
+
+ /* Endpoint out_sl contains the default SL to use for this endpoint. */
+ /* Get the MTU for this SL. */
+ if ((sc = hfi_get_port_sl2sc(ep->unit_id,
+ context->ctrl->__hfi_port,
+ ep->out_sl)) < 0) {
+ sc = PSMI_SC_DEFAULT;
+ }
+ if ((vl = hfi_get_port_sc2vl(ep->unit_id,
+ context->ctrl->__hfi_port, sc)) < 0) {
+ vl = PSMI_VL_DEFAULT;
+ }
+ if (sc == PSMI_SC_ADMIN || vl == PSMI_VL_ADMIN) {
+ err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR,
+ "Invalid sl: %d, please specify correct sl via HFI_SL",
+ ep->out_sl);
+ goto bail;
+ }
+
+ if ((ep->mtu = hfi_get_port_vl2mtu(ep->unit_id,
+ context->ctrl->__hfi_port,
+ vl)) < 0) {
+ err =
+ psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "Can't get MTU for VL %d", vl);
+ goto bail;
+ }
+
+ /* Construct epid for this Endpoint */
+ switch (PSMI_EPID_VERSION) {
+ case PSMI_EPID_V1:
+ context->epid = PSMI_EPID_PACK_V1(lid, context->ctrl->ctxt_info.ctxt,
+ context->ctrl->ctxt_info.subctxt,
+ context->ctrl->__hfi_unit,
+ PSMI_EPID_VERSION, 0x3ffffff);
+ break;
+ case PSMI_EPID_V2:
+ context->epid = PSMI_EPID_PACK_V2(lid, context->ctrl->ctxt_info.ctxt,
+ context->ctrl->ctxt_info.subctxt,
+ PSMI_EPID_IPS_SHM, /*Not a only-shm epid */
+ PSMI_EPID_VERSION, ep->gid_hi);
+ break;
+ default:
+ /* Epid version is greater than max supportd version. */
+ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+ break;
+ }
+
+
+ _HFI_VDBG
+ ("construct epid: lid %d ctxt %d subctxt %d hcatype %d mtu %d\n",
+ lid, context->ctrl->ctxt_info.ctxt,
+ context->ctrl->ctxt_info.subctxt, hfi_type, ep->mtu);
+
+ goto ret;
+
+bail:
+ _HFI_PRDBG("%s open failed: %d (%s)\n", dev_name, err, strerror(errno));
+ if (context->fd != -1) {
+ hfi_context_close(context->fd);
+ context->fd = -1;
+ }
+ret:
+
+ _HFI_VDBG("psmi_context_open() return %d\n", err);
+ return err;
+}
+
+psm2_error_t psmi_context_close(psmi_context_t *context)
+{
+ if (context->fd >= 0) {
+ hfi_context_close(context->fd);
+ context->fd = -1;
+ }
+ return PSM2_OK;
+}
+
+/*
+ * This function works whether a context is initialized or not in a psm2_ep.
+ *
+ * Returns one of
+ *
+ * PSM2_OK: Port status is ok (or context not initialized yet but still "ok")
+ * PSM2_OK_NO_PROGRESS: Cable pulled
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ * The message follows the per-port status
+ * As of 7322-ready driver, need to check port-specific qword for IB
+ * as well as older unit-only. For now, we don't have the port interface
+ * defined, so just check port 0 qword for spi_status
+ */
+psm2_error_t psmi_context_check_status(const psmi_context_t *contexti)
+{
+ psm2_error_t err = PSM2_OK;
+ psmi_context_t *context = (psmi_context_t *) contexti;
+ struct hfi1_status *status =
+ (struct hfi1_status *)context->ctrl->base_info.status_bufbase;
+ char *errmsg = NULL;
+
+ /* Fatal chip-related errors */
+ if (!(status->dev & HFI1_STATUS_CHIP_PRESENT) ||
+ !(status->dev & HFI1_STATUS_INITTED) ||
+ (status->dev & HFI1_STATUS_HWERROR)) {
+
+ err = PSM2_EP_DEVICE_FAILURE;
+ if (err != context->status_lasterr) { /* report once */
+ volatile char *errmsg_sp =
+ (volatile char *)status->freezemsg;
+ if (*errmsg_sp)
+ psmi_handle_error(context->ep, err,
+ "Hardware problem: %s",
+ errmsg_sp);
+ else {
+ if (status->dev & HFI1_STATUS_HWERROR)
+ errmsg = "Hardware error";
+ else
+ errmsg = "Hardware not found";
+
+ psmi_handle_error(context->ep, err,
+ "%s", errmsg);
+ }
+ }
+ }
+ /* Fatal network-related errors with timeout: */
+ else if (!(status->port & HFI1_STATUS_IB_CONF) ||
+ !(status->port & HFI1_STATUS_IB_READY)) {
+ err = PSM2_EP_NO_NETWORK;
+ if (err != context->status_lasterr) { /* report once */
+ context->networkLostTime = time(NULL);
+ }
+ else
+ {
+ time_t now = time(NULL);
+ static const double seventySeconds = 70.0;
+
+ /* The linkup time duration for a system should allow the time needed
+ to complete 3 LNI passes which is:
+ 50 seconds for a passive copper channel
+ 65 seconds for optical channel.
+ (we add 5 seconds of margin.) */
+ if (difftime(now,context->networkLostTime) > seventySeconds)
+ {
+ volatile char *errmsg_sp =
+ (volatile char *)status->freezemsg;
+
+ psmi_handle_error(context->ep, err, "%s",
+ *errmsg_sp ? errmsg_sp :
+ "Network down");
+ }
+ }
+ }
+
+ if (err == PSM2_OK && context->status_lasterr != PSM2_OK)
+ context->status_lasterr = PSM2_OK; /* clear error */
+ else if (err != PSM2_OK)
+ context->status_lasterr = err; /* record error */
+
+ return err;
+}
+
+/*
+ * Prepare user_info params for driver open, used only in psmi_context_open
+ */
+ustatic
+psm2_error_t
+psmi_init_userinfo_params(psm2_ep_t ep, int unit_id,
+ psm2_uuid_t const unique_job_key,
+ struct hfi1_user_info_dep *user_info)
+{
+ /* static variables, shared among rails */
+ static int shcontexts_enabled = -1, rankid, nranks;
+
+ int avail_contexts = 0, max_contexts, ask_contexts;
+ int ranks_per_context = 0;
+ psm2_error_t err = PSM2_OK;
+ union psmi_envvar_val env_maxctxt, env_ranks_per_context;
+ static int subcontext_id_start;
+
+ memset(user_info, 0, sizeof(*user_info));
+ user_info->userversion = HFI1_USER_SWMINOR|(hfi_get_user_major_version()<<HFI1_SWMAJOR_SHIFT);
+
+ user_info->subctxt_id = 0;
+ user_info->subctxt_cnt = 0;
+ memcpy(user_info->uuid, unique_job_key, sizeof(user_info->uuid));
+
+ if (shcontexts_enabled == -1) {
+ shcontexts_enabled =
+ psmi_sharedcontext_params(&nranks, &rankid);
+ }
+ if (!shcontexts_enabled)
+ return err;
+
+ avail_contexts = hfi_get_num_contexts(unit_id);
+
+ if (avail_contexts == 0) {
+ err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
+ "PSM2 found 0 available contexts on opa device(s).");
+ goto fail;
+ }
+
+ /* See if the user wants finer control over context assignments */
+ if (!psmi_getenv("PSM2_MAX_CONTEXTS_PER_JOB",
+ "Maximum number of contexts for this PSM2 job",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
+ max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */
+ ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */
+ } else if (!psmi_getenv("PSM2_SHAREDCONTEXTS_MAX",
+ "Maximum number of contexts for this PSM2 job",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
+
+ _HFI_INFO
+ ("This env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n");
+
+ max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */
+ ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */
+ } else
+ ask_contexts = max_contexts = avail_contexts;
+
+ if (!psmi_getenv("PSM2_RANKS_PER_CONTEXT",
+ "Number of ranks per context",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)1, &env_ranks_per_context)) {
+ ranks_per_context = max(env_ranks_per_context.e_int, 1);
+ ranks_per_context = min(ranks_per_context, HFI1_MAX_SHARED_CTXTS);
+ }
+
+ /*
+ * See if we could get a valid ppn. If not, approximate it to be the
+ * number of cores.
+ */
+ if (nranks == -1) {
+ long nproc = sysconf(_SC_NPROCESSORS_ONLN);
+ if (nproc < 1)
+ nranks = 1;
+ else
+ nranks = nproc;
+ }
+
+ /*
+ * Make sure that our guesses are good educated guesses
+ */
+ if (rankid >= nranks) {
+ _HFI_PRDBG
+ ("PSM2_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n",
+ rankid, nranks);
+ goto fail;
+ }
+
+ if (ranks_per_context) {
+ int contexts =
+ (nranks + ranks_per_context - 1) / ranks_per_context;
+ if (contexts > ask_contexts) {
+ err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
+ "Incompatible settings for "
+ "(PSM2_SHAREDCONTEXTS_MAX / PSM2_MAX_CONTEXTS_PER_JOB) and PSM2_RANKS_PER_CONTEXT");
+ goto fail;
+ }
+ ask_contexts = contexts;
+ }
+
+ /* group id based on total groups and local rank id */
+ user_info->subctxt_id = subcontext_id_start + rankid % ask_contexts;
+ /* this is for multi-rail, when we setup a new rail,
+ * we can not use the same subcontext ID as the previous
+ * rail, otherwise, the driver will match previous rail
+ * and fail.
+ */
+ subcontext_id_start += ask_contexts;
+
+ /* Need to compute with how many *other* peers we will be sharing the
+ * context */
+ if (nranks > ask_contexts) {
+ user_info->subctxt_cnt = nranks / ask_contexts;
+ /* If ppn != multiple of contexts, some contexts get an uneven
+ * number of subcontexts */
+ if (nranks % ask_contexts > rankid % ask_contexts)
+ user_info->subctxt_cnt++;
+ /* The case of 1 process "sharing" a context (giving 1 subcontext)
+ * is supcontexted by the driver and PSM. However, there is no
+ * need to share in this case so disable context sharing. */
+ if (user_info->subctxt_cnt == 1)
+ user_info->subctxt_cnt = 0;
+ if (user_info->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) {
+ err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR,
+ "Calculation of subcontext count exceeded maximum supported");
+ goto fail;
+ }
+ }
+ /* else subcontext_cnt remains 0 and context sharing is disabled. */
+
+ _HFI_PRDBG("PSM2_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d,"
+ "max_contexts=%d,ask_contexts=%d,"
+ "ranks_per_context=%d,id=%u,cnt=%u\n",
+ rankid, nranks, avail_contexts, max_contexts,
+ ask_contexts, ranks_per_context,
+ user_info->subctxt_id, user_info->subctxt_cnt);
+fail:
+ return err;
+}
+
+ustatic
+int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid)
+{
+ union psmi_envvar_val enable_shcontexts;
+ char *ppn_env = NULL, *lrank_env = NULL, *c;
+
+ *rankid = -1;
+ *nranks = -1;
+
+#if 0
+ /* DEBUG: Used to selectively test possible shared context and shm-only
+ * settings */
+ unsetenv("PSC_MPI_NODE_RANK");
+ unsetenv("PSC_MPI_PPN");
+ unsetenv("MPI_LOCALRANKID");
+ unsetenv("MPI_LOCALRANKS");
+#endif
+
+ /* We do not support context sharing for multiple endpoints */
+ if (psmi_multi_ep_enabled) {
+ return 0;
+ }
+
+ /* New name in 2.0.1, keep observing old name */
+ psmi_getenv("PSM2_SHAREDCONTEXTS", "Enable shared contexts",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+ (union psmi_envvar_val)
+ PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT,
+ &enable_shcontexts);
+ if (!enable_shcontexts.e_int)
+ return 0;
+
+ /* We support two types of syntaxes to let users give us a hint what
+ * our local rankid is. Moving towards MPI_, but still support PSC_ */
+ if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') {
+ lrank_env = "MPI_LOCALRANKID";
+ ppn_env = "MPI_LOCALNRANKS";
+ } else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') {
+ ppn_env = "PSC_MPI_PPN";
+ lrank_env = "PSC_MPI_NODE_RANK";
+ }
+
+ if (ppn_env != NULL && lrank_env != NULL) {
+ union psmi_envvar_val env_rankid, env_nranks;
+
+ psmi_getenv(lrank_env, "Shared context rankid",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)-1, &env_rankid);
+
+ psmi_getenv(ppn_env, "Shared context numranks",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)-1, &env_nranks);
+
+ *rankid = env_rankid.e_int;
+ *nranks = env_nranks.e_int;
+
+ return 1;
+ } else
+ return 0;
+}
+MOCK_DEF_EPILOGUE(psmi_sharedcontext_params);
+
+static
+int psmi_get_hfi_selection_algorithm(void)
+{
+ union psmi_envvar_val env_hfi1_alg;
+ int hfi1_alg = HFI1_ALG_ACROSS;
+
+ /* If a specific unit is set in the environment, use that one. */
+ psmi_getenv("HFI_SELECTION_ALG",
+ "HFI Device Selection Algorithm to use. Round Robin (Default) "
+ ", Packed or Round Robin All.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)"Round Robin", &env_hfi1_alg);
+
+ if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin"))
+ hfi1_alg = HFI1_ALG_ACROSS;
+ else if (!strcasecmp(env_hfi1_alg.e_str, "Packed"))
+ hfi1_alg = HFI1_ALG_WITHIN;
+ else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All"))
+ hfi1_alg = HFI1_ALG_ACROSS_ALL;
+ else {
+ _HFI_ERROR
+ ("Unknown HFI selection algorithm %s. Defaulting to Round Robin "
+ "allocation of HFIs.\n", env_hfi1_alg.e_str);
+ hfi1_alg = HFI1_ALG_ACROSS;
+ }
+
+ return hfi1_alg;
+}
diff --git a/psm_context.h b/psm_context.h
new file mode 100644
index 0000000..fe2aec7
--- /dev/null
+++ b/psm_context.h
@@ -0,0 +1,102 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_context.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_CONTEXT_H
+#define _PSM_CONTEXT_H
+
+typedef
+struct psmi_context {
+ struct _hfi_ctrl *ctrl; /* driver opaque hfi_proto */
+ void *spio_ctrl;
+ void *tid_ctrl;
+ void *tf_ctrl;
+
+ int fd; /* driver fd */
+ psm2_ep_t ep; /* psm ep handle */
+ psm2_epid_t epid; /* psm integral ep id */
+ struct hfi1_user_info_dep user_info;
+ uint32_t runtime_flags;
+ uint32_t rcvthread_flags;
+ psm2_error_t status_lasterr;
+ time_t networkLostTime;
+} psmi_context_t;
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_id, long port,
+ psm2_uuid_t const job_key,
+ int64_t timeout_ns, psmi_context_t *context);
+
+psm2_error_t psmi_context_close(psmi_context_t *context);
+
+/* Check status of context */
+psm2_error_t psmi_context_check_status(const psmi_context_t *context);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable);
+int psmi_context_interrupt_isenabled(psmi_context_t *context);
+
+/* Runtime flags describe what features are enabled in hw/sw and which
+ * corresponding PSM features are being used.
+ *
+ * Hi 16 bits are PSM options
+ * Lo 16 bits are HFI_RUNTIME options copied from (hfi_common.h)
+ */
+#define PSMI_RUNTIME_RCVTHREAD 0x80000000
+#define PSMI_RUNTIME_INTR_ENABLED 0x40000000
+
+#endif /* PSM_CONTEXT_H */
diff --git a/psm_diags.c b/psm_diags.c
new file mode 100644
index 0000000..2a43c22
--- /dev/null
+++ b/psm_diags.c
@@ -0,0 +1,362 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n);
+static int psmi_test_memcpy(memcpy_fn_t, const char *name);
+static int psmi_test_epid_table(int numelems);
+
+int psmi_diags(void);
+
+#define diags_assert(x) do { \
+ if (!(x)) { \
+ _HFI_ERROR("Diags assertion failure: %s\n", \
+ #x); \
+ goto fail; \
+ } \
+ } while (0)
+
+#define DIAGS_RETURN_PASS(str) \
+ do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; } \
+ while (0)
+#define DIAGS_RETURN_FAIL(str) \
+ do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; } \
+ while (0)
+
+int psmi_diags(void)
+{
+ int ret = 0;
+ ret |= psmi_test_epid_table(2048);
+ ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo");
+ /* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */
+
+ if (ret)
+ DIAGS_RETURN_FAIL("");
+ else
+ DIAGS_RETURN_PASS("");
+}
+
+/*
+ * Hash table test
+ */
+#define NALLOC 1024
+static int psmi_test_epid_table(int numelems)
+{
+ ptl_ctl_t ctl;
+ psm2_epaddr_t *ep_array, epaddr, ep_alloc;
+ psm2_epid_t *epid_array, epid_tmp;
+ psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00;
+ struct psmi_epid_table *tab;
+ int i, j;
+ struct drand48_data drand48_data;
+
+ ep_alloc =
+ (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+ sizeof(struct psm2_epaddr));
+ ep_array =
+ (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+ sizeof(struct psm2_epaddr *));
+ epid_array =
+ (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+ sizeof(psm2_epid_t));
+ diags_assert(ep_alloc != NULL);
+ diags_assert(ep_array != NULL);
+ diags_assert(epid_array != NULL);
+
+ srand48_r(12345678, &drand48_data);
+
+ psmi_epid_init();
+ tab = &psmi_epid_table;
+ ctl.ep = ep;
+
+ for (i = 0; i < numelems; i++) {
+ epid_array[i] = i;
+ ep_alloc[i].ptlctl = &ctl;
+ ep_alloc[i].epid = epid_array[i];
+ ep_array[i] = &ep_alloc[i];
+ }
+ for (i = 0; i < numelems; i++) {
+ psmi_epid_add(ep, epid_array[i], ep_array[i]);
+ }
+
+ /* Randomize epid_array */
+ for (i = 0; i < numelems; i++) {
+ long int rand_result;
+ lrand48_r(&drand48_data, &rand_result);
+ j = (int)(rand_result % numelems);
+ epid_tmp = epid_array[i];
+ epid_array[i] = epid_array[j];
+ epid_array[j] = epid_tmp;
+ }
+ /* Lookup. */
+ for (i = 0; i < numelems; i++) {
+ epaddr = psmi_epid_lookup(ep, epid_array[i]);
+ diags_assert(epaddr != NULL);
+ diags_assert(epaddr->epid == epid_array[i]);
+ diags_assert(epaddr->ptlctl->ep == ep);
+ }
+
+ /* Randomize epid_array again */
+ for (i = 0; i < numelems; i++) {
+ long int rand_result;
+ lrand48_r(&drand48_data, &rand_result);
+ j = (int)(rand_result % numelems);
+ epid_tmp = epid_array[i];
+ epid_array[i] = epid_array[j];
+ epid_array[j] = epid_tmp;
+ }
+ /* Delete half */
+ for (i = 0; i < numelems / 2; i++) {
+ epaddr = psmi_epid_remove(ep, epid_array[i]);
+ diags_assert(epaddr != NULL);
+ diags_assert(epaddr->epid == epid_array[i]);
+ diags_assert(epaddr->ptlctl->ep == ep);
+ }
+ /* Lookup other half -- expect non-NULL, then delete */
+ for (i = numelems / 2; i < numelems; i++) {
+ epaddr = psmi_epid_lookup(ep, epid_array[i]);
+ diags_assert(epaddr != NULL);
+ diags_assert(epaddr->epid == epid_array[i]);
+ diags_assert(epaddr->ptlctl->ep == ep);
+ epaddr = psmi_epid_remove(ep, epid_array[i]);
+ epaddr = psmi_epid_lookup(ep, epid_array[i]);
+ diags_assert(epaddr == NULL);
+ }
+ /* Lookup whole thing, expect done */
+ for (i = 0; i < numelems; i++) {
+ epaddr = psmi_epid_lookup(ep, epid_array[i]);
+ diags_assert(epaddr == NULL);
+ }
+ for (i = 0; i < tab->tabsize; i++) {
+ diags_assert(tab->table[i].entry == NULL ||
+ tab->table[i].entry == EPADDR_DELETED);
+ }
+
+ /* Make sure we're not leaking memory somewhere... */
+ diags_assert(tab->tabsize > tab->tabsize_used &&
+ tab->tabsize * PSMI_EPID_TABLOAD_FACTOR >
+ tab->tabsize_used);
+
+ /* Only free on success */
+ psmi_epid_fini();
+ psmi_free(epid_array);
+ psmi_free(ep_array);
+ psmi_free(ep_alloc);
+ DIAGS_RETURN_PASS("");
+
+fail:
+ /* Klocwork scan report memory leak. */
+ psmi_epid_fini();
+ if (epid_array)
+ psmi_free(epid_array);
+ if (ep_array)
+ psmi_free(ep_array);
+ if (ep_alloc)
+ psmi_free(ep_alloc);
+ DIAGS_RETURN_FAIL("");
+}
+
+/*
+ * Memcpy correctness test
+ */
+static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n);
+static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n);
+
+static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name)
+{
+ const int CORNERS = 0;
+ const long long lo = 1;
+ const long long hi = 16 * 1024 * 1024;
+ const long long below = 32;
+ const long long above = 32;
+ long long n, m;
+ char buf[128];
+ int ret = 0;
+ int memcpy_passed;
+ int memcpy_failed;
+
+ memcpy_passed = 0;
+ memcpy_failed = 0;
+
+ ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0);
+ if (ret < 0)
+ DIAGS_RETURN_FAIL("no heap space");
+
+ for (n = lo; n <= hi; n <<= 1) {
+ _HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n);
+ for (m = n - below; m <= n + above; m++) {
+ if (m == n) {
+ ret =
+ memcpy_check_size(fn, &memcpy_passed,
+ &memcpy_failed, n);
+ if (ret < 0)
+ DIAGS_RETURN_FAIL("no heap space");
+ } else if (CORNERS && m >= lo && m <= hi && m > (n >> 1)
+ && m < max(n, ((n << 1) - below))) {
+ ret =
+ memcpy_check_size(fn, &memcpy_passed,
+ &memcpy_failed,
+ (size_t) m);
+ if (ret < 0)
+ DIAGS_RETURN_FAIL("no heap space");
+ }
+ }
+ }
+
+ int total = memcpy_passed + memcpy_failed;
+ if (total > 0) {
+ _HFI_INFO("%d memcpy tests with %d passed (%.2f%%) "
+ "and %d failed (%.2f%%)\n",
+ total, memcpy_passed, (100.0 * memcpy_passed) / total,
+ memcpy_failed, (100.0 * memcpy_failed) / total);
+ }
+ if (memcpy_failed) {
+ snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed",
+ memcpy_name, (100.0 * memcpy_failed) / total);
+ DIAGS_RETURN_FAIL(buf);
+ } else {
+ DIAGS_RETURN_PASS(memcpy_name);
+ }
+}
+
+void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n)
+{
+ int ok = 1;
+ unsigned int seed = (unsigned int)
+ ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n);
+ size_t i;
+ struct drand48_data drand48_data;
+
+ if (!n)
+ return dst;
+
+ memset(src, 0x55, n);
+ memset(dst, 0xaa, n);
+ srand48_r(seed, &drand48_data);
+ for (i = 0; i < n; i++) {
+ long int rand_result;
+ lrand48_r(&drand48_data, &rand_result);
+ ((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff;
+ }
+
+ fn(dst, src, n);
+ memset(src, 0, n);
+ srand48_r(seed, &drand48_data);
+ for (i = 0; i < n; i++) {
+ long int rand_result;
+ lrand48_r(&drand48_data, &rand_result);
+ int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16);
+ int v = (int)((uint8_t *) dst)[i];
+ if (v != value) {
+ _HFI_ERROR
+ ("Error on index %llu : got %d instead of %d\n",
+ (unsigned long long)i, v, value);
+ ok = 0;
+ }
+ }
+ return ok ? dst : NULL;
+}
+
+int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n)
+{
+#define num_aligns 16
+#define USE_MALLOC 0
+#define DEBUG 0
+ uint8_t *src;
+ uint8_t *dst;
+ size_t size = n * 2 + num_aligns;
+ if (USE_MALLOC) {
+ src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+ dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+ if (src == NULL || dst == NULL)
+ if (src)
+ psmi_free(src);
+ if (dst)
+ psmi_free(dst);
+ return -1;
+ } else {
+ void *src_p, *dst_p;
+ if (posix_memalign(&src_p, 64, size) != 0 ||
+ posix_memalign(&dst_p, 64, size) != 0)
+ return -1;
+ else {
+ src = (uint8_t *) src_p;
+ dst = (uint8_t *) dst_p;
+ }
+ }
+ int src_align, dst_align;
+ for (src_align = 0; src_align < num_aligns; src_align++) {
+ for (dst_align = 0; dst_align < num_aligns; dst_align++) {
+ uint8_t *d = ((uint8_t *) dst) + dst_align;
+ uint8_t *s = ((uint8_t *) src) + src_align;
+ int ok = (memcpy_check_one(fn, d, s, n) != NULL);
+ if (DEBUG || !ok) {
+ _HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s,
+ (unsigned long long)n,
+ ok ? "passed" : "failed");
+ }
+ if (ok) {
+ (*p)++;
+ } else {
+ (*f)++;
+ }
+ }
+ }
+ psmi_free(src);
+ psmi_free(dst);
+ return 0;
+}
diff --git a/psm_ep.c b/psm_ep.c
new file mode 100644
index 0000000..d01c9aa
--- /dev/null
+++ b/psm_ep.c
@@ -0,0 +1,1527 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sched.h> /* cpu_set */
+#include <ctype.h> /* isalpha */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+/*
+ * Endpoint management
+ */
+psm2_ep_t psmi_opened_endpoint = NULL;
+int psmi_opened_endpoint_count = 0;
+
+static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
+ const struct psm2_ep_open_opts *opts,
+ const psm2_uuid_t unique_job_key,
+ struct psmi_context *context,
+ psm2_epid_t *epid);
+
+/*
+ * Device management
+ *
+ * PSM uses "devices" as components to manage communication to self, to peers
+ * reachable via shared memory and finally to peers reachable only through
+ * hfi.
+ *
+ * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is
+ * tested for reachability to each peer. First self, then shm and finally
+ * hfi. The order should really only affect endpoints that happen to be on
+ * the same node. PSM will correctly detect that two endpoints are on the same
+ * node even though they may be using different host interfaces.
+ */
+
+#define PSMI_DEVICES_DEFAULT "self,shm,hfi"
+static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
+ const char *devstr);
+static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o)
+{
+ static int num_units = -1;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (num_units == -1) {
+ num_units = hfi_get_num_units();
+ if (num_units == -1)
+ num_units = 0;
+ }
+
+ *num_units_o = (uint32_t) num_units;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_num_devunits)
+
+static int cmpfunc(const void *p1, const void *p2)
+{
+ uint64_t a = ((uint64_t *) p1)[0];
+ uint64_t b = ((uint64_t *) p2)[0];
+ if (a < b)
+ return -1;
+ if (a == b)
+ return 0;
+ return 1;
+}
+
+static psm2_error_t
+psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port)
+{
+ uint32_t num_units;
+ uint64_t gid_hi, gid_lo;
+ int i, j, ret, count = 0;
+ char *env;
+ psm2_error_t err = PSM2_OK;
+ uint64_t gidh[HFI_MAX_RAILS][3];
+ union psmi_envvar_val env_multirail;
+ int multirail_within_socket_used = 0;
+ int node_id = -1, found = 0;
+
+ psmi_getenv("PSM2_MULTIRAIL",
+ "Use all available HFIs in the system for communication.\n"
+ "0: Disabled (default),\n"
+ "1: Enable multirail across all available HFIs,\n"
+ "2: Enable multirail within socket.\n"
+ "\t For multirail within a socket, we try to find at\n"
+ "\t least one HFI on the same socket as current task.\n"
+ "\t If none found, we continue to use other HFIs within\n"
+ "\t the system.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)0,
+ &env_multirail);
+ if (!env_multirail.e_int) {
+ *num_rails = 0;
+ return err;
+ }
+
+ if (env_multirail.e_int == 2)
+ multirail_within_socket_used = 1;
+
+/*
+ * map is in format: unit:port,unit:port,...
+ */
+ if ((env = getenv("PSM2_MULTIRAIL_MAP"))) {
+ if (sscanf(env, "%d:%d", &i, &j) == 2) {
+ char *comma = strchr(env, ',');
+ unit[count] = i;
+ port[count] = j;
+ count++;
+ while (comma) {
+ if (sscanf(comma, ",%d:%d", &i, &j) != 2) {
+ break;
+ }
+ unit[count] = i;
+ port[count] = j;
+ count++;
+ if (count == HFI_MAX_RAILS)
+ break;
+ comma = strchr(comma + 1, ',');
+ }
+ }
+ *num_rails = count;
+
+/*
+ * Check if any of the port is not usable.
+ */
+ for (i = 0; i < count; i++) {
+ ret = hfi_get_port_active(unit[i], port[i]);
+ if (ret <= 0) {
+ err =
+ psmi_handle_error(NULL,
+ PSM2_EP_DEVICE_FAILURE,
+ "Unit/port: %d:%d is not active.",
+ unit[i], port[i]);
+ return err;
+ }
+ ret = hfi_get_port_lid(unit[i], port[i]);
+ if (ret <= 0) {
+ err =
+ psmi_handle_error(NULL,
+ PSM2_EP_DEVICE_FAILURE,
+ "Couldn't get lid for unit %d:%d",
+ unit[i], port[i]);
+ return err;
+ }
+ ret =
+ hfi_get_port_gid(unit[i], port[i], &gid_hi,
+ &gid_lo);
+ if (ret == -1) {
+ err =
+ psmi_handle_error(NULL,
+ PSM2_EP_DEVICE_FAILURE,
+ "Couldn't get gid for unit %d:%d",
+ unit[i], port[i]);
+ return err;
+ }
+ }
+
+ return err;
+ }
+
+ if ((err = psm2_ep_num_devunits(&num_units))) {
+ return err;
+ }
+ if (num_units > HFI_MAX_RAILS) {
+ _HFI_INFO
+ ("Found %d units, max %d units are supported, use %d\n",
+ num_units, HFI_MAX_RAILS, HFI_MAX_RAILS);
+ num_units = HFI_MAX_RAILS;
+ }
+
+ /*
+ * PSM2_MULTIRAIL=2 functionality-
+ * - Try to find at least find one HFI in the same root
+ * complex. If none found, continue to run and
+ * use remaining HFIs in the system.
+ * - If we do find at least one HFI in same root complex, we
+ * go ahead and add to list.
+ */
+ if (multirail_within_socket_used) {
+ node_id = psmi_get_current_proc_location();
+ for (i = 0; i < num_units; i++) {
+ if (hfi_get_unit_active(i) <= 0)
+ continue;
+
+ if (hfi_sysfs_unit_read_node_s64(i) == node_id) {
+ found = 1;
+ break;
+ }
+ }
+ }
+/*
+ * Get all the ports with a valid lid and gid, one per unit.
+ */
+ for (i = 0; i < num_units; i++) {
+ if (multirail_within_socket_used &&
+ found && (hfi_sysfs_unit_read_node_s64(i) != node_id))
+ continue;
+
+ for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+ ret = hfi_get_port_lid(i, j);
+ if (ret <= 0)
+ continue;
+ ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
+ if (ret == -1)
+ continue;
+
+ gidh[count][0] = gid_hi;
+ gidh[count][1] = i;
+ gidh[count][2] = j;
+ count++;
+ break;
+ }
+ }
+
+/*
+ * Sort all the ports with gidh from small to big.
+ * This is for multiple fabrics, and we use fabric with the
+ * smallest gid to make the master connection.
+ */
+ qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc);
+
+ for (i = 0; i < count; i++) {
+ unit[i] = (uint32_t) gidh[i][1];
+ port[i] = (uint16_t) (uint32_t) gidh[i][2];
+ }
+ *num_rails = count;
+ return err;
+}
+
+static psm2_error_t
+psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o,
+ uint64_t my_gid_hi, uint64_t my_gid_lo)
+{
+ static uint16_t *hfi_lids;
+ static uint32_t nlids;
+ uint32_t num_units;
+ int i;
+ psm2_error_t err = PSM2_OK;
+
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (hfi_lids == NULL) {
+ if ((err = psm2_ep_num_devunits(&num_units)))
+ goto fail;
+ hfi_lids = (uint16_t *)
+ psmi_calloc(PSMI_EP_NONE, UNDEFINED,
+ num_units * HFI_NUM_PORTS, sizeof(uint16_t));
+ if (hfi_lids == NULL) {
+ err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+ "Couldn't allocate memory for dev_lids structure");
+ goto fail;
+ }
+
+ for (i = 0; i < num_units; i++) {
+ int j;
+ for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+ int lid = hfi_get_port_lid(i, j);
+ int ret;
+ uint64_t gid_hi = 0, gid_lo = 0;
+
+ if (lid <= 0)
+ continue;
+ ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
+ if (ret == -1)
+ continue;
+ else if (my_gid_hi != gid_hi) {
+ _HFI_VDBG("LID %d, unit %d, port %d, "
+ "mismatched GID %llx:%llx and "
+ "%llx:%llx\n",
+ lid, i, j,
+ (unsigned long long)gid_hi,
+ (unsigned long long)gid_lo,
+ (unsigned long long)my_gid_hi,
+ (unsigned long long)
+ my_gid_lo);
+ continue;
+ }
+ _HFI_VDBG("LID %d, unit %d, port %d, "
+ "matching GID %llx:%llx and "
+ "%llx:%llx\n", lid, i, j,
+ (unsigned long long)gid_hi,
+ (unsigned long long)gid_lo,
+ (unsigned long long)my_gid_hi,
+ (unsigned long long)my_gid_lo);
+
+ hfi_lids[nlids++] = (uint16_t) lid;
+ }
+ }
+ if (nlids == 0) {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "Couldn't get lid&gid from any unit/port");
+ goto fail;
+ }
+ }
+ *lids = hfi_lids;
+ *num_lids_o = nlids;
+
+fail:
+ return err;
+}
+
+static psm2_error_t
+psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey)
+{
+ int i, ret;
+ psm2_error_t err;
+
+ for (i = 0; i < 16; i++) {
+ ret = hfi_get_port_index2pkey(ep->unit_id, ep->portnum, i);
+ if (ret < 0) {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "Can't get a valid pkey value from pkey table\n");
+ return err;
+ } else if (ret == 0x7fff || ret == 0xffff) {
+ continue; /* management pkey, not for app traffic. */
+ }
+
+ if (pkey == (uint16_t) ret)
+ break;
+ }
+
+ /* if pkey does not match */
+ if (i == 16) {
+ err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+ "Wrong pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n",
+ pkey);
+ return err;
+ }
+
+ /* return the final pkey */
+ *opkey = pkey;
+
+ return PSM2_OK;
+}
+
+uint64_t __psm2_epid_nid(psm2_epid_t epid)
+{
+ uint64_t rv;
+
+ PSM2_LOG_MSG("entering");
+ rv = (uint64_t) PSMI_EPID_GET_LID(epid);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_epid_nid)
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * subcontexts */
+uint64_t psmi_epid_subcontext(psm2_epid_t epid)
+{
+ return (uint64_t) PSMI_EPID_GET_SUBCONTEXT(epid);
+}
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * service levels encoding within epids. This may require
+ * changing to expose SLs
+ */
+uint64_t psmi_epid_version(psm2_epid_t epid)
+{
+ return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid);
+}
+
+uint64_t __psm2_epid_context(psm2_epid_t epid)
+{
+ uint64_t rv;
+
+ PSM2_LOG_MSG("entering");
+ rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_epid_context)
+
+uint64_t __psm2_epid_port(psm2_epid_t epid)
+{
+ uint64_t rv;
+ PSM2_LOG_MSG("entering");
+ rv = __psm2_epid_context(epid);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_epid_port)
+
+psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
+{
+ psm2_error_t err = PSM2_OK;
+ int i;
+ psm2_ep_t ep;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (*num_of_epinfo <= 0) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid psm2_ep_query parameters");
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ if (psmi_opened_endpoint == NULL) {
+ err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+ "PSM Endpoint is closed or does not exist");
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ ep = psmi_opened_endpoint;
+ for (i = 0; i < *num_of_epinfo; i++) {
+ if (ep == NULL)
+ break;
+ array_of_epinfo[i].ep = ep;
+ array_of_epinfo[i].epid = ep->epid;
+ array_of_epinfo[i].jkey = ep->jkey;
+ memcpy(array_of_epinfo[i].uuid,
+ (void *)ep->uuid, sizeof(psm2_uuid_t));
+ psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str);
+ ep = ep->user_ep_next;
+ }
+ *num_of_epinfo = i;
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_query)
+
+psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+ psm2_error_t err = PSM2_OK;
+ psm2_epaddr_t epaddr;
+ psm2_ep_t ep;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ /* Need to have an opened endpoint before we can resolve epids */
+ if (psmi_opened_endpoint == NULL) {
+ err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+ "PSM Endpoint is closed or does not exist");
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ ep = psmi_opened_endpoint;
+ while (ep) {
+ epaddr = psmi_epid_lookup(ep, epid);
+ if (!epaddr) {
+ ep = ep->user_ep_next;
+ continue;
+ }
+
+ /* Found connection for epid. Return info about endpoint to caller. */
+ psmi_assert_always(epaddr->ptlctl->ep == ep);
+ epconn->addr = epaddr;
+ epconn->ep = ep;
+ epconn->mq = ep->mq;
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN,
+ "Endpoint connection status unknown");
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup);
+
+psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+ psm2_error_t err = PSM2_OK;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ /* Need to have an opened endpoint before we can resolve epids */
+ if (ep == NULL) {
+ err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+ "PSM Endpoint is closed or does not exist");
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ if (epconn == NULL) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid output parameter");
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid);
+ if (epaddr) {
+ /* Found connection for epid. Return info about endpoint to caller. */
+ psmi_assert_always(epaddr->ptlctl->ep == ep);
+ epconn->addr = epaddr;
+ epconn->ep = ep;
+ epconn->mq = ep->mq;
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN,
+ "Endpoint connection status unknown");
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup2);
+
+psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid)
+{
+ psm2_error_t err = PSM2_OK;
+ PSM2_LOG_MSG("entering");
+ if (epaddr && epid) {
+ *epid = epaddr->epid;
+ }
+ else {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid input epaddr or output epid parameter");
+ }
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_epaddr_to_epid);
+
+psm2_error_t
+__psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o)
+{
+ uint32_t num_lids = 0;
+ uint16_t *lids = NULL;
+ int i;
+ uint16_t epid_lid;
+ int result = 0;
+ psm2_error_t err;
+
+ PSM2_LOG_MSG("entering");
+ psmi_assert_always(ep != NULL);
+ PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+ if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) ||
+ (psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) {
+ /* If we are in the no hfi-mode, or the other process is,
+ * the epid doesn't help us - so assume both we're on the same
+ * machine and try to connect.
+ */
+ result = 1;
+ } else {
+ epid_lid = (uint16_t) psm2_epid_nid(epid);
+ err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo);
+ if (err) {
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+ for (i = 0; i < num_lids; i++) {
+ if (epid_lid == lids[i]) {
+ /* we share memory if the lid is the same. */
+ result = 1;
+ break;
+ }
+ }
+ }
+ *result_o = result;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_epid_share_memory)
+
+psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts)
+{
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (!opts)
+ return PSM2_PARAM_ERR;
+
+ /* Set in order in the structure. */
+ opts->timeout = 30000000000LL; /* 30 sec */
+ opts->unit = HFI_UNIT_ID_ANY;
+ opts->affinity = PSM2_EP_OPEN_AFFINITY_SET;
+ opts->shm_mbytes = 0; /* deprecated in psm2.h */
+ opts->sendbufs_num = 1024;
+ opts->network_pkey = HFI_DEFAULT_P_KEY;
+ opts->port = HFI_PORT_NUM_ANY;
+ opts->outsl = PSMI_SL_DEFAULT;
+ opts->service_id = HFI_DEFAULT_SERVICE_ID;
+ opts->path_res_type = PSM2_PATH_RES_NONE;
+ opts->senddesc_num = 4096;
+ opts->imm_size = 128;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_open_opts_get_defaults)
+
+psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly);
+
+psm2_error_t
+__psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
+ struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq,
+ psm2_ep_t *epo, psm2_epid_t *epido)
+{
+ psm2_ep_t ep = NULL;
+ uint32_t num_units;
+ size_t len;
+ psm2_error_t err;
+ psm2_epaddr_t epaddr = NULL;
+ char buf[128], *p, *e;
+ union psmi_envvar_val envvar_val;
+ size_t ptl_sizes;
+ struct psm2_ep_open_opts opts;
+ ptl_t *amsh_ptl, *ips_ptl, *self_ptl;
+ int i;
+
+ /* First get the set of default options, we overwrite with the user's
+ * desired values afterwards */
+ if ((err = psm2_ep_open_opts_get_defaults(&opts)))
+ goto fail;
+
+ if (opts_i != NULL) {
+ if (opts_i->timeout != -1)
+ opts.timeout = opts_i->timeout;
+ if (opts_i->unit != -1)
+ opts.unit = opts_i->unit;
+ if (opts_i->affinity != -1)
+ opts.affinity = opts_i->affinity;
+
+ if (opts_i->sendbufs_num != -1)
+ opts.sendbufs_num = opts_i->sendbufs_num;
+
+ if (opts_i->network_pkey != HFI_DEFAULT_P_KEY)
+ opts.network_pkey = opts_i->network_pkey;
+
+ if (opts_i->port != 0)
+ opts.port = opts_i->port;
+
+ if (opts_i->outsl != -1)
+ opts.outsl = opts_i->outsl;
+
+ if (opts_i->service_id)
+ opts.service_id = (uint64_t) opts_i->service_id;
+ if (opts_i->path_res_type != PSM2_PATH_RES_NONE)
+ opts.path_res_type = opts_i->path_res_type;
+
+ if (opts_i->senddesc_num)
+ opts.senddesc_num = opts_i->senddesc_num;
+ if (opts_i->imm_size)
+ opts.imm_size = opts_i->imm_size;
+ }
+
+ /* Get Service ID from environment */
+ if (!psmi_getenv("PSM2_IB_SERVICE_ID",
+ "HFI Service ID for path resolution",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_ULONG_ULONG,
+ (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID,
+ &envvar_val)) {
+ opts.service_id = (uint64_t) envvar_val.e_ulonglong;
+ }
+
+ /* Get Path resolution type from environment Possible choices are:
+ *
+ * NONE : Default same as previous instances. Utilizes static data.
+ * OPP : Use OFED Plus Plus library to do path record queries.
+ * UMAD : Use raw libibumad interface to form and process path records.
+ */
+ if (!psmi_getenv("PSM2_PATH_REC",
+ "Mechanism to query HFI path record (default is no path query)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)"none", &envvar_val)) {
+ if (!strcasecmp(envvar_val.e_str, "none"))
+ opts.path_res_type = PSM2_PATH_RES_NONE;
+ else if (!strcasecmp(envvar_val.e_str, "opp"))
+ opts.path_res_type = PSM2_PATH_RES_OPP;
+ else if (!strcasecmp(envvar_val.e_str, "umad"))
+ opts.path_res_type = PSM2_PATH_RES_UMAD;
+ else {
+ _HFI_ERROR("Unknown path resolution type %s. "
+ "Disabling use of path record query.\n",
+ envvar_val.e_str);
+ opts.path_res_type = PSM2_PATH_RES_NONE;
+ }
+ }
+
+ /* If a specific unit is set in the environment, use that one. */
+ if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+ (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) {
+ opts.unit = envvar_val.e_long;
+ }
+
+ /* Get user specified port number to use. */
+ if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+ (union psmi_envvar_val)HFI_PORT_NUM_ANY,
+ &envvar_val)) {
+ opts.port = envvar_val.e_long;
+ }
+
+ /* Get service level from environment, path-query overrides it */
+ if (!psmi_getenv
+ ("HFI_SL", "HFI outging ServiceLevel number (default 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+ (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) {
+ opts.outsl = envvar_val.e_long;
+ }
+
+ /* Get network key from environment. MVAPICH and other vendor MPIs do not
+ * specify it on ep open and we may require it for vFabrics.
+ * path-query will override it.
+ */
+ if (!psmi_getenv("PSM2_PKEY",
+ "HFI PKey to use for endpoint",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_ULONG,
+ (union psmi_envvar_val)HFI_DEFAULT_P_KEY,
+ &envvar_val)) {
+ opts.network_pkey = (uint64_t) envvar_val.e_ulong;
+ }
+
+ /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of
+ 0x7FFF. That's no longer a valid default, so override it if the
+ client was compiled against PSM v1 */
+ if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 &&
+ opts.network_pkey == 0x7FFF) {
+ opts.network_pkey = HFI_DEFAULT_P_KEY;
+ }
+
+ /* Get number of default send buffers from environment */
+ if (!psmi_getenv("PSM2_NUM_SEND_BUFFERS",
+ "Number of send buffers to allocate [1024]",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)1024, &envvar_val)) {
+ opts.sendbufs_num = envvar_val.e_uint;
+ }
+
+ /* Get immediate data size - transfers less than immediate data size do
+ * not consume a send buffer and require just a send descriptor.
+ */
+ if (!psmi_getenv("PSM2_SEND_IMMEDIATE_SIZE",
+ "Immediate data send size not requiring a buffer [128]",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)128, &envvar_val)) {
+ opts.imm_size = envvar_val.e_uint;
+ }
+
+ /* Get number of send descriptors - by default this is 4 times the number
+ * of send buffers - mainly used for short/inlined messages.
+ */
+ if (!psmi_getenv("PSM2_NUM_SEND_DESCRIPTORS",
+ "Number of send descriptors to allocate [4096]",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)4096, &envvar_val)) {
+ opts.senddesc_num = envvar_val.e_uint;
+ }
+
+ if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+ if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK)
+ goto fail;
+ } else
+ num_units = 0;
+
+ /* do some error checking */
+ if (opts.timeout < -1) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid timeout value %lld",
+ (long long)opts.timeout);
+ goto fail;
+ } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid Device Unit ID %d (%d units found)",
+ opts.unit, num_units);
+ goto fail;
+ } else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) &&
+ opts.port != HFI_PORT_NUM_ANY) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid Device port number %d",
+ opts.port);
+ goto fail;
+ } else if (opts.affinity < 0
+ || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) {
+ err =
+ psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid Affinity option: %d",
+ opts.affinity);
+ goto fail;
+ } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Invalid SL number: %lld",
+ (unsigned long long)opts.outsl);
+ goto fail;
+ }
+
+ /* Set environment variable if PSM is not allowed to set affinity */
+ if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP)
+ setenv("HFI_NO_CPUAFFINITY", "1", 1);
+
+ /* Allocate end point structure storage */
+ ptl_sizes =
+ (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
+ psmi_ptl_self.sizeof_ptl() : 0) +
+ (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ?
+ psmi_ptl_ips.sizeof_ptl() : 0) +
+ (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ?
+ psmi_ptl_amsh.sizeof_ptl() : 0);
+ if (ptl_sizes == 0)
+ return PSM2_EP_NO_DEVICE;
+
+ ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64,
+ sizeof(struct psm2_ep) + ptl_sizes);
+ epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+ 1, sizeof(struct psm2_epaddr));
+ if (ep == NULL || epaddr == NULL) {
+ err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+ "Couldn't allocate memory for %s structure",
+ ep == NULL ? "psm2_ep" : "psm2_epaddr");
+ goto fail;
+ }
+ memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes);
+
+ /* Copy PTL enabled status */
+ for (i = 0; i < PTL_MAX_INIT; i++)
+ ep->devid_enabled[i] = devid_enabled[i];
+
+ /* Matched Queue initialization. We do this early because we have to
+ * make sure ep->mq exists and is valid before calling ips_do_work.
+ */
+ ep->mq = mq;
+
+ /* Get ready for PTL initialization */
+ memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t));
+ ep->epaddr = epaddr;
+ ep->memmode = mq->memmode;
+ ep->hfi_num_sendbufs = opts.sendbufs_num;
+ ep->service_id = opts.service_id;
+ ep->path_res_type = opts.path_res_type;
+ ep->hfi_num_descriptors = opts.senddesc_num;
+ ep->hfi_imm_size = opts.imm_size;
+ ep->errh = psmi_errhandler_global; /* by default use the global one */
+ ep->ptl_amsh.ep_poll = psmi_poll_noop;
+ ep->ptl_ips.ep_poll = psmi_poll_noop;
+ ep->connections = 0;
+
+ /* See how many iterations we want to spin before yielding */
+ psmi_getenv("PSM2_YIELD_SPIN_COUNT",
+ "Spin poll iterations before yield",
+ PSMI_ENVVAR_LEVEL_HIDDEN,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD,
+ &envvar_val);
+ ep->yield_spin_cnt = envvar_val.e_uint;
+
+ ptl_sizes = 0;
+ amsh_ptl = ips_ptl = self_ptl = NULL;
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+ ptl_sizes += psmi_ptl_amsh.sizeof_ptl();
+ }
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+ ptl_sizes += psmi_ptl_ips.sizeof_ptl();
+ }
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+ self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+ ptl_sizes += psmi_ptl_self.sizeof_ptl();
+ }
+
+ if ((err = psmi_ep_open_device(ep, &opts, unique_job_key,
+ &(ep->context), &ep->epid)))
+ goto fail;
+
+ psmi_assert_always(ep->epid != 0);
+ ep->epaddr->epid = ep->epid;
+
+ _HFI_VDBG("psmi_ep_open_device() passed\n");
+
+ /* Set our new label as soon as we know what it is */
+ strncpy(buf, psmi_gethostname(), sizeof(buf) - 1);
+ buf[sizeof(buf) - 1] = '\0';
+
+ p = buf + strlen(buf);
+
+ /* If our rank is set, use it. If not, use context.subcontext notation */
+ if (((e = getenv("MPI_RANKID")) != NULL && *e) ||
+ ((e = getenv("PSC_MPI_RANK")) != NULL && *e))
+ len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e));
+ else
+ len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.",
+ (uint32_t) psm2_epid_context(ep->epid),
+ (uint32_t) psmi_epid_subcontext(ep->epid));
+ *(p + len) = '\0';
+ ep->context_mylabel = psmi_strdup(ep, buf);
+ if (ep->context_mylabel == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ /* hfi_set_mylabel(ep->context_mylabel); */
+
+ if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0)))
+ goto fail;
+
+ _HFI_VDBG("start ptl device init...\n");
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+ if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self)))
+ goto fail;
+ }
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips)))
+ goto fail;
+ }
+ /* If we're shm-only, this device is enabled above */
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh)))
+ goto fail;
+ } else {
+ /* We may have pre-attached as part of getting our rank for enabling
+ * shared contexts. */
+ }
+
+ _HFI_VDBG("finish ptl device init...\n");
+
+ /*
+ * Keep only IPS since only IPS support multi-rail, other devices
+ * are only setup once. IPS device can come to this function again.
+ */
+ for (i = 0; i < PTL_MAX_INIT; i++) {
+ if (devid_enabled[i] != PTL_DEVID_IPS) {
+ devid_enabled[i] = -1;
+ }
+ }
+
+ *epido = ep->epid;
+ *epo = ep;
+
+ return PSM2_OK;
+
+fail:
+ if (ep != NULL) {
+ if (ep->context.fd != -1)
+ close(ep->context.fd);
+ psmi_free(ep);
+ }
+ if (epaddr != NULL)
+ psmi_free(epaddr);
+ return err;
+}
+
+psm2_error_t
+__psm2_ep_open(psm2_uuid_t const unique_job_key,
+ struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo,
+ psm2_epid_t *epido)
+{
+ psm2_error_t err;
+ psm2_mq_t mq;
+ psm2_epid_t epid;
+ psm2_ep_t ep, tmp;
+ uint32_t units[HFI_MAX_RAILS];
+ uint16_t ports[HFI_MAX_RAILS];
+ int i, num_rails = 0;
+ char *uname = "HFI_UNIT";
+ char *pname = "HFI_PORT";
+ char uvalue[6], pvalue[6];
+ int devid_enabled[PTL_MAX_INIT];
+ union psmi_envvar_val devs;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+ if (!epo || !epido)
+ return PSM2_PARAM_ERR;
+
+ /* Allowing only one EP (unless explicitly enabled). */
+ if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_TOO_MANY_ENDPOINTS;
+ }
+
+ /* Matched Queue initialization. We do this early because we have to
+ * make sure ep->mq exists and is valid before calling ips_do_work.
+ */
+ err = psmi_mq_malloc(&mq);
+ PSMI_LOCK(psmi_creation_lock);
+ if (err != PSM2_OK)
+ goto fail;
+
+ /* Set some of the MQ thresholds from the environment.
+ Do this before ptl initialization - the ptl may have other
+ constraints that will limit the MQ's settings. */
+ err = psmi_mq_initialize_defaults(mq);
+ if (err != PSM2_OK)
+ goto fail;
+
+ psmi_init_lock(&(mq->progress_lock));
+
+ /* See which ptl devices we want to use for this ep to be opened */
+ psmi_getenv("PSM2_DEVICES",
+ "Ordered list of PSM-level devices",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs);
+
+ if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
+ goto fail;
+
+ if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+ err = psmi_ep_multirail(&num_rails, units, ports);
+ if (err != PSM2_OK)
+ goto fail;
+
+ /* If multi-rail is used, set the first ep unit/port */
+ if (num_rails > 0) {
+ snprintf(uvalue, 6, "%1d", units[0]);
+ snprintf(pvalue, 6, "%1d", ports[0]);
+ setenv(uname, uvalue, 1);
+ setenv(pname, pvalue, 1);
+ }
+ }
+
+ err = __psm2_ep_open_internal(unique_job_key,
+ devid_enabled, opts_i, mq, &ep, &epid);
+ if (err != PSM2_OK)
+ goto fail;
+
+ if (psmi_opened_endpoint == NULL) {
+ psmi_opened_endpoint = ep;
+ } else {
+ tmp = psmi_opened_endpoint;
+ while (tmp->user_ep_next)
+ tmp = tmp->user_ep_next;
+ tmp->user_ep_next = ep;
+ }
+ psmi_opened_endpoint_count++;
+ ep->mctxt_prev = ep->mctxt_next = ep;
+ ep->mctxt_master = ep;
+ mq->ep = ep;
+
+ /* Active Message initialization */
+ err = psmi_am_init_internal(ep);
+ if (err != PSM2_OK)
+ goto fail;
+
+ *epo = ep;
+ *epido = epid;
+
+ if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+ for (i = 1; i < num_rails; i++) {
+ snprintf(uvalue, 6, "%1d", units[i]);
+ snprintf(pvalue, 6, "%1d", ports[i]);
+ setenv(uname, uvalue, 1);
+ setenv(pname, pvalue, 1);
+
+ /* Create slave EP */
+ err = __psm2_ep_open_internal(unique_job_key,
+ devid_enabled, opts_i, mq,
+ &tmp, &epid);
+ if (err)
+ goto fail;
+
+ /* Point back to shared resources on the master EP */
+ tmp->am_htable = ep->am_htable;
+
+ /* Link slave EP after master EP. */
+ PSM_MCTXT_APPEND(ep, tmp);
+ }
+ }
+
+ _HFI_VDBG("psm2_ep_open() OK....\n");
+
+fail:
+ PSMI_UNLOCK(psmi_creation_lock);
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_open)
+
+psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
+{
+ psm2_error_t err = PSM2_OK;
+#if _HFI_DEBUGGING
+ uint64_t t_start = 0;
+ if (_HFI_PRDBG_ON) {
+ t_start = get_cycles();
+ }
+#endif
+ union psmi_envvar_val timeout_intval;
+ psm2_ep_t tmp;
+ psm2_mq_t mmq;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(ep);
+ psmi_assert_always(ep->mctxt_master == ep);
+
+ PSMI_LOCK(psmi_creation_lock);
+
+ if (psmi_opened_endpoint == NULL) {
+ err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+ "PSM Endpoint is closed or does not exist");
+ PSM2_LOG_MSG("leaving");
+ PSMI_UNLOCK(psmi_creation_lock);
+ return err;
+ }
+
+ tmp = psmi_opened_endpoint;
+ while (tmp && tmp != ep) {
+ tmp = tmp->user_ep_next;
+ }
+ if (!tmp) {
+ err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+ "PSM Endpoint is closed or does not exist");
+ PSM2_LOG_MSG("leaving");
+ PSMI_UNLOCK(psmi_creation_lock);
+ return err;
+ }
+
+ psmi_getenv("PSM2_CLOSE_TIMEOUT",
+ "End-point close timeout over-ride.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &timeout_intval);
+
+ if (getenv("PSM2_CLOSE_TIMEOUT")) {
+ timeout_in = timeout_intval.e_uint * SEC_ULL;
+ } else if (timeout_in > 0) {
+ /* The timeout parameter provides the minimum timeout. A heuristic
+ * is used to scale up the timeout linearly with the number of
+ * endpoints, and we allow one second per 100 endpoints. */
+ timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100);
+ }
+
+ if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT)
+ timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+ /* Infinite and excessive close time-out are limited here to a max.
+ * The "rationale" is that there is no point waiting around forever for
+ * graceful termination. Normal (or forced) process termination should clean
+ * up the context state correctly even if termination is not graceful. */
+ if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT)
+ timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT;
+ _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and "
+ "%d connections\n",
+ ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO",
+ (double)timeout_in / 1e9, (int)ep->connections);
+
+ /* XXX We currently cheat in the sense that we leave each PTL the allowed
+ * timeout. There's no good way to do this until we change the PTL
+ * interface to allow asynchronous finalization
+ */
+
+ /*
+ * Before freeing the master ep itself,
+ * remove it from the global linklist.
+ * We do it here to let atexit handler in ptl_am directory
+ * to search the global linklist and free the shared memory file.
+ */
+ if (psmi_opened_endpoint == ep) {
+ /* Removing ep from global endpoint list. */
+ psmi_opened_endpoint = ep->user_ep_next;
+ } else {
+ tmp = psmi_opened_endpoint;
+ while (tmp->user_ep_next != ep) {
+ tmp = tmp->user_ep_next;
+ }
+ /* Removing ep from global endpoint list. */
+ tmp->user_ep_next = ep->user_ep_next;
+ }
+ psmi_opened_endpoint_count--;
+
+ /*
+ * This do/while loop is used to close and free memory of endpoints.
+ *
+ * If MULTIRAIL feature is disable this loop will be passed only once
+ * and only endpoint passed in psm2_ep_close will be closed/removed.
+ *
+ * If MULTIRAIL feature is enabled then this loop will be passed
+ * multiple times (depending on number of rails). The order in which
+ * endpoints will be closed is shown below:
+ *
+ * |--this is master endpoint in case of multirail
+ * | this endpoint is passed to psm2_ep_close and
+ * V this is only endpoint known to user.
+ * +<-Ep0<-Ep1<-Ep2<-Ep3
+ * |__________________| Ep3->mctxt_prev points to Ep2
+ * (3) (2) (1) (4) Ep2->mctxt_prev points to Ep1
+ * ^ Ep1->mctxt_prev points to Ep0
+ * | Ep0->mctxt_prev points to Ep3 (master ep)
+ * |
+ * |---- order in which endpoints will be closed.
+ *
+ * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0)
+ * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if
+ * feature is disabled then Ep3->mctxt_prev will point to Ep3 and
+ * do/while loop will have one pass.
+ *
+ * In case of MULTIRAIL enabled Ep3 which is master endpoint will be
+ * closed as the last one.
+ */
+ mmq = ep->mq;
+ tmp = ep->mctxt_prev;
+ do {
+ ep = tmp;
+ tmp = ep->mctxt_prev;
+
+ PSMI_LOCK(ep->mq->progress_lock);
+
+ PSM_MCTXT_REMOVE(ep);
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH))
+ err =
+ psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode,
+ timeout_in);
+
+ if ((err == PSM2_OK || err == PSM2_TIMEOUT) &&
+ psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+ err =
+ psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode,
+ timeout_in);
+
+ /* If there's timeouts in the disconnect requests,
+ * still make sure that we still get to close the
+ *endpoint and mark it closed */
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+ psmi_context_close(&ep->context);
+
+ psmi_free(ep->epaddr);
+ psmi_free(ep->context_mylabel);
+
+ PSMI_UNLOCK(ep->mq->progress_lock);
+
+ ep->mq = NULL;
+ psmi_free(ep);
+
+ } while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep);
+
+ if (mmq)
+ err = psmi_mq_free(mmq);
+
+
+ PSMI_UNLOCK(psmi_creation_lock);
+
+ if (_HFI_PRDBG_ON) {
+ _HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n",
+ (double)cycles_to_nanosecs(get_cycles() -
+ t_start) / SEC_ULL);
+ }
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_close)
+
+static
+psm2_error_t
+psmi_ep_open_device(const psm2_ep_t ep,
+ const struct psm2_ep_open_opts *opts,
+ const psm2_uuid_t unique_job_key,
+ struct psmi_context *context, psm2_epid_t *epid)
+{
+ psm2_error_t err = PSM2_OK;
+
+ /* Skip affinity. No affinity if:
+ * 1. User explicitly sets no-affinity=YES in environment.
+ * 2. User doesn't set affinity in environment and PSM is opened with
+ * option affinity skip.
+ */
+ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ uint32_t rcvthread_flags;
+ union psmi_envvar_val env_rcvthread;
+ static int norcvthread; /* only for first rail */
+
+ ep->out_sl = opts->outsl;
+
+ if ((err =
+ psmi_context_open(ep, opts->unit, opts->port,
+ unique_job_key, opts->timeout,
+ context)) != PSM2_OK)
+ goto fail;
+
+ _HFI_DBG("[%d]use unit %d port %d\n", getpid(),
+ context->ctrl->__hfi_unit, 1);
+
+ /* At this point, we have the unit id and port number, so
+ * check if pkey is not 0x0/0x7fff/0xffff, and match one
+ * of the pkey in table.
+ */
+ if ((err =
+ psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey,
+ &ep->network_pkey)) != PSM2_OK)
+ goto fail;
+
+ /* See if we want to activate support for receive thread */
+ psmi_getenv("PSM2_RCVTHREAD",
+ "Recv thread flags (0 disables thread)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)(norcvthread++ ? 0 :
+ PSMI_RCVTHREAD_FLAGS),
+ &env_rcvthread);
+ rcvthread_flags = env_rcvthread.e_uint;
+
+ /* If enabled, use the pollurg capability to implement a receive
+ * interrupt thread that can handle urg packets */
+ if (rcvthread_flags) {
+ context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD;
+#ifdef PSMI_PLOCK_IS_NOLOCK
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "#define PSMI_PLOCK_IS_NOLOCK not functional yet "
+ "with RCVTHREAD on");
+#endif
+ }
+ context->rcvthread_flags = rcvthread_flags;
+
+ *epid = context->epid;
+ } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ int rank;
+
+ /* In shm-only mode, we need to derive a valid epid
+ * based on our rank. We try to get it from the
+ * environment if its available, or resort to using
+ * our PID as the rank.
+ */
+ union psmi_envvar_val env_rankid;
+
+ if (psmi_getenv
+ ("MPI_LOCALRANKID", "Shared context rankid",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)-1, &env_rankid)) {
+ if (psmi_getenv
+ ("PSC_MPI_NODE_RANK",
+ "Shared context rankid",
+ PSMI_ENVVAR_LEVEL_HIDDEN,
+ PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)-1, &env_rankid)) {
+ rank = getpid();
+ } else
+ rank = env_rankid.e_int;
+ } else
+ rank = env_rankid.e_int;
+
+ /*
+ * We use a LID of 0 for non-HFI communication.
+ * Since a jobkey is not available from IPS, pull the
+ * first 16 bits from the UUID.
+ */
+ switch (PSMI_EPID_VERSION) {
+ case PSMI_EPID_V1:
+ *epid = PSMI_EPID_PACK_V1(((uint16_t *) unique_job_key)[0],
+ (rank >> 3), rank, 0,
+ PSMI_EPID_VERSION_SHM, rank);
+ break;
+ case PSMI_EPID_V2:
+ /* Construct epid for this Endpoint */
+ *epid = PSMI_EPID_PACK_V2_SHM(getpid(),
+ PSMI_EPID_SHM_ONLY, /*is a only-shm epid */
+ PSMI_EPID_VERSION);
+ break;
+ default:
+ /* Epid version is greater than max supportd version. */
+ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+ break;
+ }
+ } else {
+ /* Self-only, meaning only 1 proc max */
+ switch (PSMI_EPID_VERSION) {
+ case PSMI_EPID_V1:
+ *epid = PSMI_EPID_PACK_V1(
+ 0, 0, 0, 0, PSMI_EPID_VERSION_SHM, 0x3ffffff);
+ break;
+ case PSMI_EPID_V2:
+ *epid = PSMI_EPID_PACK_V2_SHM(0,
+ PSMI_EPID_SHM_ONLY, /*is a only-shm epid */
+ PSMI_EPID_VERSION);
+ break;
+ default:
+ /* Epid version is greater than max supportd version. */
+ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2);
+ break;
+ }
+ }
+
+fail:
+ return err;
+}
+
+/* Get a list of PTLs we want to use. The order is important, it affects
+ * whether node-local processes use shm or ips */
+static
+psm2_error_t
+psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
+{
+ char *devstr = NULL;
+ char *b_new, *e, *ee, *b;
+ psm2_error_t err = PSM2_OK;
+ int len;
+ int i = 0;
+
+ psmi_assert_always(devstring != NULL);
+ len = strlen(devstring) + 1;
+
+ for (i = 0; i < PTL_MAX_INIT; i++)
+ devices[i] = -1;
+
+ devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len);
+ if (devstr == NULL)
+ goto fail;
+
+ b_new = (char *)devstr;
+ e = b_new + len;
+ strncpy(e, devstring, len - 1);
+ e[len - 1] = '\0';
+ ee = e + len;
+ i = 0;
+ while (e < ee && *e && i < PTL_MAX_INIT) {
+ while (*e && !isalpha(*e))
+ e++;
+ b = e;
+ while (*e && isalpha(*e))
+ e++;
+ *e = '\0';
+ if (*b) {
+ if (!strcasecmp(b, "self")) {
+ devices[i++] = PTL_DEVID_SELF;
+ b_new = strcpy(b_new, "self,");
+ b_new += 5;
+ } else if (!strcasecmp(b, "shm") ||
+ !strcasecmp(b, "shmem") ||
+ !strcasecmp(b, "amsh")) {
+ devices[i++] = PTL_DEVID_AMSH;
+ strcpy(b_new, "amsh,");
+ b_new += 5;
+ } else if (!strcasecmp(b, "hfi") ||
+ !strcasecmp(b, "ipath") ||
+ !strcasecmp(b, "ips")) {
+ devices[i++] = PTL_DEVID_IPS;
+ strcpy(b_new, "ips,");
+ b_new += 4;
+ } else {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "%s set in environment variable PSM_PTL_DEVICES=\"%s\" "
+ "is not one of the recognized PTL devices (%s)",
+ b, devstring,
+ PSMI_DEVICES_DEFAULT);
+ goto fail;
+ }
+ e++;
+ }
+ }
+ if (b_new != devstr) /* we parsed something, remove trailing comma */
+ *(b_new - 1) = '\0';
+
+ _HFI_PRDBG("PSM Device allocation order: %s\n", devstr);
+fail:
+ if (devstr != NULL)
+ psmi_free(devstr);
+ return err;
+
+}
+
+static
+int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
+{
+ int i;
+ for (i = 0; i < PTL_MAX_INIT; i++)
+ if (devid_enabled[i] == devid)
+ return 1;
+ return 0;
+}
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid)
+{
+ return psmi_device_is_enabled(ep->devid_enabled, devid);
+}
diff --git a/psm_ep.h b/psm_ep.h
new file mode 100644
index 0000000..78b12f1
--- /dev/null
+++ b/psm_ep.h
@@ -0,0 +1,245 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm2_ep.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_EP_H
+#define _PSMI_EP_H
+
+/*
+ * EPIDs encode the following information:
+ *
+ * LID:16 bits - LID for endpoint
+ * CONTEXT:8 bits - Context used for bits (upto 256 contexts)
+ * SUBCONTEXT:3 bits - Subcontext used for endpoint
+ * HFIUNIT: 2 bits - HFI unit number
+ * HFITYPE: 3 bits - OPA1, OPA2, ...
+ * RANK: 26 bits - process rank
+ * reserved: 6 bit - for future usage
+ */
+
+#define PSMI_HFI_TYPE_UNKNOWN 0
+#define PSMI_HFI_TYPE_OPA1 1
+#define PSMI_HFI_TYPE_OPA2 2
+
+#define PSMI_SL_DEFAULT 0
+#define PSMI_SC_DEFAULT 0
+#define PSMI_VL_DEFAULT 0
+#define PSMI_SL_MIN 0
+#define PSMI_SL_MAX 31
+#define PSMI_SC_ADMIN 15
+#define PSMI_VL_ADMIN 15
+
+#define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \
+ (((((uint64_t)lid)&0xffff)<<16) | \
+ ((((uint64_t)context)&0xff)<<8) | \
+ ((((uint64_t)subcontext)&0x7)<<5) | \
+ ((((uint64_t)hfiunit)&0x3)<<3) | \
+ ((((uint64_t)epid_version)&0x7)<<0) | \
+ ((((uint64_t)rank)&0x3ffffff)<<32))
+
+#define PSMI_EPID_PACK_V2(lid, context, subcontext, shmbool, epid_version, subnet_id) \
+ (((((uint64_t)lid)&0xffffff)<<16) | \
+ ((((uint64_t)context)&0xff)<<8) | \
+ ((((uint64_t)subcontext)&0x7)<<5) | \
+ ((((uint64_t)shmbool)&0x1)<<3) | \
+ ((((uint64_t)epid_version)&0x7)<<0) | \
+ ((((uint64_t)subnet_id)&0xffff)<<48))
+
+#define PSMI_EPID_PACK_V2_SHM(process_id, shmbool, epid_version) \
+ (((((uint64_t)process_id)&0xffffffff)<<32) | \
+ ((((uint64_t)shmbool)&0x1)<<3) | \
+ ((((uint64_t)epid_version)&0x7)<<0))
+
+#define PSMI_EPID_GET_LID_V1(epid) (((epid)>>16)&0xffff)
+#define PSMI_EPID_GET_LID_V2(epid) (((epid)>>16)&0xffffff)
+#define PSMI_EPID_GET_CONTEXT(epid) (((epid)>>8)&0xff)
+#define PSMI_EPID_GET_SUBCONTEXT(epid) (((epid)>>5)&0x7)
+#define PSMI_EPID_GET_HFIUNIT(epid) (((epid)>>3)&0x3)
+#define PSMI_EPID_GET_EPID_VERSION(epid) (((epid)>>0)&0x7)
+#define PSMI_EPID_GET_RANK(epid) (((epid)>>32)&0x3ffffff)
+#define PSMI_EPID_GET_SHMBOOL(epid) (((epid)>>3)&0x1)
+#define PSMI_EPID_GET_SUBNET_ID(epid) (((epid)>>48)&0xffff)
+#define PSMI_EPID_GET_PROCESS_ID(epid) (((epid)>>32)&0xffffffff)
+
+#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL)
+#define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL)
+
+#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL)
+
+#define PSM_MCTXT_APPEND(head, node) \
+ node->mctxt_prev = head->mctxt_prev; \
+ node->mctxt_next = head; \
+ head->mctxt_prev->mctxt_next = node; \
+ head->mctxt_prev = node; \
+ node->mctxt_master = head
+#define PSM_MCTXT_REMOVE(node) \
+ node->mctxt_prev->mctxt_next = node->mctxt_next; \
+ node->mctxt_next->mctxt_prev = node->mctxt_prev; \
+ node->mctxt_next = node->mctxt_prev = node; \
+ node->mctxt_master = NULL
+
+#define HFI_MAX_RAILS 4
+
+struct psm2_ep {
+ psm2_epid_t epid; /**> This endpoint's Endpoint ID */
+ psm2_epaddr_t epaddr; /**> This ep's ep address */
+ psm2_mq_t mq; /**> only 1 MQ */
+ int unit_id;
+ uint16_t portnum;
+ uint16_t out_sl;
+ uint16_t mtu; /* out_sl-->vl-->mtu in sysfs */
+ uint16_t network_pkey; /**> OPA Pkey */
+ int did_syslog;
+ psm2_uuid_t uuid;
+ uint16_t jkey;
+ uint64_t service_id; /* OPA service ID */
+ psm2_path_res_t path_res_type; /* Path resolution for endpoint */
+ psm2_ep_errhandler_t errh;
+ int devid_enabled[PTL_MAX_INIT];
+ int memmode; /**> min, normal, large memory mode */
+
+ uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */
+ uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/
+ uint32_t hfi_imm_size; /** Immediate data size */
+ uint32_t connections; /**> Number of connections */
+
+ psmi_context_t context;
+ char *context_mylabel;
+ uint32_t yield_spin_cnt;
+
+ /* EP link-lists */
+ struct psm2_ep *user_ep_next;
+
+ /* EP link-lists for multi-context. */
+ struct psm2_ep *mctxt_prev;
+ struct psm2_ep *mctxt_next;
+ struct psm2_ep *mctxt_master;
+
+ /* Active Message handler table */
+ void **am_htable;
+
+ uint64_t gid_hi;
+ uint64_t gid_lo;
+
+ ptl_ctl_t ptl_amsh;
+ ptl_ctl_t ptl_ips;
+ ptl_ctl_t ptl_self;
+
+ /* All ptl data is allocated inline below */
+ uint8_t ptl_base_data[0] __attribute__ ((aligned(64)));
+};
+
+struct mqq {
+ psm2_mq_req_t first;
+ psm2_mq_req_t last;
+};
+
+typedef
+union psmi_seqnum {
+ struct {
+ uint32_t psn_seq:11;
+ uint32_t psn_gen:20;
+ };
+ struct {
+ uint32_t psn_num:31;
+ };
+ uint32_t psn_val;
+} psmi_seqnum_t;
+
+/*
+ * PSM end point address. One per connection and per rail.
+ */
+struct psm2_epaddr {
+ psm2_epid_t epid; /* peer's epid */
+ ptl_ctl_t *ptlctl; /* The control structure for the ptl */
+ struct ips_proto *proto; /* only for ips protocol */
+ void *usr_ep_ctxt; /* User context associated with endpoint */
+};
+
+#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD
+# define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD 250
+#endif
+
+/*
+ * Users of BLOCKUNTIL should check the value of err upon return
+ */
+#define PSMI_BLOCKUNTIL(ep, err, cond) do { \
+ int spin_cnt = 0; \
+ PSMI_PROFILE_BLOCK(); \
+ while (!(cond)) { \
+ err = psmi_poll_internal(ep, 1); \
+ if (err == PSM2_OK_NO_PROGRESS) { \
+ PSMI_PROFILE_REBLOCK(1); \
+ if (++spin_cnt == (ep)->yield_spin_cnt) { \
+ spin_cnt = 0; \
+ PSMI_YIELD((ep)->mq->progress_lock); \
+ } \
+ } \
+ else if (err == PSM2_OK) { \
+ PSMI_PROFILE_REBLOCK(0); \
+ spin_cnt = 0; \
+ } \
+ else \
+ break; \
+ } \
+ PSMI_PROFILE_UNBLOCK(); \
+} while (0)
+
+#endif /* _PSMI_EP_H */
diff --git a/psm_ep_connect.c b/psm_ep_connect.c
new file mode 100644
index 0000000..9657209
--- /dev/null
+++ b/psm_ep_connect.c
@@ -0,0 +1,620 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+#if _HFI_DEBUGGING
+PSMI_ALWAYS_INLINE(
+char *psmi_getdevice(int type))
+{
+ switch (type) {
+ case PTL_DEVID_IPS:
+ return "ips";
+ case PTL_DEVID_AMSH:
+ return "amsh";
+ case PTL_DEVID_SELF:
+ return "self";
+ default:
+ return "ips";
+ }
+}
+#endif
+
+psm2_error_t
+__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid,
+ int const *array_of_epid_mask, /* can be NULL */
+ psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr,
+ int64_t timeout)
+{
+ psm2_error_t err = PSM2_OK;
+ ptl_ctl_t *ptlctl;
+ ptl_t *ptl;
+ int i, j, dup_idx;
+ int num_toconnect = 0;
+ int *epid_mask = NULL;
+ int *epid_mask_isdupof = NULL;
+ uint64_t t_start = get_cycles();
+ uint64_t t_left;
+ union psmi_envvar_val timeout_intval;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+ /*
+ * Normally we would lock here, but instead each implemented ptl component
+ * does its own locking. This is mostly because the ptl components are
+ * ahead of the PSM2 interface in that they can disconnect their peers.
+ */
+ if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL ||
+ num_of_epid < 1) {
+ err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+ "Invalid psm2_ep_connect parameters");
+ goto fail_nolock;
+ }
+
+ PSMI_LOCK(ep->mq->progress_lock);
+
+ /* We need two of these masks to detect duplicates */
+ err = PSM2_NO_MEMORY;
+ epid_mask =
+ (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+ if (epid_mask == NULL)
+ goto fail;
+ epid_mask_isdupof =
+ (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+ if (epid_mask_isdupof == NULL)
+ goto fail;
+ err = PSM2_OK;
+
+ /* Eventually handle timeouts across all connects. */
+ for (j = 0; j < num_of_epid; j++) {
+ if (array_of_epid_mask != NULL && !array_of_epid_mask[j])
+ epid_mask[j] = 0;
+ else {
+ epid_mask[j] = 1;
+ array_of_errors[j] = PSM2_EPID_UNKNOWN;
+ array_of_epaddr[j] = NULL;
+ if (psmi_epid_version(array_of_epid[j]) >
+ PSMI_EPID_VERSION) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ " Unkown version of EPID - %"PRIu64" \n"
+ "Please upgrade PSM2 or set PSM2_ADDR_FMT=1 in the environment to force EPID version 1 \n",
+ psmi_epid_version(array_of_epid[j]));
+ }
+ num_toconnect++;
+ }
+ epid_mask_isdupof[j] = -1;
+ }
+
+ psmi_getenv("PSM2_CONNECT_TIMEOUT",
+ "End-point connection timeout over-ride. 0 for no time-out.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &timeout_intval);
+
+ if (getenv("PSM2_CONNECT_TIMEOUT")) {
+ timeout = timeout_intval.e_uint * SEC_ULL;
+ } else if (timeout > 0) {
+ /* The timeout parameter provides the minimum timeout. A heuristic
+ * is used to scale up the timeout linearly with the number of
+ * endpoints, and we allow one second per 100 endpoints. */
+ timeout = max(timeout, (num_toconnect * SEC_ULL) / 100);
+ }
+
+ if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+ timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+ _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n",
+ num_toconnect, (double)timeout / 1e9);
+
+ /* Look for duplicates in input array */
+ for (i = 0; i < num_of_epid; i++) {
+ for (j = i + 1; j < num_of_epid; j++) {
+ if (array_of_epid[i] == array_of_epid[j] &&
+ epid_mask[i] && epid_mask[j]) {
+ epid_mask[j] = 0; /* don't connect more than once */
+ epid_mask_isdupof[j] = i;
+ }
+ }
+ }
+
+ for (i = 0; i < PTL_MAX_INIT; i++) {
+ if (ep->devid_enabled[i] == -1)
+ continue;
+ /* Set up the right connect ptrs */
+ switch (ep->devid_enabled[i]) {
+ case PTL_DEVID_IPS:
+ ptlctl = &ep->ptl_ips;
+ ptl = ep->ptl_ips.ptl;
+ break;
+ case PTL_DEVID_AMSH:
+ ptlctl = &ep->ptl_amsh;
+ ptl = ep->ptl_amsh.ptl;
+ break;
+ case PTL_DEVID_SELF:
+ ptlctl = &ep->ptl_self;
+ ptl = ep->ptl_self.ptl;
+ break;
+ default:
+ ptlctl = &ep->ptl_ips; /*no-unused */
+ ptl = ep->ptl_ips.ptl; /*no-unused */
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unknown/unhandled PTL id %d\n",
+ ep->devid_enabled[i]);
+ break;
+ }
+ t_left = psmi_cycles_left(t_start, timeout);
+
+ if (_HFI_VDBG_ON) {
+ _HFI_VDBG_ALWAYS
+ ("Trying to connect with device %s\n",
+ psmi_getdevice(ep->devid_enabled[i]));
+ }
+ if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid,
+ epid_mask, array_of_errors,
+ array_of_epaddr,
+ cycles_to_nanosecs(t_left)))) {
+ if (_HFI_PRDBG_ON) {
+ _HFI_PRDBG_ALWAYS
+ ("Connect failure in device %s err=%d\n",
+ psmi_getdevice(ep->devid_enabled[i]), err);
+ }
+ goto connect_fail;
+ }
+
+ /* Now process what's been connected */
+ for (j = 0; j < num_of_epid; j++) {
+ dup_idx = epid_mask_isdupof[j];
+ if (!epid_mask[j] && dup_idx == -1)
+ continue;
+
+ if (dup_idx != -1) { /* dup */
+ array_of_epaddr[j] = array_of_epaddr[dup_idx];
+ array_of_errors[j] = array_of_errors[dup_idx];
+ epid_mask_isdupof[j] = -1;
+ }
+
+ if (array_of_errors[j] == PSM2_OK) {
+ epid_mask[j] = 0; /* don't try on next ptl */
+ ep->connections++;
+ }
+ }
+ }
+
+ for (i = 0; i < num_of_epid; i++) {
+ ptl_ctl_t *c = NULL;
+ if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
+ continue;
+ /* If we see unreachable here, that means some PTLs were not enabled */
+ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+ err = PSM2_EPID_UNREACHABLE;
+ break;
+ }
+
+ psmi_assert_always(array_of_epaddr[i] != NULL);
+ c = array_of_epaddr[i]->ptlctl;
+ psmi_assert_always(c != NULL);
+ _HFI_VDBG("%-20s DEVICE %s (%p)\n",
+ psmi_epaddr_get_name(array_of_epid[i]),
+ c == &ep->ptl_ips ? "hfi" :
+ (c == &ep->ptl_amsh ? "amsh" : "self"),
+ (void *)array_of_epaddr[i]->ptlctl->ptl);
+ }
+
+ if (err == PSM2_OK)
+ for (i=0; i<num_of_epid; i++)
+ array_of_errors[i] = PSM2_OK;
+
+connect_fail:
+ /* If the error is a timeout (at worse) and the client is OPA MPI,
+ * just return timeout to let OPA MPI handle the hostnames that
+ * timed out */
+ if (err != PSM2_OK) {
+ char errbuf[PSM2_ERRSTRING_MAXLEN];
+ size_t len;
+ int j = 0;
+
+ if (err == PSM2_EPID_UNREACHABLE) {
+ char *deverr = "of an incorrect setting";
+ char *eperr = "";
+ char *devname = NULL;
+ if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ deverr =
+ "there is no shared memory PSM2 device (shm)";
+ eperr = " shared memory";
+ } else
+ if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ deverr =
+ "there is no OPA PSM2 device (hfi)";
+ eperr = " OPA";
+ }
+
+ len = snprintf(errbuf, sizeof(errbuf) - 1,
+ "Some%s endpoints could not be connected because %s "
+ "in the currently enabled PSM2_DEVICES (",
+ eperr, deverr);
+ for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1;
+ i++) {
+ switch (ep->devid_enabled[i]) {
+ case PTL_DEVID_IPS:
+ devname = "hfi";
+ break;
+ case PTL_DEVID_AMSH:
+ devname = "shm";
+ break;
+ case PTL_DEVID_SELF:
+ default:
+ devname = "self";
+ break;
+ }
+ len +=
+ snprintf(errbuf + len,
+ sizeof(errbuf) - len - 1, "%s,",
+ devname);
+ }
+ if (len < sizeof(errbuf) - 1 && devname != NULL)
+ /* parsed something, remove trailing comma */
+ errbuf[len - 1] = ')';
+ } else
+ len = snprintf(errbuf, sizeof(errbuf) - 1,
+ "%s", err == PSM2_TIMEOUT ?
+ "Detected connection timeout" :
+ psm2_error_get_string(err));
+
+ /* first pass, look for all nodes with the error */
+ for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) {
+ if (array_of_epid_mask != NULL
+ && !array_of_epid_mask[i])
+ continue;
+ if (array_of_errors[i] == PSM2_OK)
+ continue;
+ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+ err != PSM2_EPID_UNREACHABLE)
+ continue;
+ if (err == array_of_errors[i]) {
+ len +=
+ snprintf(errbuf + len,
+ sizeof(errbuf) - len - 1, "%c %s",
+ j == 0 ? ':' : ',',
+ psmi_epaddr_get_hostname
+ (array_of_epid[i]));
+ j++;
+ }
+ }
+ errbuf[sizeof(errbuf) - 1] = '\0';
+ err = psmi_handle_error(ep, err, "%s", errbuf);
+ }
+
+fail:
+ PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+ if (epid_mask != NULL)
+ psmi_free(epid_mask);
+ if (epid_mask_isdupof != NULL)
+ psmi_free(epid_mask_isdupof);
+
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_connect)
+
+psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+ psm2_epaddr_t *array_of_epaddr,
+ const int *array_of_epaddr_mask,
+ psm2_error_t *array_of_errors,
+ int64_t timeout)
+{
+ return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr,
+ array_of_epaddr_mask, array_of_errors,
+ PSM2_EP_DISCONNECT_GRACEFUL, timeout);
+}
+PSMI_API_DECL(psm2_ep_disconnect)
+
+psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+ psm2_epaddr_t *array_of_epaddr,
+ const int *array_of_epaddr_mask,
+ psm2_error_t *array_of_errors,
+ int mode, int64_t timeout)
+{
+ psm2_error_t err = PSM2_OK;
+ ptl_ctl_t *ptlctl;
+ ptl_t *ptl;
+ int i, j, dup_idx;
+ int num_todisconnect = 0;
+ int *epaddr_mask = NULL;
+ int *epaddr_mask_isdupof = NULL;
+ uint64_t t_start = get_cycles();
+ uint64_t t_left;
+ union psmi_envvar_val timeout_intval;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+
+ /*
+ * Normally we would lock here, but instead each implemented ptl component
+ * does its own locking. This is mostly because the ptl components are
+ * ahead of the PSM2 interface in that they can disconnect their peers.
+ */
+ if (ep == NULL || array_of_epaddr == NULL ||
+ num_of_epaddr < 1) {
+ err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+ "Invalid psm2_ep_disconnect parameters");
+ goto fail_nolock;
+ }
+
+ PSMI_LOCK(ep->mq->progress_lock);
+
+ /* We need two of these masks to detect duplicates */
+ err = PSM2_NO_MEMORY;
+ epaddr_mask =
+ (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+ if (epaddr_mask == NULL)
+ goto fail;
+ epaddr_mask_isdupof =
+ (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+ if (epaddr_mask_isdupof == NULL)
+ goto fail;
+ err = PSM2_OK;
+
+ /* Eventually handle timeouts across all connects. */
+ for (j = 0; j < num_of_epaddr; j++) {
+ if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j])
+ epaddr_mask[j] = 0;
+ else {
+ epaddr_mask[j] = 1;
+ array_of_errors[j] = PSM2_EPID_UNKNOWN;
+ num_todisconnect++;
+ }
+ epaddr_mask_isdupof[j] = -1;
+ }
+
+ psmi_getenv("PSM2_DISCONNECT_TIMEOUT",
+ "End-point disconnection timeout over-ride. 0 for no time-out.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &timeout_intval);
+
+ if (getenv("PSM2_DISCONNECT_TIMEOUT")) {
+ timeout = timeout_intval.e_uint * SEC_ULL;
+ } else if (timeout > 0) {
+ /* The timeout parameter provides the minimum timeout. A heuristic
+ * is used to scale up the timeout linearly with the number of
+ * endpoints, and we allow one second per 100 endpoints. */
+ timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100);
+ }
+
+ if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+ timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+ _HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n",
+ num_todisconnect, (double)timeout / 1e9);
+
+ /* Look for duplicates in input array */
+ for (i = 0; i < num_of_epaddr; i++) {
+ for (j = i + 1; j < num_of_epaddr; j++) {
+ if (array_of_epaddr[i] == array_of_epaddr[j] &&
+ epaddr_mask[i] && epaddr_mask[j]) {
+ epaddr_mask[j] = 0; /* don't disconnect more than once */
+ epaddr_mask_isdupof[j] = i;
+ }
+ }
+ }
+
+ for (i = 0; i < PTL_MAX_INIT; i++) {
+ if (ep->devid_enabled[i] == -1)
+ continue;
+ /* Set up the right connect ptrs */
+ switch (ep->devid_enabled[i]) {
+ case PTL_DEVID_IPS:
+ ptlctl = &ep->ptl_ips;
+ ptl = ep->ptl_ips.ptl;
+ break;
+ case PTL_DEVID_AMSH:
+ ptlctl = &ep->ptl_amsh;
+ ptl = ep->ptl_amsh.ptl;
+ break;
+ case PTL_DEVID_SELF:
+ ptlctl = &ep->ptl_self;
+ ptl = ep->ptl_self.ptl;
+ break;
+ default:
+ ptlctl = &ep->ptl_ips; /*no-unused */
+ ptl = ep->ptl_ips.ptl; /*no-unused */
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unknown/unhandled PTL id %d\n",
+ ep->devid_enabled[i]);
+ break;
+ }
+ t_left = psmi_cycles_left(t_start, timeout);
+
+ if (_HFI_VDBG_ON) {
+ _HFI_VDBG_ALWAYS
+ ("Trying to disconnect with device %s\n",
+ psmi_getdevice(ep->devid_enabled[i]));
+ }
+ if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE),
+ num_of_epaddr, array_of_epaddr,
+ epaddr_mask, array_of_errors,
+ cycles_to_nanosecs(t_left)))) {
+ if (_HFI_PRDBG_ON) {
+ _HFI_PRDBG_ALWAYS
+ ("Disconnect failure in device %s err=%d\n",
+ psmi_getdevice(ep->devid_enabled[i]), err);
+ }
+ goto disconnect_fail;
+ }
+
+ /* Now process what's been disconnected */
+ for (j = 0; j < num_of_epaddr; j++) {
+ dup_idx = epaddr_mask_isdupof[j];
+ if (!epaddr_mask[j] && dup_idx == -1)
+ continue;
+
+ if (dup_idx != -1) { /* dup */
+ array_of_errors[j] = array_of_errors[dup_idx];
+ epaddr_mask_isdupof[j] = -1;
+ }
+
+ if (array_of_errors[j] == PSM2_OK) {
+ epaddr_mask[j] = 0; /* don't try on next ptl */
+ array_of_epaddr[j] = NULL;
+ ep->connections--;
+ }
+ }
+ }
+
+ for (i = 0; i < num_of_epaddr; i++) {
+ if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i])
+ continue;
+ /* If we see unreachable here, that means some PTLs were not enabled */
+ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+ err = PSM2_EPID_UNREACHABLE;
+ break;
+ }
+ }
+
+disconnect_fail:
+ /* If the error is a timeout (at worse) and the client is OPA MPI,
+ * just return timeout to let OPA MPI handle the hostnames that
+ * timed out */
+ if (err != PSM2_OK) {
+ char errbuf[PSM2_ERRSTRING_MAXLEN];
+ size_t len;
+ int j = 0;
+
+ if (err == PSM2_EPID_UNREACHABLE) {
+ char *deverr = "of an incorrect setting";
+ char *eperr = "";
+ char *devname = NULL;
+ if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+ deverr =
+ "there is no shared memory PSM2 device (shm)";
+ eperr = " shared memory";
+ } else
+ if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+ deverr =
+ "there is no OPA PSM2 device (hfi)";
+ eperr = " OPA";
+ }
+
+ len = snprintf(errbuf, sizeof(errbuf) - 1,
+ "Some%s endpoints could not be disconnected because %s "
+ "in the currently enabled PSM2_DEVICES (",
+ eperr, deverr);
+ for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) {
+ switch (ep->devid_enabled[i]) {
+ case PTL_DEVID_IPS:
+ devname = "hfi";
+ break;
+ case PTL_DEVID_AMSH:
+ devname = "shm";
+ break;
+ case PTL_DEVID_SELF:
+ default:
+ devname = "self";
+ break;
+ }
+ len +=
+ snprintf(errbuf + len,
+ sizeof(errbuf) - len - 1, "%s,",
+ devname);
+ }
+ if (len < sizeof(errbuf) - 1 && devname != NULL)
+ /* parsed something, remove trailing comma */
+ errbuf[len - 1] = ')';
+ } else
+ len = snprintf(errbuf, sizeof(errbuf) - 1,
+ "%s", err == PSM2_TIMEOUT ?
+ "Detected disconnect timeout" :
+ psm2_error_get_string(err));
+
+ /* first pass, look for all nodes with the error */
+ for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) {
+ if (array_of_epaddr_mask != NULL
+ && !array_of_epaddr_mask[i])
+ continue;
+ if (array_of_errors[i] == PSM2_OK)
+ continue;
+ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+ err != PSM2_EPID_UNREACHABLE)
+ continue;
+ if (err == array_of_errors[i]) {
+ len +=
+ snprintf(errbuf + len,
+ sizeof(errbuf) - len - 1, "%c %s",
+ j == 0 ? ':' : ',',
+ psmi_epaddr_get_hostname
+ (array_of_epaddr[i]->epid));
+ j++;
+ }
+ }
+ errbuf[sizeof(errbuf) - 1] = '\0';
+ err = psmi_handle_error(ep, err, "%s", errbuf);
+ }
+
+fail:
+ PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+ if (epaddr_mask != NULL)
+ psmi_free(epaddr_mask);
+ if (epaddr_mask_isdupof != NULL)
+ psmi_free(epaddr_mask_isdupof);
+
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_ep_disconnect2)
diff --git a/psm_error.c b/psm_error.c
new file mode 100644
index 0000000..99bb94f
--- /dev/null
+++ b/psm_error.c
@@ -0,0 +1,348 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_NOLOG -1
+
+struct psm2_error_token {
+ psm2_ep_t ep;
+ psm2_error_t error;
+ char err_string[PSM2_ERRSTRING_MAXLEN];
+};
+
+static
+psm2_error_t
+psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err,
+ const char *error_string, psm2_error_token_t token)
+{
+ return err;
+}
+
+static
+psm2_error_t
+psmi_errhandler_psm(psm2_ep_t ep,
+ const psm2_error_t err,
+ const char *error_string, psm2_error_token_t token)
+{
+ /* we want the error to be seen through ssh, etc., so we flush and then
+ * sleep a bit. Not perfect, but not doing so means it almost never
+ * gets seen. */
+ fprintf(stderr, "%s%s\n", hfi_get_mylabel(), token->err_string);
+ fflush(stdout);
+ fflush(stderr);
+
+ /* XXX Eventually, this will hook up to a connection manager, and we'll
+ * issue an upcall into the connection manager at shutdown time */
+ sleep(3);
+
+ /* We use this "special" ep internally to handle internal errors that are
+ * triggered from within code that is not expected to return to the user.
+ * Errors of this sort on not expected to be handled by users and always
+ * mean we have an internal PSM bug. */
+ if (err == PSM2_INTERNAL_ERR)
+ abort();
+ else
+ exit(-1);
+}
+
+psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop;
+
+psm2_error_t __psm2_error_defer(psm2_error_token_t token)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ rv = psmi_errhandler_psm(token->ep, token->error, token->err_string,
+ token);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_error_defer)
+
+psm2_error_t
+__psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler)
+{
+ psm2_ep_errhandler_t *errh;
+
+ PSM2_LOG_MSG("entering");
+
+ if (ep == NULL)
+ errh = &psmi_errhandler_global;
+ else
+ errh = &ep->errh;
+
+ if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER)
+ *errh = psmi_errhandler_psm;
+ else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER)
+ *errh = psmi_errhandler_noop;
+ else
+ *errh = errhandler;
+
+ PSM2_LOG_MSG("leaving");
+
+ return PSM2_OK;
+}
+PSMI_API_DECL(psm2_error_register_handler)
+
+psm2_error_t
+MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...)
+{
+ va_list argptr;
+ int syslog_level;
+ int console_print = 0;
+ psm2_error_t newerr;
+ struct psm2_error_token token;
+ char *c, fullmsg[PSM2_ERRSTRING_MAXLEN];
+ token.error = error;
+ snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf);
+ fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+ va_start(argptr, buf);
+ vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr);
+ va_end(argptr);
+ token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+
+ /* Unless the user has set PSM2_NO_VERBOSE_ERRORS, always print errors to
+ * console */
+ c = getenv("PSM2_NO_VERBOSE_ERRORS");
+ console_print = 0;
+ if (ep == PSMI_EP_LOGEVENT)
+ console_print = 1;
+ else if (!c || *c == '\0') { /* no desire to prevent verbose errors */
+ /* Remove the console print if we're internally handling the error */
+ if (ep == PSMI_EP_NORETURN)
+ console_print = 0;
+ else if (ep == NULL
+ && psmi_errhandler_global != psmi_errhandler_psm)
+ console_print = 1;
+ else if (ep != NULL && ep->errh != psmi_errhandler_psm)
+ console_print = 1;
+ }
+
+ /* Before we let the user even handle the error, send to syslog */
+ syslog_level = psmi_error_syslog_level(error);
+ if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT)
+ psmi_syslog(ep, console_print,
+ ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level,
+ "%s (err=%d)", token.err_string, error);
+
+ if (ep == PSMI_EP_LOGEVENT) /* we're just logging */
+ newerr = PSM2_OK;
+ else if (ep == PSMI_EP_NORETURN)
+ newerr =
+ psmi_errhandler_psm(NULL, error, token.err_string, &token);
+ else if (ep == NULL)
+ newerr =
+ psmi_errhandler_global(NULL, error, token.err_string,
+ &token);
+ else
+ newerr = ep->errh(ep, error, token.err_string, &token);
+
+ return newerr;
+}
+MOCK_DEF_EPILOGUE(psmi_handle_error);
+
+/* Returns the "worst" error out of errA and errB */
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB)
+{
+#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err)
+
+ /* Bad runtime or before initialization */
+ _PSMI_ERR_IS(PSM2_NO_MEMORY);
+ _PSMI_ERR_IS(PSM2_INTERNAL_ERR);
+ _PSMI_ERR_IS(PSM2_INIT_NOT_INIT);
+ _PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION);
+
+ /* Before we cget an endpoint */
+ _PSMI_ERR_IS(PSM2_EP_NO_DEVICE);
+ _PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND);
+ _PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE);
+ _PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL);
+ _PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS);
+
+ /* As we open/close the endpoint */
+ _PSMI_ERR_IS(PSM2_EP_NO_NETWORK);
+ _PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR);
+ _PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT);
+ _PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY);
+ _PSMI_ERR_IS(PSM2_EP_NO_RESOURCES);
+
+ /* In connect phase */
+ _PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_NODE);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY);
+ _PSMI_ERR_IS(PSM2_EPID_INVALID_MTU);
+
+ /* Timeout if nothing else */
+ _PSMI_ERR_IS(PSM2_TIMEOUT);
+
+ /* Last resort */
+ return max(errA, errB);
+}
+
+struct psmi_error_item {
+ int syslog_level;
+ const char *error_string;
+};
+
+static
+struct psmi_error_item psmi_error_items[] = {
+ {PSMI_NOLOG, "Success"}, /* PSM2_OK = 0, */
+ {PSMI_NOLOG, "No events were progressed in psm_poll"}, /* PSM2_OK_NO_PROGRESS = 1 */
+ {PSMI_NOLOG, "unknown 2"},
+ {PSMI_NOLOG, "Error in a function parameter"}, /* PSM2_PARAM_ERR = 3 */
+ {LOG_CRIT, "Ran out of memory"}, /* PSM2_NO_MEMORY = 4 */
+ {PSMI_NOLOG, "PSM has not been initialized by psm2_init"}, /* PSM2_INIT_NOT_INIT = 5 */
+ {LOG_INFO, "API version passed in psm2_init is incompatible"}, /* PSM2_INIT_BAD_API_VERSION = 6 */
+ {PSMI_NOLOG, "PSM Could not set affinity"}, /* PSM2_NO_AFFINITY = 7 */
+ {LOG_ALERT, "PSM Unresolved internal error"}, /* PSM2_INTERNAL_ERR = 8 */
+ {LOG_CRIT, "PSM could not set up shared memory segment"}, /* PSM2_SHMEM_SEGMENT_ERR = 9 */
+ {PSMI_NOLOG, "PSM option is a read-only option"}, /* PSM2_OPT_READONLY = 10 */
+ {PSMI_NOLOG, "Operation timed out"}, /* PSM2_TIMEOUT = 11 */
+ {LOG_INFO, "Exceeded supported amount of endpoints"},
+ /* PSM2_TOO_MANY_ENDPOINTS = 12 */
+ {PSMI_NOLOG, "PSM is in the finalized state"}, /* PSM2_IS_FINALIZED = 13 */
+ {PSMI_NOLOG, "unknown 14"},
+ {PSMI_NOLOG, "unknown 15"},
+ {PSMI_NOLOG, "unknown 16"},
+ {PSMI_NOLOG, "unknown 17"},
+ {PSMI_NOLOG, "unknown 18"},
+ {PSMI_NOLOG, "unknown 19"},
+ {PSMI_NOLOG, "Endpoint was closed"}, /* PSM2_EP_WAS_CLOSED = 20 */
+ {LOG_ALERT, "PSM Could not find an OPA Unit"}, /* PSM2_EP_NO_DEVICE = 21 */
+ {PSMI_NOLOG, "User passed a bad unit number"}, /* PSM2_EP_UNIT_NOT_FOUND = 22 */
+ {LOG_ALERT, "Failure in initializing endpoint"}, /* PSM2_EP_DEVICE_FAILURE = 23 */
+ {PSMI_NOLOG, "Error closing the endpoing error"}, /* PSM2_EP_CLOSE_TIMEOUT = 24 */
+ {PSMI_NOLOG, "No free contexts could be obtained"}, /* PSM2_EP_NO_PORTS_AVAIL = 25 */
+ {LOG_ALERT, "Could not detect network connectivity"}, /* PSM2_EP_NO_NETWORK = 26 */
+ {LOG_INFO, "Invalid Unique job-wide UUID Key"}, /* PSM2_EP_INVALID_UUID_KEY = 27 */
+ {LOG_INFO, "Out of endpoint resources"}, /* PSM2_EP_NO_RESOURCES = 28 */
+ {PSMI_NOLOG, "unknown 29"},
+ {PSMI_NOLOG, "unknown 30"},
+ {PSMI_NOLOG, "unknown 31"},
+ {PSMI_NOLOG, "unknown 32"},
+ {PSMI_NOLOG, "unknown 33"},
+ {PSMI_NOLOG, "unknown 34"},
+ {PSMI_NOLOG, "unknown 35"},
+ {PSMI_NOLOG, "unknown 36"},
+ {PSMI_NOLOG, "unknown 37"},
+ {PSMI_NOLOG, "unknown 38"},
+ {PSMI_NOLOG, "unknown 39"},
+ {PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"}, /* PSM2_EPID_UNKNOWN = 40 */
+ {PSMI_NOLOG, "Endpoint could not be reached"}, /* PSM2_EPID_UNREACHABLE = 41 */
+ {PSMI_NOLOG, "unknown 42"},
+ {LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"}, /* PSM2_EPID_INVALID_NODE = 43 */
+ {LOG_CRIT, "Invalid MTU"}, /* PSM2_EPID_INVALID_MTU = 44 */
+ {PSMI_NOLOG, "UUID key mismatch"}, /* PSM2_EPID_INVALID_UUID_KEY = 45 */
+ {LOG_ERR, "Incompatible PSM version"}, /* PSM2_EPID_INVALID_VERSION = 46 */
+ {LOG_CRIT, "Connect received garbled connection information"}, /* PSM2_EPID_INVALID_CONNECT = 47 */
+ {PSMI_NOLOG, "Endpoint was already connected"}, /* PSM2_EPID_ALREADY_CONNECTED = 48 */
+ {LOG_CRIT, "Two or more endpoints have the same network id (LID)"}, /* PSM2_EPID_NETWORK_ERROR = 49 */
+ {LOG_CRIT, "Endpoint provided incompatible Partition Key"},
+ {LOG_CRIT, "Unable to resolve network path. Is the SM running?"},
+ {PSMI_NOLOG, "unknown 52"},
+ {PSMI_NOLOG, "unknown 53"},
+ {PSMI_NOLOG, "unknown 54"},
+ {PSMI_NOLOG, "unknown 55"},
+ {PSMI_NOLOG, "unknown 56"},
+ {PSMI_NOLOG, "unknown 57"},
+ {PSMI_NOLOG, "unknown 58"},
+ {PSMI_NOLOG, "unknown 59"},
+ {PSMI_NOLOG, "MQ Non-blocking request is incomplete"}, /* PSM2_MQ_NO_COMPLETIONS = 60 */
+ {PSMI_NOLOG, "MQ Message has been truncated at the receiver"}, /* PSM2_MQ_TRUNCATION = 61 */
+ {PSMI_NOLOG, "unknown 62"},
+ {PSMI_NOLOG, "unknown 63"},
+ {PSMI_NOLOG, "unknown 64"},
+ {PSMI_NOLOG, "unknown 65"},
+ {PSMI_NOLOG, "unknown 66"},
+ {PSMI_NOLOG, "unknown 67"},
+ {PSMI_NOLOG, "unknown 68"},
+ {PSMI_NOLOG, "unknown 69"},
+ {PSMI_NOLOG, "Invalid AM reply"},
+ {PSMI_NOLOG, "unknown 71"},
+ {PSMI_NOLOG, "unknown 72"},
+ {PSMI_NOLOG, "unknown 73"},
+ {PSMI_NOLOG, "unknown 74"},
+ {PSMI_NOLOG, "unknown 75"},
+ {PSMI_NOLOG, "unknown 76"},
+ {PSMI_NOLOG, "unknown 77"},
+ {PSMI_NOLOG, "unknown 78"},
+ {PSMI_NOLOG, "unknown 79"},
+ {PSMI_NOLOG, "unknown 80"},
+};
+
+const char *__psm2_error_get_string(psm2_error_t error)
+{
+ PSM2_LOG_MSG("entering");
+ if (error >= PSM2_ERROR_LAST) {
+ PSM2_LOG_MSG("leaving");
+ return "unknown";
+ }
+ else {
+ PSM2_LOG_MSG("leaving");
+ return psmi_error_items[error].error_string;
+ }
+}
+PSMI_API_DECL(psm2_error_get_string)
+
+int psmi_error_syslog_level(psm2_error_t error)
+{
+ if (error >= PSM2_ERROR_LAST)
+ return PSMI_NOLOG;
+ else
+ return psmi_error_items[error].syslog_level;
+}
diff --git a/psm_error.h b/psm_error.h
new file mode 100644
index 0000000..f335382
--- /dev/null
+++ b/psm_error.h
@@ -0,0 +1,78 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+#include "psm2_mock_testing.h"
+
+#ifndef _PSMI_IN_USER_H
+#error psm_error.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_ERROR_H
+#define _PSMI_ERROR_H
+
+#define PSMI_EP_NONE (NULL)
+#define PSMI_EP_NORETURN ((psm2_ep_t) -2)
+#define PSMI_EP_LOGEVENT ((psm2_ep_t) -3)
+
+psm2_ep_errhandler_t psmi_errhandler_global;
+
+psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error,
+ const char *buf, ...)
+ __attribute__((format(printf, 3, 4)));
+MOCK_DCL_EPILOGUE(psmi_handle_error);
+
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB);
+int psmi_error_syslog_level(psm2_error_t error);
+
+#endif /* _PSMI_ERROR_H */
diff --git a/psm_help.h b/psm_help.h
new file mode 100644
index 0000000..12ebe5b
--- /dev/null
+++ b/psm_help.h
@@ -0,0 +1,190 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_HELP_H
+#define _PSMI_HELP_H
+#include "psm_log.h"
+
+/* XXX gcc only */
+#define PSMI_INLINE(FN) \
+ static inline FN
+
+#define PSMI_ALWAYS_INLINE(FN) \
+ static __inline__ FN __attribute__((always_inline)); \
+ static __inline__ FN
+
+#define PSMI_NEVER_INLINE(FN) \
+ static FN __attribute__((noinline)); \
+ static FN
+
+#define _PPragma(x) _Pragma(x)
+
+#define STRINGIFY(s) _STRINGIFY(s)
+#define _STRINGIFY(s) #s
+#define PSMI_CURLOC __FILE__ ":" STRINGIFY(__LINE__)
+#define psmi_assert_always_loc(x, curloc) \
+ do { \
+ if_pf(!(x)) { \
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Assertion failure at %s: %s", curloc, \
+ STRINGIFY(x)); \
+ } } while (0)
+
+#define psmi_assert_always(x) psmi_assert_always_loc(x, PSMI_CURLOC)
+
+#ifdef PSM_DEBUG
+# define psmi_assert(x) psmi_assert_always(x)
+# define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized())
+#else
+# define psmi_assert(x)
+# define PSMI_ASSERT_INITIALIZED()
+#endif
+
+#define _PSMI_API_NAME(FN) __ ## FN
+#define _PSMI_API_STR(FN) _STRINGIFY(__ ## FN)
+#define PSMI_API_DECL(FN) \
+ typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN))));
+
+#define PSMI_ERR_UNLESS_INITIALIZED(ep) \
+ do { \
+ if (!psmi_isinitialized()) { \
+ PSM2_LOG_MSG("leaving"); \
+ return psmi_handle_error(ep, PSM2_INIT_NOT_INIT, \
+ "PSM2 has not been initialized"); \
+ } \
+ } while (0)
+
+#define PSMI_CHECKMEM(err, mem) \
+ do { \
+ if ((mem) == NULL) { \
+ (err) = PSM2_NO_MEMORY; \
+ goto fail; \
+ } \
+ } while (0)
+
+#define PSMI_CACHEALIGN __attribute__((aligned(64)))
+
+/* Easy way to ignore the OK_NO_PROGRESS case */
+PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err))
+{
+ if (err > PSM2_OK_NO_PROGRESS)
+ return err;
+ else
+ return PSM2_OK;
+}
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+#define SEC_ULL 1000000000ULL
+#define MSEC_ULL 1000000ULL
+#define USEC_ULL 1000ULL
+#define NSEC_ULL 1ULL
+
+#define PSMI_TRUE 1
+#define PSMI_FALSE 0
+
+#define PSMI_CYCLES_TO_SECSF(cycles) \
+ ((double) cycles_to_nanosecs(cycles) / 1.0e9)
+
+#define PSMI_PAGESIZE psmi_getpagesize()
+#define PSMI_POWEROFTWO(P) (((P)&((P)-1)) == 0)
+#define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1)))
+#define PSMI_ALIGNUP(p, P) (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P)))
+
+#define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff))
+
+#ifdef PSM_DEBUG
+
+/* The intent of the following two macros is to emit an internal error if a size of a
+ 'member' is not as expected, violating an assumption in the code. There are some
+ problems with the implementation of this code:
+
+ The first macro creates a static const variable with ABSOLUTELY NO references
+ to them. For example there are ABSOLUTELY NO uses of the second macro in the
+ PSM code. This is not completely pure. GCC version 5, for example, emits a
+ warning for defining a static const when it is not referenced.
+
+ A better implementation of the intent of this code is to use static_assert()
+ so that at compile time the violations can be caught and corrected - not at
+ run time. */
+
+#define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz
+#define PSMI_STRICT_SIZE_VERIFY(member, sz) \
+ do { \
+ if (__psm2_ss_ ## member != (sz)) { \
+ char errmsg[64]; \
+ snprintf(errmsg, 32, "Internal error: %s " \
+ "size doesn't match expected %d bytes", \
+ STRINGIFY(member), (int) __psm2_ss_ ## member); \
+ exit(-1); \
+ } \
+ } while (0)
+
+#else
+
+#define PSMI_STRICT_SIZE_DECL(member, sz) /* nothing */
+#define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */
+
+#endif /* PSM_DEBUG */
+
+#endif /* _PSMI_HELP_H */
diff --git a/psm_lock.h b/psm_lock.h
new file mode 100644
index 0000000..56e82a8
--- /dev/null
+++ b/psm_lock.h
@@ -0,0 +1,142 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_lock.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_LOCK_H
+#define _PSMI_LOCK_H
+
+#ifndef PSMI_USE_PTHREAD_SPINLOCKS
+#define PSMI_USE_PTHREAD_SPINLOCKS 0
+#endif
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+typedef pthread_spinlock_t psmi_spinlock_t;
+
+#define psmi_spin_init(lock) pthread_spin_init(lock, \
+ PTHREAD_PROCESS_PRIVATE)
+#define psmi_spin_lock(lock) pthread_spin_lock(lock)
+#define psmi_spin_trylock(lock) pthread_spin_trylock(lock)
+#define psmi_spin_unlock(lock) pthread_spin_unlock(lock)
+#else
+typedef ips_atomic_t psmi_spinlock_t;
+#define PSMI_SPIN_LOCKED 1
+#define PSMI_SPIN_UNLOCKED 0
+#endif
+
+/* psmi_lock_t structure */
+typedef struct {
+
+#ifdef PSMI_LOCK_IS_SPINLOCK
+ psmi_spinlock_t lock;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+ pthread_mutex_t lock;
+ pthread_t lock_owner;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+ pthread_mutex_t lock;
+#endif
+} psmi_lock_t;
+
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+#else
+PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock))
+{
+ ips_atomic_set(lock, PSMI_SPIN_UNLOCKED);
+ return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock))
+{
+ if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED)
+ == PSMI_SPIN_UNLOCKED)
+ return 0;
+ else
+ return EBUSY;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock))
+{
+ while (psmi_spin_trylock(lock) == EBUSY) {
+ }
+ return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock))
+{
+ atomic_set(lock, PSMI_SPIN_UNLOCKED);
+ return 0;
+}
+#endif /* PSMI_USE_PTHREAD_SPINLOCKS */
+
+PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
+{
+#ifdef PSMI_LOCK_IS_SPINLOCK
+ psmi_spin_init(&(lock->lock));
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+ pthread_mutex_init(&(lock->lock), NULL);
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+ pthread_mutexattr_t attr;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP);
+ pthread_mutex_init(&(lock->lock), &attr);
+ pthread_mutexattr_destroy(&attr);
+ lock->lock_owner = PSMI_LOCK_NO_OWNER;
+#endif
+}
+
+#endif /* _PSMI_LOCK_H */
diff --git a/psm_log.h b/psm_log.h
new file mode 100644
index 0000000..c808c5c
--- /dev/null
+++ b/psm_log.h
@@ -0,0 +1,224 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_LOG_H
+#define _PSMI_LOG_H
+
+/*
+
+ A note about PSM_LOG and PSM_LOG_FAST_IO:
+
+ By default, the PSM_LOG facility is safe, slow, and is complete. That is, if the
+ test case you are debugging has an abnormal termiation, no problem. The logs are
+ saved up to the point of the abnormal termination. Abnormal termination can be
+ a seg fault, the test case issues a fatal error, or exit()'s or abort()'s.
+
+ However, debugging timing sensitive problems, make the usual SLOW PSM_LOG
+ facility inadequate as the timing overhead that it introduces dominates, and the
+ symptoms of your problem may change or go away.
+
+ For this case, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO. To use
+ PSM_LOG_FAST_IO though, caution: for abnormal program termination, you will get
+ no log file.
+
+ To workaround this problem, and allow you to get a log file even after an abnormal
+ program terminiation, we expose psmi_log_fini to the outside world (via the linker
+ script file), and so in your client test code, you can call psmi_log_fini() on a
+ fatal error (e.g. in a signal handler).
+
+ --------------------------------------------------------------------------------
+
+ This file (psm_log.h) defines macros for logging messages to assist investigations
+ into the psm library.
+
+ By default, these macros are not defined when building psm. When not defined, the
+ macros become no-ops in the PSM code.
+
+ When enabled (by defining the PSM_LOG symbol), the macros present information to
+ the psmi_log_message() facility for processing. See below for more information on the
+ psmi_log_message() facility.
+
+ To enable the macros, build PSM with the PSM_LOG environment variable exported, ala:
+
+ PSM_LOG=1 make ...
+
+ The macros are described in the following:
+
+ PSM2_LOG_MSG(FORMAT,...) Spills a printf-style message to the log.
+ PSM_LOG_DECLARE_BT_BUFFER() Declares a local back trace buffer for use with the
+ PSM_LOG_BT() macro.
+ PSM_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs from the
+ previous backtrace spilled to the log.
+
+ The psmi_log_message() facility is the backend for these messages when PSM_LOG is enabled.
+ The psmi_log_message() facility spills messages to unique log files based on the process id
+ and the thread id. So every unique process id, and thread id will spill to unique log files.
+ The psmi_log_message prefixes each message in the log files with a high resolution timer
+ message so that messages from multiple threads and log files can be reconciled to one timeline.
+ It is left as an exercise to the reader to reconcile log messages from different hosts to one
+ timeline.
+
+ The backtrace capability in the PSM_LOG functionality needs some explanation: often a bug
+ happens only when the code is tickled from a specific call-chain. The PSM_LOG_BT() macro
+ supports identifying the unique call-chain when a problem occurs. The model is as follows:
+
+ A unique declaration is made for a backtrace to spill the backtrace information to. This
+ declaration should be made in the same basic block as the use of the PSM_LOG_BT() macro.
+ To make the declaration, use PSM_LOG_DECLARE_BT_BUFFER().
+
+ When the PSM_LOG is enabled, at the statement for the macro: PSM_LOG_BT(NFRAMES,FORMAT,...),
+ the psmi_log_message() facility generates the current backtrace, and compares the first
+ NFRAMES of the current backtrace against the previous backtrace stored in the backtrace
+ buffer declared with the declaration. If the two backtraces differ, the psmi_log_message()
+ code saves the current backtrace into the declared buffer, and then spills the backtrace to the
+ log file.
+
+ At runtime, setting environment variables can squelch the log file from getting too big:
+
+ PSM2_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated FNL) (see below),
+ that will INClude the FNL's into the colleciton of functions to spill log data for.
+
+ PSM2_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude the FNL's from the
+ collection of functions to spill log data for.
+
+ An FNL is a 'Function Name List' that is defined by the following grammar:
+
+ # A LINE1 is either a single line number of a range of line numbers:
+ LINE1 :: lineNumber |
+ lineNumber1 '-' lineNumber2
+
+ # LINES is a list of LINE1's separated by commas:
+ LINES :: LINE1 |
+ LINE1 ',' LINES
+
+ # An FN is either a function name, or a function name with a list of lines:
+ FN :: functionName |
+ functionName ';' LINES
+
+ # A FNL is a list of FN's separated by colons:
+ FNL :: FN |
+ FN ':' FNL
+
+ # Examples:
+ foo:bar the two functions foo and bar
+ foo;1-10 lines 1 to 10 of function foo.
+ bar;1,3,5 lines 1, 3 and 5 of function bar
+
+ PSM2_LOG_SRCH_FORMAT_STRING If set, overrides the PSM2_LOG_INC_FUNCTION_NAMES
+ and PSM2_LOG_EXC_FUNCTION_NAMES settings. Causes the psmi_log_message() facility
+ to only emit the log messages that match (using fnmatch()) the message in FORMAT.
+
+ */
+
+#define PSM_LOG_EPM_TX ((int)1)
+#define PSM_LOG_EPM_RX ((int)0)
+
+
+#ifdef PSM_LOG
+
+extern void psmi_log_initialize(void);
+
+/* defined in psm_utils.c */
+extern void psmi_log_message(const char *fileName,
+ const char *functionName,
+ int lineNumber,
+ const char *format, ...);
+
+#ifdef PSM_LOG_FAST_IO
+extern void psmi_log_fini(void);
+#else
+#define psmi_log_fini() /* nothing */
+#endif
+
+#define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__)
+
+#define PSM_LOG_BT_BUFFER_SIZE 100
+
+#define PSM_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM_LOG_BT_BUFFER_SIZE]
+
+#define PSM_LOG_BT_MAGIC ((const char *)-1)
+
+#define PSM_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__)
+
+#define PSM_LOG_EPM_MAGIC ((const char *)-2)
+
+/* EPM is short for Emit Protocol Message to the log file.
+OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h
+TXRX is an int, and should be one of the above two consts (PSM_LOG_EPM_TX, or PSM_LOG_EPM_RX).
+FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender of the message
+ and the toepid should be the epid (end point id) of the receiver of the message
+ */
+#define PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__)
+
+/* Just adds a condition to the PSM_LOG_EPM() macro. */
+#define PSM_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) if (COND) PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__)
+
+#else
+
+#define psmi_log_initialize() /* nothing */
+
+#define PSM2_LOG_MSG(FORMAT , ...) /* nothing */
+
+#define psmi_log_fini() /* nothing */
+
+#define PSM_LOG_DECLARE_BT_BUFFER() /* nothing */
+
+#define PSM_LOG_BT(NFRAMES,FORMAT , ...) /* nothing */
+
+#define PSM_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#define PSM_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#endif /* #ifdef PSM_LOG */
+
+#endif /* #ifndef _PSMI_LOG_H */
diff --git a/psm_memcpy.c b/psm_memcpy.c
new file mode 100644
index 0000000..d3c2b11
--- /dev/null
+++ b/psm_memcpy.c
@@ -0,0 +1,67 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "psm_mq_internal.h"
+
+void *psmi_memcpyo(void *dst, const void *src, size_t n)
+{
+ psmi_mq_mtucpy(dst, src, n);
+ return dst;
+}
diff --git a/psm_mock.c b/psm_mock.c
new file mode 100644
index 0000000..bdcfd41
--- /dev/null
+++ b/psm_mock.c
@@ -0,0 +1,90 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2017 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm2_mock_testing.h"
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl)
+{
+ _PSMI_LOCK_INIT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_init);
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl)
+{
+ int ret = _PSMI_LOCK_TRY(*pl);
+ return ret;
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_try);
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl)
+{
+ _PSMI_LOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock);
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl)
+{
+ _PSMI_UNLOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock);
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl)
+{
+ _PSMI_LOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert);
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl)
+{
+ _PSMI_UNLOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert);
+#endif
diff --git a/psm_mpool.c b/psm_mpool.c
new file mode 100644
index 0000000..99f6748
--- /dev/null
+++ b/psm_mpool.c
@@ -0,0 +1,588 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_MPOOL_ALIGNMENT 64
+
+struct mpool_element {
+ union {
+ SLIST_ENTRY(mpool_element) me_next;
+ mpool_t me_mpool;
+ };
+
+ uint32_t me_gen_count;
+ uint32_t me_index;
+#ifdef PSM_DEBUG
+ uint32_t me_isused;
+#endif
+} __attribute__ ((aligned(8)));
+
+#ifdef PSM_DEBUG
+# define me_mark_used(me) ((me)->me_isused = 1)
+# define me_mark_unused(me) ((me)->me_isused = 0)
+#else
+# define me_mark_used(me)
+# define me_mark_unused(me)
+#endif
+
+struct mpool {
+ int mp_type;
+ int mp_flags;
+ int mp_vector_shift;
+
+ uint32_t mp_elm_vector_size;
+ uint32_t mp_elm_offset;
+ uint32_t mp_num_obj;
+ uint32_t mp_num_obj_inuse;
+ uint32_t mp_elm_size;
+ uint32_t mp_obj_size;
+ uint32_t mp_num_obj_per_chunk;
+ uint32_t mp_num_obj_max_total;
+ psmi_memtype_t mp_memtype;
+
+ SLIST_HEAD(, mpool_element) mp_head;
+ struct mpool_element **mp_elm_vector;
+ struct mpool_element **mp_elm_vector_free;
+ non_empty_callback_fn_t mp_non_empty_cb;
+ void *mp_non_empty_cb_context;
+
+#ifdef PSM_CUDA
+ alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
+ void *mp_alloc_dealloc_cb_context;
+#endif
+};
+
+static int psmi_mpool_allocate_chunk(mpool_t);
+
+/**
+ * psmi_mpool_create()
+ *
+ * Create a memory pool and allocates <num_obj_per_chunk> objects of size
+ * <obj_size>. If more memory is needed to accommodate mpool_get()
+ * requests, the memory pool will allocate another chunk of
+ * <num_obj_per_chunk> objects, until it reaches the maximum number of objects
+ * it can allocate.
+ *
+ * <obj_size> size of each individual object
+ * <num_obj_per_chunk> number of objects to allocate per chunk (power of two)
+ * <num_obj_max_total> total number of objects that may be allocated
+ * at any given time. Must be a power of two greater than
+ * <num_obj_per_chunk>.
+ *
+ * <flags> flags to be applied on the memory pool (ie. memory
+ * alignment)
+ *
+ * <cb> callback to be called when the memory pool has some
+ * free objects available again (after running out of them).
+ * <context> context pointer for the callback
+ *
+ * Return the mpool on success, NULL on failure.
+ */
+mpool_t
+psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk,
+ uint32_t num_obj_max_total, int flags,
+ psmi_memtype_t statstype,
+ non_empty_callback_fn_t cb, void *context)
+{
+ mpool_t mp;
+ int s;
+ size_t hdr_size;
+
+#ifdef PSM_VALGRIND
+ /* For Valgrind we wish to define a "redzone" before and after the
+ * allocation block, so we also allocate a blank mpool_element
+ * at the end of the user's block */
+#endif
+
+ if (!PSMI_POWEROFTWO(num_obj_per_chunk) ||
+ !PSMI_POWEROFTWO(num_obj_max_total) ||
+ num_obj_max_total < num_obj_per_chunk) {
+ return NULL;
+ }
+
+ mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool));
+ if (mp == NULL) {
+ fprintf(stderr,
+ "Failed to allocate memory for memory pool: %s\n",
+ strerror(errno));
+ return NULL;
+ }
+
+ for (s = 1; s < num_obj_per_chunk; s <<= 1)
+ mp->mp_vector_shift++;
+
+ mp->mp_flags = flags;
+ mp->mp_num_obj_per_chunk = num_obj_per_chunk;
+ mp->mp_num_obj_max_total = num_obj_max_total;
+ mp->mp_non_empty_cb = cb;
+ mp->mp_non_empty_cb_context = context;
+
+ mp->mp_memtype = statstype;
+
+ SLIST_INIT(&mp->mp_head);
+ mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk;
+ mp->mp_elm_vector =
+ psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size,
+ sizeof(struct mpool_element *));
+ if (mp->mp_elm_vector == NULL) {
+ fprintf(stderr,
+ "Failed to allocate memory for memory pool vector: "
+ "%s\n", strerror(errno));
+ psmi_free(mp);
+ return NULL;
+ }
+
+ mp->mp_elm_vector_free = mp->mp_elm_vector;
+
+ if (flags & PSMI_MPOOL_ALIGN) {
+ /* User wants its block to start on a PSMI_MPOOL_ALIGNMENT
+ * boundary. */
+ hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element),
+ PSMI_MPOOL_ALIGNMENT);
+ mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT);
+ mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+
+ mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element);
+ } else {
+ hdr_size = sizeof(struct mpool_element);
+ mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8);
+ mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+ mp->mp_elm_offset = 0;
+ }
+
+ return mp;
+}
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+ uint32_t num_obj_max_total, int flags,
+ psmi_memtype_t statstype, non_empty_callback_fn_t cb,
+ void *context)
+{
+ mpool_t mp;
+
+ mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+ num_obj_max_total, flags, statstype,
+ cb, context);
+
+ if (mp == NULL)
+ return NULL;
+
+ if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+ psmi_mpool_destroy(mp);
+ return NULL;
+ }
+
+ VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */ ,
+ PSM_VALGRIND_MEM_UNDEFINED);
+
+ return mp;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_create);
+
+#ifdef PSM_CUDA
+mpool_t
+psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+ uint32_t num_obj_max_total, int flags,
+ psmi_memtype_t statstype,
+ non_empty_callback_fn_t cb, void *context,
+ alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+{
+ mpool_t mp;
+
+ mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+ num_obj_max_total, flags, statstype,
+ cb, context);
+
+ if (mp == NULL)
+ return NULL;
+
+ mp->mp_alloc_dealloc_cb = ad_cb;
+ mp->mp_alloc_dealloc_cb_context = ad_context;
+
+ if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+ psmi_mpool_destroy(mp);
+ return NULL;
+ }
+
+ VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */ ,
+ PSM_VALGRIND_MEM_UNDEFINED);
+
+ return mp;
+}
+#endif
+
+/**
+ * psmi_mpool_get()
+ *
+ * <mp> memory pool
+ *
+ * Requests an object from the memory pool.
+ *
+ * Returns NULL if the maximum number of objects has been allocated (refer to
+ * <num_obj_max_total> in psmi_mpool_create) or if running out of memory.
+ */
+void *psmi_mpool_get(mpool_t mp)
+{
+ struct mpool_element *me;
+ void *obj;
+
+ if (SLIST_EMPTY(&mp->mp_head)) {
+ if (psmi_mpool_allocate_chunk(mp) != PSM2_OK)
+ return NULL;
+ }
+
+ me = SLIST_FIRST(&mp->mp_head);
+ SLIST_REMOVE_HEAD(&mp->mp_head, me_next);
+
+ psmi_assert(!me->me_isused);
+ me_mark_used(me);
+
+ /* store a backpointer to the memory pool */
+ me->me_mpool = mp;
+ mp->mp_num_obj_inuse++;
+ psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj);
+
+ obj = (void *)((uintptr_t) me + sizeof(struct mpool_element));
+ VALGRIND_MEMPOOL_ALLOC(mp, obj, mp->mp_obj_size);
+ return obj;
+}
+
+/**
+ * psmi_mpool_put()
+ *
+ * <obj> object to return to the memory pool
+ *
+ * Returns an <obj> to the memory pool subsystem. This object will be re-used
+ * to fulfill new psmi_mpool_get() requests.
+ */
+void psmi_mpool_put(void *obj)
+{
+ struct mpool_element *me;
+ int was_empty;
+ mpool_t mp;
+
+ me = (struct mpool_element *)
+ ((uintptr_t) obj - sizeof(struct mpool_element));
+ me->me_gen_count++;
+
+ mp = me->me_mpool;
+
+ psmi_assert(mp != NULL);
+ psmi_assert(mp->mp_num_obj_inuse >= 0);
+ psmi_assert(me->me_isused);
+ me_mark_unused(me);
+
+ was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total;
+ SLIST_INSERT_HEAD(&mp->mp_head, me, me_next);
+
+ mp->mp_num_obj_inuse--;
+
+ VALGRIND_MEMPOOL_FREE(mp, obj);
+
+ /* tell the user that memory is available */
+ if (mp->mp_non_empty_cb && was_empty)
+ mp->mp_non_empty_cb(mp->mp_non_empty_cb_context);
+}
+
+/**
+ * psmi_mpool_get_obj_index()
+ *
+ * <obj> object in the memory pool
+ *
+ * Returns the index of the <obj> in the memory pool.
+ */
+
+int psmi_mpool_get_obj_index(void *obj)
+{
+ struct mpool_element *me = (struct mpool_element *)
+ ((uintptr_t) obj - sizeof(struct mpool_element));
+
+ return me->me_index;
+}
+
+/**
+ * psmi_mpool_get_obj_gen_count()
+ *
+ * <obj> object in the memory pool
+ *
+ * Returns the generation count of the <obj>.
+ */
+uint32_t psmi_mpool_get_obj_gen_count(void *obj)
+{
+ struct mpool_element *me = (struct mpool_element *)
+ ((uintptr_t) obj - sizeof(struct mpool_element));
+
+ return me->me_gen_count;
+}
+
+/**
+ * psmi_mpool_get_obj_index_gen_count()
+ *
+ * <obj> object in the memory pool
+ *
+ * Returns the index of the <obj> in <index>.
+ * Returns the generation count of the <obj> in <gen_count>.
+ */
+int
+psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index,
+ uint32_t *gen_count)
+{
+ struct mpool_element *me = (struct mpool_element *)
+ ((uintptr_t) obj - sizeof(struct mpool_element));
+
+ *index = me->me_index;
+ *gen_count = me->me_gen_count;
+ return 0;
+}
+
+/**
+ * psmi_mpool_find_obj_by_index()
+ *
+ * <mp> memory pool
+ * <index> index of the object
+ *
+ * Returns the object located at <index> in the memory pool or NULL if the
+ * <index> is invalid.
+ */
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index)
+{
+ struct mpool_element *me;
+
+ if_pf(index < 0 || index >= mp->mp_num_obj)
+ return NULL;
+
+ me = (struct mpool_element *)
+ ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] +
+ (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size +
+ mp->mp_elm_offset);
+
+ /* If this mpool doesn't require generation counts, it's illegal to find a
+ * freed object */
+#ifdef PSM_DEBUG
+ if (mp->mp_flags & PSMI_MPOOL_NOGENERATION)
+ psmi_assert(!me->me_isused);
+#endif
+
+ return (void *)((uintptr_t) me + sizeof(struct mpool_element));
+}
+
+#ifdef PSM_CUDA
+/**
+ * psmi_mpool_chunk_dealloc()
+ * <mp> memory pool
+ * <i> index
+ * Calls the dealloc function on each element in the chunk.
+ */
+void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
+{
+ int j;
+ for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
+ mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
+ mp->mp_alloc_dealloc_cb_context,
+ ((void *) mp->mp_elm_vector[idx]) +
+ j * mp->mp_elm_size +
+ sizeof(struct mpool_element));
+}
+#endif
+/**
+ * psmi_mpool_destroy()
+ *
+ * <mp> memory pool
+ *
+ * Destroy a previously allocated memory pool and reclaim its associated
+ * memory. The behavior is undefined if some objects have not been returned
+ * to the memory pool with psmi_mpool_put().
+ */
+void psmi_mpool_destroy(mpool_t mp)
+{
+ int i = 0;
+ size_t nbytes = mp->mp_num_obj * mp->mp_elm_size;
+
+ for (i = 0; i < mp->mp_elm_vector_size; i++) {
+ if (mp->mp_elm_vector[i]) {
+#ifdef PSM_CUDA
+ if (mp->mp_alloc_dealloc_cb)
+ psmi_mpool_chunk_dealloc(mp, i);
+#endif
+ psmi_free(mp->mp_elm_vector[i]);
+ }
+ }
+ psmi_free(mp->mp_elm_vector);
+ nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *);
+ VALGRIND_DESTROY_MEMPOOL(mp);
+ psmi_free(mp);
+ nbytes += sizeof(struct mpool);
+}
+
+/**
+ * psmi_mpool_get_max_obj()
+ *
+ * <mp> memory pool
+ *
+ * Returns the num-obj-per-chunk
+ * Returns the num-obj-max-total
+ */
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+ uint32_t *num_obj_max_total)
+{
+ *num_obj_per_chunk = mp->mp_num_obj_per_chunk;
+ *num_obj_max_total = mp->mp_num_obj_max_total;
+ return;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info);
+
+static int psmi_mpool_allocate_chunk(mpool_t mp)
+{
+ struct mpool_element *elm;
+ void *chunk;
+ uint32_t i = 0, num_to_allocate;
+
+ num_to_allocate =
+ mp->mp_num_obj + mp->mp_num_obj_per_chunk >
+ mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk;
+
+ psmi_assert(mp->mp_num_obj + num_to_allocate <=
+ mp->mp_num_obj_max_total);
+
+ if (num_to_allocate == 0)
+ return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+ if (mp->mp_alloc_dealloc_cb)
+ chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype,
+ num_to_allocate, mp->mp_elm_size);
+ else
+ chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+ num_to_allocate * mp->mp_elm_size);
+#else
+ chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+ num_to_allocate * mp->mp_elm_size);
+#endif
+ if (chunk == NULL) {
+ fprintf(stderr,
+ "Failed to allocate memory for memory pool chunk: %s\n",
+ strerror(errno));
+ return PSM2_NO_MEMORY;
+ }
+
+ for (i = 0; i < num_to_allocate; i++) {
+#ifdef PSM_CUDA
+ if (mp->mp_alloc_dealloc_cb)
+ mp->mp_alloc_dealloc_cb(1 /* is alloc */,
+ mp->mp_alloc_dealloc_cb_context,
+ chunk + i * mp->mp_elm_size +
+ sizeof(struct mpool_element));
+#endif
+ elm = (struct mpool_element *)((uintptr_t) chunk +
+ i * mp->mp_elm_size +
+ mp->mp_elm_offset);
+ elm->me_gen_count = 0;
+ elm->me_index = mp->mp_num_obj + i;
+#ifdef PSM_DEBUG
+ elm->me_isused = 0;
+#endif
+ SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next);
+#if 0
+ fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n",
+ (long)(mp->mp_elm_vector_free - mp->mp_elm_vector),
+ (int)i, elm,
+ (void *)((uintptr_t) elm +
+ sizeof(struct mpool_element)), SLIST_NEXT(elm,
+ me_next));
+#endif
+ }
+
+ psmi_assert((uintptr_t) mp->mp_elm_vector_free
+ < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size
+ * sizeof(struct mpool_element *));
+
+ mp->mp_elm_vector_free[0] = chunk;
+ mp->mp_elm_vector_free++;
+ mp->mp_num_obj += num_to_allocate;
+
+ return PSM2_OK;
+}
+
+#if 0
+void psmi_mpool_dump(mpool_t mp)
+{
+ int i, j;
+ struct mpool_element *me;
+
+ fprintf(stderr, "Memory pool %p has %d elements per chunk.\n",
+ mp, mp->mp_num_obj_per_chunk);
+ for (i = 0; i < mp->mp_elm_vector_size; i++) {
+ if (mp->mp_elm_vector[i] != NULL) {
+ fprintf(stderr, "===========================\n");
+ fprintf(stderr, "mpool chunk #%d\n", i);
+
+ for (j = 0, me = mp->mp_elm_vector[i];
+ j < mp->mp_num_obj_per_chunk;
+ j++, me = (struct mpool_element *)
+ ((uintptr_t) me + mp->mp_elm_size)) {
+ fprintf(stderr,
+ "obj=%p index=%d gen_count=%d\n",
+ (void *)((uintptr_t) me +
+ sizeof(struct mpool_element)),
+ me->me_index, me->me_gen_count);
+ }
+ fprintf(stderr, "===========================\n");
+ }
+ }
+}
+#endif
diff --git a/psm_mpool.h b/psm_mpool.h
new file mode 100644
index 0000000..8098f60
--- /dev/null
+++ b/psm_mpool.h
@@ -0,0 +1,107 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_mpool.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef PSM_MPOOL_H
+#define PSM_MPOOL_H
+
+/* mpool flags */
+#define PSMI_MPOOL_ALIGN_CACHE 0x1
+#define PSMI_MPOOL_ALIGN_PAGE 0x2
+#define PSMI_MPOOL_NOGENERATION 0x4
+
+/* Backwards compatibility */
+#define PSMI_MPOOL_ALIGN PSMI_MPOOL_ALIGN_CACHE
+
+typedef struct mpool *mpool_t;
+typedef void (*non_empty_callback_fn_t) (void *context);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
+ void *chunk);
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+ uint32_t num_obj_max_total, int flags,
+ psmi_memtype_t statstype,
+ non_empty_callback_fn_t cb, void *context);
+MOCK_DCL_EPILOGUE(psmi_mpool_create);
+
+mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+ uint32_t num_obj_max_total, int flags,
+ psmi_memtype_t statstype,
+ non_empty_callback_fn_t cb, void *context,
+ alloc_dealloc_callback_fn_t ad_cb,
+ void *ad_context);
+
+void psmi_mpool_destroy(mpool_t mp);
+
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+ uint32_t *num_obj_max_total);
+MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info);
+
+void *psmi_mpool_get(mpool_t mp);
+void psmi_mpool_put(void *obj);
+
+int psmi_mpool_get_obj_index(void *obj);
+uint32_t psmi_mpool_get_obj_gen_count(void *obj);
+int psmi_mpool_get_obj_index_gen_count(void *obj,
+ uint32_t *index, uint32_t *gen_count);
+
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index);
+
+#endif
diff --git a/psm_mq.c b/psm_mq.c
new file mode 100644
index 0000000..44b602a
--- /dev/null
+++ b/psm_mq.c
@@ -0,0 +1,1433 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sched.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Functions to manipulate the expected queue in mq_ep.
+ */
+
+/*
+ * Once the linked lists cross the size limit, this function will enable tag
+ * hashing and disable the non-hashing fastpath. We need to go back and insert
+ * reqs into the hash tables where the hashing searches will look for them.
+ */
+void
+psmi_mq_fastpath_disable(psm2_mq_t mq)
+{
+ psm2_mq_req_t *curp, cur;
+ struct mqq *qp;
+ unsigned hashvals[NUM_HASH_CONFIGS];
+ int t = PSM2_ANYTAG_ANYSRC;
+
+ mq->nohash_fastpath = 0;
+ /* Everything in the unexpected_q needs to be duplicated into
+ each of the (three) unexpected hash tables. */
+ qp = &mq->unexpected_q;
+ for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) {
+ mq->unexpected_hash_len++;
+ hashvals[PSM2_TAG_SRC] =
+ hash_64(*(uint64_t *) cur->tag.tag) % NUM_HASH_BUCKETS;
+ hashvals[PSM2_TAG_ANYSRC] =
+ hash_32(cur->tag.tag[0]) % NUM_HASH_BUCKETS;
+ hashvals[PSM2_ANYTAG_SRC] =
+ hash_32(cur->tag.tag[1]) % NUM_HASH_BUCKETS;
+ for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++)
+ mq_qq_append_which(mq->unexpected_htab,
+ t, hashvals[t], cur);
+ }
+
+ /* Everything in the expected_q needs to be moved into the
+ (single) correct expected hash table. */
+ qp = &mq->expected_q;
+ for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) {
+ /* must read next ptr before remove */
+ curp = &cur->next[PSM2_ANYTAG_ANYSRC];
+ if ((cur->tagsel.tag[0] == 0xFFFFFFFF) &&
+ (cur->tagsel.tag[1] == 0xFFFFFFFF)) {
+ /* hash tag0 and tag1 */
+ t = PSM2_TAG_SRC;
+ hashvals[t] = hash_64(*(uint64_t *) cur->tag.tag) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab,
+ t, hashvals[t], cur);
+ } else if (cur->tagsel.tag[0] == 0xFFFFFFFF) {
+ t = PSM2_TAG_ANYSRC;
+ hashvals[t] = hash_32(cur->tag.tag[0]) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab,
+ t, hashvals[t], cur);
+ } else if (cur->tagsel.tag[1] == 0xFFFFFFFF) {
+ t = PSM2_ANYTAG_SRC;
+ hashvals[t] = hash_32(cur->tag.tag[1]) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab,
+ t, hashvals[t], cur);
+ } else
+ continue; /* else, req must stay in ANY ANY */
+
+ mq->expected_list_len--;
+ mq->expected_hash_len++;
+ mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC);
+ }
+}
+
+/* easy threshold to re-enable: if |hash| == 0 && |list| < X
+ aggressive threshold: if |hash| + |list| < X
+ even easier: if |hash| + |list| == 0
+ might be better approach to avoid constant bouncing between modes */
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq)
+{
+ if_pf(mq->nohash_fastpath == 0 &&
+ mq->unexpected_hash_len == 0 &&
+ mq->expected_hash_len == 0 &&
+ mq->unexpected_list_len == 0 &&
+ mq->expected_list_len == 0){
+ mq->nohash_fastpath = 1;
+ }
+}
+
+/*
+ * ! @brief PSM exposed version to allow PTLs to match
+ */
+
+/*! @brief Try to match against the MQ using a tag and tagsel
+ *
+ * @param[in] mq Message Queue
+ * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR.
+ * @param[in] tag Input Tag
+ * @param[in] tagsel Input Tag Selector
+ * @param[in] remove Non-zero to remove the req from the queue
+ *
+ * @returns NULL if no match or an mq request if there is a match
+ */
+static
+psm2_mq_req_t
+mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src,
+ psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove)
+{
+ psm2_mq_req_t *curp;
+ psm2_mq_req_t cur;
+ unsigned hashval;
+ int i, j = 0;
+ struct mqq *qp;
+
+ if_pt (mq->nohash_fastpath) {
+ i = j = PSM2_ANYTAG_ANYSRC;
+ qp = &mq->unexpected_q;
+ } else if ((tagsel->tag[0] == 0xFFFFFFFF) &&
+ (tagsel->tag[1] == 0xFFFFFFFF)) {
+ i = PSM2_TAG_SRC;
+ hashval = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS;
+ qp = &mq->unexpected_htab[i][hashval];
+ } else if (tagsel->tag[0] == 0xFFFFFFFF) {
+ i = PSM2_TAG_ANYSRC;
+ hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+ qp = &mq->unexpected_htab[i][hashval];
+ } else if (tagsel->tag[1] == 0xFFFFFFFF) {
+ i = PSM2_ANYTAG_SRC;
+ hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+ qp = &mq->unexpected_htab[i][hashval];
+ } else {
+ /* unhashable tag */
+ i = PSM2_ANYTAG_ANYSRC;
+ qp = &mq->unexpected_q;
+ }
+
+ for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) {
+ psmi_assert(cur->peer != PSM2_MQ_ANY_ADDR);
+ if ((src == PSM2_MQ_ANY_ADDR || src == cur->peer) &&
+ !((tag->tag[0] ^ cur->tag.tag[0]) & tagsel->tag[0]) &&
+ !((tag->tag[1] ^ cur->tag.tag[1]) & tagsel->tag[1]) &&
+ !((tag->tag[2] ^ cur->tag.tag[2]) & tagsel->tag[2])) {
+ /* match! */
+ if (remove) {
+ if_pt (i == PSM2_ANYTAG_ANYSRC)
+ mq->unexpected_list_len--;
+ else
+ mq->unexpected_hash_len--;
+ for (; j < NUM_MQ_SUBLISTS; j++)
+ mq_qq_remove_which(cur, j);
+ psmi_mq_fastpath_try_reenable(mq);
+ }
+ return cur;
+ }
+ }
+ return NULL;
+}
+
+static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+ unsigned hashval;
+ int i;
+
+ req->timestamp = mq->timestamp++;
+ if_pt (mq->nohash_fastpath) {
+ mq_qq_append(&mq->expected_q, req);
+ req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+ mq->expected_list_len++;
+ if_pf (mq->expected_list_len >= HASH_THRESHOLD)
+ psmi_mq_fastpath_disable(mq);
+ } else if ((req->tagsel.tag[0] == 0xFFFFFFFF) &&
+ (req->tagsel.tag[1] == 0xFFFFFFFF)) {
+ i = PSM2_TAG_SRC;
+ hashval = hash_64(*(uint64_t *) req->tag.tag) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab, i, hashval, req);
+ mq->expected_hash_len++;
+ } else if (req->tagsel.tag[0] == 0xFFFFFFFF) {
+ i = PSM2_TAG_ANYSRC;
+ hashval = hash_32(req->tag.tag[0]) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab, i, hashval, req);
+ mq->expected_hash_len++;
+ } else if (req->tagsel.tag[1] == 0xFFFFFFFF) {
+ i = PSM2_ANYTAG_SRC;
+ hashval = hash_32(req->tag.tag[1]) % NUM_HASH_BUCKETS;
+ mq_qq_append_which(mq->expected_htab, i, hashval, req);
+ mq->expected_hash_len++;
+ } else {
+ mq_qq_append(&mq->expected_q, req);
+ req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+ mq->expected_list_len++;
+ }
+}
+
+/*! @brief Try to remove the req in the MQ
+ *
+ * @param[in] mq Message Queue
+ * @param[in] req MQ request
+ *
+ * @returns 1 if successfully removed, or 0 if req cannot be found.
+ */
+static
+int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req)
+{
+ int i;
+
+ /* item should only exist in one expected queue at a time */
+ psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1);
+
+ for (i = 0; i < NUM_MQ_SUBLISTS; i++)
+ if (req->q[i]) /* found */
+ break;
+ switch (i) {
+ case PSM2_ANYTAG_ANYSRC:
+ mq->expected_list_len--;
+ break;
+ case PSM2_TAG_SRC:
+ case PSM2_TAG_ANYSRC:
+ case PSM2_ANYTAG_SRC:
+ mq->expected_hash_len--;
+ break;
+ default:
+ return 0;
+ }
+
+ mq_qq_remove_which(req, i);
+ psmi_mq_fastpath_try_reenable(mq);
+ return 1;
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars)
+{
+ unsigned char *dest = (unsigned char *)vdest;
+ const unsigned char *src = (const unsigned char *)vsrc;
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+ PSMI_CUDA_CALL(cudaMemcpy,
+ vdest, vsrc, nchars, cudaMemcpyDefault);
+ return;
+ }
+#endif
+
+ if (nchars >> 2)
+ hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+ dest += (nchars >> 2) << 2;
+ src += (nchars >> 2) << 2;
+ switch (nchars & 0x03) {
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+}
+MOCK_DEF_EPILOGUE(psmi_mq_mtucpy);
+
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)
+{
+ unsigned char *dest = (unsigned char *)vdest;
+ const unsigned char *src = (const unsigned char *)vsrc;
+
+ if (nchars >> 2)
+ hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+ dest += (nchars >> 2) << 2;
+ src += (nchars >> 2) << 2;
+ switch (nchars & 0x03) {
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+}
+
+#if 0 /* defined(__x86_64__) No consumers of mtucpy safe */
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars)
+{
+ unsigned char *dest = (unsigned char *)vdest;
+ const unsigned char *src = (const unsigned char *)vsrc;
+ if (nchars >> 2)
+ hfi_dwordcpy_safe((uint32_t *) dest, (uint32_t *) src,
+ nchars >> 2);
+ dest += (nchars >> 2) << 2;
+ src += (nchars >> 2) << 2;
+ switch (nchars & 0x03) {
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+}
+#endif
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src,
+ psm2_mq_tag_t *tag,
+ psm2_mq_tag_t *tagsel, int remove_req))
+{
+ psm2_mq_req_t req;
+
+ PSMI_LOCK(mq->progress_lock);
+ req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+ if (req != NULL) {
+ PSMI_UNLOCK(mq->progress_lock);
+ return req;
+ }
+
+ psmi_poll_internal(mq->ep, 1);
+ /* try again */
+ req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+ PSMI_UNLOCK(mq->progress_lock);
+ return req;
+}
+
+psm2_error_t
+__psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src,
+ psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+ psm2_mq_status2_t *status)
+{
+ psm2_mq_req_t req;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0);
+ psmi_assert_req_not_internal(req);
+
+ if (req != NULL) {
+ if (status != NULL) {
+ mq_status2_copy(req, status);
+ }
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+ }
+ PSM2_LOG_MSG("leaving");
+ return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe2)
+
+psm2_error_t
+__psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+ psm2_mq_status_t *status)
+{
+ psm2_mq_tag_t rtag;
+ psm2_mq_tag_t rtagsel;
+ psm2_mq_req_t req;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ *(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+ rtag.tag[2] = 0;
+#endif
+ *(uint64_t *) rtagsel.tag = tagsel;
+ rtagsel.tag[2] = 0;
+
+ req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0);
+ psmi_assert_req_not_internal(req);
+
+ if (req != NULL) {
+ if (status != NULL) {
+ mq_status_copy(req, status);
+ }
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+ }
+
+ PSM2_LOG_MSG("leaving");
+
+ return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe)
+
+psm2_error_t
+__psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src,
+ psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+ psm2_mq_req_t *reqo, psm2_mq_status2_t *status)
+{
+ psm2_mq_req_t req;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ASSERT_INITIALIZED();
+
+ req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1);
+ if (req != NULL) {
+ if (status != NULL) {
+ mq_status2_copy(req, status);
+ }
+ *reqo = req;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+ }
+
+ *reqo = NULL;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe2)
+
+psm2_error_t
+__psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+ psm2_mq_req_t *reqo, psm2_mq_status_t *status)
+{
+ psm2_mq_tag_t rtag;
+ psm2_mq_tag_t rtagsel;
+ psm2_mq_req_t req;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ *(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+ rtag.tag[2] = 0;
+#endif
+ *(uint64_t *) rtagsel.tag = tagsel;
+ rtagsel.tag[2] = 0;
+
+ req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1);
+ if (req != NULL) {
+ if (status != NULL) {
+ mq_status_copy(req, status);
+ }
+ *reqo = req;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+ }
+
+ *reqo = NULL;
+ PSM2_LOG_MSG("leaving");
+ return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe)
+
+psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq)
+{
+ psm2_mq_req_t req = *ireq;
+ psm2_mq_t mq;
+ psm2_error_t err = PSM2_OK;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ if (req == NULL) {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_MQ_NO_COMPLETIONS;
+ }
+
+ /* Cancelling a send is a blocking operation, and expensive.
+ * We only allow cancellation of rendezvous sends, consider the eager sends
+ * as always unsuccessfully cancelled.
+ */
+ mq = req->mq;
+ PSMI_LOCK(mq->progress_lock);
+
+ if (MQE_TYPE_IS_RECV(req->type)) {
+ if (req->state == MQ_STATE_POSTED) {
+ int rc;
+
+ rc = mq_req_remove_single(mq, req);
+ psmi_assert_always(rc);
+ req->state = MQ_STATE_COMPLETE;
+ mq_qq_append(&mq->completed_q, req);
+ err = PSM2_OK;
+ } else
+ err = PSM2_MQ_NO_COMPLETIONS;
+ } else {
+ err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+ "Cannot cancel send requests (req=%p)",
+ req);
+ }
+
+ PSMI_UNLOCK(mq->progress_lock);
+
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_mq_cancel)
+
+/* This is the only PSM function that blocks.
+ * We handle it in a special manner since we don't know what the user's
+ * execution environment is (threads, oversubscribing processes, etc).
+ *
+ * The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t. Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status,
+ psmi_mq_status_copy_t status_copy,
+ int do_lock))
+{
+ psm2_error_t err = PSM2_OK;
+
+ psm2_mq_req_t req = *ireq;
+ if (req == PSM2_MQ_REQINVALID) {
+ return PSM2_OK;
+ }
+
+ if (do_lock)
+ PSMI_LOCK(req->mq->progress_lock);
+
+ if (req->state != MQ_STATE_COMPLETE) {
+ psm2_mq_t mq = req->mq;
+
+ /* We'll be waiting on this req, mark it as so */
+ req->type |= MQE_TYPE_WAITING;
+
+ _HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n",
+ req, req->buf, req->buf_len);
+
+ if (req->testwait_callback) {
+ err = req->testwait_callback(ireq);
+ if (do_lock)
+ PSMI_UNLOCK(req->mq->progress_lock);
+ if (status != NULL) {
+ status_copy(req, status);
+ }
+ return err;
+ }
+
+ PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE);
+
+ if (err > PSM2_OK_NO_PROGRESS)
+ goto fail_with_lock;
+ else
+ err = PSM2_OK;
+ }
+
+ if(!psmi_is_req_internal(req))
+ mq_qq_remove(&req->mq->completed_q, req);
+
+ if (status != NULL) {
+ status_copy(req, status);
+ }
+
+ _HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n",
+ req, req->buf, req->buf_len, req->error_code);
+
+ psmi_mq_req_free(req);
+ *ireq = PSM2_MQ_REQINVALID;
+
+fail_with_lock:
+ if (do_lock)
+ PSMI_UNLOCK(req->mq->progress_lock);
+ return err;
+}
+
+psm2_error_t
+__psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert_req_not_internal(*ireq);
+
+ rv = psmi_mq_wait_inner(ireq, status,
+ (psmi_mq_status_copy_t) mq_status2_copy, 1);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_wait2)
+
+psm2_error_t
+__psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert_req_not_internal(*ireq);
+
+ rv = psmi_mq_wait_inner(ireq, status,
+ (psmi_mq_status_copy_t) mq_status_copy, 1);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_wait)
+
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq)
+{
+ return psmi_mq_wait_inner(ireq, NULL, NULL, 0);
+}
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t. Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status,
+ psmi_mq_status_copy_t status_copy))
+{
+ psm2_mq_req_t req = *ireq;
+ psm2_error_t err = PSM2_OK;
+
+ PSMI_ASSERT_INITIALIZED();
+
+ if (req == PSM2_MQ_REQINVALID) {
+ return PSM2_OK;
+ }
+
+ if (req->state != MQ_STATE_COMPLETE) {
+ if (req->testwait_callback) {
+ PSMI_LOCK(req->mq->progress_lock);
+ err = req->testwait_callback(ireq);
+ if (status != NULL) {
+ status_copy(req, status);
+ }
+ PSMI_UNLOCK(req->mq->progress_lock);
+ return err;
+ } else
+ return PSM2_MQ_NO_COMPLETIONS;
+ }
+
+ if (status != NULL)
+ status_copy(req, status);
+
+ _HFI_VDBG
+ ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n",
+ req, req->tag.tag[0], req->tag.tag[1], req->tag.tag[2], req->buf,
+ req->buf_len, req->error_code);
+
+ PSMI_LOCK(req->mq->progress_lock);
+ mq_qq_remove(&req->mq->completed_q, req);
+ psmi_mq_req_free(req);
+ PSMI_UNLOCK(req->mq->progress_lock);
+
+ *ireq = PSM2_MQ_REQINVALID;
+
+ return err;
+}
+
+psm2_error_t
+__psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ rv = psmi_mq_test_inner(ireq, status,
+ (psmi_mq_status_copy_t) mq_status2_copy);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_test2)
+
+psm2_error_t
+__psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ rv = psmi_mq_test_inner(ireq, status,
+ (psmi_mq_status_copy_t) mq_status_copy);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+
+}
+PSMI_API_DECL(psm2_mq_test)
+
+psm2_error_t
+__psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+ psm2_mq_tag_t *stag, const void *buf, uint32_t len,
+ void *context, psm2_mq_req_t *req)
+{
+ psm2_error_t err;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert(stag != NULL);
+
+ PSMI_LOCK(mq->progress_lock);
+ err =
+ dest->ptlctl->mq_isend(mq, dest, flags, stag, buf, len, context,
+ req);
+ PSMI_UNLOCK(mq->progress_lock);
+
+#if 0
+#ifdef PSM_VALGRIND
+ /* If the send isn't completed yet, make sure that we mark the memory as
+ * unaccessible
+ */
+ if (*req != PSM2_MQ_REQINVALID && (*req)->state != MQ_STATE_COMPLETE)
+ VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+#endif
+#endif
+ psmi_assert(*req != NULL);
+ psmi_assert_req_not_internal(*req);
+
+ (*req)->peer = dest;
+
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_mq_isend2)
+
+psm2_error_t
+__psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len, void *context, psm2_mq_req_t *req)
+{
+ psm2_error_t err;
+ psm2_mq_tag_t tag;
+
+ PSM2_LOG_MSG("entering");
+
+ *((uint64_t *) tag.tag) = stag;
+ tag.tag[2] = 0;
+
+ PSMI_ASSERT_INITIALIZED();
+
+ PSMI_LOCK(mq->progress_lock);
+ err =
+ dest->ptlctl->mq_isend(mq, dest, flags, &tag, buf, len, context,
+ req);
+ PSMI_UNLOCK(mq->progress_lock);
+
+#if 0
+#ifdef PSM_VALGRIND
+ /* If the send isn't completed yet, make sure that we mark the memory as
+ * unaccessible
+ */
+ if (*req != PSM2_MQ_REQINVALID && (*req)->state != MQ_STATE_COMPLETE)
+ VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+#endif
+#endif
+ psmi_assert(*req != NULL);
+ psmi_assert_req_not_internal(*req);
+
+ (*req)->peer = dest;
+
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_mq_isend)
+
+psm2_error_t
+__psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+ psm2_mq_tag_t *stag, const void *buf, uint32_t len)
+{
+ psm2_error_t err;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+ psmi_assert(stag != NULL);
+
+ PSMI_LOCK(mq->progress_lock);
+ err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len);
+ PSMI_UNLOCK(mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_mq_send2)
+
+psm2_error_t
+__psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+ const void *buf, uint32_t len)
+{
+ psm2_error_t err;
+ psm2_mq_tag_t tag;
+
+ PSM2_LOG_MSG("entering");
+
+ *((uint64_t *) tag.tag) = stag;
+ tag.tag[2] = 0;
+
+ PSMI_ASSERT_INITIALIZED();
+
+ PSMI_LOCK(mq->progress_lock);
+ err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len);
+ PSMI_UNLOCK(mq->progress_lock);
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_mq_send)
+
+/*
+ * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv. This code assumes
+ * that the provided request has been matched, and begins copying message data
+ * that has already arrived to the user's buffer. Any remaining data is copied
+ * by PSM polling until the message is complete.
+ */
+static psm2_error_t
+psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
+{
+ uint32_t copysz;
+
+ PSM2_LOG_MSG("entering");
+ psmi_assert(MQE_TYPE_IS_RECV(req->type));
+#ifdef PSM_CUDA
+ psmi_mtucpy_fn_t psmi_mtucpy_fn;
+ if (req->is_buf_gpu_mem)
+ psmi_mtucpy_fn = psmi_mq_mtucpy;
+ else
+ psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+#endif
+
+ switch (req->state) {
+ case MQ_STATE_COMPLETE:
+ if (req->buf != NULL) { /* 0-byte messages don't alloc a sysbuf */
+ copysz = mq_set_msglen(req, len, req->send_msglen);
+#ifdef PSM_CUDA
+ psmi_mtucpy_fn
+#else
+ psmi_mq_mtucpy
+#endif
+ (buf, (const void *)req->buf, copysz);
+ psmi_mq_sysbuf_free(mq, req->buf);
+ }
+ req->buf = buf;
+ req->buf_len = len;
+ mq_qq_append(&mq->completed_q, req);
+ break;
+
+ case MQ_STATE_UNEXP: /* not done yet */
+ copysz = mq_set_msglen(req, len, req->send_msglen);
+ /* Copy What's been received so far and make sure we don't receive
+ * any more than copysz. After that, swap system with user buffer
+ */
+ req->recv_msgoff = min(req->recv_msgoff, copysz);
+ if (req->recv_msgoff) {
+#ifdef PSM_CUDA
+ psmi_mtucpy_fn
+#else
+ psmi_mq_mtucpy
+#endif
+ (buf, (const void *)req->buf,
+ req->recv_msgoff);
+ }
+ /* What's "left" is no access */
+ VALGRIND_MAKE_MEM_NOACCESS((void *)((uintptr_t) buf +
+ req->recv_msgoff),
+ len - req->recv_msgoff);
+ psmi_mq_sysbuf_free(mq, req->buf);
+
+ req->state = MQ_STATE_MATCHED;
+ req->buf = buf;
+ req->buf_len = len;
+ break;
+
+ case MQ_STATE_UNEXP_RV: /* rendez-vous ... */
+ copysz = mq_set_msglen(req, len, req->send_msglen);
+ /* Copy What's been received so far and make sure we don't receive
+ * any more than copysz. After that, swap system with user buffer
+ */
+ req->recv_msgoff = min(req->recv_msgoff, copysz);
+ if (req->recv_msgoff) {
+#ifdef PSM_CUDA
+ psmi_mtucpy_fn
+#else
+ psmi_mq_mtucpy
+#endif
+ (buf, (const void *)req->buf,
+ req->recv_msgoff);
+ }
+ /* What's "left" is no access */
+ VALGRIND_MAKE_MEM_NOACCESS((void *)((uintptr_t) buf +
+ req->recv_msgoff),
+ len - req->recv_msgoff);
+ if (req->send_msgoff) {
+ psmi_mq_sysbuf_free(mq, req->buf);
+ }
+
+ req->state = MQ_STATE_MATCHED;
+ req->buf = buf;
+ req->buf_len = len;
+ req->rts_callback(req, 0);
+ break;
+
+ default:
+ fprintf(stderr, "Unexpected state %d in req %p\n", req->state,
+ req);
+ fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+ req->type, req->mq, req->tag.tag[0], req->tag.tag[1],
+ req->tag.tag[2]);
+ abort();
+ }
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
+ psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+ uint32_t flags, void *buf, uint32_t len, void *context,
+ psm2_mq_req_t *reqo)
+{
+ psm2_error_t err = PSM2_OK;
+ psm2_mq_req_t req;
+
+#ifdef PSM_CUDA
+ int gpu_mem;
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)buf);
+ gpu_mem = 1;
+ } else
+ gpu_mem = 0;
+#endif
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ PSMI_LOCK(mq->progress_lock);
+
+ /* First check unexpected Queue and remove req if found */
+ req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY);
+
+ if (req == NULL) {
+ /* prepost before arrival, add to expected q */
+ req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+ if_pf(req == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto ret;
+ }
+
+ req->peer = src;
+ req->tag = *tag;
+ req->tagsel = *tagsel;
+ req->state = MQ_STATE_POSTED;
+ req->buf = buf;
+ req->buf_len = len;
+ req->recv_msglen = len;
+ req->recv_msgoff = 0;
+ req->context = context;
+
+#ifdef PSM_CUDA
+ req->is_buf_gpu_mem = gpu_mem;
+#endif
+
+ /* Nobody should touch the buffer after it's posted */
+ VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+
+ mq_add_to_expected_hashes(mq, req);
+ _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x "
+ " tagsel=%08x.%08x.%08x req=%p\n",
+ buf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+ tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+ } else {
+ _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x"
+ " tagsel=%08x.%08x.%08x req=%p\n", buf, len,
+ tag->tag[0], tag->tag[1], tag->tag[2],
+ tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+#ifdef PSM_CUDA
+ req->is_buf_gpu_mem = gpu_mem;
+#endif
+
+ req->context = context;
+
+ psm2_mq_irecv_inner(mq, req, buf, len);
+ }
+
+ret:
+ PSMI_UNLOCK(mq->progress_lock);
+ psmi_assert_req_not_internal(req);
+ *reqo = req;
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_mq_irecv2)
+
+psm2_error_t
+__psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags,
+ void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo)
+{
+ psm2_error_t rv;
+ psm2_mq_tag_t rtag;
+ psm2_mq_tag_t rtagsel;
+
+ *reqo = NULL;
+
+ PSM2_LOG_MSG("entering");
+
+ *(uint64_t *) rtag.tag = tag;
+#ifdef PSM_DEBUG
+ rtag.tag[2] = 0;
+#endif
+ *(uint64_t *) rtagsel.tag = tagsel;
+ rtagsel.tag[2] = 0;
+ rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel,
+ flags, buf, len, context, reqo);
+
+ psmi_assert_req_not_internal(*reqo);
+ PSM2_LOG_MSG("leaving");
+
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_irecv)
+
+psm2_error_t
+__psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+ void *context, psm2_mq_req_t *reqo)
+{
+ psm2_error_t err = PSM2_OK;
+ psm2_mq_req_t req = *reqo;
+
+ PSM2_LOG_MSG("entering");
+ PSMI_ASSERT_INITIALIZED();
+
+ if (req == PSM2_MQ_REQINVALID) {
+ err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+ "Invalid request (req=%p)", req);
+ } else {
+ /* Message is already matched -- begin delivering message data to the
+ user's buffer. */
+ req->context = context;
+
+#ifdef PSM_CUDA
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)buf);
+ req->is_buf_gpu_mem = 1;
+ } else
+ req->is_buf_gpu_mem = 0;
+#endif
+
+ PSMI_LOCK(mq->progress_lock);
+ psm2_mq_irecv_inner(mq, req, buf, len);
+ PSMI_UNLOCK(mq->progress_lock);
+ }
+
+ PSM2_LOG_MSG("leaving");
+
+ return err;
+}
+PSMI_API_DECL(psm2_mq_imrecv)
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t. Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq,
+ void *status,
+ psmi_mq_status_copy_t status_copy))
+{
+ psm2_mq_req_t req;
+
+ PSMI_ASSERT_INITIALIZED();
+
+ if ((req = mq->completed_q.first) == NULL) {
+ PSMI_LOCK(mq->progress_lock);
+ psmi_poll_internal(mq->ep, 1);
+ if ((req = mq->completed_q.first) == NULL) {
+ PSMI_UNLOCK(mq->progress_lock);
+ return PSM2_MQ_NO_COMPLETIONS;
+ }
+ PSMI_UNLOCK(mq->progress_lock);
+ }
+ /* something in the queue */
+ *oreq = req;
+ if (status != NULL)
+ status_copy(req, status);
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status)
+{
+ psm2_error_t rv;
+
+ *oreq = NULL;
+
+ PSM2_LOG_MSG("entering");
+ rv = psmi_mq_ipeek_inner(mq, oreq, status,
+ (psmi_mq_status_copy_t) mq_status2_copy);
+
+ psmi_assert_req_not_internal(*oreq);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek2)
+
+psm2_error_t
+__psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
+{
+ psm2_error_t rv;
+
+ *oreq = NULL;
+ PSM2_LOG_MSG("entering");
+ rv = psmi_mq_ipeek_inner(mq, oreq, status,
+ (psmi_mq_status_copy_t) mq_status_copy);
+
+ psmi_assert_req_not_internal(*oreq);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek)
+
+static
+psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
+{
+ psm2_error_t err = PSM2_OK;
+ uint32_t val32;
+
+ switch (key) {
+ case PSM2_MQ_RNDV_HFI_SZ:
+ if (get)
+ *((uint32_t *) value) = mq->hfi_thresh_rv;
+ else {
+ val32 = *((uint32_t *) value);
+ mq->hfi_thresh_rv = val32;
+ }
+ _HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n",
+ mq->hfi_thresh_rv, get ? "GET" : "SET");
+ break;
+
+ case PSM2_MQ_RNDV_SHM_SZ:
+ if (get)
+ *((uint32_t *) value) = mq->shm_thresh_rv;
+ else {
+ val32 = *((uint32_t *) value);
+ mq->shm_thresh_rv = val32;
+ }
+ _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n",
+ mq->shm_thresh_rv, get ? "GET" : "SET");
+ break;
+ case PSM2_MQ_MAX_SYSBUF_MBYTES:
+ /* Deprecated: this option no longer does anything. */
+ break;
+
+ default:
+ err =
+ psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown option key=%u", key);
+ break;
+ }
+ return err;
+}
+
+psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+ rv = psmi_mqopt_ctl(mq, key, value, 1);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_getopt)
+
+psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value)
+{
+ psm2_error_t rv;
+ PSM2_LOG_MSG("entering");
+ PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+ rv = psmi_mqopt_ctl(mq, key, (void *)value, 0);
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_setopt)
+
+/*
+ * This is the API for the user. We actually allocate the MQ much earlier, but
+ * the user can set options after obtaining an endpoint
+ */
+psm2_error_t
+__psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+ const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo)
+{
+ psm2_error_t err = PSM2_OK;
+
+ if (ep == NULL) {
+ err = PSM2_PARAM_ERR;
+ goto fail;
+ }
+
+ psm2_mq_t mq = ep->mq;
+ int i;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+ psmi_assert_always(mq != NULL);
+ psmi_assert_always(mq->ep != NULL);
+
+ /* Process options */
+ for (i = 0; err == PSM2_OK && i < numopts; i++)
+ err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0);
+ if (err != PSM2_OK) /* error already handled */
+ goto fail;
+
+ /* Initialize the unexpected system buffer allocator */
+ psmi_mq_sysbuf_init(mq);
+ char buf[128];
+ psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf);
+ _HFI_VDBG("%s", buf);
+
+ *mqo = mq;
+
+fail:
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+PSMI_API_DECL(psm2_mq_init)
+
+static
+void
+psmi_mq_print_stats(psm2_mq_t mq)
+{
+ psm2_mq_stats_t stats;
+
+ psm2_mq_get_stats(mq, &stats);
+ _HFI_INFO("rx_user_bytes %lu\n", stats.rx_user_bytes);
+ _HFI_INFO("rx_user_num %lu\n", stats.rx_user_num);
+ _HFI_INFO("rx_sys_bytes %lu\n", stats.rx_sys_bytes);
+ _HFI_INFO("rx_sys_num %lu\n", stats.rx_sys_num);
+
+ _HFI_INFO("tx_num %lu\n", stats.tx_num);
+ _HFI_INFO("tx_eager_num %lu\n", stats.tx_eager_num);
+ _HFI_INFO("tx_eager_bytes %lu\n", stats.tx_eager_bytes);
+ _HFI_INFO("tx_rndv_num %lu\n", stats.tx_rndv_num);
+ _HFI_INFO("tx_rndv_bytes %lu\n", stats.tx_rndv_bytes);
+
+ _HFI_INFO("tx_shm_num %lu\n", stats.tx_shm_num);
+ _HFI_INFO("rx_shm_num %lu\n", stats.rx_shm_num);
+
+ _HFI_INFO("rx_sysbuf_num %lu\n", stats.rx_sysbuf_num);
+ _HFI_INFO("rx_sysbuf_bytes %lu\n", stats.rx_sysbuf_bytes);
+}
+
+psm2_error_t __psm2_mq_finalize(psm2_mq_t mq)
+{
+ psm2_error_t rv = PSM2_OK;
+
+ PSM2_LOG_MSG("entering");
+
+ PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+
+ if (mq->print_stats != 0)
+ psmi_mq_print_stats(mq);
+
+ PSM2_LOG_MSG("leaving");
+ return rv;
+}
+PSMI_API_DECL(psm2_mq_finalize)
+
+void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats)
+{
+ PSM2_LOG_MSG("entering");
+ memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t));
+ PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_mq_get_stats)
+
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo)
+{
+ psm2_error_t err = PSM2_OK;
+
+ psm2_mq_t mq =
+ (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq));
+ if (mq == NULL) {
+ err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+ "Couldn't allocate memory for mq endpoint");
+ goto fail;
+ }
+
+ mq->ep = NULL;
+ /*mq->unexpected_callback = NULL; */
+ mq->memmode = psmi_parse_memmode();
+
+ memset(mq->unexpected_htab, 0,
+ NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+ memset(mq->expected_htab, 0,
+ NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+ memset(&mq->expected_q, 0, sizeof(struct mqq));
+ memset(&mq->unexpected_q, 0, sizeof(struct mqq));
+ memset(&mq->completed_q, 0, sizeof(struct mqq));
+ memset(&mq->outoforder_q, 0, sizeof(struct mqq));
+ STAILQ_INIT(&mq->eager_q);
+
+
+ /* The values are overwritten in initialize_defaults, they're just set to
+ * sensible defaults until then */
+ if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M)
+ {
+ mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2;
+ mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2;
+ } else {
+ mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON;
+ mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON;
+ }
+ mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY;
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED)
+ mq->hfi_base_window_rv = MQ_HFI_THRESH_RNDV_CUDA;
+#endif
+ mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
+
+ memset(&mq->stats, 0, sizeof(psm2_mq_stats_t));
+ err = psmi_mq_req_init(mq);
+ if (err)
+ goto fail;
+
+ *mqo = mq;
+
+ return PSM2_OK;
+fail:
+ if (mq != NULL)
+ psmi_free(mq);
+ return err;
+}
+
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq)
+{
+ union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv,
+ env_shmrv, env_stats;
+
+ psmi_getenv("PSM2_MQ_TINY_HFI_THRESH",
+ "hfi tiny packet switchover (max 8, default 8)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny);
+ mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8);
+
+ psmi_getenv("PSM2_MQ_RNDV_HFI_THRESH",
+ "hfi eager-to-rendezvous switchover",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv);
+ mq->hfi_thresh_rv = env_hfirv.e_uint;
+
+ psmi_getenv("PSM2_MQ_RNDV_HFI_WINDOW",
+ "hfi rendezvous window size, max 4M",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin);
+ mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint);
+
+ /* Re-evaluate this since it may have changed after initializing the shm
+ * device */
+ mq->shm_thresh_rv = psmi_shm_mq_rv_thresh;
+ psmi_getenv("PSM2_MQ_RNDV_SHM_THRESH",
+ "shm eager-to-rendezvous switchover",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv);
+ mq->shm_thresh_rv = env_shmrv.e_uint;
+
+ psmi_getenv("PSM2_MQ_PRINT_STATS",
+ "Print MQ stats during finalization",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val) 0, &env_stats);
+ mq->print_stats = env_stats.e_uint;
+
+ mq->nohash_fastpath = 1;
+ return PSM2_OK;
+}
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq)
+{
+ psmi_mq_req_fini(mq);
+ psmi_mq_sysbuf_fini(mq);
+ psmi_free(mq);
+ return PSM2_OK;
+}
+MOCK_DEF_EPILOGUE(psmi_mq_free);
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
new file mode 100644
index 0000000..f20bf34
--- /dev/null
+++ b/psm_mq_internal.h
@@ -0,0 +1,639 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef MQ_INT_H
+#define MQ_INT_H
+
+/* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */
+#ifdef malloc
+#undef malloc
+#endif
+#ifdef free
+#undef free
+#endif
+#include <smmintrin.h>
+#include "psm_user.h"
+#include "psm_sysbuf.h"
+
+#include "psm2_mock_testing.h"
+
+#if 0
+typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t)
+ (psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr,
+ uint64_t tag, uint32_t send_msglen, const void *payload,
+ uint32_t paylen);
+#endif
+
+#define NUM_HASH_BUCKETS 64
+#define HASH_THRESHOLD 65
+#define NUM_HASH_CONFIGS 3
+#define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1)
+#define REMOVE_ENTRY 1
+
+enum psm2_mq_tag_pattern {
+ PSM2_TAG_SRC = 0,
+ PSM2_TAG_ANYSRC,
+ PSM2_ANYTAG_SRC,
+ PSM2_ANYTAG_ANYSRC,
+};
+
+struct psm2_mq {
+ psm2_ep_t ep; /**> ep back pointer */
+ mpool_t sreq_pool;
+ mpool_t rreq_pool;
+
+ struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+ struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+
+ /* in case the compiler can't figure out how to preserve the hashed values
+ between mq_req_match() and mq_add_to_unexpected_hashes() ... */
+ unsigned hashvals[NUM_HASH_CONFIGS];
+
+ /*psm_mq_unexpected_callback_fn_t unexpected_callback; */
+ struct mqq expected_q; /**> Preposted (expected) queue */
+ struct mqq unexpected_q; /**> Unexpected queue */
+ struct mqq completed_q; /**> Completed queue */
+
+ struct mqq outoforder_q; /**> OutofOrder queue */
+ STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */
+
+ uint32_t hfi_thresh_tiny;
+ uint32_t hfi_thresh_rv;
+ uint32_t shm_thresh_rv;
+ uint32_t hfi_base_window_rv; /**> this is a base rndv window size,
+ will be further trimmed down per-connection based
+ on the peer's MTU */
+ int memmode;
+
+ uint64_t timestamp;
+ psm2_mq_stats_t stats; /**> MQ stats, accumulated by each PTL */
+ int print_stats;
+ int nohash_fastpath;
+ unsigned unexpected_hash_len;
+ unsigned unexpected_list_len;
+ unsigned expected_hash_len;
+ unsigned expected_list_len;
+
+ psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS];
+ int mem_ctrl_is_init;
+ uint64_t mem_ctrl_total_bytes;
+
+ psmi_lock_t progress_lock;
+};
+
+#define MQ_HFI_THRESH_TINY 8
+#define MQ_HFI_THRESH_EGR_SDMA_XEON 34000 /* Eager Xeon blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000 /* Eager Phi2 blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16000 /* Eager Xeon non-blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536 /* Eager Phi2 non-blocking */
+
+#define MQ_HFI_THRESH_RNDV_PHI2 200000
+#define MQ_HFI_THRESH_RNDV_XEON 64000
+
+#define MQ_HFI_WINDOW_RNDV_PHI2 4194304
+#define MQ_HFI_WINDOW_RNDV_XEON 131072
+
+#ifdef PSM_CUDA
+#define MQ_HFI_THRESH_RNDV_CUDA 2097152
+#endif
+
+#define MQ_SHM_THRESH_RNDV 16000
+
+#define MQE_TYPE_IS_SEND(type) ((type) & MQE_TYPE_SEND)
+#define MQE_TYPE_IS_RECV(type) ((type) & MQE_TYPE_RECV)
+
+#define MQE_TYPE_SEND 0x1000
+#define MQE_TYPE_RECV 0x2000
+#define MQE_TYPE_FLAGMASK 0x0fff
+#define MQE_TYPE_WAITING 0x0001
+#define MQE_TYPE_WAITING_PEER 0x0004
+#define MQE_TYPE_EAGER_QUEUE 0x0008
+
+#define MQ_STATE_COMPLETE 0
+#define MQ_STATE_POSTED 1
+#define MQ_STATE_MATCHED 2
+#define MQ_STATE_UNEXP 3
+#define MQ_STATE_UNEXP_RV 4
+#define MQ_STATE_FREE 5
+
+/*
+ * These must match the ips protocol message opcode.
+ */
+#define MQ_MSG_TINY 0xc1
+#define MQ_MSG_SHORT 0xc2
+#define MQ_MSG_EAGER 0xc3
+#define MQ_MSG_LONGRTS 0xc4
+
+/*
+ * Descriptor allocation limits.
+ * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure
+ */
+#define MQ_SENDREQ_LIMITS { \
+ .env = "PSM2_MQ_SENDREQS_MAX", \
+ .descr = "Max num of isend requests in flight", \
+ .env_level = PSMI_ENVVAR_LEVEL_USER, \
+ .minval = 1, \
+ .maxval = ~0, \
+ .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \
+ .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \
+ .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \
+ }
+
+#define MQ_RECVREQ_LIMITS { \
+ .env = "PSM2_MQ_RECVREQS_MAX", \
+ .descr = "Max num of irecv requests in flight", \
+ .env_level = PSMI_ENVVAR_LEVEL_USER, \
+ .minval = 1, \
+ .maxval = ~0, \
+ .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \
+ .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \
+ .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \
+ }
+
+typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted);
+typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req);
+
+
+/* If request is marked as internal, then it will not
+ be exposed to the user, will not be added to the mq->completed_q.
+ This flag is set if request is used by e.g. MPI_SEND */
+#define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0)
+
+#define psmi_is_req_internal(req) ((req)->flags & PSMI_REQ_FLAG_IS_INTERNAL)
+
+#define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \
+ (!psmi_is_req_internal(req)))
+
+/* receive mq_req, the default */
+struct psm2_mq_req {
+ struct {
+ psm2_mq_req_t next[NUM_MQ_SUBLISTS];
+ psm2_mq_req_t prev[NUM_MQ_SUBLISTS];
+ STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */
+ };
+ struct mqq *q[NUM_MQ_SUBLISTS];
+ uint64_t timestamp;
+ uint32_t state;
+ uint32_t type;
+ psm2_mq_t mq;
+
+ /* Tag matching vars */
+ psm2_epaddr_t peer;
+ psm2_mq_tag_t tag;
+ psm2_mq_tag_t tagsel; /* used for receives */
+
+ /* Some PTLs want to get notified when there's a test/wait event */
+ mq_testwait_callback_fn_t testwait_callback;
+
+ /* Buffer attached to request. May be a system buffer for unexpected
+ * messages or a user buffer when an expected message */
+ uint8_t *buf;
+ uint32_t buf_len;
+ uint32_t error_code;
+
+ uint16_t msg_seqnum; /* msg seq num for mctxt */
+ uint32_t recv_msglen; /* Message length we are ready to receive */
+ uint32_t send_msglen; /* Message length from sender */
+ uint32_t recv_msgoff; /* Message offset into buf */
+ union {
+ uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */
+ uint32_t recv_msgposted;
+ };
+ uint32_t rts_reqidx_peer;
+
+ uint64_t flags;
+
+ /* Used for request to send messages */
+ void *context; /* user context associated to sends or receives */
+
+ /* Used to keep track of unexpected rendezvous */
+ mq_rts_callback_fn_t rts_callback;
+ psm2_epaddr_t rts_peer;
+ uintptr_t rts_sbuf;
+
+#ifdef PSM_CUDA
+ /* is_buf_gpu_mem - used to indicate if the send or receive is issued
+ * on a device/host buffer.
+ * is_sendbuf_gpu_mem - Used to always select TID path on the receiver
+ * when send is on a device buffer
+ */
+ uint8_t is_buf_gpu_mem;
+ uint8_t is_sendbuf_gpu_mem;
+ STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch;
+ uint32_t prefetch_send_msgoff;
+ int cuda_hostbuf_used;
+ cudaIpcMemHandle_t cuda_ipc_handle;
+ cudaEvent_t cuda_ipc_event;
+ uint8_t cuda_ipc_handle_attached;
+#endif
+
+ /* PTLs get to store their own per-request data. MQ manages the allocation
+ * by allocating psm2_mq_req so that ptl_req_data has enough space for all
+ * possible PTLs.
+ */
+ union {
+ void *ptl_req_ptr; /* when used by ptl as pointer */
+ uint8_t ptl_req_data[0]; /* when used by ptl for "inline" data */
+ };
+};
+
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_64(uint64_t a))
+{
+ return _mm_crc32_u64(0, a);
+}
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_32(uint32_t a))
+{
+ return _mm_crc32_u32(0, a);
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars);
+MOCK_DCL_EPILOGUE(psmi_mq_mtucpy);
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars);
+
+#if defined(__x86_64__)
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars);
+#else
+#define psmi_mq_mtucpy_safe psmi_mq_mtucpy
+#endif
+
+/*
+ * Optimize for 0-8 byte case, but also handle others.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) {
+ if (!PSMI_IS_CUDA_ENABLED) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Please enable PSM CUDA support when using GPU buffer \n");
+ return;
+ }
+ PSMI_CUDA_CALL(cudaMemcpy, dest, src, len, cudaMemcpyDefault);
+ return;
+ }
+#endif
+ switch (len) {
+ case 8:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 0:
+ return;
+ case 7:
+ case 6:
+ case 5:
+ *dest++ = *src++;
+ len -= 4;
+ case 3:
+ case 2:
+ case 1:
+ break;
+ default: /* greater than 8 */
+ psmi_mq_mtucpy(dest, src, len);
+ return;
+ }
+ uint8_t *dest1 = (uint8_t *) dest;
+ uint8_t *src1 = (uint8_t *) src;
+ switch (len) {
+ case 3:
+ *dest1++ = *src1++;
+ case 2:
+ *dest1++ = *src1++;
+ case 1:
+ *dest1++ = *src1++;
+ }
+}
+
+#ifdef PSM_CUDA
+typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len);
+
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+ switch (len) {
+ case 8:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 0:
+ return;
+ case 7:
+ case 6:
+ case 5:
+ *dest++ = *src++;
+ len -= 4;
+ case 3:
+ case 2:
+ case 1:
+ break;
+ default: /* greater than 8 */
+ psmi_mq_mtucpy(dest, src, len);
+ return;
+ }
+ uint8_t *dest1 = (uint8_t *) dest;
+ uint8_t *src1 = (uint8_t *) src;
+ switch (len) {
+ case 3:
+ *dest1++ = *src1++;
+ case 2:
+ *dest1++ = *src1++;
+ case 1:
+ *dest1++ = *src1++;
+ }
+}
+#endif
+
+/* Typedef describing a function to populate a psm2_mq_status(2)_t given a
+ * matched request. The purpose of this typedef is to avoid duplicating
+ * code to handle both PSM v1 and v2 status objects. Outer routines pass in
+ * either mq_status_copy or mq_status2_copy and the inner routine calls that
+ * provided routine to fill in the correct status type.
+ */
+typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status);
+
+/*
+ * Given an req with buffer ubuf of length ubuf_len,
+ * fill in the req's status and return the amount of bytes the request
+ * can receive.
+ *
+ * The function sets status truncation errors. Basically what MPI_Status does.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status))
+{
+ status->msg_tag = *((uint64_t *) req->tag.tag);
+ status->msg_length = req->send_msglen;
+ status->nbytes = req->recv_msglen;
+ status->error_code = req->error_code;
+ status->context = req->context;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status))
+{
+ status->msg_peer = req->peer;
+ status->msg_tag = req->tag;
+ status->msg_length = req->send_msglen;
+ status->nbytes = req->recv_msglen;
+ status->error_code = req->error_code;
+ status->context = req->context;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t
+mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen))
+{
+ req->send_msglen = sendlen;
+ if (recvlen < sendlen) {
+ req->recv_msglen = recvlen;
+ req->error_code = PSM2_MQ_TRUNCATION;
+ return recvlen;
+ } else {
+ req->recv_msglen = sendlen;
+ req->error_code = PSM2_OK;
+ return sendlen;
+ }
+}
+
+PSMI_ALWAYS_INLINE(
+int
+min_timestamp_4(psm2_mq_req_t *match))
+{
+ uint64_t oldest = -1;
+ int which = -1, i;
+ for (i = 0; i < 4; i++) {
+ if (match[i] && (match[i]->timestamp < oldest)) {
+ oldest = match[i]->timestamp;
+ which = i;
+ }
+ }
+ return which;
+}
+
+#ifndef PSM_DEBUG
+/*! Append to Queue */
+PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req))
+{
+ req->next[PSM2_ANYTAG_ANYSRC] = NULL;
+ req->prev[PSM2_ANYTAG_ANYSRC] = q->last;
+ if (q->last)
+ q->last->next[PSM2_ANYTAG_ANYSRC] = req;
+ else
+ q->first = req;
+ q->last = req;
+ req->q[PSM2_ANYTAG_ANYSRC] = q;
+}
+#else
+#define mq_qq_append(qq, req) \
+ do { \
+ psmi_assert_req_not_internal(req); \
+ (req)->next[PSM2_ANYTAG_ANYSRC] = NULL; \
+ (req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last; \
+ if ((qq)->last) \
+ (qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req); \
+ else \
+ (qq)->first = (req); \
+ (qq)->last = (req); \
+ (req)->q[PSM2_ANYTAG_ANYSRC] = (qq); \
+ if (qq == &(req)->mq->completed_q) \
+ _HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \
+ (req), __FILE__, __LINE__); \
+ } while (0)
+#endif
+PSMI_ALWAYS_INLINE(
+void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS],
+ int table, int bucket, psm2_mq_req_t req))
+{
+ req->next[table] = NULL;
+ req->prev[table] = q[table][bucket].last;
+ if (q[table][bucket].last)
+ q[table][bucket].last->next[table] = req;
+ else
+ q[table][bucket].first = req;
+ q[table][bucket].last = req;
+ req->q[table] = &q[table][bucket];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req))
+{
+ if (req->next[PSM2_ANYTAG_ANYSRC] != NULL)
+ req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] =
+ req->prev[PSM2_ANYTAG_ANYSRC];
+ else
+ q->last = req->prev[PSM2_ANYTAG_ANYSRC];
+ if (req->prev[PSM2_ANYTAG_ANYSRC])
+ req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] =
+ req->next[PSM2_ANYTAG_ANYSRC];
+ else
+ q->first = req->next[PSM2_ANYTAG_ANYSRC];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table))
+{
+ struct mqq *q = req->q[table];
+
+ req->q[table] = NULL;
+ if (req->next[table] != NULL)
+ req->next[table]->prev[table] = req->prev[table];
+ else
+ q->last = req->prev[table];
+ if (req->prev[table])
+ req->prev[table]->next[table] = req->next[table];
+ else
+ q->first = req->next[table];
+}
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq);
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq);
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type);
+MOCK_DCL_EPILOGUE(psmi_mq_req_alloc);
+#define psmi_mq_req_free(req) psmi_mpool_put(req)
+
+/*
+ * Main receive progress engine, for shmops and hfi, in mq.c
+ */
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo);
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq);
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq);
+MOCK_DCL_EPILOGUE(psmi_mq_free);
+
+/* Three functions that handle all MQ stuff */
+#define MQ_RET_MATCH_OK 0
+#define MQ_RET_UNEXP_OK 1
+#define MQ_RET_UNEXP_NO_RESOURCES 2
+#define MQ_RET_DATA_OK 3
+#define MQ_RET_DATA_OUT_OF_ORDER 4
+
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req);
+int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+ uint32_t offset, const void *payload, uint32_t paylen);
+int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+ uint32_t msglen, const void *payload, uint32_t paylen,
+ int msgorder, mq_rts_callback_fn_t cb,
+ psm2_mq_req_t *req_o);
+int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+ uint32_t msglen, uint32_t offset,
+ const void *payload, uint32_t paylen, int msgorder,
+ uint32_t opcode, psm2_mq_req_t *req_o);
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
+
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn);
+
+void psmi_mq_fastpath_disable(psm2_mq_t mq);
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq);
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum))
+{
+ psm2_mq_req_t *curp;
+ psm2_mq_req_t cur;
+
+ for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) {
+ if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) {
+ /* match! */
+ mq_qq_remove(q, cur);
+ return cur;
+ }
+ }
+ return NULL; /* no match */
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum))
+{
+ psm2_mq_req_t cur;
+
+ cur = STAILQ_FIRST(&mq->eager_q);
+ while (cur) {
+ if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum)
+ return cur;
+ cur = STAILQ_NEXT(cur, nextq);
+ }
+ return NULL; /* no match */
+}
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+ psm_mq_unexpected_callback_fn_t fn);
+#endif
+
+PSMI_ALWAYS_INLINE(void psmi_mq_stats_rts_account(psm2_mq_req_t req))
+{
+ psm2_mq_t mq = req->mq;
+ if (MQE_TYPE_IS_SEND(req->type)) {
+ mq->stats.tx_num++;
+ mq->stats.tx_rndv_num++;
+ mq->stats.tx_rndv_bytes += req->send_msglen;
+ } else {
+ mq->stats.rx_user_num++;
+ mq->stats.rx_user_bytes += req->recv_msglen;
+ }
+ return;
+}
+
+#endif
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
new file mode 100644
index 0000000..3217714
--- /dev/null
+++ b/psm_mq_recv.c
@@ -0,0 +1,593 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "ptl_ips/ips_proto_header.h"
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+ psm_mq_unexpected_callback_fn_t fn)
+{
+ psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback;
+ mq->unexpected_callback = fn;
+ return old_fn;
+}
+#endif
+
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req)
+{
+ psm2_mq_t mq = req->mq;
+
+ /* Stats on rendez-vous messages */
+ psmi_mq_stats_rts_account(req);
+ req->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ if(!psmi_is_req_internal(req))
+ mq_qq_append(&mq->completed_q, req);
+#ifdef PSM_VALGRIND
+ if (MQE_TYPE_IS_RECV(req->type))
+ PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+ req->recv_msglen);
+ else
+ VALGRIND_MAKE_MEM_DEFINED(req->buf, req->buf_len);
+#endif
+ _HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n",
+ req, req->recv_msglen);
+ return;
+}
+
+static void
+psmi_mq_req_copy(psm2_mq_req_t req,
+ uint32_t offset, const void *buf, uint32_t nbytes)
+{
+ /* recv_msglen may be changed by unexpected receive buf. */
+ uint32_t msglen_this, end;
+ uint8_t *msgptr = (uint8_t *) req->buf + offset;
+
+ /* out of receiving range. */
+ if (offset >= req->recv_msglen) {
+ req->send_msgoff += nbytes;
+ return;
+ }
+
+ end = offset + nbytes;
+ if (end > req->recv_msglen) {
+ msglen_this = req->recv_msglen - offset;
+ end = req->recv_msglen;
+ } else {
+ msglen_this = nbytes;
+ }
+
+ VALGRIND_MAKE_MEM_DEFINED(msgptr, msglen_this);
+ psmi_mq_mtucpy(msgptr, buf, msglen_this);
+
+ if (req->recv_msgoff < end) {
+ req->recv_msgoff = end;
+ }
+
+ req->send_msgoff += nbytes;
+ return;
+}
+
+int
+psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+ uint32_t offset, const void *buf, uint32_t nbytes)
+{
+ psmi_assert(req != NULL);
+ int rc;
+
+ if (req->state == MQ_STATE_MATCHED)
+ rc = MQ_RET_MATCH_OK;
+ else {
+ psmi_assert(req->state == MQ_STATE_UNEXP);
+ rc = MQ_RET_UNEXP_OK;
+ }
+
+ psmi_mq_req_copy(req, offset, buf, nbytes);
+
+ /*
+ * the reason to use >= is because send_msgoff
+ * may be DW pad included.
+ */
+ if (req->send_msgoff >= req->send_msglen) {
+ if (req->type & MQE_TYPE_EAGER_QUEUE) {
+ STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq);
+ }
+
+ if (req->state == MQ_STATE_MATCHED) {
+ req->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ mq_qq_append(&mq->completed_q, req);
+ } else { /* MQ_STATE_UNEXP */
+ req->state = MQ_STATE_COMPLETE;
+ }
+ }
+
+ return rc;
+}
+
+static
+void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+ int table;
+ mq_qq_append(&mq->unexpected_q, req);
+ req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q;
+ mq->unexpected_list_len++;
+ if_pt (mq->nohash_fastpath) {
+ if_pf (mq->unexpected_list_len >= HASH_THRESHOLD)
+ psmi_mq_fastpath_disable(mq);
+ return;
+ }
+
+ for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+ mq_qq_append_which(mq->unexpected_htab,
+ table, mq->hashvals[table], req);
+ mq->unexpected_hash_len++;
+}
+
+
+psm2_mq_req_t
+mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold)
+{
+ psm2_mq_req_t *curp, cur;
+
+ for (curp = &q->first;
+ ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold);
+ curp = &cur->next[which]) {
+ if ((cur->peer == PSM2_MQ_ANY_ADDR || src == cur->peer) &&
+ !((tag->tag[0] ^ cur->tag.tag[0]) & cur->tagsel.tag[0]) &&
+ !((tag->tag[1] ^ cur->tag.tag[1]) & cur->tagsel.tag[1]) &&
+ !((tag->tag[2] ^ cur->tag.tag[2]) & cur->tagsel.tag[2])) {
+ *time_threshold = cur->timestamp;
+ return cur;
+ }
+ }
+ return NULL;
+}
+
+psm2_mq_req_t
+mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove)
+{
+ psm2_mq_req_t match[4];
+ int table;
+ uint64_t best_ts = -1;
+
+ if (mq->nohash_fastpath) {
+ table = PSM2_ANYTAG_ANYSRC;
+ match[table] =
+ mq_list_scan(&mq->expected_q,
+ src, tag, PSM2_ANYTAG_ANYSRC, &best_ts);
+ if (match[table] && remove) {
+ mq->expected_list_len--;
+ mq_qq_remove_which(match[table], table);
+ }
+ return match[table];
+ }
+
+ mq->hashvals[PSM2_TAG_SRC] = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS;
+ mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+ mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+
+ for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+ match[table] =
+ mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]],
+ src, tag, table, &best_ts);
+ table = PSM2_ANYTAG_ANYSRC;
+ match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts);
+
+ table = min_timestamp_4(match);
+ if (table == -1)
+ return NULL;
+
+ if (remove) {
+ if_pt (table == PSM2_ANYTAG_ANYSRC)
+ mq->expected_list_len--;
+ else
+ mq->expected_hash_len--;
+ mq_qq_remove_which(match[table], table);
+ psmi_mq_fastpath_try_reenable(mq);
+ }
+ return match[table];
+}
+/*
+ * This handles the rendezvous MPI envelopes, the packet might have the whole
+ * message payload, or zero payload.
+ */
+int
+psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+ uint32_t send_msglen, const void *payload, uint32_t paylen,
+ int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o)
+{
+ psm2_mq_req_t req;
+ uint32_t msglen;
+ int rc;
+
+ PSMI_LOCK_ASSERT(mq->progress_lock);
+
+ if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+ /* we have a match, no need to callback */
+ msglen = mq_set_msglen(req, req->buf_len, send_msglen);
+ /* reset send_msglen because sender only sends this many */
+ req->send_msglen = msglen;
+ req->state = MQ_STATE_MATCHED;
+ req->peer = src;
+ req->tag = *tag;
+
+ if (paylen > msglen) paylen = msglen;
+ if (paylen) {
+ psmi_mq_mtucpy(req->buf, payload, paylen);
+ }
+ req->recv_msgoff = req->send_msgoff = paylen;
+ *req_o = req; /* yes match */
+ PSM_LOG_EPM(OPCODE_LONG_RTS,PSM_LOG_EPM_RX,src->epid,mq->ep->epid,
+ "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+ rc = MQ_RET_MATCH_OK;
+ } else if (msgorder > 1) {
+ /* There is NO request match, and this is the first time
+ * to try to process this packet, we leave the packet in
+ * hardware queue for retry in hope there is a request
+ * match next time, this is for performance
+ * consideration.
+ */
+ rc = MQ_RET_UNEXP_NO_RESOURCES;
+ } else { /* No match, keep track of callback */
+ req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+ psmi_assert(req != NULL);
+ /* We don't know recv_msglen yet but we set it here for
+ * mq_iprobe */
+ req->send_msglen = req->recv_msglen = send_msglen;
+ PSM_LOG_EPM_COND(req->send_msglen > mq->hfi_thresh_rv,
+ OPCODE_LONG_RTS,PSM_LOG_EPM_RX,src->epid,mq->ep->epid,
+ "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+ req->state = MQ_STATE_UNEXP_RV;
+ req->peer = src;
+ req->tag = *tag;
+ req->rts_callback = cb;
+ if (paylen > send_msglen) paylen = send_msglen;
+ if (paylen) {
+ req->buf = psmi_mq_sysbuf_alloc(mq, paylen);
+ mq->stats.rx_sysbuf_num++;
+ mq->stats.rx_sysbuf_bytes += paylen;
+ psmi_mq_mtucpy(req->buf, payload, paylen);
+ }
+ req->recv_msgoff = req->send_msgoff = paylen;
+
+ if (msgorder) {
+ mq_add_to_unexpected_hashes(mq, req);
+ }
+ /* caller will handle out of order case */
+ *req_o = req; /* no match, will callback */
+ rc = MQ_RET_UNEXP_OK;
+ }
+
+#ifdef PSM_DEBUG
+ if (req)
+ _HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d "
+ "sendlen=%d errcode=%d\n",
+ rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+ psmi_epaddr_get_name(src->epid),
+ req->tag.tag[0], req->tag.tag[1], req->tag.tag[2],
+ req->recv_msglen, req->send_msglen, req->error_code);
+ else
+ _HFI_VDBG("match=%s (req=%p) src=%s\n",
+ rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+ psmi_epaddr_get_name(src->epid));
+#endif /* #ifdef PSM_DEBUG */
+ return rc;
+}
+
+/*
+ * This handles the regular (i.e. non-rendezvous MPI envelopes)
+ */
+int
+psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+ uint32_t send_msglen, uint32_t offset,
+ const void *payload, uint32_t paylen, int msgorder,
+ uint32_t opcode, psm2_mq_req_t *req_o)
+{
+ psm2_mq_req_t req;
+ uint32_t msglen;
+
+ if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+ /* we have a match */
+ psmi_assert(MQE_TYPE_IS_RECV(req->type));
+ req->peer = src;
+ req->tag = *tag;
+ msglen = mq_set_msglen(req, req->buf_len, send_msglen);
+
+ _HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x"
+ " msglen=%d paylen=%d\n", req, opcode,
+ psmi_epaddr_get_name(src->epid),
+ tag->tag[0], tag->tag[1], tag->tag[2], msglen,
+ paylen);
+
+ switch (opcode) {
+ case MQ_MSG_TINY:
+ PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+ msglen);
+ /* mq_copy_tiny() can handle zero byte */
+ mq_copy_tiny((uint32_t *) req->buf,
+ (uint32_t *) payload, msglen);
+ req->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ mq_qq_append(&mq->completed_q, req);
+ break;
+
+ case MQ_MSG_SHORT: /* message fits in 1 payload */
+ PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len,
+ msglen);
+ if (msglen <= paylen) {
+ psmi_mq_mtucpy(req->buf, payload, msglen);
+ } else {
+ psmi_assert((msglen & ~0x3) == paylen);
+ psmi_mq_mtucpy(req->buf, payload, paylen);
+ /*
+ * there are nonDW bytes attached in header,
+ * copy after the DW payload.
+ */
+ mq_copy_tiny((uint32_t *)(req->buf+paylen),
+ (uint32_t *)&offset, msglen & 0x3);
+ }
+ req->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ mq_qq_append(&mq->completed_q, req);
+ break;
+
+ case MQ_MSG_EAGER:
+ req->state = MQ_STATE_MATCHED;
+ req->type |= MQE_TYPE_EAGER_QUEUE;
+ req->send_msgoff = req->recv_msgoff = 0;
+ STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+ _HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n",
+ msglen, paylen);
+ if (paylen > 0)
+ psmi_mq_handle_data(mq, req, offset, payload,
+ paylen);
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Internal error, unknown packet 0x%x",
+ opcode);
+ }
+
+ mq->stats.rx_user_bytes += msglen;
+ mq->stats.rx_user_num++;
+
+ *req_o = req; /* yes match */
+ return MQ_RET_MATCH_OK;
+ }
+
+ /* unexpected message or out of order message. */
+
+#if 0
+ /*
+ * Keep a callback here in case we want to fit some other high-level
+ * protocols over MQ (i.e. shmem). These protocols would bypass the
+ * normal message handling and go to higher-level message handlers.
+ */
+ if (msgorder && mq->unexpected_callback) {
+ mq->unexpected_callback(mq, opcode, epaddr, tag, send_msglen,
+ payload, paylen);
+ *req_o = NULL;
+ return MQ_RET_UNEXP_OK;
+ }
+#endif
+
+ if (msgorder > 1) {
+ /* There is NO request match, and this is the first time
+ * to try to process this packet, we leave the packet in
+ * hardware queue for retry in hope there is a request
+ * match nex time, this is for performance
+ * consideration.
+ */
+ return MQ_RET_UNEXP_NO_RESOURCES;
+ }
+
+ req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+ psmi_assert(req != NULL);
+
+ req->peer = src;
+ req->tag = *tag;
+ req->recv_msgoff = 0;
+ req->recv_msglen = req->send_msglen = req->buf_len = msglen =
+ send_msglen;
+
+ _HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x"
+ " send_msglen=%d\n", req, opcode,
+ psmi_epaddr_get_name(src->epid),
+ tag->tag[0], tag->tag[1], tag->tag[2], send_msglen);
+
+ switch (opcode) {
+ case MQ_MSG_TINY:
+ if (msglen > 0) {
+ req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+ mq->stats.rx_sysbuf_num++;
+ mq->stats.rx_sysbuf_bytes += paylen;
+ mq_copy_tiny((uint32_t *) req->buf,
+ (uint32_t *) payload, msglen);
+ } else
+ req->buf = NULL;
+ req->state = MQ_STATE_COMPLETE;
+ break;
+
+ case MQ_MSG_SHORT:
+ req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+ mq->stats.rx_sysbuf_num++;
+ mq->stats.rx_sysbuf_bytes += paylen;
+ if (msglen <= paylen) {
+ psmi_mq_mtucpy(req->buf, payload, msglen);
+ } else {
+ psmi_assert((msglen & ~0x3) == paylen);
+ psmi_mq_mtucpy(req->buf, payload, paylen);
+ /*
+ * there are nonDW bytes attached in header,
+ * copy after the DW payload.
+ */
+ mq_copy_tiny((uint32_t *)(req->buf+paylen),
+ (uint32_t *)&offset, msglen & 0x3);
+ }
+ req->state = MQ_STATE_COMPLETE;
+ break;
+
+ case MQ_MSG_EAGER:
+ req->send_msgoff = 0;
+ req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+ mq->stats.rx_sysbuf_num++;
+ mq->stats.rx_sysbuf_bytes += paylen;
+ req->state = MQ_STATE_UNEXP;
+ req->type |= MQE_TYPE_EAGER_QUEUE;
+ STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+ _HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n",
+ msglen, paylen);
+ if (paylen > 0)
+ psmi_mq_handle_data(mq, req, offset, payload, paylen);
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Internal error, unknown packet 0x%x",
+ opcode);
+ }
+
+ mq->stats.rx_sys_bytes += msglen;
+ mq->stats.rx_sys_num++;
+
+ if (msgorder) {
+ mq_add_to_unexpected_hashes(mq, req);
+ }
+ /* caller will handle out of order case */
+ *req_o = req; /* no match, will callback */
+ return MQ_RET_UNEXP_OK;
+}
+
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
+{
+ psm2_mq_req_t ereq;
+ uint32_t msglen;
+
+ ereq = mq_req_match(mq, ureq->peer, &ureq->tag, 1);
+ if (ereq == NULL) {
+ mq_add_to_unexpected_hashes(mq, ureq);
+ return 0;
+ }
+
+ psmi_assert(MQE_TYPE_IS_RECV(ereq->type));
+ ereq->peer = ureq->peer;
+ ereq->tag = ureq->tag;
+ msglen = mq_set_msglen(ereq, ereq->buf_len, ureq->send_msglen);
+
+ switch (ureq->state) {
+ case MQ_STATE_COMPLETE:
+ if (ureq->buf != NULL) { /* 0-byte don't alloc a sysbuf */
+ psmi_mq_mtucpy(ereq->buf, (const void *)ureq->buf,
+ msglen);
+ psmi_mq_sysbuf_free(mq, ureq->buf);
+ }
+ ereq->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ mq_qq_append(&mq->completed_q, ereq);
+ break;
+ case MQ_STATE_UNEXP: /* not done yet */
+ ereq->state = MQ_STATE_MATCHED;
+ ereq->msg_seqnum = ureq->msg_seqnum;
+ ereq->ptl_req_ptr = ureq->ptl_req_ptr;
+ ereq->send_msgoff = ureq->send_msgoff;
+ ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+ if (ereq->recv_msgoff) {
+ psmi_mq_mtucpy(ereq->buf,
+ (const void *)ureq->buf,
+ ereq->recv_msgoff);
+ }
+ psmi_mq_sysbuf_free(mq, ureq->buf);
+ ereq->type = ureq->type;
+ STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq);
+ STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq);
+ break;
+ case MQ_STATE_UNEXP_RV: /* rendez-vous ... */
+ ereq->state = MQ_STATE_MATCHED;
+ ereq->rts_peer = ureq->rts_peer;
+ ereq->rts_sbuf = ureq->rts_sbuf;
+ ereq->send_msgoff = ureq->send_msgoff;
+ ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+ if (ereq->recv_msgoff) {
+ psmi_mq_mtucpy(ereq->buf,
+ (const void *)ureq->buf,
+ ereq->recv_msgoff);
+ }
+ if (ereq->send_msgoff) {
+ psmi_mq_sysbuf_free(mq, ureq->buf);
+ }
+ ereq->rts_callback = ureq->rts_callback;
+ ereq->rts_reqidx_peer = ureq->rts_reqidx_peer;
+ ereq->type = ureq->type;
+ ereq->rts_callback(ereq, 0);
+ break;
+ default:
+ fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state,
+ ureq);
+ fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+ ureq->type, ureq->mq, ureq->tag.tag[0],
+ ureq->tag.tag[1], ureq->tag.tag[2]);
+ abort();
+ }
+
+ psmi_mq_req_free(ureq);
+ return 0;
+}
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
new file mode 100644
index 0000000..ff8a52a
--- /dev/null
+++ b/psm_mq_utils.c
@@ -0,0 +1,273 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * MQ request allocator
+ *
+ */
+
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
+{
+ psm2_mq_req_t req;
+
+ psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND);
+
+ if (type == MQE_TYPE_SEND)
+ req = psmi_mpool_get(mq->sreq_pool);
+ else
+ req = psmi_mpool_get(mq->rreq_pool);
+
+ if_pt(req != NULL) {
+ /* A while ago there were issues about forgetting to zero-out parts of the
+ * structure, I'm leaving this as a debug-time option */
+#ifdef PSM_DEBUG
+ memset(req, 0, sizeof(struct psm2_mq_req));
+#endif
+ req->type = type;
+ req->state = MQ_STATE_FREE;
+ memset(req->next, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t));
+ memset(req->prev, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t));
+ memset(req->q, 0, NUM_MQ_SUBLISTS * sizeof(struct mqq *));
+ req->error_code = PSM2_OK;
+ req->mq = mq;
+ req->testwait_callback = NULL;
+ req->rts_peer = NULL;
+ req->peer = NULL;
+ req->ptl_req_ptr = NULL;
+ req->flags = 0;
+ return req;
+ } else { /* we're out of reqs */
+ int issend = (type == MQE_TYPE_SEND);
+ uint32_t reqmax, reqchunk;
+ psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool,
+ &reqchunk, &reqmax);
+
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR,
+ "Exhausted %d MQ %s request descriptors, which usually indicates "
+ "a user program error or insufficient request descriptors (%s=%d)",
+ reqmax, issend ? "isend" : "irecv",
+ issend ? "PSM2_MQ_SENDREQS_MAX" :
+ "PSM2_MQ_RECVREQS_MAX", reqmax);
+ return NULL;
+ }
+}
+MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
+
+#ifdef PSM_CUDA
+void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
+ psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
+ if (is_alloc)
+ PSMI_CUDA_CALL(cudaEventCreate, &recvreq->cuda_ipc_event);
+ else
+ PSMI_CUDA_CALL(cudaEventDestroy, recvreq->cuda_ipc_event);
+ return;
+}
+#endif
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
+{
+ psm2_mq_req_t warmup_req;
+ psm2_error_t err = PSM2_OK;
+
+ _HFI_VDBG("mq element sizes are %d bytes\n",
+ (int)sizeof(struct psm2_mq_req));
+
+ /*
+ * Send MQ requests
+ */
+ {
+ struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS;
+ uint32_t maxsz, chunksz;
+
+ if ((err =
+ psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+ goto fail;
+
+ if ((mq->sreq_pool =
+ psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+ maxsz, 0, DESCRIPTORS, NULL,
+ NULL)) == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ }
+
+ /*
+ * Receive MQ requests
+ */
+ {
+ struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS;
+ uint32_t maxsz, chunksz;
+
+ if ((err =
+ psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+ goto fail;
+ /* Have a callback function for receive req mpool which creates
+ * and destroy events.
+ */
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED) {
+ if ((mq->rreq_pool =
+ psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
+ maxsz, 0, DESCRIPTORS, NULL,
+ NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ }
+ else {
+ if ((mq->rreq_pool =
+ psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+ maxsz, 0, DESCRIPTORS, NULL,
+ NULL)) == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ }
+#else
+ if ((mq->rreq_pool =
+ psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+ maxsz, 0, DESCRIPTORS, NULL,
+ NULL)) == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+#endif
+ }
+
+ /* Warm up the allocators */
+ warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+ psmi_assert_always(warmup_req != NULL);
+ psmi_mq_req_free(warmup_req);
+
+ warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+ psmi_assert_always(warmup_req != NULL);
+ psmi_mq_req_free(warmup_req);
+
+fail:
+ return err;
+}
+
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq)
+{
+ psmi_mpool_destroy(mq->rreq_pool);
+ psmi_mpool_destroy(mq->sreq_pool);
+ return PSM2_OK;
+}
+
+
+/*
+ * Hooks to plug into QLogic MPI stats
+ */
+
+static
+void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args)
+{
+ uint64_t *entry = args->stats;
+ psm2_mq_t mq = (psm2_mq_t) args->context;
+ psm2_mq_stats_t mqstats;
+
+ psm2_mq_get_stats(mq, &mqstats);
+
+ if (args->num < 8)
+ return;
+
+ entry[0] = mqstats.tx_eager_num;
+ entry[1] = mqstats.tx_eager_bytes;
+ entry[2] = mqstats.tx_rndv_num;
+ entry[3] = mqstats.tx_rndv_bytes;
+
+ entry[4] = mqstats.rx_user_num;
+ entry[5] = mqstats.rx_user_bytes;
+ entry[6] = mqstats.rx_sys_num;
+ entry[7] = mqstats.rx_sys_bytes;
+}
+
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn)
+{
+ char *desc[8];
+ uint16_t flags[8];
+ int i;
+ struct mpspawn_stats_add_args mp_add;
+ /*
+ * Hardcode flags until we correctly move mpspawn to its own repo.
+ * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN;
+ */
+ for (i = 0; i < 8; i++)
+ flags[i] = MPSPAWN_STATS_REDUCTION_ALL;
+
+ desc[0] = "Eager count sent";
+ desc[1] = "Eager bytes sent";
+ desc[2] = "Rendezvous count sent";
+ desc[3] = "Rendezvous bytes sent";
+ desc[4] = "Expected count received";
+ desc[5] = "Expected bytes received";
+ desc[6] = "Unexpect count received";
+ desc[7] = "Unexpect bytes received";
+
+ mp_add.version = MPSPAWN_STATS_VERSION;
+ mp_add.num = 8;
+ mp_add.header = "MPI Statistics Summary (max,min @ rank)";
+ mp_add.req_fn = psmi_mq_stats_callback;
+ mp_add.desc = desc;
+ mp_add.flags = flags;
+ mp_add.context = mq;
+
+ add_fn(&mp_add);
+}
diff --git a/psm_perf.c b/psm_perf.c
new file mode 100644
index 0000000..f3d7e94
--- /dev/null
+++ b/psm_perf.c
@@ -0,0 +1,246 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+#include "psm_user.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <asm/unistd.h>
+
+struct rdpmc_ctx global_rdpmc_ctx;
+
+u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+unsigned int global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE;
+unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+
+struct rdpmc_ctx {
+ int fd;
+ struct perf_event_mmap_page *buf;
+};
+
+typedef unsigned long long u64;
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+#include "immintrin.h"
+#endif
+
+/**
+ * DOC: Ring 3 counting for CPU performance counters
+ *
+ * This library allows accessing CPU performance counters from ring 3
+ * using the perf_events subsystem. This is useful to measure specific
+ * parts of programs (e.g. excluding initialization code)
+ *
+ * Requires a Linux 3.3+ kernel
+ */
+
+/**
+ * rdpmc_open_attr - initialize a raw ring 3 readable performance counter
+ * @attr: perf struct %perf_event_attr for the counter
+ * @ctx: Pointer to struct %rdpmc_ctx that is initialized.
+ * @leader_ctx: context of group leader or NULL
+ *
+ * This allows more flexible setup with a custom &perf_event_attr.
+ * For simple uses rdpmc_open() should be used instead.
+ * Must be called for each thread using the counter.
+ * Must be closed with rdpmc_close()
+ */
+PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx,
+ struct rdpmc_ctx *leader_ctx))
+{
+ ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1,
+ leader_ctx ? leader_ctx->fd : -1, 0);
+ if (ctx->fd < 0) {
+ perror("perf_event_open");
+ return -1;
+ }
+ ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0);
+ if (ctx->buf == MAP_FAILED) {
+ close(ctx->fd);
+ perror("mmap on perf fd");
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * rdpmc_open - initialize a simple ring 3 readable performance counter
+ * @counter: Raw event descriptor (UUEE UU unit mask EE event)
+ * @ctx: Pointer to struct &rdpmc_ctx that is initialized
+ *
+ * The counter will be set up to count CPU events excluding the kernel.
+ * Must be called for each thread using the counter.
+ * The caller must make sure counter is suitable for the running CPU.
+ * Only works in 3.3+ kernels.
+ * Must be closed with rdpmc_close()
+ */
+
+PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx))
+{
+ struct perf_event_attr attr = {
+ .type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE,
+ .size = PERF_ATTR_SIZE_VER0,
+ .config = counter,
+ .sample_type = PERF_SAMPLE_READ,
+ .exclude_kernel = 1,
+ };
+ return rdpmc_open_attr(&attr, ctx, NULL);
+}
+
+/**
+ * rdpmc_close: free a ring 3 readable performance counter
+ * @ctx: Pointer to &rdpmc_ctx context.
+ *
+ * Must be called by each thread for each context it initialized.
+ */
+PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx))
+{
+ close(ctx->fd);
+ munmap(ctx->buf, sysconf(_SC_PAGESIZE));
+}
+
+/**
+ * rdpmc_read: read a ring 3 readable performance counter
+ * @ctx: Pointer to initialized &rdpmc_ctx structure.
+ *
+ * Read the current value of a running performance counter.
+ */
+unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
+{
+ u64 val;
+ unsigned seq;
+ u64 offset = 0;
+
+ typeof (ctx->buf) buf = ctx->buf;
+ do {
+ seq = buf->lock;
+ ips_rmb();
+ if (buf->index <= 0)
+ return buf->offset;
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+ val = _rdpmc(buf->index - 1);
+#else /* GCC */
+ val = __builtin_ia32_rdpmc(buf->index - 1);
+#endif
+ offset = buf->offset;
+ ips_rmb();
+ } while (buf->lock != seq);
+ return val + offset;
+}
+
+void psmi_rdpmc_perf_framework_init()
+{
+ int rdpmc_retval;
+
+ struct rdpmc_ctx *leader = NULL;
+
+ int env_result = 1;
+ char * env_type = NULL;
+ char * env_config = NULL;
+
+ env_type = getenv("RDPMC_PERF_TYPE");
+
+ if (env_type)
+ {
+ global_rdpmc_type = (int)strtoll(env_type, NULL, 16);
+ }
+ else
+ {
+ env_result = 0;
+ }
+
+ env_config = getenv("RDPMC_PERF_CONFIG");
+
+ if (env_config)
+ {
+ global_rdpmc_config = (int)strtoll(env_config, NULL, 16);
+ }
+ else
+ {
+ env_result = 0;
+ }
+
+ if (env_result != 1)
+ {
+ global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE;
+ global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+ }
+
+ struct perf_event_attr attr = {
+ .type = global_rdpmc_type,
+ .size = sizeof(struct perf_event_attr),
+ .config = global_rdpmc_config,
+ .sample_type = PERF_SAMPLE_READ,
+ };
+
+ rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader);
+
+ if (rdpmc_retval < 0)
+ {
+ printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval);
+ exit(-1);
+ }
+}
+
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/psm_perf.h b/psm_perf.h
new file mode 100644
index 0000000..6fa06d2
--- /dev/null
+++ b/psm_perf.h
@@ -0,0 +1,142 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+#include <linux/perf_event.h>
+
+/* Configuration */
+
+#define RDPMC_PERF_DEFAULT_TYPE (PERF_TYPE_HARDWARE)
+#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES)
+
+#define RDPMC_PERF_MAX_SLOT_NUMBER (8)
+#define RDPMC_PERF_MAX_SLOT_NAME (256)
+
+/* RDPMC infrastructure */
+
+extern struct rdpmc_ctx global_rdpmc_ctx;
+
+typedef unsigned long long u64;
+
+extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+extern unsigned int global_rdpmc_type;
+extern unsigned int global_rdpmc_config;
+
+extern void psmi_rdpmc_perf_framework_init();
+
+extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
+
+#define RDPMC_PERF_INIT() \
+{ \
+ int i; \
+ for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+ { \
+ global_rdpmc_begin[i] = 0; \
+ global_rdpmc_summ[i] = 0; \
+ global_rdpmc_number[i] = 0; \
+ global_rdpmc_slot_name[i][0] = '\0'; \
+ } \
+}
+
+/* There is no slot_number max range check */
+
+#define RDPMC_PERF_SET_SLOT_NAME(slot_number, name) \
+{ \
+ strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \
+ global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0'; \
+}
+
+#define RDPMC_PERF_BEGIN(slot_number) \
+{ \
+ global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \
+}
+
+#define RDPMC_PERF_END(slot_number) \
+{ \
+ global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \
+ global_rdpmc_number[(slot_number)]++; \
+}
+
+#define RDPMC_PERF_DUMP(stream) \
+{ \
+ int i; \
+ for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+ { \
+ if (global_rdpmc_slot_name[i][0]) \
+ { \
+ fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \
+ global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \
+ (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \
+ fflush((stream)); \
+ } \
+ } \
+}
+
+#define GENERIC_PERF_INIT() RDPMC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number) RDPMC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number) RDPMC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream) RDPMC_PERF_DUMP(stream)
+#else /* RDPMC_PERF_FRAMEWORK */
+#define GENERIC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream)
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/psm_stats.c b/psm_stats.c
new file mode 100644
index 0000000..0015174
--- /dev/null
+++ b/psm_stats.c
@@ -0,0 +1,664 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+struct psmi_stats_type {
+ STAILQ_ENTRY(psmi_stats_type) next;
+ struct psmi_stats_entry *entries;
+
+ int num_entries;
+ void *heading;
+ uint32_t statstype;
+ void *context;
+};
+
+static STAILQ_HEAD(, psmi_stats_type) psmi_stats =
+STAILQ_HEAD_INITIALIZER(psmi_stats);
+
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+ uint32_t statstype,
+ const struct psmi_stats_entry *entries_i,
+ int num_entries, void *context)
+{
+ struct psmi_stats_entry *entries;
+ struct psmi_stats_type *type;
+ int i;
+ psm2_error_t err = PSM2_OK;
+
+ entries =
+ psmi_calloc(PSMI_EP_NONE, STATS, num_entries,
+ sizeof(struct psmi_stats_entry));
+ type =
+ psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type));
+ PSMI_CHECKMEM(err, entries);
+ PSMI_CHECKMEM(err, type);
+
+ type->entries = entries;
+ type->num_entries = num_entries;
+ type->statstype = statstype;
+ type->context = context;
+ type->heading = (char *)heading;
+
+ for (i = 0; i < num_entries; i++) {
+ type->entries[i].desc = entries_i[i].desc;
+ type->entries[i].flags = entries_i[i].flags;
+ type->entries[i].getfn = entries_i[i].getfn;
+ type->entries[i].u.val = entries_i[i].u.val;
+ }
+
+ STAILQ_INSERT_TAIL(&psmi_stats, type, next);
+ return err;
+
+fail:
+ if (entries)
+ psmi_free(entries);
+ if (type)
+ psmi_free(type);
+ return err;
+}
+
+psm2_error_t psmi_stats_deregister_all(void)
+{
+ struct psmi_stats_type *type;
+
+ /* Currently our mpi still reads stats after finalize so this isn't safe
+ * yet */
+ while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) {
+ STAILQ_REMOVE_HEAD(&psmi_stats, next);
+ psmi_free(type->entries);
+ psmi_free(type);
+ }
+
+ return PSM2_OK;
+}
+
+static uint32_t typestring_to_type(const char *typestr)
+{
+ if (strncasecmp(typestr, "all", 4) == 0)
+ return PSMI_STATSTYPE_ALL;
+ else if (strncasecmp(typestr, "p2p", 4) == 0)
+ return PSMI_STATSTYPE_P2P;
+ else if (strncasecmp(typestr, "hfi", 6) == 0)
+ return PSMI_STATSTYPE_HFI;
+ else if (strncasecmp(typestr, "ips", 4) == 0)
+ return PSMI_STATSTYPE_IPSPROTO;
+ else if ((strncasecmp(typestr, "intr", 5) == 0) ||
+ (strncasecmp(typestr, "thread", 7) == 0) ||
+ (strncasecmp(typestr, "rcvthread", 10) == 0))
+ return PSMI_STATSTYPE_RCVTHREAD;
+ else if ((strncasecmp(typestr, "mq", 3) == 0) ||
+ (strncasecmp(typestr, "mpi", 4) == 0))
+ return PSMI_STATSTYPE_MQ;
+ else if ((strncasecmp(typestr, "tid", 4) == 0) ||
+ (strncasecmp(typestr, "tids", 5) == 0))
+ return PSMI_STATSTYPE_TIDS;
+ else if ((strncasecmp(typestr, "counter", 8) == 0) ||
+ (strncasecmp(typestr, "counters", 9) == 0))
+ return PSMI_STATSTYPE_DEVCOUNTERS;
+ else if (strncasecmp(typestr, "devstats", 9) == 0)
+ return PSMI_STATSTYPE_DEVSTATS;
+ else if ((strncasecmp(typestr, "memory", 7) == 0) ||
+ (strncasecmp(typestr, "alloc", 6) == 0) ||
+ (strncasecmp(typestr, "malloc", 7) == 0))
+ return PSMI_STATSTYPE_MEMORY;
+ else
+ return 0;
+}
+
+static uint32_t stats_parse_enabled_mask(const char *stats_string)
+{
+ char *b = (char *)stats_string;
+ char *e = b;
+ char buf[128];
+
+ uint32_t stats_enabled_mask = 0;
+
+ while (*e) {
+ b = e;
+ while (*e && *e != ',' && *e != '+' && *e != '.' &&
+ *e != '|' && *e != ':')
+ e++;
+ if (e > b) { /* something new to parse */
+ int len = ((e - b) > (sizeof(buf) - 1)) ?
+ (sizeof(buf) - 1) : (e - b);
+ strncpy(buf, b, len);
+ buf[len] = '\0';
+ stats_enabled_mask |= typestring_to_type(buf);
+ }
+ if (*e)
+ e++; /* skip delimiter */
+ }
+ return stats_enabled_mask;
+}
+
+static
+void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args)
+{
+ const struct psmi_stats_entry *entry;
+ struct psmi_stats_type *type = (struct psmi_stats_type *)args->context;
+ int i, num = args->num;
+ uint64_t *stats = args->stats;
+ uint64_t *c = NULL;
+ uint64_t *s = NULL;
+
+ psmi_assert(num == type->num_entries);
+
+ if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS ||
+ type->statstype == PSMI_STATSTYPE_DEVSTATS) {
+ int unit_id = ((psm2_ep_t) type->context)->unit_id;
+ int portno = ((psm2_ep_t) type->context)->portnum;
+ uintptr_t off;
+ uint8_t *p = NULL;
+ int nc, npc, ns;
+ int nstats = hfi_get_stats_names_count();
+ int nctrs = hfi_get_ctrs_unit_names_count(unit_id);
+ int npctrs = hfi_get_ctrs_port_names_count(unit_id);
+
+ if (nctrs != -1 && npctrs != -1)
+ c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs + npctrs,
+ sizeof(uint64_t));
+ if (nstats != -1)
+ s = psmi_calloc(PSMI_EP_NONE, STATS, nstats,
+ sizeof(uint64_t));
+
+ /*
+ * If hfifs is not loaded, we set NAN everywhere. We don't want
+ * stats to break just because 1 node didn't have hfi-stats
+ */
+ if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) {
+ nc = hfi_get_ctrs_unit(unit_id, c, nctrs);
+ if (nc != -1 && nc == nctrs)
+ p = (uint8_t *) c;
+ if (nc == -1)
+ nc = 0;
+ npc =
+ hfi_get_ctrs_port(unit_id, portno, c + nc, npctrs);
+ if (!p && npc > 0 && npc == npctrs)
+ p = (uint8_t *) c;
+ } else if (s != NULL) {
+ ns = hfi_get_stats(s, nstats);
+ if (ns != -1)
+ p = (uint8_t *) s;
+ }
+ for (i = 0; i < num; i++) {
+ entry = &type->entries[i];
+ if (p) {
+ off = (uintptr_t) entry->u.off;
+ stats[i] = *((uint64_t *) (p + off));
+ } else
+ stats[i] = MPSPAWN_NAN_U64;
+ }
+ } else if (type->statstype == PSMI_STATSTYPE_MEMORY) {
+ for (i = 0; i < num; i++) {
+ entry = &type->entries[i];
+ stats[i] =
+ *(uint64_t *) ((uintptr_t) &psmi_stats_memory +
+ (uintptr_t) entry->u.off);
+ }
+ } else {
+ for (i = 0; i < num; i++) {
+ entry = &type->entries[i];
+ if (entry->getfn != NULL)
+ stats[i] = entry->getfn(type->context);
+ else
+ stats[i] = *entry->u.val;
+ }
+ }
+
+ if (c != NULL)
+ psmi_free(c);
+ if (s != NULL)
+ psmi_free(s);
+}
+
+static
+void
+stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn,
+ char *heading,
+ int num_entries,
+ struct psmi_stats_entry *entries,
+ mpspawn_stats_req_fn req_fn, void *context)
+{
+ int i;
+ struct mpspawn_stats_add_args mp_add;
+
+ mp_add.version = MPSPAWN_STATS_VERSION;
+ mp_add.num = num_entries;
+ mp_add.header = heading;
+ mp_add.req_fn = req_fn;
+ mp_add.context = context;
+
+ mp_add.desc = (char **)alloca(sizeof(char *) * num_entries);
+ psmi_assert_always(mp_add.desc != NULL);
+
+ mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries);
+ psmi_assert_always(mp_add.flags != NULL);
+
+ for (i = 0; i < num_entries; i++) {
+ mp_add.desc[i] = (char *)entries[i].desc;
+ mp_add.flags[i] = entries[i].flags;
+ }
+
+ /* Ignore return code, doesn't matter to *us* if register failed */
+ add_fn(&mp_add);
+
+ return;
+}
+
+static void stats_register_hfi_counters(psm2_ep_t ep);
+static void stats_register_hfi_stats(psm2_ep_t ep);
+static void stats_register_mem_stats(psm2_ep_t ep);
+static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args
+ *args);
+
+/*
+ * Downcall from QLogic MPI into PSM, so we can register stats
+ */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args)
+{
+ struct psmi_stats_type *type;
+ uint32_t statsmask;
+
+ /*
+ * Args has a version string in it, but we can ignore it since mpspawn
+ * will decide if it supports *our* version
+ */
+
+ /*
+ * Eventually, parse the stats_types to add various "flavours" of stats
+ */
+ if (args->stats_types == NULL)
+ return NULL;
+
+ statsmask = stats_parse_enabled_mask(args->stats_types);
+
+ /* MQ (MPI-level) statistics */
+ if (statsmask & PSMI_STATSTYPE_MQ)
+ psmi_mq_stats_register(args->mq, args->add_fn);
+
+ /* PSM and hfi level statistics */
+ if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS)
+ stats_register_hfi_counters(args->mq->ep);
+
+ if (statsmask & PSMI_STATSTYPE_DEVSTATS)
+ stats_register_hfi_stats(args->mq->ep);
+
+ if (statsmask & PSMI_STATSTYPE_MEMORY)
+ stats_register_mem_stats(args->mq->ep);
+
+ /*
+ * At this point all PSM and hfi-level components have registered stats
+ * with the PSM stats interface. We register with the mpspawn stats
+ * interface with an upcall in add_fn
+ */
+ STAILQ_FOREACH(type, &psmi_stats, next) {
+ if (type->statstype & statsmask)
+ stats_register_mpspawn_single(args->add_fn,
+ type->heading,
+ type->num_entries,
+ type->entries,
+ psmi_stats_mpspawn_callback,
+ type);
+ }
+
+ /*
+ * Special handling for per-endpoint statistics
+ * Only MPI knows what the endpoint-addresses are in the running program,
+ * PSM has no sense of MPI worlds. In stats register, MPI tells PSM how
+ * many endpoints it anticipates having and PSM simply reserves that amount
+ * of stats entries X the amount of per-endpoint stats.
+ */
+ if (statsmask & PSMI_STATSTYPE_P2P)
+ psmi_stats_epaddr_register(args);
+
+ return NULL;
+}
+
+struct stats_epaddr {
+ psm2_ep_t ep;
+ mpspawn_map_epaddr_fn epaddr_map_fn;
+ int num_ep;
+ int num_ep_stats;
+};
+
+static
+void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args)
+{
+ int i, num, off;
+ uint64_t *statsp;
+ struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context;
+ psm2_ep_t ep = stats_ctx->ep;
+ psm2_epaddr_t epaddr;
+
+ num = stats_ctx->num_ep * stats_ctx->num_ep_stats;
+
+ /* First always NAN the entire stats request */
+ for (i = 0; i < num; i++) {
+ if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE)
+ args->stats[i] = MPSPAWN_NAN;
+ else
+ args->stats[i] = MPSPAWN_NAN_U64;
+ }
+
+ for (i = 0; i < stats_ctx->num_ep; i++) {
+ statsp = args->stats + i * stats_ctx->num_ep_stats;
+ off = 0;
+ epaddr = stats_ctx->epaddr_map_fn(i);
+ if (epaddr == NULL)
+ continue;
+
+ /* Self */
+ if (&ep->ptl_self == epaddr->ptlctl) {
+ if (ep->ptl_self.epaddr_stats_get != NULL)
+ off +=
+ ep->ptl_self.epaddr_stats_get(epaddr,
+ statsp + off);
+ } else {
+ if (ep->ptl_self.epaddr_stats_num != NULL)
+ off += ep->ptl_self.epaddr_stats_num();
+ }
+
+ /* Shm */
+ if (&ep->ptl_amsh == epaddr->ptlctl) {
+ if (ep->ptl_amsh.epaddr_stats_get != NULL)
+ off +=
+ ep->ptl_amsh.epaddr_stats_get(epaddr,
+ statsp + off);
+ } else {
+ if (ep->ptl_amsh.epaddr_stats_num != NULL)
+ off += ep->ptl_amsh.epaddr_stats_num();
+ }
+
+ /* ips */
+ if (&ep->ptl_ips == epaddr->ptlctl) {
+ if (ep->ptl_ips.epaddr_stats_get != NULL)
+ off +=
+ ep->ptl_ips.epaddr_stats_get(epaddr,
+ statsp + off);
+ } else {
+ if (ep->ptl_ips.epaddr_stats_num != NULL)
+ off += ep->ptl_ips.epaddr_stats_num();
+ }
+ }
+ return;
+}
+
+static
+psm2_error_t
+psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args)
+{
+ int i = 0, j;
+ int num_ep = args->num_epaddr;
+ int num_ep_stats = 0;
+ int nz;
+ char **desc, **desc_i;
+ uint16_t *flags, *flags_i;
+ char *p;
+ char buf[128];
+ psm2_ep_t ep;
+ struct mpspawn_stats_add_args mp_add;
+ struct stats_epaddr *stats_ctx;
+ psm2_error_t err = PSM2_OK;
+
+ if (args->mq == NULL)
+ return PSM2_OK;
+ ep = args->mq->ep;
+
+ /* Figure out how many stats there are in an endpoint from all devices */
+ if (ep->ptl_self.epaddr_stats_num != NULL)
+ num_ep_stats += ep->ptl_self.epaddr_stats_num();
+ if (ep->ptl_amsh.epaddr_stats_num != NULL)
+ num_ep_stats += ep->ptl_amsh.epaddr_stats_num();
+ if (ep->ptl_ips.epaddr_stats_num != NULL)
+ num_ep_stats += ep->ptl_ips.epaddr_stats_num();
+
+ /* Allocate desc and flags and let each device initialize their
+ * descriptions and flags */
+ desc =
+ psmi_malloc(ep, STATS,
+ sizeof(char *) * num_ep_stats * (num_ep + 1));
+ if (desc == NULL)
+ return PSM2_NO_MEMORY;
+ flags =
+ psmi_malloc(ep, STATS,
+ sizeof(uint16_t) * num_ep_stats * (num_ep + 1));
+ if (flags == NULL) {
+ psmi_free(desc);
+ return PSM2_NO_MEMORY;
+ }
+
+ /* Get the descriptions/flags from each device */
+ i = 0;
+ i += ep->ptl_self.epaddr_stats_num != NULL ?
+ ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0;
+ i += ep->ptl_amsh.epaddr_stats_num != NULL ?
+ ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0;
+ i += ep->ptl_ips.epaddr_stats_num != NULL ?
+ ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0;
+ psmi_assert_always(i == num_ep_stats);
+
+ /*
+ * Clone the descriptions for each endpoint but append "rank %d" to it
+ * beforehand.
+ */
+ nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */
+ (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 :
+ (num_ep <
+ 10000 ? 5 : 6)))));
+
+ desc_i = desc + num_ep_stats;
+ flags_i = flags + num_ep_stats;
+ memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats);
+
+ for (i = 0; i < num_ep; i++) {
+ for (j = 0; j < num_ep_stats; j++) {
+ snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i,
+ desc[j]);
+ buf[sizeof(buf) - 1] = '\0';
+ p = psmi_strdup(ep, buf);
+ if (p == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto clean;
+ }
+ desc_i[i * num_ep_stats + j] = p;
+ flags_i[i * num_ep_stats + j] = flags[j];
+ }
+ }
+
+ mp_add.version = MPSPAWN_STATS_VERSION;
+ mp_add.num = num_ep_stats * num_ep;
+ mp_add.header = "Endpoint-to-Endpoint Stats (by <rank>)";
+ mp_add.req_fn = psmi_stats_epaddr_callback;
+ mp_add.desc = desc_i;
+ mp_add.flags = flags_i;
+ stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr));
+ if (stats_ctx == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto clean;
+ }
+ stats_ctx->ep = ep;
+ stats_ctx->epaddr_map_fn = args->epaddr_map_fn;
+ stats_ctx->num_ep = num_ep;
+ stats_ctx->num_ep_stats = num_ep_stats;
+ mp_add.context = stats_ctx;
+
+ args->add_fn(&mp_add);
+
+clean:
+ /* Now we can free all the descriptions */
+ for (i = 0; i < num_ep; i++) {
+ for (j = 0; j < num_ep_stats; j++)
+ if (desc_i[i * num_ep_stats + j])
+ psmi_free(desc_i[i * num_ep_stats + j]);
+ }
+
+ psmi_free(desc);
+ psmi_free(flags);
+
+ return err;
+}
+
+static
+void stats_register_hfi_counters(psm2_ep_t ep)
+{
+ int i, nc, npc;
+ char *cnames = NULL, *pcnames = NULL;
+ struct psmi_stats_entry *entries = NULL;
+
+ nc = hfi_get_ctrs_unit_names(ep->unit_id, &cnames);
+ if (nc == -1 || cnames == NULL)
+ goto bail;
+ npc = hfi_get_ctrs_port_names(ep->unit_id, &pcnames);
+ if (npc == -1 || pcnames == NULL)
+ goto bail;
+ entries =
+ psmi_calloc(ep, STATS, nc + npc, sizeof(struct psmi_stats_entry));
+ if (entries == NULL)
+ goto bail;
+
+ for (i = 0; i < nc; i++) {
+ entries[i].desc = hfi_get_next_name(&cnames);
+ entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO;
+ entries[i].getfn = NULL;
+ entries[i].u.off = i * sizeof(uint64_t);
+ }
+ for (i = nc; i < nc + npc; i++) {
+ entries[i].desc = hfi_get_next_name(&pcnames);
+ entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO;
+ entries[i].getfn = NULL;
+ entries[i].u.off = i * sizeof(uint64_t);
+ }
+ psmi_stats_register_type("OPA device counters",
+ PSMI_STATSTYPE_DEVCOUNTERS,
+ entries, nc + npc, ep);
+ return;
+
+bail:
+ if (cnames != NULL)
+ hfi_release_names(cnames);
+ if (pcnames != NULL)
+ hfi_release_names(pcnames);
+ if (entries != NULL)
+ psmi_free(entries);
+}
+
+static
+void stats_register_hfi_stats(psm2_ep_t ep)
+{
+ int i, ns;
+ char *snames = NULL;
+ struct psmi_stats_entry *entries = NULL;
+
+ ns = hfi_get_stats_names(&snames);
+ if (ns == -1 || snames == NULL)
+ goto bail;
+ entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry));
+ if (entries == NULL)
+ goto bail;
+
+ for (i = 0; i < ns; i++) {
+ entries[i].desc = hfi_get_next_name(&snames);
+ entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO;
+ entries[i].getfn = NULL;
+ entries[i].u.off = i * sizeof(uint64_t);
+ }
+ psmi_stats_register_type("OPA device statistics",
+ PSMI_STATSTYPE_DEVSTATS, entries, ns, ep);
+ return;
+
+bail:
+ if (snames != NULL)
+ hfi_release_names(snames);
+ if (entries != NULL)
+ psmi_free(entries);
+}
+
+#undef _SDECL
+#define _SDECL(_desc, _param) { \
+ .desc = _desc, \
+ .flags = MPSPAWN_STATS_REDUCTION_ALL \
+ | MPSPAWN_STATS_SKIP_IF_ZERO, \
+ .getfn = NULL, \
+ .u.off = offsetof(struct psmi_stats_malloc, _param) \
+ }
+
+static
+void stats_register_mem_stats(psm2_ep_t ep)
+{
+ struct psmi_stats_entry entries[] = {
+ _SDECL("Total (current)", m_all_total),
+ _SDECL("Total (max)", m_all_max),
+ _SDECL("All Peers (current)", m_perpeer_total),
+ _SDECL("All Peers (max)", m_perpeer_max),
+ _SDECL("Network Buffers (current)", m_netbufs_total),
+ _SDECL("Network Buffers (max)", m_netbufs_max),
+ _SDECL("PSM desctors (current)", m_descriptors_total),
+ _SDECL("PSM desctors (max)", m_descriptors_max),
+ _SDECL("Unexp. buffers (current)", m_unexpbufs_total),
+ _SDECL("Unexp. Buffers (max)", m_unexpbufs_max),
+ _SDECL("Other (current)", m_undefined_total),
+ _SDECL("Other (max)", m_undefined_max),
+ };
+
+ psmi_stats_register_type("PSM memory allocation statistics",
+ PSMI_STATSTYPE_MEMORY,
+ entries, PSMI_STATS_HOWMANY(entries), ep);
+}
diff --git a/psm_stats.h b/psm_stats.h
new file mode 100644
index 0000000..9e9e0a9
--- /dev/null
+++ b/psm_stats.h
@@ -0,0 +1,120 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_stats.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_STATS_H
+#define _PSM_STATS_H
+
+#include "mpspawn_stats.h"
+
+#define PSMI_STATSTYPE_MQ 0x00001
+#define PSMI_STATSTYPE_RCVTHREAD 0x00100 /* num_wakups, ratio, etc. */
+#define PSMI_STATSTYPE_IPSPROTO 0x00200 /* acks,naks,err_chks */
+#define PSMI_STATSTYPE_TIDS 0x00400
+#define PSMI_STATSTYPE_MEMORY 0x01000
+#define PSMI_STATSTYPE_HFI (PSMI_STATSTYPE_RCVTHREAD| \
+ PSMI_STATSTYPE_IPSPROTO | \
+ PSMI_STATSTYPE_MEMORY | \
+ PSMI_STATSTYPE_TIDS)
+#define PSMI_STATSTYPE_P2P 0x00800 /* ep-to-ep details */
+#define PSMI_STATSTYPE_DEVCOUNTERS 0x10000
+#define PSMI_STATSTYPE_DEVSTATS 0x20000
+#define PSMI_STATSTYPE_ALL 0xfffff
+#define _PSMI_STATSTYPE_DEVMASK 0xf0000
+
+/* Used to determine how many stats in static array decl. */
+#define PSMI_STATS_HOWMANY(entries) \
+ (sizeof(entries)/sizeof(entries[0]))
+
+#define PSMI_STATS_NO_HEADING NULL
+
+#define PSMI_STATS_DECL(_desc, _flags, _getfn, _val) \
+ { .desc = _desc, \
+ .flags = _flags, \
+ .getfn = _getfn, \
+ .u.val = _val, \
+ }
+
+#define PSMI_STATS_DECLU64(_desc, _val) \
+ PSMI_STATS_DECL(_desc, \
+ MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \
+ NULL, \
+ _val)
+
+struct psmi_stats_entry {
+ const char *desc;
+ uint16_t flags;
+ uint64_t(*getfn) (void *context); /* optional fn ptr to get value */
+ union {
+ uint64_t *val; /* where value is stored if getfn is NULL */
+ uint64_t off; /* of offset if that makes more sense */
+ } u;
+};
+
+/*
+ * Copy the array of entries and keep track of the context
+ */
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+ uint32_t statstype,
+ const struct psmi_stats_entry *entries,
+ int num_entries, void *context);
+
+psm2_error_t psmi_stats_deregister_all(void);
+
+#endif /* PSM_STATS_H */
diff --git a/psm_sysbuf.c b/psm_sysbuf.c
new file mode 100644
index 0000000..04298f0
--- /dev/null
+++ b/psm_sysbuf.c
@@ -0,0 +1,234 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * System buffer (unexpected message) allocator
+ *
+ */
+
+#define MM_FLAG_NONE 0
+#define MM_FLAG_TRANSIENT 0x1
+
+struct psmi_mem_block_ctrl {
+ union {
+ psmi_mem_ctrl_t *mem_handler;
+ struct psmi_mem_block_ctrl *next;
+ };
+ char _redzone[PSM_VALGRIND_REDZONE_SZ];
+};
+
+
+/* Per MQ allocators */
+void psmi_mq_sysbuf_init(psm2_mq_t mq)
+{
+ int i;
+ uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1};
+ uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0};
+
+ if (mq->mem_ctrl_is_init)
+ return;
+ mq->mem_ctrl_is_init = 1;
+
+ for (i=0; i < MM_NUM_OF_POOLS; i++) {
+ mq->handler_index[i].block_size = block_sizes[i];
+ mq->handler_index[i].current_available = 0;
+ mq->handler_index[i].free_list = NULL;
+ mq->handler_index[i].total_alloc = 0;
+ mq->handler_index[i].replenishing_rate = replenishing_rate[i];
+
+ if (block_sizes[i] == -1) {
+ psmi_assert_always(replenishing_rate[i] == 0);
+ mq->handler_index[i].flags = MM_FLAG_TRANSIENT;
+ }
+ else {
+ psmi_assert_always(replenishing_rate[i] > 0);
+ mq->handler_index[i].flags = MM_FLAG_NONE;
+ }
+ }
+
+ VALGRIND_CREATE_MEMPOOL(mq, PSM_VALGRIND_REDZONE_SZ,
+ PSM_VALGRIND_MEM_UNDEFINED);
+
+ /* Hit once on each block size so we have a pool that's allocated */
+ for (i=0; i < MM_NUM_OF_POOLS; i++) {
+ void *ptr;
+ if (block_sizes[i] == -1)
+ continue;
+ ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]);
+ psmi_mq_sysbuf_free(mq, ptr);
+ }
+}
+
+void psmi_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently not used
+{
+ struct psmi_mem_block_ctrl *block;
+ int i;
+
+ if (mq->mem_ctrl_is_init == 0)
+ return;
+
+ VALGRIND_DESTROY_MEMPOOL(mq);
+
+ for (i=0; i < MM_NUM_OF_POOLS; i++) {
+ while ((block = mq->handler_index[i].free_list) != NULL) {
+ mq->handler_index[i].free_list = block->next;
+ psmi_free(block);
+ }
+ }
+ mq->mem_ctrl_is_init = 0;
+}
+
+void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len)
+{
+ snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n",
+ mq->mem_ctrl_total_bytes);
+ buf[len-1] = '\0';
+ return;
+}
+
+void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
+{
+ psmi_mem_ctrl_t *mm_handler = mq->handler_index;
+ struct psmi_mem_block_ctrl *new_block;
+ int replenishing;
+
+ /* There is a timing race with ips initialization, fix later.
+ * * XXX */
+ if (!mq->mem_ctrl_is_init)
+ psmi_mq_sysbuf_init(mq);
+
+ mq->stats.rx_sysbuf_num++;
+ mq->stats.rx_sysbuf_bytes += alloc_size;
+
+ while (mm_handler->block_size < alloc_size)
+ mm_handler++;
+
+ replenishing = mm_handler->replenishing_rate;
+
+ if (mm_handler->current_available == 0) { // allocate more buffers
+ if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+ uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl)
+ + PSM_VALGRIND_REDZONE_SZ;
+ new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+
+ if (new_block) {
+ new_block->mem_handler = mm_handler;
+ new_block++;
+ mm_handler->total_alloc++;
+ mq->mem_ctrl_total_bytes += newsz;
+ VALGRIND_MEMPOOL_ALLOC(mq, new_block, alloc_size);
+ }
+ return new_block;
+ }
+
+ do {
+ uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl) +
+ PSM_VALGRIND_REDZONE_SZ;
+
+ new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+ mq->mem_ctrl_total_bytes += newsz;
+
+ if (new_block) {
+ mm_handler->current_available++;
+ mm_handler->total_alloc++;
+
+ new_block->next = mm_handler->free_list;
+ mm_handler->free_list = new_block;
+ }
+
+ } while (--replenishing && new_block);
+ }
+
+ if (mm_handler->current_available) {
+ mm_handler->current_available--;
+
+ new_block = mm_handler->free_list;
+ mm_handler->free_list = new_block->next;
+
+ new_block->mem_handler = mm_handler;
+ new_block++;
+
+ VALGRIND_MEMPOOL_ALLOC(mq, new_block, mm_handler->block_size);
+ return new_block;
+ }
+ return NULL;
+}
+
+void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free)
+{
+ struct psmi_mem_block_ctrl * block_to_free;
+ psmi_mem_ctrl_t *mm_handler;
+
+ psmi_assert_always(mq->mem_ctrl_is_init);
+
+ block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1;
+ mm_handler = block_to_free->mem_handler;
+
+ VALGRIND_MEMPOOL_FREE(mq, mem_to_free);
+
+ if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+ psmi_free(block_to_free);
+ } else {
+ block_to_free->next = mm_handler->free_list;
+ mm_handler->free_list = block_to_free;
+ mm_handler->current_available++;
+ }
+
+ return;
+}
diff --git a/psm_sysbuf.h b/psm_sysbuf.h
new file mode 100644
index 0000000..07ab593
--- /dev/null
+++ b/psm_sysbuf.h
@@ -0,0 +1,81 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef SYSBUF_INT_H
+#define SYSBUF_INT_H
+
+#include "psm_user.h"
+
+#define MM_NUM_OF_POOLS 7
+
+typedef struct psmi_mem_ctrl {
+ struct psmi_mem_block_ctrl *free_list;
+ uint32_t total_alloc;
+ uint32_t current_available;
+ uint32_t block_size;
+ uint32_t flags;
+ uint32_t replenishing_rate;
+} psmi_mem_ctrl_t;
+
+/*
+ * MQ unexpected buffer management
+ */
+void psmi_mq_sysbuf_init(psm2_mq_t mq);
+void psmi_mq_sysbuf_fini(psm2_mq_t mq);
+void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes);
+void psmi_mq_sysbuf_free(psm2_mq_t mq, void *);
+void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len);
+
+#endif /* SYSBUF_INT_H */
diff --git a/psm_timer.c b/psm_timer.c
new file mode 100644
index 0000000..9a8dddd
--- /dev/null
+++ b/psm_timer.c
@@ -0,0 +1,198 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#if PSMI_TIMER_STATS
+# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) ((ctrl)->num_insertions++)
+# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) ((ctrl)->num_traversals++)
+#else
+# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)
+# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)
+#endif
+
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl)
+{
+ ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+
+#if PSMI_TIMER_STATS
+ ctrl->num_insertions = 0;
+ ctrl->num_traversals = 0;
+#endif
+
+ TAILQ_INIT(&ctrl->timerq);
+ return PSM2_OK;
+}
+
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl)
+{
+#if PSMI_TIMER_STATS
+ if (ctrl->num_insertions > 0) {
+ _HFI_INFO("avg elem traversals/insertion = %3.2f %%\n",
+ 100.0 * (double)ctrl->num_traversals /
+ ctrl->num_insertions);
+ }
+#endif
+ return PSM2_OK;
+}
+
+void
+psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+ struct psmi_timer *t_insert, uint64_t t_cyc_expire)
+{
+ struct psmi_timer *t_cursor;
+
+ psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING));
+
+ t_insert->t_timeout = t_cyc_expire;
+ t_insert->flags |= PSMI_TIMER_FLAG_PENDING;
+
+ /*
+ * We keep the list from oldest (head) to newest (tail), with the
+ * assumption that insert and remove occur much more often than search
+ * (when the timer expires). Newly added timers are more likely to expire
+ * later rather than sooner, which is why the head is older.
+ */
+ PSMI_TIMER_STATS_ADD_INSERTION(ctrl);
+
+ if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */
+ TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+ ctrl->t_cyc_next_expire = t_cyc_expire;
+ PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+ return;
+ } else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) {
+ TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) {
+ if (t_cursor->t_timeout <= t_cyc_expire) {
+ TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer);
+ return;
+ }
+ PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+ }
+ /* Got to the end of the list -- We're the next to expire */
+ ctrl->t_cyc_next_expire = t_cyc_expire;
+ TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+ return;
+ } else {
+ TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) {
+ if (t_cursor->t_timeout >= t_cyc_expire) {
+ TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor,
+ t_insert, timer);
+ ctrl->t_cyc_next_expire =
+ min(t_cyc_expire, ctrl->t_cyc_next_expire);
+ return;
+ }
+ PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+ }
+ TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer);
+ /* No need to check if we inserted last, given first branch case */
+ /* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */
+ /* ctrl->t_cyc_next_expire = t_cyc_expire; */
+ return;
+ }
+
+ return;
+}
+
+psm2_error_t
+psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire)
+{
+ psm2_error_t err = PSM2_OK_NO_PROGRESS;
+ struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq);
+
+ PSM2_LOG_MSG("entering");
+
+ while (t_cursor) {
+ if (t_cursor->t_timeout > t_cyc_expire)
+ break;
+
+ err = PSM2_OK;
+ psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING);
+ t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING;
+ TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer);
+ t_cursor->expire_callback(t_cursor, t_cyc_expire);
+ t_cursor = TAILQ_PREV(t_cursor, timerq, timer);
+ }
+
+ if (TAILQ_EMPTY(&ctrl->timerq))
+ ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+ else
+ ctrl->t_cyc_next_expire =
+ TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+
+void
+psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+ struct psmi_timer *t_remove)
+{
+
+ psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING);
+
+ t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING;
+ TAILQ_REMOVE(&ctrl->timerq, t_remove, timer);
+
+ /*
+ * If we're removing the last entry, we need to reset the
+ * expiration cycle time.
+ */
+ if (TAILQ_EMPTY(&ctrl->timerq))
+ ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+ else
+ ctrl->t_cyc_next_expire =
+ TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+ return;
+}
diff --git a/psm_timer.h b/psm_timer.h
new file mode 100644
index 0000000..a57fd7a
--- /dev/null
+++ b/psm_timer.h
@@ -0,0 +1,164 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_timer.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_TIMER_H
+#define _PSMI_TIMER_H
+
+#include "psm_user.h"
+
+/* Keep timer stats */
+#define PSMI_TIMER_STATS 0
+
+typedef struct psmi_timer psmi_timer;
+typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *,
+ uint64_t);
+
+struct psmi_timer {
+ TAILQ_ENTRY(psmi_timer) timer; /* opaque */
+ uint64_t t_timeout; /* opaque */
+ uint8_t flags; /* opaque */
+
+ psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */
+ void *context; /* user -- callback param */
+};
+
+struct psmi_timer_ctrl {
+ uint64_t t_cyc_next_expire;
+ TAILQ_HEAD(timerq, psmi_timer) timerq;
+
+#if PSMI_TIMER_STATS
+ uint64_t num_insertions;
+ uint64_t num_traversals;
+#endif
+};
+
+/*
+ * Some events need to be unconditionally enqueued at the beginning of the
+ * timerq -- they are not timers meant to expire but merely operations that
+ * need to be delayed. For delayed operations, there are 5 levels of
+ * priority.
+ */
+#define PSMI_TIMER_PRIO_0 0ULL
+#define PSMI_TIMER_PRIO_1 1ULL
+#define PSMI_TIMER_PRIO_2 2ULL
+#define PSMI_TIMER_PRIO_3 3ULL
+#define PSMI_TIMER_PRIO_4 4ULL
+#define PSMI_TIMER_PRIO_LAST PSMI_TIMER_PRIO_4
+
+#define PSMI_TIMER_INFINITE 0xFFFFFFFFFFFFFFFFULL
+#define PSMI_TIMER_FLAG_PENDING 0x01
+
+/*
+ * Timer control initialization and finalization
+ */
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl);
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl);
+
+/*
+ * Timer entry initialization (a timer must be initialized before it can be
+ * added to the timer request queue).
+ */
+
+PSMI_ALWAYS_INLINE(
+void
+psmi_timer_entry_init(struct psmi_timer *t_init,
+ psmi_timer_expire_callback_t expire_fn,
+ void *context))
+{
+ t_init->flags = 0;
+ t_init->expire_callback = expire_fn;
+ t_init->context = context;
+ return;
+}
+
+/*
+ * Timer requests, conditional (macro) or unconditional
+ */
+#define psmi_timer_request(ctrl, t_insert, t_cyc) \
+ if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING)) \
+ psmi_timer_request_always((ctrl), (t_insert), (t_cyc))
+
+void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+ struct psmi_timer *t_insert,
+ uint64_t t_cyc_expire);
+
+/*
+ * Timer cancelations, conditional (macro) only (cancel_inner is internal)
+ */
+#define psmi_timer_cancel(ctrl, t_remove) \
+ if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \
+ psmi_timer_cancel_inner(ctrl, t_remove)
+void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+ struct psmi_timer *t_remove);
+
+/*
+ * Timer processing, conditional or unconditional.
+ */
+#define psmi_timer_process_if_expired(ctrl, t_cyc_expire) \
+ (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ? \
+ psmi_timer_process_expired(ctrl, t_cyc_expire) : \
+ PSM2_OK_NO_PROGRESS)
+
+#define psmi_timer_is_expired(ctrl, t_cyc_expire) \
+ ((ctrl)->t_cyc_next_expire <= (t_cyc_expire))
+
+psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl,
+ uint64_t t_cyc_expire);
+
+#endif /* _PSMI_TIMER_H */
diff --git a/psm_user.h b/psm_user.h
new file mode 100644
index 0000000..dd5384f
--- /dev/null
+++ b/psm_user.h
@@ -0,0 +1,500 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_USER_H
+#define _PSMI_USER_H
+
+#include <inttypes.h>
+#include <pthread.h>
+
+#include <sched.h>
+#include <numa.h>
+
+#include "psm2.h"
+#include "psm2_mq.h"
+
+#include "ptl.h"
+
+#include "opa_user.h"
+#include "opa_queue.h"
+
+#ifdef PSM_VALGRIND
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+#endif
+
+#include "psm_log.h"
+#include "psm_perf.h"
+
+#ifdef PSM_VALGRIND
+#define PSM_VALGRIND_REDZONE_SZ 8
+#define PSM_VALGRIND_DEFINE_MQ_RECV(buf, posted_len, recv_len) do { \
+ VALGRIND_MAKE_MEM_DEFINED((void *)(buf), (posted_len)); \
+ if ((recv_len) < (posted_len)) \
+ VALGRIND_MAKE_MEM_UNDEFINED( \
+ (void *) ((uintptr_t) (buf) + (recv_len)), \
+ (posted_len) - (recv_len)); \
+ } while (0)
+
+#else
+#define PSM_VALGRIND_REDZONE_SZ 0
+#define PSM_VALGRIND_DEFINE_MQ_RECV(buf, posted_len, recv_len)
+#define VALGRIND_CREATE_MEMPOOL(ARG1,ARG2,ARG3)
+#define VALGRIND_MAKE_MEM_DEFINED(ARG1,ARG2)
+#define VALGRIND_DESTROY_MEMPOOL(ARG1)
+#define VALGRIND_MEMPOOL_ALLOC(ARG1,ARG2,ARG3)
+#define VALGRIND_MEMPOOL_FREE(ARG1,ARG2)
+#define VALGRIND_MAKE_MEM_NOACCESS(ARG1,ARG2)
+#endif
+
+/* Parameters for use in valgrind's "is_zeroed" */
+#define PSM_VALGRIND_MEM_DEFINED 1
+#define PSM_VALGRIND_MEM_UNDEFINED 0
+
+#define PSMI_LOCK_NO_OWNER ((pthread_t)(-1))
+
+#ifdef PSM_DEBUG
+#define PSMI_LOCK_IS_MUTEXLOCK_DEBUG
+#else
+#define PSMI_LOCK_IS_SPINLOCK
+/* #define PSMI_LOCK_IS_MUTEXLOCK */
+/* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */
+/* #define PSMI_PLOCK_IS_NOLOCK */
+#endif
+
+#define _PSMI_IN_USER_H
+#include "psm_help.h"
+#include "psm_error.h"
+#include "psm_context.h"
+#include "psm_utils.h"
+#include "psm_timer.h"
+#include "psm_mpool.h"
+#include "psm_ep.h"
+#include "psm_lock.h"
+#include "psm_stats.h"
+#include "psm2_mock_testing.h"
+#undef _PSMI_IN_USER_H
+
+#define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff))
+#define PSMI_VERNO PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR)
+#define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff)
+#define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff)
+
+int psmi_verno_client();
+int psmi_verno_isinteroperable(uint16_t verno);
+int MOCKABLE(psmi_isinitialized)();
+MOCK_DCL_EPILOGUE(psmi_isinitialized);
+
+psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh);
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq);
+
+int psmi_get_current_proc_location();
+
+extern int psmi_epid_ver;
+extern uint32_t non_dw_mul_sdma;
+extern psmi_lock_t psmi_creation_lock;
+
+extern psm2_ep_t psmi_opened_endpoint;
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_get_epid_version()) {
+ return psmi_epid_ver;
+}
+
+#define PSMI_EPID_VERSION_SHM 0
+#define PSMI_EPID_SHM_ONLY 1
+#define PSMI_EPID_IPS_SHM 0
+#define PSMI_EPID_VERSION _psmi_get_epid_version()
+#define PSMI_MAX_EPID_VERNO_SUPPORTED 2
+#define PSMI_MIN_EPID_VERNO_SUPPORTED 1
+#define PSMI_EPID_VERNO_DEFAULT 2
+#define PSMI_EPID_V1 1
+#define PSMI_EPID_V2 2
+
+#define PSMI_EPID_GET_LID(epid) (PSMI_EPID_VERSION == PSMI_EPID_V1) ? \
+ (int)PSMI_EPID_GET_LID_V1(epid) \
+ : (int)PSMI_EPID_GET_LID_V2(epid)
+
+#define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffff)
+/*
+ * Default setting for Receive thread
+ *
+ * 0 disables rcvthread by default
+ * 0x1 enables ips receive thread by default
+ */
+#define PSMI_RCVTHREAD_FLAGS 0x1
+
+/*
+ * Define one of these below.
+ *
+ * Spinlock gives the best performance and makes sense with the progress thread
+ * only because the progress thread does a "trylock" and then goes back to
+ * sleep in a poll.
+ *
+ * Mutexlock should be used for experimentation while the more useful
+ * mutexlock-debug should be enabled during development to catch potential
+ * errors.
+ */
+#ifdef PSMI_LOCK_IS_SPINLOCK
+#define _PSMI_LOCK_INIT(pl) psmi_spin_init(&((pl).lock))
+#define _PSMI_LOCK_TRY(pl) psmi_spin_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl) psmi_spin_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl) psmi_spin_unlock(&((pl).lock))
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#define PSMI_LOCK_DISABLED 0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_trylock_inner(pthread_mutex_t *mutex,
+ const char *curloc, pthread_t *lock_owner))
+{
+ psmi_assert_always_loc(*lock_owner != pthread_self(),
+ curloc);
+ int ret = pthread_mutex_trylock(mutex);
+ if (ret == 0)
+ *lock_owner = pthread_self();
+ return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_lock_inner(pthread_mutex_t *mutex,
+ const char *curloc, pthread_t *lock_owner))
+{
+ psmi_assert_always_loc(*lock_owner != pthread_self(),
+ curloc);
+ int ret = pthread_mutex_lock(mutex);
+ psmi_assert_always_loc(ret != EDEADLK, curloc);
+ *lock_owner = pthread_self();
+ return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+_psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
+ const char *curloc, pthread_t *lock_owner))
+{
+ psmi_assert_always_loc(*lock_owner == pthread_self(),
+ curloc);
+ *lock_owner = PSMI_LOCK_NO_OWNER;
+ psmi_assert_always_loc(pthread_mutex_unlock(mutex) !=
+ EPERM, curloc);
+ return;
+}
+
+#define _PSMI_LOCK_INIT(pl) /* static initialization */
+#define _PSMI_LOCK_TRY(pl) \
+ _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \
+ &((pl).lock_owner))
+#define _PSMI_LOCK(pl) \
+ _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC, \
+ &((pl).lock_owner))
+#define _PSMI_UNLOCK(pl) \
+ _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \
+ &((pl).lock_owner))
+#define _PSMI_LOCK_ASSERT(pl) \
+ psmi_assert_always(pl.lock_owner == pthread_self());
+#define _PSMI_UNLOCK_ASSERT(pl) \
+ psmi_assert_always(pl.lock_owner != pthread_self());
+#define PSMI_LOCK_DISABLED 0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+#define _PSMI_LOCK_INIT(pl) /* static initialization */
+#define _PSMI_LOCK_TRY(pl) pthread_mutex_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl) pthread_mutex_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl) pthread_mutex_unlock(&((pl).lock))
+#define PSMI_LOCK_DISABLED 0
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+
+#elif defined(PSMI_PLOCK_IS_NOLOCK)
+#define _PSMI_LOCK_TRY(pl) 0 /* 0 *only* so progress thread never succeeds */
+#define _PSMI_LOCK(pl)
+#define _PSMI_UNLOCK(pl)
+#define PSMI_LOCK_DISABLED 1
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#else
+#error No LOCK lock type declared
+#endif
+
+#define PSMI_YIELD(pl) \
+ do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0)
+
+#ifdef PSM2_MOCK_TESTING
+/* If this is a mocking tests build, all the operations on the locks
+ * are routed through functions which may be mocked, if necessary. */
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_init);
+
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_try);
+
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock);
+
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock);
+
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert);
+
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert);
+
+#define PSMI_LOCK_INIT(pl) psmi_mockable_lock_init(&(pl))
+#define PSMI_LOCK_TRY(pl) psmi_mockable_lock_try(&(pl))
+#define PSMI_LOCK(pl) psmi_mockable_lock(&(pl))
+#define PSMI_UNLOCK(pl) psmi_mockable_unlock(&(pl))
+#define PSMI_LOCK_ASSERT(pl) psmi_mockable_lock_assert(&(pl))
+#define PSMI_UNLOCK_ASSERT(pl) psmi_mockable_unlock_assert(&(pl))
+#else
+#define PSMI_LOCK_INIT(pl) _PSMI_LOCK_INIT(pl)
+#define PSMI_LOCK_TRY(pl) _PSMI_LOCK_TRY(pl)
+#define PSMI_LOCK(pl) _PSMI_LOCK(pl)
+#define PSMI_UNLOCK(pl) _PSMI_UNLOCK(pl)
+#define PSMI_LOCK_ASSERT(pl) _PSMI_LOCK_ASSERT(pl)
+#define PSMI_UNLOCK_ASSERT(pl) _PSMI_UNLOCK_ASSERT(pl)
+#endif
+
+#ifdef PSM_PROFILE
+void psmi_profile_block() __attribute__ ((weak));
+void psmi_profile_unblock() __attribute__ ((weak));
+void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
+
+#define PSMI_PROFILE_BLOCK() psmi_profile_block()
+#define PSMI_PROFILE_UNBLOCK() psmi_profile_unblock()
+#define PSMI_PROFILE_REBLOCK(noprog) psmi_profile_reblock(noprog)
+#else
+#define PSMI_PROFILE_BLOCK()
+#define PSMI_PROFILE_UNBLOCK()
+#define PSMI_PROFILE_REBLOCK(noprog)
+#endif
+
+#ifdef PSM_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
+
+#if CUDART_VERSION < 4010
+#error Please update CUDA runtime, required minimum version is 4.1
+#endif
+
+extern int is_cuda_enabled;
+extern int device_support_gpudirect;
+extern int cuda_runtime_version;
+
+extern CUcontext ctxt;
+void *psmi_cudart_lib;
+void *psmi_cuda_lib;
+CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+cudaError_t (*psmi_cudaRuntimeGetVersion)(int *runtime_version);
+cudaError_t (*psmi_cudaGetDeviceCount)(int *n);
+cudaError_t (*psmi_cudaGetDeviceProperties)(struct cudaDeviceProp *p, int d);
+cudaError_t (*psmi_cudaGetDevice)(int *n);
+cudaError_t (*psmi_cudaSetDevice)(int n);
+cudaError_t (*psmi_cudaStreamCreate)(cudaStream_t *s);
+cudaError_t (*psmi_cudaStreamCreateWithFlags)(cudaStream_t *s, unsigned f);
+cudaError_t (*psmi_cudaStreamSynchronize)(cudaStream_t s);
+cudaError_t (*psmi_cudaDeviceSynchronize)();
+cudaError_t (*psmi_cudaEventCreate)(cudaEvent_t *event);
+cudaError_t (*psmi_cudaEventDestroy)(cudaEvent_t event);
+cudaError_t (*psmi_cudaEventQuery)(cudaEvent_t event);
+cudaError_t (*psmi_cudaEventRecord)(cudaEvent_t event, cudaStream_t stream);
+cudaError_t (*psmi_cudaEventSynchronize)(cudaEvent_t event);
+cudaError_t (*psmi_cudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+cudaError_t (*psmi_cudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t s);
+cudaError_t (*psmi_cudaMalloc)(void **devPtr, size_t size);
+cudaError_t (*psmi_cudaHostAlloc)(void **devPtr, size_t size, unsigned int flags);
+cudaError_t (*psmi_cudaFreeHost)(void *ptr);
+
+cudaError_t (*psmi_cudaIpcGetMemHandle)(cudaIpcMemHandle_t* handle, void* devPtr);
+cudaError_t (*psmi_cudaIpcOpenMemHandle)(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
+cudaError_t (*psmi_cudaIpcCloseMemHandle)(void* devPtr);
+
+#define PSMI_CUDA_DRIVER_API_CALL(func, args...) do { \
+ CUresult cudaerr; \
+ cudaerr = psmi_##func(args); \
+ if (cudaerr != CUDA_SUCCESS) { \
+ if (ctxt == NULL) \
+ _HFI_ERROR( \
+ "Check if cuda runtime is initialized" \
+ "before psm2_ep_open call \n"); \
+ _HFI_ERROR( \
+ "CUDA failure: %s() (at %s:%d)" \
+ "returned %d\n", \
+ #func, __FILE__, __LINE__, cudaerr); \
+ psmi_handle_error( \
+ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Error returned from CUDA function.\n");\
+ } \
+ } while (0)
+
+#define PSMI_CUDA_CALL(func, args...) do { \
+ cudaError_t cudaerr; \
+ cudaerr = psmi_##func(args); \
+ if (cudaerr != cudaSuccess) { \
+ if (ctxt == NULL) \
+ _HFI_ERROR( \
+ "Check if cuda runtime is initialized" \
+ "before psm2_ep_open call \n"); \
+ _HFI_ERROR( \
+ "CUDA failure: %s() (at %s:%d)" \
+ "returned %d\n", \
+ #func, __FILE__, __LINE__, cudaerr); \
+ psmi_handle_error( \
+ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Error returned from CUDA function.\n");\
+ } \
+ } while (0)
+
+#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \
+ cudaerr = psmi_cudaEventQuery(event); \
+ if ((cudaerr != cudaSuccess) && \
+ (cudaerr != cudaErrorNotReady)) { \
+ _HFI_ERROR( \
+ "CUDA failure: %s() returned %d\n", \
+ "cudaEventQuery", cudaerr); \
+ psmi_handle_error( \
+ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Error returned from CUDA function.\n");\
+ } \
+ } while (0)
+
+
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_cuda_mem(void *ptr))
+{
+ CUresult cres;
+ CUmemorytype mt;
+ cres = psmi_cuPointerGetAttribute(
+ &mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr);
+ if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE))
+ return 1;
+ else
+ return 0;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_cuda_enabled())
+{
+ return is_cuda_enabled;
+}
+
+#define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled()
+
+#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p)
+/* XXX TODO: Getting the gpu page size from driver at init time */
+#define PSMI_GPU_PAGESIZE 65536
+
+struct ips_cuda_hostbuf {
+ STAILQ_ENTRY(ips_cuda_hostbuf) req_next;
+ STAILQ_ENTRY(ips_cuda_hostbuf) next;
+ uint32_t size, offset, bytes_read;
+ /* This flag indicates whether a chb is
+ * pulled from a mpool or dynamically
+ * allocated using calloc. */
+ uint8_t is_tempbuf;
+ cudaEvent_t copy_status;
+ psm2_mq_req_t req;
+ void *host_buf, *gpu_buf;
+};
+
+struct ips_cuda_hostbuf_mpool_cb_context {
+ unsigned bufsz;
+};
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+
+#define CUDA_HOSTBUFFER_LIMITS { \
+ .env = "PSM_CUDA_BOUNCEBUFFERS_MAX", \
+ .descr = "Max CUDA bounce buffers (in MB)", \
+ .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \
+ .minval = 1, \
+ .maxval = 1<<30, \
+ .mode[PSMI_MEMMODE_NORMAL] = { 16, 256 }, \
+ .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \
+ .mode[PSMI_MEMMODE_LARGE] = { 32, 512 } \
+ }
+
+#define CUDA_SMALLHOSTBUF_SZ (256*1024)
+#define CUDA_WINDOW_PREFETCH_DEFAULT 2
+#define GPUDIRECT_THRESH_RV 3
+
+extern uint32_t gpudirect_send_threshold;
+extern uint32_t gpudirect_recv_threshold;
+
+enum psm2_chb_match_type {
+ /* Complete data found in a single chb */
+ PSMI_CUDA_FULL_MATCH_FOUND = 0,
+ /* Data is spread across two chb's */
+ PSMI_CUDA_SPLIT_MATCH_FOUND = 1,
+ /* Data is only partially prefetched */
+ PSMI_CUDA_PARTIAL_MATCH_FOUND = 2,
+ PSMI_CUDA_CONTINUE = 3
+};
+typedef enum psm2_chb_match_type psm2_chb_match_type_t;
+
+#endif /* PSM_CUDA */
+#endif /* _PSMI_USER_H */
diff --git a/psm_utils.c b/psm_utils.c
new file mode 100644
index 0000000..37446a4
--- /dev/null
+++ b/psm_utils.c
@@ -0,0 +1,2553 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <netdb.h> /* gethostbyname */
+#include <malloc.h> /* malloc_usable_size */
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+struct psmi_epid_table psmi_epid_table;
+
+/* Iterator to access the epid table.
+ * 'ep' can be NULL if remote endpoints from all endpoint handles are requested
+ */
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep)
+{
+ itor->i = 0;
+ itor->ep = ep;
+ pthread_mutex_lock(&psmi_epid_table.tablock);
+}
+
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor)
+{
+ int i;
+ struct psmi_epid_tabentry *e;
+
+ if (itor->i >= psmi_epid_table.tabsize)
+ return NULL;
+ for (i = itor->i; i < psmi_epid_table.tabsize; i++) {
+ e = &psmi_epid_table.table[i];
+ if (!e->entry || e->entry == EPADDR_DELETED)
+ continue;
+ if (itor->ep && e->ep != itor->ep)
+ continue;
+ itor->i = i + 1;
+ return e->entry;
+ }
+ itor->i = psmi_epid_table.tabsize; /* put at end of table */
+ return NULL;
+}
+
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor)
+{
+ pthread_mutex_unlock(&psmi_epid_table.tablock);
+ itor->i = 0;
+}
+
+#define mix64(a, b, c) \
+{ \
+ a -= b; a -= c; a ^= (c>>43); \
+ b -= c; b -= a; b ^= (a<<9); \
+ c -= a; c -= b; c ^= (b>>8); \
+ a -= b; a -= c; a ^= (c>>38); \
+ b -= c; b -= a; b ^= (a<<23); \
+ c -= a; c -= b; c ^= (b>>5); \
+ a -= b; a -= c; a ^= (c>>35); \
+ b -= c; b -= a; b ^= (a<<49); \
+ c -= a; c -= b; c ^= (b>>11); \
+ a -= b; a -= c; a ^= (c>>12); \
+ b -= c; b -= a; b ^= (a<<18); \
+ c -= a; c -= b; c ^= (b>>22); \
+}
+
+psm2_error_t psmi_epid_init()
+{
+ pthread_mutexattr_t attr;
+ psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0;
+ psmi_epid_table.tabsize_used = 0;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+ pthread_mutex_init(&psmi_epid_table.tablock, &attr);
+ pthread_mutexattr_destroy(&attr);
+ return PSM2_OK;
+};
+
+psm2_error_t psmi_epid_fini()
+{
+ if (psmi_epid_table.table != NULL) {
+ psmi_free(psmi_epid_table.table);
+ psmi_epid_table.table = NULL;
+ }
+ psmi_epid_table.tabsize = 0;
+ psmi_epid_table.tabsize_used = 0;
+ return PSM2_OK;
+}
+
+PSMI_ALWAYS_INLINE(
+uint64_t
+hash_this(const psm2_ep_t ep, const psm2_epid_t epid))
+{
+ uint64_t ep_i = (uint64_t) (uintptr_t) ep;
+ uint64_t epid_i = (uint64_t) epid;
+ uint64_t hash = 0x9e3779b97f4a7c13LL;
+ mix64(ep_i, epid_i, hash);
+ return hash;
+}
+
+PSMI_ALWAYS_INLINE(
+void *
+psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove))
+{
+ uint64_t key = hash_this(ep, epid);
+ struct psmi_epid_tabentry *e;
+ void *entry = NULL;
+ int idx;
+
+ pthread_mutex_lock(&psmi_epid_table.tablock);
+ if (!psmi_epid_table.table)
+ goto ret;
+ idx = (int)(key % psmi_epid_table.tabsize);
+ while (psmi_epid_table.table[idx].entry != NULL) {
+ /* An epid can be added twice if there's more than one opened endpoint,
+ * but really we match on epid *and* on endpoint */
+ e = &psmi_epid_table.table[idx];
+ if (e->entry != EPADDR_DELETED && e->key == key) {
+ entry = e->entry;
+ if (remove)
+ psmi_epid_table.table[idx].entry =
+ EPADDR_DELETED;
+ goto ret;
+ }
+ if (++idx == psmi_epid_table.tabsize)
+ idx = 0;
+ }
+ret:
+ pthread_mutex_unlock(&psmi_epid_table.tablock);
+ return entry;
+}
+
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid)
+{
+ void *entry = psmi_epid_lookup_inner(ep, epid, 0);
+ if (PSMI_EP_HOSTNAME != ep)
+ _HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid,
+ entry);
+ return entry;
+}
+
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid)
+{
+ if (PSMI_EP_HOSTNAME != ep)
+ _HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid);
+ return psmi_epid_lookup_inner(ep, epid, 1);
+}
+
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry)
+{
+ uint64_t key;
+ int idx, i, newsz;
+ struct psmi_epid_tabentry *e;
+ psm2_error_t err = PSM2_OK;
+
+ if (PSMI_EP_HOSTNAME != ep)
+ _HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid,
+ entry);
+ pthread_mutex_lock(&psmi_epid_table.tablock);
+ /* Leave this here, mostly for sanity and for the fact that the epid
+ * table is currently not used in the critical path */
+ if (++psmi_epid_table.tabsize_used >
+ (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) {
+ struct psmi_epid_tabentry *newtab;
+ newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK;
+ newtab = (struct psmi_epid_tabentry *)
+ psmi_calloc(ep, PER_PEER_ENDPOINT,
+ newsz, sizeof(struct psmi_epid_tabentry));
+ if (newtab == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ if (psmi_epid_table.table) { /* rehash the table */
+ for (i = 0; i < psmi_epid_table.tabsize; i++) {
+ e = &psmi_epid_table.table[i];
+ if (e->entry == NULL)
+ continue;
+ /* When rehashing, mark deleted as free again */
+ if (e->entry == EPADDR_DELETED) {
+ psmi_epid_table.tabsize_used--;
+ continue;
+ }
+ idx = (int)(e->key % newsz);
+ while (newtab[idx].entry != NULL)
+ if (++idx == newsz)
+ idx = 0;
+ newtab[idx].entry = e->entry;
+ newtab[idx].key = e->key;
+ newtab[idx].ep = e->ep;
+ newtab[idx].epid = e->epid;
+ }
+ psmi_free(psmi_epid_table.table);
+ }
+ psmi_epid_table.table = newtab;
+ psmi_epid_table.tabsize = newsz;
+ }
+ key = hash_this(ep, epid);
+ idx = (int)(key % psmi_epid_table.tabsize);
+ e = &psmi_epid_table.table[idx];
+ while (e->entry && e->entry != EPADDR_DELETED) {
+ if (++idx == psmi_epid_table.tabsize)
+ idx = 0;
+ e = &psmi_epid_table.table[idx];
+ }
+ e->entry = entry;
+ e->key = key;
+ e->epid = epid;
+ e->ep = ep;
+
+fail:
+ pthread_mutex_unlock(&psmi_epid_table.tablock);
+ return err;
+}
+
+char *psmi_gethostname(void)
+{
+ /* XXX this will need a lock in a multi-threaded environment */
+ static char hostname[80] = { '\0' };
+ char *c;
+
+ if (hostname[0] == '\0') {
+ gethostname(hostname, sizeof(hostname));
+ hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */
+ if ((c = strchr(hostname, '.')))
+ *c = '\0';
+ }
+
+ return hostname;
+}
+
+/*
+ * Hostname stuff. We really only register the network portion of the epid
+ * since all epids from the same nid are assumed to have the same hostname.
+ */
+psm2_error_t
+psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite)
+{
+ size_t hlen;
+ char *h;
+ psm2_error_t err = PSM2_OK;
+
+ if (hostname == NULL)
+ return PSM2_OK;
+ /* First see if a hostname already exists */
+ if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) {
+ if (!overwrite)
+ return PSM2_OK;
+
+ h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid);
+ if (h != NULL) /* free the previous hostname if so exists */
+ psmi_free(h);
+ }
+
+ hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1);
+ h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen);
+ if (h == NULL)
+ return PSM2_NO_MEMORY;
+ snprintf(h, hlen, "%s", hostname);
+ h[hlen - 1] = '\0';
+ err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h);
+ return err;
+}
+
+/* XXX These two functions are not thread safe, we'll use a rotating buffer
+ * trick whenever we need to make them thread safe */
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid)
+{
+ static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+ static int bufno;
+ uint64_t nid = psm2_epid_nid(epid);
+ char *h, *hostname;
+
+ hostname = hostnamebufs[bufno];
+ bufno = (bufno + 1) % 4;
+
+ /* First, if we have registered a host for this epid, just return that, or
+ * else try to return something with lid and context */
+ h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid);
+ if (h != NULL)
+ return h;
+ else {
+ snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d:%d.%d",
+ (int)PSMI_EPID_GET_LID(epid),
+ (int)PSMI_EPID_GET_CONTEXT(epid),
+ (int)PSMI_EPID_GET_SUBCONTEXT(epid));
+ hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+ return hostname;
+ }
+}
+
+/* This one gives the hostname with a lid */
+const char *psmi_epaddr_get_name(psm2_epid_t epid)
+{
+ static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+ static int bufno;
+ char *h, *hostname;
+ hostname = hostnamebufs[bufno];
+ bufno = (bufno + 1) % 4;
+
+ h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid));
+ if (h == NULL)
+ return psmi_epaddr_get_hostname(epid);
+ else {
+ snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1,
+ "%s (LID=%d:%d.%d)", h,
+ (int)PSMI_EPID_GET_LID(epid),
+ (int)PSMI_EPID_GET_CONTEXT(epid),
+ (int)PSMI_EPID_GET_SUBCONTEXT(epid));
+ hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+ }
+ return hostname;
+}
+
+/* Wrapper, in case we port to OS xyz that doesn't have sysconf */
+uintptr_t psmi_getpagesize(void)
+{
+ static uintptr_t pagesz = (uintptr_t) -1;
+ long sz;
+ if (pagesz != (uintptr_t) -1)
+ return pagesz;
+ sz = sysconf(_SC_PAGESIZE);
+ if (sz == -1) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Can't query system page size");
+ }
+
+ pagesz = (uintptr_t) sz;
+ return pagesz;
+}
+
+/* If PSM2_VERBOSE_ENV is set in the environment, we determine
+ * what its verbose level is and print the environment at "INFO"
+ * level if the environment's level matches the desired printlevel.
+ */
+static int psmi_getenv_verblevel = -1;
+static int psmi_getenv_is_verblevel(int printlevel)
+{
+ if (psmi_getenv_verblevel == -1) {
+ char *env = getenv("PSM2_VERBOSE_ENV");
+ if (env && *env) {
+ char *ep;
+ int val = (int)strtol(env, &ep, 0);
+ if (ep == env)
+ psmi_getenv_verblevel = 0;
+ else if (val == 2)
+ psmi_getenv_verblevel = 2;
+ else
+ psmi_getenv_verblevel = 1;
+ } else
+ psmi_getenv_verblevel = 0;
+ }
+ return (printlevel <= psmi_getenv_verblevel);
+}
+
+#define GETENV_PRINTF(_level, _fmt, ...) \
+ do { \
+ int nlevel = _level; \
+ if (psmi_getenv_is_verblevel(nlevel)) \
+ nlevel = 0; \
+ _HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__); \
+ } while (0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+ int type, union psmi_envvar_val defval,
+ union psmi_envvar_val *newval)
+{
+ int used_default = 0;
+ union psmi_envvar_val tval;
+ char *env = getenv(name);
+#if _HFI_DEBUGGING
+ int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS ||
+ type == PSMI_ENVVAR_TYPE_UINT_FLAGS);
+#endif
+
+ /* If we're not using the default, always reset the print
+ * level to '1' so the changed value gets seen at low
+ * verbosity */
+#define _GETENV_PRINT(used_default, fmt, val, defval) \
+ do { \
+ if (used_default) \
+ GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt \
+ "\n", level > 1 ? "*" : " ", name, \
+ descr, ishex ? "0x" : " ", val); \
+ else \
+ GETENV_PRINTF(1, "%s%-25s %-40s =>%s" \
+ fmt " (default was%s" fmt ")\n", \
+ level > 1 ? "*" : " ", name, descr, \
+ ishex ? " 0x" : " ", val, \
+ ishex ? " 0x" : " ", defval); \
+ } while (0)
+
+ switch (type) {
+ case PSMI_ENVVAR_TYPE_YESNO:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else if (env[0] == 'Y' || env[0] == 'y')
+ tval.e_int = 1;
+ else if (env[0] == 'N' || env[0] == 'n')
+ tval.e_int = 0;
+ else {
+ char *ep;
+ tval.e_ulong = strtoul(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ } else if (tval.e_ulong != 0)
+ tval.e_ulong = 1;
+ }
+ _GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO",
+ defval.e_int ? "YES" : "NO");
+ break;
+
+ case PSMI_ENVVAR_TYPE_STR:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else
+ tval.e_str = env;
+ _GETENV_PRINT(used_default, "%s", tval.e_str, defval.e_str);
+ break;
+
+ case PSMI_ENVVAR_TYPE_INT:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else {
+ char *ep;
+ tval.e_int = (int)strtol(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ }
+ }
+ _GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int);
+ break;
+
+ case PSMI_ENVVAR_TYPE_UINT:
+ case PSMI_ENVVAR_TYPE_UINT_FLAGS:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else {
+ char *ep;
+ tval.e_int = (unsigned int)strtoul(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ }
+ }
+ if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
+ _GETENV_PRINT(used_default, "%x", tval.e_uint,
+ defval.e_uint);
+ else
+ _GETENV_PRINT(used_default, "%u", tval.e_uint,
+ defval.e_uint);
+ break;
+
+ case PSMI_ENVVAR_TYPE_LONG:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else {
+ char *ep;
+ tval.e_long = strtol(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ }
+ }
+ _GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long);
+ break;
+ case PSMI_ENVVAR_TYPE_ULONG_ULONG:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else {
+ char *ep;
+ tval.e_ulonglong =
+ (unsigned long long)strtoull(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ }
+ }
+ _GETENV_PRINT(used_default, "%llu",
+ tval.e_ulonglong, defval.e_ulonglong);
+ break;
+ case PSMI_ENVVAR_TYPE_ULONG:
+ case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
+ default:
+ if (!env || *env == '\0') {
+ tval = defval;
+ used_default = 1;
+ } else {
+ char *ep;
+ tval.e_ulong = (unsigned long)strtoul(env, &ep, 0);
+ if (ep == env) {
+ used_default = 1;
+ tval = defval;
+ }
+ }
+ if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
+ _GETENV_PRINT(used_default, "%lx", tval.e_ulong,
+ defval.e_ulong);
+ else
+ _GETENV_PRINT(used_default, "%lu", tval.e_ulong,
+ defval.e_ulong);
+ break;
+ }
+#undef _GETENV_PRINT
+ *newval = tval;
+
+ return used_default;
+}
+MOCK_DEF_EPILOGUE(psmi_getenv);
+
+/*
+ * Parsing int parameters set in string tuples.
+ * Output array int *vals should be able to store 'ntup' elements.
+ * Values are only overwritten if they are parsed.
+ * Tuples are always separated by colons ':'
+ */
+int psmi_parse_str_tuples(const char *string, int ntup, int *vals)
+{
+ char *b = (char *)string;
+ char *e = b;
+ int tup_i = 0;
+ int n_parsed = 0;
+ char *buf = psmi_strdup(NULL, string);
+ psmi_assert_always(buf != NULL);
+
+ while (*e && tup_i < ntup) {
+ b = e;
+ while (*e && *e != ':')
+ e++;
+ if (e > b) { /* something to parse */
+ char *ep;
+ int len = e - b;
+ long int l;
+ strncpy(buf, b, len);
+ buf[len] = '\0';
+ l = strtol(buf, &ep, 0);
+ if (ep != buf) { /* successful conversion */
+ vals[tup_i] = (int)l;
+ n_parsed++;
+ }
+ }
+ if (*e == ':')
+ e++; /* skip delimiter */
+ tup_i++;
+ }
+ psmi_free(buf);
+ return n_parsed;
+}
+
+/*
+ * Memory footprint/usage mode.
+ *
+ * This can be used for debug or for separating large installations from
+ * small/medium ones. The default is to assume a medium installation. Large
+ * is not that much larger in memory footprint, but we make a conscious effort
+ * an consuming only the amount of memory we need.
+ */
+int psmi_parse_memmode(void)
+{
+ union psmi_envvar_val env_mmode;
+ int used_default =
+ psmi_getenv("PSM2_MEMORY", "Memory usage mode (normal or large)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)"normal", &env_mmode);
+ if (used_default || !strcasecmp(env_mmode.e_str, "normal"))
+ return PSMI_MEMMODE_NORMAL;
+ else if (!strcasecmp(env_mmode.e_str, "min"))
+ return PSMI_MEMMODE_MINIMAL;
+ else if (!strcasecmp(env_mmode.e_str, "large") ||
+ !strcasecmp(env_mmode.e_str, "big"))
+ return PSMI_MEMMODE_LARGE;
+ else {
+ _HFI_PRDBG("PSM2_MEMORY env value %s unrecognized, "
+ "using 'normal' memory mode instead\n",
+ env_mmode.e_str);
+ return PSMI_MEMMODE_NORMAL;
+ }
+}
+
+static
+const char *psmi_memmode_string(int mode)
+{
+ psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM);
+ switch (mode) {
+ case PSMI_MEMMODE_NORMAL:
+ return "normal";
+ case PSMI_MEMMODE_MINIMAL:
+ return "minimal";
+ case PSMI_MEMMODE_LARGE:
+ return "large";
+ default:
+ return "unknown";
+ }
+}
+
+psm2_error_t
+psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+ const struct psmi_rlimit_mpool *rlim,
+ uint32_t *valo, uint32_t *chunkszo)
+{
+ uint32_t val;
+ const char *env = rlim->env;
+ int mode = mq->memmode;
+ psm2_error_t err = PSM2_OK;
+ union psmi_envvar_val env_val;
+
+ psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL
+ && mode < PSMI_MEMMODE_NUM);
+
+ psmi_getenv(rlim->env, rlim->descr, rlim->env_level,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val);
+
+ val = env_val.e_uint;
+ if (val < rlim->minval || val > rlim->maxval) {
+ err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Env. var %s=%u is invalid (valid settings in mode PSM2_MEMORY=%s"
+ " are inclusively between %u and %u)",
+ env, val, psmi_memmode_string(mode),
+ rlim->minval, rlim->maxval);
+ goto fail;
+ }
+
+ _HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n",
+ env, val, rlim->mode[mode].obj_chunk,
+ psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval);
+
+ *valo = val;
+ *chunkszo = rlim->mode[mode].obj_chunk;
+
+fail:
+ return err;
+}
+
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns)
+{
+ if (timeout_ns < 0)
+ return 0ULL;
+ else if (timeout_ns == 0ULL || timeout_ns == ~0ULL)
+ return ~0ULL;
+ else {
+ uint64_t t_end = nanosecs_to_cycles(timeout_ns);
+ uint64_t t_now = get_cycles() - start_cycles;
+
+ if (t_now >= t_end)
+ return 0ULL;
+ else
+ return (t_end - t_now);
+ }
+}
+
+uint32_t psmi_get_ipv4addr()
+{
+ struct hostent *he;
+ uint32_t addr = 0;
+
+ he = gethostbyname(psmi_gethostname());
+ if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) {
+ memcpy(&addr, he->h_addr, sizeof(uint32_t));
+ return addr;
+ } else
+ return 0;
+}
+
+#define PSMI_EP_IS_PTR(ptr) ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT)
+
+void
+psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...)
+{
+ va_list ap;
+
+ /* If we've never syslogged anything from this ep at the PSM level, make
+ * sure we log context information */
+ if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) {
+ char uuid_str[64];
+ int hfi = ep->context.ctrl != NULL;
+ ep->did_syslog = 1;
+
+ memset(&uuid_str, 0, sizeof(uuid_str));
+ psmi_uuid_unparse(ep->uuid, uuid_str);
+ hfi_syslog("PSM", 0, LOG_WARNING,
+ "uuid_key=%s,unit=%d,context=%d,subcontext=%d",
+ uuid_str,
+ hfi ? ep->context.ctrl->ctxt_info.unit : -1,
+ hfi ? ep->context.ctrl->ctxt_info.ctxt : -1,
+ hfi ? ep->context.ctrl->ctxt_info.subctxt : -1);
+ }
+
+ va_start(ap, format);
+ hfi_vsyslog("PSM", to_console, level, format, ap);
+ va_end(ap);
+}
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+ uint32_t c;
+ int n, k;
+
+ for (n = 0; n < 256; n++) {
+ c = (uint32_t) n;
+ for (k = 0; k < 8; k++) {
+ if (c & 1)
+ c = 0xedb88320 ^ (c >> 1);
+ else
+ c = c >> 1;
+ }
+ crc_table[n] = c;
+ }
+ crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len)
+{
+ uint32_t c = crc;
+ int n;
+
+ if_pf(!crc_table_computed)
+ make_crc_table();
+ for (n = 0; n < len; n++) {
+ c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
+ }
+ return c;
+}
+
+/* Return the CRC of the bytes buf[0..len-1]. */
+uint32_t psmi_crc(unsigned char *buf, int len)
+{
+ return update_crc(0xffffffff, buf, len) ^ 0xffffffff;
+}
+
+/* Return the HFI type being used for a context */
+uint32_t psmi_get_hfi_type(const psmi_context_t *context)
+{
+ return PSMI_HFI_TYPE_OPA1;
+}
+
+#define PSMI_FAULTINJ_SPEC_NAMELEN 32
+struct psmi_faultinj_spec {
+ STAILQ_ENTRY(psmi_faultinj_spec) next;
+ char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN];
+
+ unsigned long long num_faults;
+ unsigned long long num_calls;
+
+ struct drand48_data drand48_data;
+ int num;
+ int denom;
+
+};
+
+int psmi_multi_ep_enabled = 0;
+void psmi_multi_ep_init()
+{
+ union psmi_envvar_val env_fi;
+
+ psmi_getenv("PSM2_MULTI_EP", "PSM2 Multiple Endpoints (yes/no)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+ PSMI_ENVVAR_VAL_NO, &env_fi);
+
+ psmi_multi_ep_enabled = env_fi.e_uint;
+}
+
+int psmi_faultinj_enabled = 0;
+int psmi_faultinj_verbose = 0;
+char *psmi_faultinj_outfile = NULL;
+
+static struct psmi_faultinj_spec psmi_faultinj_dummy;
+static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head =
+STAILQ_HEAD_INITIALIZER(psmi_faultinj_head);
+
+void psmi_faultinj_init()
+{
+ union psmi_envvar_val env_fi;
+
+ psmi_getenv("PSM2_FI", "PSM Fault Injection (yes/no)",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO,
+ PSMI_ENVVAR_VAL_NO, &env_fi);
+
+ psmi_faultinj_enabled = !!env_fi.e_uint;
+
+ if (psmi_faultinj_enabled) {
+ char *def = NULL;
+ if (!psmi_getenv
+ ("PSM2_FI_TRACEFILE", "PSM Fault Injection output file",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)def, &env_fi)) {
+ psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str);
+ }
+ }
+
+ return;
+}
+
+void psmi_faultinj_fini()
+{
+ struct psmi_faultinj_spec *fi;
+ FILE *fp;
+ int do_fclose = 0;
+
+ if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL)
+ return;
+
+ if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0)
+ fp = stdout;
+ else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0)
+ fp = stderr;
+ else {
+ char *c = psmi_faultinj_outfile;
+ char buf[192];
+ int append = 0;
+ if (*c == '+') {
+ append = 1;
+ ++c;
+ }
+ do_fclose = 1;
+ snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel());
+ buf[sizeof(buf) - 1] = '\0';
+ fp = fopen(buf, append ? "a" : "w");
+ }
+
+ if (fp != NULL) {
+ STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+ fprintf(fp, "%s:%s PSM2_FI_%-12s %2.3f%% => "
+ "%2.3f%% %10lld faults/%10lld events\n",
+ __progname, hfi_get_mylabel(), fi->spec_name,
+ (double)fi->num * 100.0 / fi->denom,
+ (double)fi->num_faults * 100.0 / fi->num_calls,
+ fi->num_faults, fi->num_calls);
+ }
+ fflush(fp);
+ if (do_fclose)
+ fclose(fp);
+ }
+
+ psmi_free(psmi_faultinj_outfile);
+ return;
+}
+
+/*
+ * Intended to be used only once, not in the critical path
+ */
+struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, int num,
+ int denom)
+{
+ struct psmi_faultinj_spec *fi;
+
+ if (!psmi_faultinj_enabled)
+ return &psmi_faultinj_dummy;
+
+ STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+ if (strcmp(fi->spec_name, spec_name) == 0)
+ return fi;
+ }
+
+ /* We got here, so no spec -- allocate one */
+ fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+ sizeof(struct psmi_faultinj_spec));
+ psmi_assert_always(fi != NULL);
+ strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1);
+ fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0';
+ fi->num = num;
+ fi->denom = denom;
+ fi->num_faults = 0;
+ fi->num_calls = 0;
+
+ /*
+ * See if we get a hint from the environment.
+ * Format is
+ * <num:denom:initial_seed>
+ *
+ * By default, we chose the initial seed to be the 'pid'. If users need
+ * repeatability, they should set initial_seed to be the 'pid' when the
+ * error was observed or force the initial_seed to be a constant number in
+ * each running process. Using 'pid' is useful because core dumps store
+ * pids and our backtrace format does as well so if a crash is observed for
+ * a specific seed, programs can reuse the 'pid' to regenerate the same
+ * error condition.
+ */
+ {
+ int fvals[3] = { num, denom, (int)getpid() };
+ union psmi_envvar_val env_fi;
+ char fvals_str[128];
+ char fname[128];
+ char fdesc[256];
+
+ snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num,
+ denom);
+ fvals_str[sizeof(fvals_str) - 1] = '\0';
+ snprintf(fname, sizeof(fname) - 1, "PSM2_FI_%s", spec_name);
+ fname[sizeof(fname) - 1] = '\0';
+ snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection %s <%s>",
+ fname, fvals_str);
+
+ if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN,
+ PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)fvals_str, &env_fi)) {
+ /* not using default values */
+ int n_parsed =
+ psmi_parse_str_tuples(env_fi.e_str, 3, fvals);
+ if (n_parsed >= 1)
+ fi->num = fvals[0];
+ if (n_parsed >= 2)
+ fi->denom = fvals[1];
+ if (n_parsed >= 3)
+ srand48_r((long int) fvals[2], &fi->drand48_data);
+ }
+ }
+
+ STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next);
+ return fi;
+}
+
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi)
+{
+ if (!psmi_faultinj_enabled) /* never fault if disabled */
+ return 0;
+ if (fi->num == 0)
+ return 0;
+
+ fi->num_calls++;
+ long int rnum;
+ lrand48_r(&fi->drand48_data, &rnum);
+ if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) {
+ fi->num_faults++;
+ return 1;
+ } else
+ return 0;
+}
+
+/* For memory allocation, we kind of break the PSM error handling rules.
+ * If the caller gets NULL, it has to assume that the error has been handled
+ * and should always return PSM2_NO_MEMORY */
+
+/*
+ * Log memory increments or decrements of type memstats_t.
+ */
+struct psmi_memtype_hdr {
+ struct {
+ uint64_t size:48;
+ uint64_t magic:8;
+ uint64_t type:8;
+ };
+ void *original_allocation;
+};
+
+struct psmi_stats_malloc psmi_stats_memory;
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes)
+{
+#define _add_max_total(type, nbytes) \
+ psmi_stats_memory.m_ ## type ## _total += (nbytes); \
+ psmi_stats_memory.m_ ## type ## _max = max( \
+ psmi_stats_memory.m_ ## type ## _total, \
+ psmi_stats_memory.m_ ## type ## _max);
+
+ switch (type) {
+ case PER_PEER_ENDPOINT:
+ _add_max_total(perpeer, nbytes);
+ break;
+ case NETWORK_BUFFERS:
+ _add_max_total(netbufs, nbytes);
+ break;
+ case DESCRIPTORS:
+ _add_max_total(descriptors, nbytes);
+ break;
+ case UNEXPECTED_BUFFERS:
+ _add_max_total(unexpbufs, nbytes);
+ break;
+ case STATS:
+ _add_max_total(stats, nbytes);
+ break;
+ case UNDEFINED:
+ _add_max_total(undefined, nbytes);
+ break;
+ default:
+ psmi_assert_always(type == TOTAL);
+ break;
+ }
+ _add_max_total(all, nbytes);
+ psmi_stats_memory.m_all_max++;
+#undef _add_max_total
+
+ return;
+}
+
+// Memory stats will only be collected under debug builds
+
+#ifdef PSM_DEBUG
+#define psmi_stats_mask PSMI_STATSTYPE_MEMORY
+#else
+#define psmi_stats_mask 0
+#endif
+
+#ifdef malloc
+#undef malloc
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* PSM HEAP DEBUG documentation:
+
+ In the following code, the acronym: 'HD' is short for "Heap Debug".
+
+ Each actual heap allocation will have a header and a trailer surrounding it,
+ and the header itself may have some vacant space preceding it due to alignment
+ needs:
+
+ 0. This area is the actual return value of posix_memalign and is due to
+ alignment requirements. (This area does not exist for heap allocations
+ from malloc()).
+ 1. HD HEADER
+ 2. Actual allocation
+ 3. HD TRAILER
+
+ malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code,
+ then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to
+ the caller. Thereafter, the HD code will inspect areas 1 and 3 of all heap
+ allocations to make sure they have retained their integrity.
+
+ Surrounding the actual allocation like this enables:
+
+ 1. Checking for heap overrun / underrun of all allocations.
+ 2. Checking for double frees.
+ 3. Use of an area that has been freed.
+ 4. Identifying orphaned heap allocations.
+
+Constant no-mans-land written to areas that no-one should be writing to:
+
+ */
+
+#define HD_NO_MANS_LAND -15
+
+/* The following is the declaration of the HD header. */
+
+/* Heap debug header magic number type: */
+typedef char HD_Hdr_Magic_Type[8];
+
+typedef struct HD_Header_Struct
+{
+ HD_Hdr_Magic_Type magic1; /* Magic number to ensure this
+ allocation has integrity.
+ (guards against heap
+ overrun from above). */
+ const char *allocLoc; /* Source file name/line
+ number where this heap
+ allocation was made. */
+ const char *freeLoc; /* Source filename/line number
+ where this heap allocation
+ was freed. */
+ struct HD_Header_Struct *nextHD_header; /* Creates a singly-linked
+ list of all heap
+ allocations. */
+ uint64_t sizeOfAlloc; /* size of this heap
+ allocation. */
+ void *systemAlloc; /* The actual return value
+ from malloc()/posix_memaligh(). */
+ uint64_t systemAllocSize;/* The size that is actually allocated
+ by malloc()/posix_memalign(). */
+ HD_Hdr_Magic_Type magic2; /* Second magic number to
+ ensure this allocation
+ has integrity.
+ (guards against heap
+ underrun from the actual
+ allocation that follows). */
+} __attribute__ ((packed)) HD_Header_Type;
+
+typedef struct HD_free_list_struct
+{
+ HD_Header_Type *freedStuct;
+ struct HD_free_list_struct *next_free_struct;
+} HD_Free_Struct_Type;
+
+static HD_Free_Struct_Type *HD_free_list_root = NULL;
+static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root;
+
+typedef char HD_Trlr_Magic_Type[16];
+
+static const HD_Hdr_Magic_Type HD_HDR_MGC_1 = "Eric";
+static const HD_Hdr_Magic_Type HD_HDR_MGC_2 = "Emily";
+static const HD_Trlr_Magic_Type HD_TRLR_MGC = "Erin&Elaine";
+
+/* Convert a pointer of an actual allocation to a pointer to its HD header: */
+static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa)
+{
+ char *p = (char*)aa;
+ return (HD_Header_Type*)(p - sizeof(HD_Header_Type));
+}
+
+/* Convert a pointer to an HD header to the actual allocation: */
+static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr)
+{
+ char *p = (char*)phdHdr;
+ return p + sizeof(HD_Header_Type);
+}
+
+/* Get the address of the trailer that follows the actual allocation: */
+static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr)
+{
+ char *p = (char*)HD_HDR_TO_AA(phdr);
+ return p + phdr->sizeOfAlloc;
+}
+
+static HD_Header_Type * HD_root_of_list = NULL; /* Root of singly linked list
+ of all heap allocations */
+static HD_Header_Type **HD_end_of_list = &HD_root_of_list; /* Pointer to the
+ last pointer of the singly linked list of all heap allocations. */
+
+/* Number of allocations in the list. Maintained to assert the integrity
+ of the singly linked list of heap allocations. */
+static int n_allocations = 0;
+
+/* HD_check_one_struct() checks one heap allocation for integrity. */
+static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc)
+{
+ /* First check the magic values in the header and trailer: */
+ psmi_assert_always(0 == memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)));
+ psmi_assert_always(0 == memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)));
+ psmi_assert_always(0 == memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)));
+
+ /* Next, check the area between systemAlloc and the start of the header */
+ signed char *pchr = (signed char *)p->systemAlloc;
+ while (pchr < (signed char*)p)
+ {
+ psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND);
+ pchr++;
+ }
+
+ /* Lastly, check the actual allocation area if directed to do so: */
+ if (checkAA)
+ {
+ uint64_t i;
+ signed char *pchr = HD_HDR_TO_AA(p);
+ for (i=0;i < p->sizeOfAlloc;i++)
+ if (pchr[i] != (signed char) HD_NO_MANS_LAND)
+ {
+ fprintf(stderr,
+ "use after free; ptr: %p,\n"
+ " allocated from: %s,\n"
+ " validated from: %s\n"
+ " freed from: %s\n",
+ pchr+i,p->allocLoc,curloc,p->freeLoc);
+ fflush(0);
+ psmi_assert_always(0);
+ }
+ }
+}
+
+/* _HD_validate_heap_allocations() walks the singly linked list and inspects all
+ * heap allocations to ensure all of them have integrity still. */
+void _HD_validate_heap_allocations(const char *curloc)
+{
+ /* first check current allocation list: */
+ HD_Header_Type *p = HD_root_of_list;
+ int cnt = 0;
+
+ while (p)
+ {
+ HD_check_one_struct(p,0,curloc);
+ p = p->nextHD_header;
+ cnt++;
+ }
+ psmi_assert_always(cnt == n_allocations);
+ /* Next check free list */
+ HD_Free_Struct_Type *pfreestruct = HD_free_list_root;
+ while (pfreestruct)
+ {
+ HD_check_one_struct(pfreestruct->freedStuct,1,curloc);
+ pfreestruct = pfreestruct->next_free_struct;
+ }
+}
+
+/* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds
+ * the header and trailer to the allocation. Lastly, it validates the existing singly-linked
+ * list for integrity. */
+static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc,
+ void *systemAlloc,
+ uint64_t systemSize,
+ uint64_t actualSize,
+ const char *curloc)
+{
+#if 0
+ /* if we use this block of code, psm hangs running mpistress. See JIRA STL-5244. */
+ memset(systemAlloc,HD_NO_MANS_LAND,systemSize);
+#else
+ /* write HD_NO_MANS_LAND to the area between the system allocation and the start of the hd header. */
+ signed char *pchr = systemAlloc;
+ for (;pchr < (signed char*) hd_alloc;pchr++)
+ *pchr = (signed char) HD_NO_MANS_LAND;
+#endif
+ /* Write the HD header info: */
+ memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1));
+ hd_alloc->allocLoc = curloc;
+ hd_alloc->freeLoc = NULL;
+ hd_alloc->nextHD_header = NULL;
+ hd_alloc->sizeOfAlloc = actualSize;
+ hd_alloc->systemAlloc = systemAlloc;
+ hd_alloc->systemAllocSize = systemSize;
+ memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2));
+ memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC));
+ *HD_end_of_list = hd_alloc;
+ HD_end_of_list = &hd_alloc->nextHD_header;
+ n_allocations++;
+ HD_validate_heap_allocations();
+}
+
+/* hd_malloc() is the heap debug version of malloc that will create the header and trailer
+ * and link the allocation into the singly linked list. */
+static inline void *hd_malloc(size_t sz, const char *curloc)
+{
+ const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC);
+ HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize);
+
+ hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc);
+ return HD_HDR_TO_AA(hd_alloc);
+}
+
+/* hd_memalign() is the heap debug version of posix_memalign(). */
+static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc)
+{
+ void *systemAlloc = NULL;
+ const uint64_t alignMask = alignment - 1;
+ uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC);
+ int rv = posix_memalign(&systemAlloc,alignment,systemSize);
+ char *actualAlloc = NULL;
+ const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize;
+
+ if (rv)
+ return rv;
+
+ uint64_t actualAllocu64 = (uint64_t) systemAlloc;
+ actualAllocu64 += sizeof(HD_Header_Type) + alignMask;
+ actualAllocu64 &= ~ alignMask;
+ actualAlloc = (char*)actualAllocu64;
+ psmi_assert_always((actualAllocu64 & alignMask) == 0);
+ psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc);
+ psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type));
+
+ hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc);
+ *ptr = actualAlloc;
+ return rv;
+}
+
+/* hd_free() is the heap debug version of free(). First, hd_free() ensures that the ptr to be
+ * freed in fact is known by the HD code. Next, hd_free() removes the ptr from the list. Then,
+ * hd_free scribbles to the ptr's area and actually frees the heap space. */
+static inline void hd_free(void *ptr,const char *curloc)
+{
+ HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+ HD_Header_Type *p = HD_root_of_list, *q = NULL;
+
+ HD_validate_heap_allocations();
+ while (p)
+ {
+ if (p == hd_alloc)
+ {
+ /* first, fix the next pointers: */
+ if (q)
+ {
+ q->nextHD_header = p->nextHD_header;
+ }
+ else
+ {
+ psmi_assert_always(p == HD_root_of_list);
+ HD_root_of_list = p->nextHD_header;
+ }
+ /* Now, handle the case of removing the last entry in the list. */
+ if (&p->nextHD_header == HD_end_of_list)
+ {
+ if (q)
+ {
+ q->nextHD_header = NULL;
+ HD_end_of_list = &q->nextHD_header;
+ }
+ else
+ {
+ HD_root_of_list = NULL;
+ HD_end_of_list = &HD_root_of_list;
+ }
+ }
+ /* Scribble to the actual allocation to make further access to the heap
+ area unusable. */
+ n_allocations--;
+ memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc);
+ hd_alloc->freeLoc = curloc;
+ /* Add this allocation to the free list. */
+ HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type));
+ *HD_free_list_bottom = pfreestruct;
+ HD_free_list_bottom = &pfreestruct->next_free_struct;
+ pfreestruct->freedStuct = hd_alloc;
+ pfreestruct->next_free_struct = NULL;
+ HD_validate_heap_allocations();
+ return;
+ }
+ q = p;
+ p = p->nextHD_header;
+ }
+ /* trying to free a heap allocation that we did not allocate. */
+ psmi_assert_always(0);
+}
+
+size_t hd_malloc_usable_size(void *ptr,const char *curloc)
+{
+ HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+ return hd_alloc->systemAllocSize;
+}
+
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* For HD code, we retarget the malloc, memaligh and free calls to the hd versions
+ * of the code. */
+
+#define my_malloc(SZ,CURLOC) hd_malloc(SZ,CURLOC)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC) hd_memalign(PTR,ALIGN,SZ,CURLOC)
+#define my_free(PTR,CURLOC) hd_free(PTR,CURLOC)
+#define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC)
+
+#else
+
+/* For non-HD code, we target the code to the usual functions: */
+#define my_malloc(SZ,CURLOC) malloc(SZ)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC) posix_memalign(PTR,ALIGN,SZ)
+#define my_free(PTR,CURLOC) free(PTR)
+#define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR)
+
+#endif
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+ size_t sz, const char *curloc)
+{
+ size_t newsz = sz;
+ void *newa;
+
+ if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+ newsz += sizeof(struct psmi_memtype_hdr);
+
+ newa = my_malloc(newsz,curloc);
+ if (newa == NULL) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+ "Out of memory for malloc at %s", curloc);
+ return NULL;
+ }
+
+ if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+ struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa;
+ hdr->size = newsz;
+ hdr->type = type;
+ hdr->magic = 0x8c;
+ hdr->original_allocation = newa;
+ psmi_log_memstats(type, newsz);
+ newa = (void *)(hdr + 1);
+ /* _HFI_INFO("alloc is %p\n", newa); */
+ }
+ return newa;
+}
+
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+ void *ptr, size_t nsz, const char *curloc)
+{
+ if (ptr)
+ {
+ size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc);
+ if (nsz > existingSize)
+ {
+ void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc);
+
+ memcpy(newPtr,ptr,existingSize);
+ psmi_free_internal(ptr,curloc);
+ return newPtr;
+ }
+ else
+ /* We will not support shrinking virtual space
+ for performance reasons. */
+ return ptr;
+ }
+ else
+ return psmi_malloc_internal(ep,type,nsz,curloc);
+}
+
+#ifdef memalign
+#undef memalign
+#endif
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type,
+ size_t alignment, size_t sz, const char *curloc)
+{
+ size_t newsz = sz;
+ void *newa;
+ int ret, preambleSize = 0;
+
+ if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+ {
+ if (sizeof(struct psmi_memtype_hdr) > alignment)
+ {
+ int n = sizeof(struct psmi_memtype_hdr) / alignment;
+ int r = sizeof(struct psmi_memtype_hdr) % alignment;
+ if (r)
+ n++;
+ preambleSize = n * alignment;
+ }
+ else
+ preambleSize = alignment;
+ newsz += preambleSize;
+ }
+
+ ret = my_memalign(&newa, alignment, newsz, curloc);
+ if (ret) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+ "Out of memory for malloc at %s", curloc);
+ return NULL;
+ }
+
+ if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+ void *rv = newa + preambleSize;
+ struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)(rv-sizeof(struct psmi_memtype_hdr));
+ hdr->size = newsz;
+ hdr->type = type;
+ hdr->magic = 0x8c;
+ hdr->original_allocation = newa;
+ psmi_log_memstats(type, newsz);
+ newa = rv;
+ /* _HFI_INFO("alloc is %p\n", newa); */
+ }
+ return newa;
+}
+
+#ifdef calloc
+#undef calloc
+#endif
+
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem,
+ size_t elemsz, const char *curloc)
+{
+ void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc);
+ if (newa == NULL) /* error handled above */
+ return NULL;
+ memset(newa, 0, nelem * elemsz);
+ return newa;
+}
+
+#ifdef strdup
+#undef strdup
+#endif
+
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc)
+{
+ size_t len = strlen(string) + 1;
+ void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc);
+ if (newa == NULL)
+ return NULL;
+ memcpy(newa, string, len); /* copy with \0 */
+ return newa;
+}
+
+#ifdef free
+#undef free
+#endif
+
+void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc)
+{
+ if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+ struct psmi_memtype_hdr *hdr =
+ (struct psmi_memtype_hdr *)ptr - 1;
+ /* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */
+ psmi_memtype_t type = hdr->type;
+ int64_t size = hdr->size;
+ int magic = (int)hdr->magic;
+ psmi_log_memstats(type, -size);
+ psmi_assert_always(magic == 0x8c);
+ ptr = hdr->original_allocation;
+ }
+ my_free(ptr,curloc);
+}
+MOCK_DEF_EPILOGUE(psmi_free_internal);
+
+#ifdef malloc_usable_size
+#undef malloc_usable_size
+#endif
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc)
+{
+ return my_malloc_usable_size(ptr,curLoc);
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_coreopt_ctl(const void *core_obj, int optname,
+ void *optval, uint64_t *optlen, int get))
+{
+ psm2_error_t err = PSM2_OK;
+
+ switch (optname) {
+ case PSM2_CORE_OPT_DEBUG:
+ /* Sanity check length */
+ if (*optlen < sizeof(unsigned)) {
+ err = psmi_handle_error(NULL,
+ PSM2_PARAM_ERR,
+ "Option value length error");
+ *optlen = sizeof(unsigned);
+ return err;
+ }
+
+ if (get) {
+ *((unsigned *)optval) = hfi_debug;
+ } else
+ hfi_debug = *(unsigned *)optval;
+ break;
+ case PSM2_CORE_OPT_EP_CTXT:
+ {
+ /* core object is epaddr */
+ psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+
+ /* Sanity check epaddr */
+ if (!epaddr) {
+ return psmi_handle_error(NULL,
+ PSM2_PARAM_ERR,
+ "Invalid endpoint address");
+ }
+
+ /* Sanity check length */
+ if (*optlen < sizeof(unsigned long)) {
+ err = psmi_handle_error(NULL,
+ PSM2_PARAM_ERR,
+ "Option value length error");
+ *optlen = sizeof(void *);
+ return err;
+ }
+
+ if (get) {
+ *((unsigned long *)optval) =
+ (unsigned long)epaddr->usr_ep_ctxt;
+ } else
+ epaddr->usr_ep_ctxt = optval;
+ }
+ break;
+ default:
+ /* Unknown/unrecognized option */
+ err = psmi_handle_error(NULL,
+ PSM2_PARAM_ERR,
+ "Unknown PSM2_CORE option %u.",
+ optname);
+ break;
+ }
+ return err;
+}
+
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+ const void *optval, uint64_t optlen)
+{
+ return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+ void *optval, uint64_t *optlen)
+{
+ return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1);
+}
+
+/* PSM AM component option handling */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_amopt_ctl(const void *am_obj, int optname,
+ void *optval, uint64_t *optlen, int get))
+{
+ psm2_error_t err = PSM2_OK;
+
+ /* AM object is a psm2_epaddr (or NULL for global minimum sz) */
+ /* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */
+
+ /* All AM options are read-only. */
+ if (!get) {
+ return err =
+ psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY,
+ "Attempted to set read-only option value");
+ }
+
+ /* Sanity check length -- all AM options are uint32_t. */
+ if (*optlen < sizeof(uint32_t)) {
+ *optlen = sizeof(uint32_t);
+ return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR,
+ "Option value length error");
+ }
+
+ switch (optname) {
+ case PSM2_AM_OPT_FRAG_SZ:
+ *((uint32_t *) optval) = psmi_am_parameters.max_request_short;
+ break;
+ case PSM2_AM_OPT_NARGS:
+ *((uint32_t *) optval) = psmi_am_parameters.max_nargs;
+ break;
+ case PSM2_AM_OPT_HANDLERS:
+ *((uint32_t *) optval) = psmi_am_parameters.max_handlers;
+ break;
+ default:
+ err =
+ psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown PSM2_AM option %u.", optname);
+ }
+
+ return err;
+}
+
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+ const void *optval, uint64_t optlen)
+{
+ return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+ void *optval, uint64_t *optlen)
+{
+ return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1);
+}
+
+#ifdef PSM_LOG
+
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include "ptl_ips/ips_proto_header.h"
+
+/* A treeNode is used to store the list of Function Name Lists that
+ are passed to the PSM_LOG facility via environment variables.
+ See psm_log.h for more information.
+
+ Note that treeNode is a node in a binary tree data structure. */
+typedef struct _treeNode
+{
+ const char *name;
+ int line1,line2;
+ struct _treeNode *left,*right;
+} treeNode;
+
+/* An epmTreeNode is used to track the number of protocol packets
+ that are send/recevied, for a given opcode, and source epid
+ to another epid. */
+typedef struct _epmTreeNode
+{
+ int opcode,count,txrx;
+ uint64_t fromepid,toepid;
+ struct _epmTreeNode *left,*right;
+} epmTreeNode;
+
+
+/* given a line range: [*line1 .. *line2], and another line, line
+ 'join' the line range to the new line if the line immediately abuts
+ the line range. The new line does not abut the existing range,
+ return 0. Else, return 1.
+
+ For example, take the line range [ 20 .. 30 ] and the line: 19.
+ Since 19 comes immediately before 20, the line range can be joined
+ resulting in the line rage: [ 19 .. 30 ]. The function returns 1 for this
+ case.
+
+ The following other examples gives the new line range given the new line and
+ range [ 20 .. 30 ], and gives the return value:
+
+ 31 [ 20 .. 31 ] 1
+ 18 [ 20 .. 30 ] 0
+ 32 [ 20 .. 30 ] 0
+ 25 [ 20 .. 30 ] 1 */
+static int joinOverlap(int *line1,int *line2,int line)
+{
+ long long ll_line = line;
+
+ if (ll_line+1 >= *line1 && ll_line-1 <= *line2)
+ {
+ *line1 = min(*line1,line);
+ *line2 = max(*line2,line);
+ return 1;
+ }
+ return 0;
+}
+
+/* given two line ranges, determine the range that encompasses both line ranges
+ if an overlap has occurred. Returns 0 if the two ranges do not overlap and
+ do not abutt.
+
+ Some examples, if line1=20 and line2=30
+
+ [20 30] [20 30] 2
+ [19 30] [19 30] 2
+ [19 20] [19 30] 2
+ [10 15] [20 30] 0
+ [40 50] [20 30] 0 */
+static int joinOverlapRange(int *line1,int *line2,int l1,int l2)
+{
+ return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2);
+}
+
+/* inserts a new treeNode into the FNL tree, or, merges the lines that are already
+ present in the tree. */
+static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2)
+{
+ if (*root)
+ {
+ int c = strcmp(name,(*root)->name);
+ if (c < 0)
+ insertNodeInTree(&((*root)->left),name,line1,line2);
+ else if (c > 0)
+ insertNodeInTree(&((*root)->right),name,line1,line2);
+ else
+ {
+ if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2))
+ return;
+ else if (line1 < (*root)->line1)
+ insertNodeInTree(&((*root)->left),name,line1,line2);
+ else if (line2 > (*root)->line2)
+ insertNodeInTree(&((*root)->right),name,line1,line2);
+ else psmi_assert_always(0); /* should never happen. */
+ }
+ }
+ else
+ {
+ *root = malloc(sizeof(treeNode));
+ (*root)->name = strdup(name);
+ (*root)->line1 = line1;
+ (*root)->line2 = line2;
+ (*root)->left = (*root)->right = NULL;
+ }
+}
+
+/* Returns -1 if the data in the node is less than the data supplied as parameter, else
+ Returns 1 if the data in the node is greater than the data supplied as parameter, else
+ Returns 0.
+ */
+static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+#define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1
+ COMPARE_ONE(opcode);
+ COMPARE_ONE(txrx);
+ COMPARE_ONE(fromepid);
+ COMPARE_ONE(toepid);
+ return 0;
+}
+
+/* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree.
+ In either case, this code returns a pointer to the count in the node. */
+static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+ if (*root)
+ {
+ int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid);
+ if (a < 0)
+ return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid);
+ else if (a > 0)
+ return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid);
+ else
+ return &((*root)->count);
+ }
+ else
+ {
+ *root = malloc(sizeof(epmTreeNode));
+ (*root)->opcode = opcode;
+ (*root)->txrx = txrx;
+ (*root)->count = 0;
+ (*root)->fromepid = fromepid;
+ (*root)->toepid = toepid;
+ (*root)->left = (*root)->right = NULL;
+ return &((*root)->count);
+ }
+}
+
+/* returns 0, if the node is present, non-zero if it is absent. */
+static int lookupNodeInTree(const treeNode *root,const char *name,int line)
+{
+ if (root)
+ {
+ int c = strcmp(name,root->name);
+ if (c < 0)
+ return lookupNodeInTree(root->left,name,line);
+ else if (c > 0)
+ return lookupNodeInTree(root->right,name,line);
+ else
+ {
+ if (line < root->line1)
+ return lookupNodeInTree(root->left,name,line);
+ else if (line > root->line2)
+ return lookupNodeInTree(root->right,name,line);
+ else /* line must be >= root->line1 and line must be <= root->line2. */
+ return 0;
+ }
+ }
+ else
+ {
+ return 1;
+ }
+}
+
+/* Declare a prototype for a parserFunc - referenced in the following code: */
+typedef void parserFunc(char *,int,int,void *);
+
+/* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */
+static void parseString(char *ps,char c,parserFunc pf,void *ctx)
+{
+ int idx,n=0;
+ char *p;
+
+ /* first, count the number of instances of c in ps, for use by the parser function: */
+ for (idx=0;ps[idx];idx++)
+ if (ps[idx] == c)
+ n++;
+ /* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */
+ for (idx=0,p=ps;p && *p;idx++)
+ {
+ char *t = strchr(p,c);
+ if (!t)
+ {
+ break;
+ }
+ else
+ {
+ *t = 0;
+ pf(p,idx,n,ctx);
+ p = t+1;
+ }
+ }
+ /* finally, call pf on the final substring. */
+ pf(p,idx,n,ctx);
+}
+
+/* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */
+typedef struct
+{
+ const char *currentFuncName;
+ int firstLineNumber;
+ treeNode **root;
+} funcNameCtx;
+
+/* This is the start of the parser code for parsing FNL's. Here is the grammar:
+
+ An FNL is a 'Function Name List' that is defined by the following grammar:
+
+ # A LINE1 is either a single line number of a range of line numbers:
+(1) LINE1 :: lineNumber |
+(2) lineNumber1 '-' lineNumber2
+
+ # LINES is a list of LINE1's separated by commas:
+(3) LINES :: LINE1 |
+(4) LINE1 ',' LINES
+
+ # An FN is either a function name, or a function name with a list of lines:
+(5) FN :: functionName |
+(6) functionName ';' LINES
+
+ # A FNL is a list of FN's separated by colons:
+(7) FNL :: FN |
+(8) FN ':' FNL
+
+ # Examples:
+ foo:bar the two functions foo and bar
+ foo;1-10 lines 1 to 10 of function foo.
+ bar;1,3,5 lines 1, 3 and 5 of function bar
+
+*/
+
+/* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree.
+*/
+static void p4(char *s,int idx,int n,void *ctx)
+{
+ funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+ if (n == 0) /* production (1) */
+ {
+ pfnc->firstLineNumber = atoi(s);
+ insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber);
+ }
+ else if (n == 1) /* production (2) */
+ {
+ if (idx == 0) /* lhs of production (2) */
+ pfnc->firstLineNumber = atoi(s);
+ else /* rhs of production (2). */
+ insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s));
+ }
+}
+
+/* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and
+ uses p4 to spill each individual range (or just one line number) into the tree */
+static void p3(char *s,int idx,int n,void *ctx)
+{
+ funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+ if (n == 0 && *s == 0) /* production (5)/(7) */
+ {
+ insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX);
+ }
+ else if (*s) /* production (2) */
+ {
+ /* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */
+ parseString(s,'-',p4,ctx);
+ }
+}
+
+/* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */
+static void p2(char *s,int idx,int n,void *ctx)
+{
+ funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+ if (n)
+ {
+ if (idx == 0)
+ pfnc->currentFuncName = s;
+ else
+ {
+ /* production (4) */
+ /* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */
+ parseString(s,',',p3,ctx);
+ }
+ }
+ else
+ {
+ /* production (7)/(5). */
+ insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX);
+ }
+}
+
+/* p1 parses each function name and line range list. */
+static void p1(char *s,int idx,int n,void *ctx)
+{
+ /* production (5)/(6)) */
+ /* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */
+ parseString(s,';',p2,ctx);
+}
+
+static void parseAndInsertInTree(const char *buf,treeNode **root)
+{
+ funcNameCtx t;
+ t.root = root;
+ char *p = alloca(strlen(buf)+1);
+ strcpy(p,buf);
+ /* productions (7)/(8) */
+ /* separates string into colon-separated strings, and then parses each substring in p1: */
+ parseString(p,':',p1,(void*)&t);
+}
+
+/* initialization code for the psmi log mechanism. */
+static inline void psmi_initialize(const char **plmf_fileName_kernel,
+ const char **plmf_search_format_string,
+ treeNode **includeFunctionNamesTreeRoot,
+ treeNode **excludeFunctionNamesTreeRoot)
+{
+ static volatile int plmf_initialized = 0;
+
+ if (!plmf_initialized)
+ {
+ static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+ if (pthread_mutex_lock(&plmf_init_mutex))
+ {
+ perror("cannot lock mutex for psmi_log_message facility");
+ return;
+ }
+ /* CRITICAL SECTION BEGIN */
+ if (!plmf_initialized)
+ {
+ /* initializing psmi log message facility here. */
+ const char *env = getenv("PSM2_LOG_FILENAME");
+ if (env)
+ *plmf_fileName_kernel = env;
+ env = getenv("PSM2_LOG_SRCH_FORMAT_STRING");
+ if (env)
+ {
+ *plmf_search_format_string = env;
+ }
+ else
+ {
+ env = getenv("PSM2_LOG_INC_FUNCTION_NAMES");
+ if (env)
+ {
+ parseAndInsertInTree(env,includeFunctionNamesTreeRoot);
+ }
+ env = getenv("PSM2_LOG_EXC_FUNCTION_NAMES");
+ if (env)
+ {
+ parseAndInsertInTree(env,excludeFunctionNamesTreeRoot);
+ }
+ }
+ /* initialization of psmi log message facility is completed. */
+ plmf_initialized = 1;
+ }
+ /* CRITICAL SECTION END */
+ if (pthread_mutex_unlock(&plmf_init_mutex))
+ {
+ perror("cannot unlock mutex for psmi_log_message facility");
+ return;
+ }
+ }
+}
+
+/* Utility function to map the integer txrx value to the given strings for emitting to the log file. */
+static const char * const TxRxString(int txrx)
+{
+ switch(txrx)
+ {
+ case PSM_LOG_EPM_TX: return "Sent";
+ case PSM_LOG_EPM_RX: return "Received";
+ default: return "Unknown";
+ }
+}
+
+/* Utility function to map an integer opcode value to the given strings for emitting to the log file. */
+static const char * const OpcodeString(int opcode)
+{
+ switch(opcode)
+ {
+ case OPCODE_LONG_RTS: return "RTS";
+ case OPCODE_LONG_CTS: return "CTS";
+ case OPCODE_LONG_DATA: return "DATA";
+ case OPCODE_EXPTID: return "EXPTID";
+ case OPCODE_EXPTID_COMPLETION: return "EXPTID_COMPLETION";
+ default: return "UNKNOWN";
+ }
+}
+
+static const char *plmf_fileName_kernel = "/tmp/psm2_log";
+static const char *plmf_search_format_string = NULL;
+static treeNode *includeFunctionNamesTreeRoot = NULL;
+static treeNode *excludeFunctionNamesTreeRoot = NULL;
+
+void psmi_log_initialize(void)
+{
+ /* If not initialized, then, initialize in a single thread of execution. */
+ psmi_initialize(&plmf_fileName_kernel,
+ &plmf_search_format_string,
+ &includeFunctionNamesTreeRoot,
+ &excludeFunctionNamesTreeRoot);
+}
+
+#ifdef PSM_LOG_FAST_IO
+
+struct psmi_log_io_thread_info
+{
+ pthread_t thread_id;
+ char *buff;
+ unsigned long max_buff_length, curr_buff_length;
+ pthread_mutex_t flags_mutex;
+ volatile int flags;
+#define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1 /* io is currently in progress */
+#define PSMI_LOG_IO_FLAG_IO_SHUTDOWN 2 /* we are shutting down logging. */
+};
+
+/* Please note that psmi_log_io_info is in thread local storage. */
+static __thread struct psmi_log_io_thread_info psmi_log_io_info =
+{
+ .thread_id = 0,
+ .buff = NULL,
+ .max_buff_length = 0,
+ .curr_buff_length = 0,
+ .flags_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .flags = 0
+};
+
+static struct
+{
+ unsigned int nTableEntries,maxTableEntries;
+ pthread_mutex_t table_mutex;
+ struct psmi_log_io_thread_info **table;
+} psmi_log_io_table =
+{
+ .nTableEntries = 0,
+ .maxTableEntries = 0,
+ .table_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .table = NULL
+};
+
+void psmi_log_fini()
+{
+ if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+ {
+ perror("Cannot lock mutex for psmi_log_io_table");
+ return;
+ }
+ /* Start critical section. */
+
+ unsigned int i;
+ for (i=0;i < psmi_log_io_table.nTableEntries;i++)
+ {
+ if (psmi_log_io_table.table[i])
+ {
+ struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i];
+ int flags;
+
+ if (pthread_mutex_lock(&pti->flags_mutex))
+ {
+ perror("can't lock the flags mutex.");
+ continue;
+ }
+ /* critical section */
+ flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN);
+ /* end critical section */
+ pthread_mutex_unlock(&pti->flags_mutex);
+ /* if io is currenctly in progress, allow it to complete. */
+ while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+ {
+ sleep(1);
+ if (pthread_mutex_lock(&pti->flags_mutex))
+ {
+ perror("can't lock the flags mutex.");
+ continue;
+ }
+ flags = pti->flags;
+ pthread_mutex_unlock(&pti->flags_mutex);
+ }
+ if (pti->buff)
+ {
+ char logFileName[256];
+ FILE *fout;
+
+ snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+ plmf_fileName_kernel,getpid(),pti->thread_id);
+ fout = fopen(logFileName,"w");
+ if (!fout)
+ {
+ perror(logFileName);
+ continue;
+ }
+ fwrite(pti->buff,pti->curr_buff_length,1,fout);
+ fclose(fout);
+ }
+ }
+ psmi_log_io_table.table[i] = NULL;
+ }
+ psmi_log_io_table.nTableEntries = 0;
+ psmi_free(psmi_log_io_table.table);
+ psmi_log_io_table.table = NULL;
+ psmi_log_io_table.maxTableEntries = 0;
+ /* End critical section. */
+ pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+}
+
+static int psmi_log_register_tls(void)
+{
+ if (psmi_log_io_info.thread_id != pthread_self())
+ {
+ psmi_log_io_info.thread_id = pthread_self();
+ if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+ {
+ perror("cannot lock table mutex");
+ return -1;
+ }
+ /* critical section start. */
+ if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1)
+ {
+ if (psmi_log_io_table.maxTableEntries == 0)
+ {
+ psmi_log_io_table.maxTableEntries = 2;
+ psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE,
+ PER_PEER_ENDPOINT,
+ psmi_log_io_table.maxTableEntries *
+ sizeof(struct psmi_log_io_thread_info *));
+ }
+ else
+ {
+ psmi_log_io_table.maxTableEntries *= 2;
+ psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE,
+ PER_PEER_ENDPOINT,
+ psmi_log_io_table.table,
+ psmi_log_io_table.maxTableEntries *
+ sizeof(struct psmi_log_io_thread_info *));
+ }
+ }
+ psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info;
+ psmi_log_io_table.nTableEntries++;
+ /* critical section end. */
+ pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+ }
+ if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+ {
+ perror("cannot lock table mutex");
+ return -1;
+ }
+ /* critical section start. */
+ int old_flags = psmi_log_io_info.flags;
+ int new_flags = old_flags;
+ if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN))
+ new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+ psmi_log_io_info.flags = new_flags;
+ /* critical section end. */
+ pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+ if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+ return 0;
+ return -1;
+}
+
+static void psmi_buff_fclose(int port)
+{
+ if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+ {
+ perror("cannot lock table mutex");
+ return;
+ }
+ /* critical section start. */
+ psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+ /* critical section end. */
+ pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+}
+
+static void growBuff(size_t minExcess)
+{
+ while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length)
+ {
+ if (!psmi_log_io_info.buff)
+ psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+ psmi_log_io_info.max_buff_length = 1 << 20);
+ else
+ {
+ psmi_log_io_info.max_buff_length *= 2;
+ psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+ psmi_log_io_info.buff,
+ psmi_log_io_info.max_buff_length);
+ }
+ }
+}
+
+static int psmi_buff_vfprintf(int port, const char *format, va_list ap)
+{
+ int done = 0;
+ size_t excess = 1024;
+ int length;
+
+ while (!done)
+ {
+ growBuff(excess);
+
+ length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length,
+ excess, format, ap);
+ if (length >= excess)
+ excess *= 2;
+ else
+ done = 1;
+ }
+ psmi_log_io_info.curr_buff_length += length;
+ return length;
+}
+
+static int psmi_buff_fprintf(int port,const char *format, ...)
+{
+ int length;
+ va_list ap;
+
+ va_start(ap, format);
+
+ length = psmi_buff_vfprintf(port,format,ap);
+
+ va_end(ap);
+ return length;
+}
+
+static int psmi_buff_fputc(int c, int port)
+{
+ growBuff(1024);
+ psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c;
+ psmi_log_io_info.curr_buff_length++;
+ return 1;
+}
+#endif
+
+
+/* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h
+ are serviced from this back end. */
+void psmi_log_message(const char *fileName,
+ const char *functionName,
+ int lineNumber,
+ const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+
+ /* Next, determine if this log message is signal or noise. */
+ if (plmf_search_format_string)
+ {
+ if((format != PSM_LOG_BT_MAGIC) && (format != PSM_LOG_EPM_MAGIC))
+ {
+ if (fnmatch(plmf_search_format_string, format, 0))
+ {
+ va_end(ap);
+ /* tis noise, return. */
+ return;
+ }
+ }
+ }
+ else
+ {
+ if (includeFunctionNamesTreeRoot)
+ {
+ if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber))
+ {
+ va_end(ap);
+ /* tis noise, return. */
+ return;
+ }
+ }
+
+ if (excludeFunctionNamesTreeRoot)
+ {
+ if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber))
+ {
+ va_end(ap);
+ /* tis noise, return. */
+ return;
+ }
+ }
+ }
+
+ /* At this point, we think that this may be a message that we want to emit to the log.
+ But, there is one more test, to apply to the two cases where (format == PSM_LOG_BT_MAGIC
+ and format == PSM_LOG_EPM_MAGIC. */
+ {
+ void **voidarray = NULL; /*va_arg(ap,void **);*/
+ int nframes = 0; /*va_arg(ap,int);*/
+ const char *newFormat = format; /*va_arg(ap,const char *);*/
+ int opcode = 0;
+ int txrx = 0;
+ uint64_t fromepid = 0;
+ uint64_t toepid = 0;
+
+#ifdef PSM_LOG_FAST_IO
+#define IO_PORT 0
+#define MY_FPRINTF psmi_buff_fprintf
+#define MY_VFPRINTF psmi_buff_vfprintf
+#define MY_FPUTC psmi_buff_fputc
+#define MY_FCLOSE psmi_buff_fclose
+#else
+ char logFileName[256];
+ FILE *fout;
+#define IO_PORT fout
+#define MY_FPRINTF fprintf
+#define MY_VFPRINTF vfprintf
+#define MY_FPUTC fputc
+#define MY_FCLOSE fclose
+#endif
+ struct timespec tp;
+
+ if (format == PSM_LOG_BT_MAGIC)
+ {
+ voidarray = va_arg(ap,void **);
+ nframes = va_arg(ap,int);
+ newFormat = va_arg(ap,const char *);
+ /* One last test to make sure that this message is signal: */
+ if (plmf_search_format_string)
+ {
+ {
+ if (fnmatch(plmf_search_format_string, newFormat, 0))
+ {
+ va_end(ap);
+ /* tis noise, return. */
+ return;
+ }
+ }
+ }
+ }
+ else if (format == PSM_LOG_EPM_MAGIC)
+ {
+ opcode = va_arg(ap,int);
+ txrx = va_arg(ap,int);
+ fromepid = va_arg(ap,uint64_t);
+ toepid = va_arg(ap,uint64_t);
+ newFormat = va_arg(ap,const char *);
+ /* One last test to make sure that this message is signal: */
+ if (plmf_search_format_string)
+ {
+ {
+ if (fnmatch(plmf_search_format_string, newFormat, 0))
+ {
+ va_end(ap);
+ /* tis noise, return. */
+ return;
+ }
+ }
+ }
+ }
+
+#ifdef PSM_LOG_FAST_IO
+ if (psmi_log_register_tls() != 0)
+ {
+ va_end(ap);
+ return;
+ }
+#else
+ /* At this point we know that the message is not noise, and it is going to be emitted to the log. */
+ snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+ plmf_fileName_kernel,getpid(),
+ pthread_self());
+ fout = fopen(logFileName,"a");
+ if (!fout)
+ {
+ va_end(ap);
+ return;
+ }
+#endif
+
+#define M1() clock_gettime(CLOCK_REALTIME, &tp); \
+ MY_FPRINTF(IO_PORT,"%f %s %s:%d: ", \
+ (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0), \
+ functionName,fileName,lineNumber)
+
+ M1();
+
+ if ((format != PSM_LOG_BT_MAGIC) && (format != PSM_LOG_EPM_MAGIC))
+ {
+ MY_VFPRINTF(IO_PORT,format,ap);
+ MY_FPUTC('\n',IO_PORT);
+ }
+ else if (format == PSM_LOG_BT_MAGIC)
+ {
+ void *newframes[PSM_LOG_BT_BUFFER_SIZE];
+ int newframecnt = backtrace(newframes,
+ PSM_LOG_BT_BUFFER_SIZE);
+ int pframes = min(newframecnt,nframes);
+
+ MY_VFPRINTF(IO_PORT,newFormat,ap);
+ MY_FPUTC('\n',IO_PORT);
+
+ if (memcmp(voidarray,newframes,pframes * sizeof(void*)))
+ {
+ int i;
+ char **strings;
+
+ memcpy(voidarray,newframes,sizeof(newframes));
+ M1();
+ MY_FPRINTF(IO_PORT,
+ "backtrace() returned %d addresses\n",
+ newframecnt);
+
+ strings = backtrace_symbols(voidarray, pframes);
+ if (strings == NULL)
+ {
+ perror("backtrace_symbols");
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < pframes; i++)
+ {
+ M1();
+ MY_FPRINTF(IO_PORT,"%s\n", strings[i]);
+ }
+
+#undef free
+ free(strings);
+ }
+
+ }
+ else /* (format == PSM_LOG_EPM_MAGIC) */
+ {
+ static epmTreeNode *root = 0;
+ static pthread_mutex_t plmf_epm_mutex =
+ PTHREAD_MUTEX_INITIALIZER;
+ int *pcount = 0;
+ if (pthread_mutex_lock(&plmf_epm_mutex))
+ {
+ perror("cannot lock mutex for "
+ "psmi_log_message facility");
+ va_end(ap);
+ return;
+ }
+ /* START OF CRITICAL SECTION */
+ pcount = insertNodeInEpmTree(&root,opcode,txrx,
+ fromepid,toepid);
+ /* END OF CRITICAL SECTION */
+ if (pthread_mutex_unlock(&plmf_epm_mutex))
+ {
+ perror("cannot unlock mutex for "
+ "psmi_log_message facility");
+ va_end(ap);
+ return;
+ }
+ (*pcount)++;
+ MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64
+ ", to: %" PRIx64 ", count: %d, ",
+ TxRxString(txrx),OpcodeString(opcode),
+ fromepid,toepid,*pcount);
+ MY_VFPRINTF(IO_PORT,newFormat,ap);
+ MY_FPUTC('\n',IO_PORT);
+ }
+ MY_FCLOSE(IO_PORT);
+ }
+
+ va_end(ap);
+}
+#endif /* #ifdef PSM_LOG */
diff --git a/psm_utils.h b/psm_utils.h
new file mode 100644
index 0000000..07d198b
--- /dev/null
+++ b/psm_utils.h
@@ -0,0 +1,379 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_IN_USER_H
+#error psm_utils.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_UTILS_H
+#define _PSMI_UTILS_H
+
+#include <arpa/inet.h> /* ipv4addr */
+#include <stdlib.h> /* malloc/free */
+
+/*
+ * Endpoint 'id' hash table, with iterator interface
+ */
+struct psmi_epid_table {
+ struct psmi_epid_tabentry *table;
+ int tabsize;
+ int tabsize_used;
+ pthread_mutex_t tablock;
+};
+/*
+ * Endpoint address hash table
+ */
+struct psmi_epid_tabentry {
+ void *entry;
+ uint64_t key;
+ psm2_ep_t ep;
+ psm2_epid_t epid;
+};
+
+extern struct psmi_epid_table psmi_epid_table;
+#define EPADDR_DELETED ((void *)-1) /* tag used to mark deleted entries */
+#define PSMI_EPID_TABSIZE_CHUNK 128
+#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7)
+
+psm2_error_t psmi_epid_init();
+psm2_error_t psmi_epid_fini();
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid);
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid);
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry);
+#define PSMI_EP_HOSTNAME ((psm2_ep_t) -1) /* Special endpoint handle we use
+ * to register hostnames */
+#define PSMI_EP_CROSSTALK ((psm2_ep_t) -2) /* Second special endpoint handle
+ * to log which nodes we've seen
+ * crosstalk from */
+struct psmi_eptab_iterator {
+ int i; /* last index looked up */
+ psm2_ep_t ep;
+};
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep);
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor);
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor);
+
+uint64_t psmi_epid_version(psm2_epid_t epid);
+
+/*
+ * Hostname manipulation
+ */
+#define PSMI_EP_HOSTNAME_LEN 64 /* hostname only */
+#define PSMI_EP_NAME_LEN 96 /* hostname:LID:context:subcontext */
+char *psmi_gethostname(void);
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid);
+const char *psmi_epaddr_get_name(psm2_epid_t epid);
+psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname,
+ int overwrite);
+
+/*
+ * Memory allocation, use macros only.
+ *
+ * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE
+ * if no endpoint is available.
+ *
+ * psmi_malloc_usable_size(void *ptr)
+ * psmi_malloc(ep, memtype, size)
+ * psmi_realloc(ep, memtype, ptr, newsize)
+ * psmi_memalign(ep, memtype, alignment, size)
+ * psmi_calloc(ep, memtype, elemsz, numelems)
+ * psmi_strdup(ep, memtype, ptr)
+ * psmi_free(ptr)
+ *
+ */
+typedef enum psmi_memtype {
+ TOTAL = 0, /* Logged automatically by malloc/calloc */
+ UNDEFINED, /* For tracking "other types" of allocations */
+ PER_PEER_ENDPOINT, /* For tracking "per peer" allocations */
+ NETWORK_BUFFERS, /* For tracking network buffers */
+ DESCRIPTORS, /* For tracking send/recv descriptors */
+ UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */
+ STATS, /* For tracking stats-related allocs */
+} psmi_memtype_t;
+
+/*
+ * We track allocation stats.
+ */
+struct psmi_stats_malloc {
+ int64_t m_all_total;
+ int64_t m_all_max;
+ int64_t m_perpeer_total;
+ int64_t m_perpeer_max;
+ int64_t m_netbufs_total;
+ int64_t m_netbufs_max;
+ int64_t m_descriptors_total;
+ int64_t m_descriptors_max;
+ int64_t m_unexpbufs_total;
+ int64_t m_unexpbufs_max;
+ int64_t m_undefined_total;
+ int64_t m_undefined_max;
+ int64_t m_stats_total;
+ int64_t m_stats_max;
+};
+
+extern struct psmi_stats_malloc psmi_stats_memory;
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz,
+ const char *curloc);
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr,
+ size_t newSz, const char *curloc);
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment,
+ size_t sz, const char *curloc);
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num,
+ size_t sz, const char *curloc);
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc);
+
+void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc);
+MOCK_DCL_EPILOGUE(psmi_free_internal);
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc);
+
+#ifdef PSM_HEAP_DEBUG
+/* During heap debug code, we can sprinkle function calls:
+ HD_validate_heap_allocations(), that will examine all of the heap allocations
+ to ensure integrity. */
+void _HD_validate_heap_allocations(const char *curloc);
+
+#define HD_validate_heap_allocations() _HD_validate_heap_allocations(PSMI_CURLOC)
+
+#else
+
+#define HD_validate_heap_allocations() /* nothing */
+
+#endif
+
+#define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC)
+#define psmi_calloc(ep, mt, nelem, elemsz) \
+ psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC)
+#define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC)
+#define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC)
+#define psmi_memalign(ep, mt, al, sz) \
+ psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC)
+#define psmi_free(ptr) psmi_free_internal(ptr, PSMI_CURLOC)
+#define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC)
+#ifndef PSM_IS_TEST
+#define malloc(sz) _use_psmi_malloc_instead_of_plain_malloc
+#define realloc(ptr,nsz) _use_psmi_realloc_instead_of_plain_realloc
+#define memalign(sz) _use_psmi_memalign_instead_of_plain_memalign
+#define calloc(sz, nelm) _use_psmi_calloc_instead_of_plain_calloc
+#ifdef strdup
+#undef strdup
+#endif
+#define strdup(ptr) _use_psmi_strdup_instead_of_plain_strdup
+#define free(ptr) _use_psmi_free_instead_of_plain_free
+#define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size
+#endif /* PSM_IS_TEST */
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes);
+
+/*
+ * Parsing int parameters set in string tuples.
+ */
+int psmi_parse_str_tuples(const char *str, int ntup, int *vals);
+
+/*
+ * Resource Limiting based on PSM memory mode.
+ */
+#define PSMI_MEMMODE_NORMAL 0
+#define PSMI_MEMMODE_MINIMAL 1
+#define PSMI_MEMMODE_LARGE 2
+#define PSMI_MEMMODE_NUM 3
+
+struct psmi_rlimit_mpool {
+ const char *env;
+ const char *descr;
+ int env_level;
+ uint32_t minval;
+ uint32_t maxval;
+ struct {
+ uint32_t obj_chunk;
+ uint32_t obj_max;
+ } mode[PSMI_MEMMODE_NUM];
+};
+psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+ const struct psmi_rlimit_mpool *rlim,
+ uint32_t *valo, uint32_t *chunkszo);
+int psmi_parse_memmode(void);
+
+/*
+ * Parsing environment variables
+ */
+
+union psmi_envvar_val {
+ void *e_void;
+ char *e_str;
+ int e_int;
+ unsigned int e_uint;
+ long e_long;
+ unsigned long e_ulong;
+ unsigned long long e_ulonglong;
+};
+
+#define PSMI_ENVVAR_LEVEL_USER 1
+#define PSMI_ENVVAR_LEVEL_HIDDEN 2
+
+#define PSMI_ENVVAR_TYPE_YESNO 0
+#define PSMI_ENVVAR_TYPE_STR 1
+#define PSMI_ENVVAR_TYPE_INT 2
+#define PSMI_ENVVAR_TYPE_UINT 3
+#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4
+#define PSMI_ENVVAR_TYPE_LONG 5
+#define PSMI_ENVVAR_TYPE_ULONG 6
+#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7
+#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8
+
+#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1)
+#define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+ int type, union psmi_envvar_val defval,
+ union psmi_envvar_val *newval);
+MOCK_DCL_EPILOGUE(psmi_getenv);
+/*
+ * Misc functionality
+ */
+uintptr_t psmi_getpagesize(void);
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns);
+uint32_t psmi_get_ipv4addr();
+void psmi_syslog(psm2_ep_t ep, int to_console, int level,
+ const char *format, ...);
+void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out);
+int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB);
+void *psmi_memcpyo(void *dst, const void *src, size_t n);
+uint32_t psmi_crc(unsigned char *buf, int len);
+uint32_t psmi_get_hfi_type(const psmi_context_t *context);
+
+/*
+ * Internal CPUID detection
+ */
+#define CPUID_FAMILY_MASK 0x00000f00
+#define CPUID_MODEL_MASK 0x000000f0
+#define CPUID_EXMODEL_MASK 0x000f0000
+
+/*
+ * CPUID return values
+ */
+#define CPUID_FAMILY_XEON 0x00000600
+#define CPUID_MODEL_PHI_GEN2 87
+#define CPUID_MODEL_PHI_GEN2M 133
+/*
+ * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX
+ * due to Little Endian and Hex it is not so obvious
+ */
+#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */
+#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */
+#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */
+
+/*
+ * These values are internal only, not real register values
+ */
+#define CPUID_GENUINE_INTEL 0xf0000000
+#define CPUID_MODEL_UNDEFINED -1
+
+/*
+ * Global model so we can tune defaults better for specific cpu's
+ */
+uint32_t psmi_cpu_model;
+
+/*
+ * Diagnostics, all in psm_diags.c
+ */
+int psmi_diags(void);
+
+/*
+ * Multiple Endpoints
+ */
+extern int psmi_multi_ep_enabled;
+void psmi_multi_ep_init();
+
+/*
+ * Fault injection
+ */
+struct psmi_faultinj_spec;
+int psmi_faultinj_enabled; /* use macro to test */
+#if 1 /* possible to disable at compile time */
+#define PSMI_FAULTINJ_ENABLED() (!!psmi_faultinj_enabled)
+#else
+#define PSMI_FAULTINJ_ENABLED() 0
+#endif
+
+void psmi_faultinj_init();
+void psmi_faultinj_fini();
+struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name,
+ int num, int denom);
+#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom) \
+ static struct psmi_faultinj_spec *var; \
+ if (PSMI_FAULTINJ_ENABLED() && (var) == NULL) \
+ (var) = psmi_faultinj_getspec((spec_name), (num), (denom));
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec);
+
+/*
+ * PSM core component set/get options
+ */
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+ const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+ void *optval, uint64_t *optlen);
+
+/*
+ * PSM AM component set/get options
+ */
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+ const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+ void *optval, uint64_t *optlen);
+
+#endif /* _PSMI_UTILS_H */
diff --git a/psmi_wrappers.c b/psmi_wrappers.c
new file mode 100644
index 0000000..ea857fc
--- /dev/null
+++ b/psmi_wrappers.c
@@ -0,0 +1,94 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdlib.h>
+#include "opa_common.h"
+#include <unistd.h>
+#include "psmi_wrappers.h"
+
+/* The following indirection wrappers for external functions
+ * are only created if this is a mocking tests build
+ */
+#ifdef PSM2_MOCK_TESTING
+
+void MOCKABLE(psmi_exit)(int status)
+{
+ exit(status);
+}
+MOCK_DEF_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count)
+{
+ return write(fd, buf, count);
+}
+MOCK_DEF_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg)
+{
+ return ioctl(fd, cmd, arg);
+}
+MOCK_DEF_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact)
+{
+ return sigaction(signum, act, oldact);
+}
+MOCK_DEF_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void)
+{
+ return ips_rmb();
+}
+MOCK_DEF_EPILOGUE(psmi_rmb);
+
+#endif /* def PSM2_MOCK_TESTING */
diff --git a/psmi_wrappers.h b/psmi_wrappers.h
new file mode 100644
index 0000000..68f11c8
--- /dev/null
+++ b/psmi_wrappers.h
@@ -0,0 +1,98 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_WRAPPERS_H
+#define _PSMI_WRAPPERS_H
+
+#include <signal.h>
+#include "psm2_mock_testing.h"
+#include "opa_intf.h"
+
+#if defined( IB_IOCTL_MAGIC )
+#include <sys/ioctl.h>
+#endif
+
+/* If this is a mocking tests build, we introduce "incision points"
+ * through which we can easily mock external dependencies.
+ * For non-mocking-tests build, we bypass those indirections
+ * for performance reasons.
+ */
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_exit)(int status);
+MOCK_DCL_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count);
+MOCK_DCL_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg);
+MOCK_DCL_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact);
+MOCK_DCL_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void);
+MOCK_DCL_EPILOGUE(psmi_rmb);
+
+#else /* def PSM2_MOCK_TESTING */
+
+#define psmi_exit exit
+#define psmi_write write
+#define psmi_ioctl ioctl
+#define psmi_sigaction sigaction
+#define psmi_rmb ips_rmb
+
+#endif /* def PSM2_MOCK_TESTING */
+
+#endif // _PSMI_WRAPPERS_H
+
diff --git a/ptl.h b/ptl.h
new file mode 100644
index 0000000..88d6fc1
--- /dev/null
+++ b/ptl.h
@@ -0,0 +1,211 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* Interface implemented by Packet Transport layers such as
+ * ips and active messages.
+ *
+ * This interface can be volatile, it is never seen by PSM clients, and it will
+ * probably change as the AM ptl is developed.
+ */
+
+#ifndef PSM_PTL_H
+#define PSM_PTL_H
+#include <inttypes.h>
+#include <psm2.h>
+#include <psm2_mq.h>
+#include <psm2_am.h>
+
+/* We currently have 3 PTLs, 0 is reserved. */
+#define PTL_DEVID_IPS 1
+#define PTL_DEVID_AMSH 2
+#define PTL_DEVID_SELF 3
+
+/* We can currently initialize up to 3 PTLs */
+#define PTL_MAX_INIT 3
+
+struct ptl;
+typedef struct ptl ptl_t;
+
+struct ptl_ctl;
+typedef struct ptl_ctl ptl_ctl_t;
+
+struct ptl_mq_req;
+typedef struct ptl_mq_req ptl_mq_req_t;
+
+struct ips_proto;
+typedef struct ips_proto ips_proto_t;
+
+/* To be filled in statically by all PTLs */
+struct ptl_ctl_init {
+ size_t(*sizeof_ptl) (void);
+
+ psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl);
+
+ psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns);
+
+ psm2_error_t
+ (*setopt) (const void *component_obj, int optname,
+ const void *optval, uint64_t optlen);
+
+ psm2_error_t
+ (*getopt) (const void *component_obj, int optname,
+ void *optval, uint64_t *optlen);
+};
+
+typedef
+struct ptl_arg {
+ union {
+ struct {
+ uint16_t u16w3;
+ uint16_t u16w2;
+ uint16_t u16w1;
+ uint16_t u16w0;
+ };
+ struct {
+ uint32_t u32w1;
+ uint32_t u32w0;
+ };
+ uint64_t u64w0;
+ uint64_t u64;
+ void *uptr;
+ };
+} ptl_arg_t;
+
+#include "ptl_self/ptl_fwd.h"
+#include "ptl_ips/ptl_fwd.h"
+#include "ptl_am/ptl_fwd.h"
+
+/* To be filled in as part of ptl_init */
+struct ptl_ctl {
+ ptl_t *ptl; /* pointer to ptl */
+ psm2_ep_t ep; /* pointer to ep */
+
+ /* EP-specific stuff */
+ psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly);
+
+ /* PTL-level connect
+ *
+ * This PTL-level is slightly different from the top-level PSM connect.
+ *
+ * pre 1: Caller has masked off epids in epid array that are already
+ * connected at the PSM level.
+ *
+ * post 0: PTL has allocate all epaddrs and whatever internal ptladdr
+ * that ptl needs.
+ * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i]
+ * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't
+ * be connected before a timeout occurred.
+ * post 3: PTL returns OK if all epids are either OK or UNREACHABLE
+ * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK.
+ */
+ psm2_error_t(*ep_connect) (ptl_t *ptl,
+ int num_ep,
+ const psm2_epid_t input_array_of_epid[],
+ const int array_of_epid_mask[],
+ psm2_error_t output_array_of_errors[],
+ psm2_epaddr_t output_array_of_epddr[],
+ uint64_t timeout_ns);
+
+ psm2_error_t (*ep_disconnect)(ptl_t *ptl,
+ int force,
+ int num_ep,
+ psm2_epaddr_t input_array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t output_array_of_errors[],
+ uint64_t timeout_ns);
+
+ /* MQ stuff */
+ psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest,
+ uint32_t flags, psm2_mq_tag_t *stag,
+ const void *buf, uint32_t len);
+ psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest,
+ uint32_t flags, psm2_mq_tag_t *stag,
+ const void *buf, uint32_t len,
+ void *ctxt, psm2_mq_req_t *req);
+
+ int (*epaddr_stats_num) (void);
+ int (*epaddr_stats_init) (char *desc[], uint16_t *flags);
+ int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats);
+
+ /* AM stuff */
+ psm2_error_t(*am_get_parameters) (psm2_ep_t ep,
+ struct psm2_am_parameters *
+ parameters);
+ psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr,
+ psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+ psm2_error_t(*am_short_reply) (psm2_am_token_t token,
+ psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs, void *src,
+ size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+ /* Long messages currently unsupported */
+#if 0
+ psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr,
+ psm2_handler_t handler,
+ psm2_amarg_t *args, int nargs,
+ void *src, size_t len, void *dest,
+ int flags);
+ psm2_error_t(*am_long_reply) (psm2_am_token_t token,
+ psm2_handler_t handler, psm2_amarg_t *args,
+ int nargs, void *src, size_t len,
+ void *dest, int flags);
+#endif
+};
+#endif
diff --git a/ptl_am/Makefile b/ptl_am/Makefile
new file mode 100644
index 0000000..5aa5a46
--- /dev/null
+++ b/ptl_am/Makefile
@@ -0,0 +1,91 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir := $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o cmarwu.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+ $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno; \
+ cd -; \
+ fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+ @echo "Nothing to do for install."
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
new file mode 100644
index 0000000..959dbc3
--- /dev/null
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -0,0 +1,316 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#include "psm_user.h"
+#include "am_cuda_memhandle_cache.h"
+#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start)
+#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length))
+#define RBTREE_ASSERT psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems)
+
+#include "rbtree.c"
+
+#ifdef PSM_DEBUG
+static int cache_hit_counter;
+static int cache_miss_counter;
+#endif
+
+/*
+ * Creating mempool for cuda memhandle cache nodes.
+ */
+psm2_error_t
+am_cuda_memhandle_mpool_init(uint32_t memcache_size)
+{
+ psm2_error_t err;
+ cuda_memhandle_cache_size = memcache_size;
+ /* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE
+ * which includes the Root and NIL items
+ */
+ cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t),
+ cuda_memhandle_cache_size,
+ cuda_memhandle_cache_size, 0,
+ UNDEFINED, NULL, NULL,
+ psmi_cuda_memhandle_cache_alloc_func,
+ NULL);
+ if (cuda_memhandle_mpool == NULL) {
+ err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+ "Couldn't allocate CUDA host receive buffer pool");
+ return err;
+ }
+ return PSM2_OK;
+}
+
+/*
+ * Initialize rbtree.
+ */
+psm2_error_t am_cuda_memhandle_cache_map_init()
+{
+ cl_map_item_t *root, *nil_item;
+ root = (cl_map_item_t *)psmi_mpool_get(cuda_memhandle_mpool);
+ if (root == NULL)
+ return PSM2_NO_MEMORY;
+ nil_item = (cl_map_item_t *)psmi_mpool_get(cuda_memhandle_mpool);
+ if (nil_item == NULL)
+ return PSM2_NO_MEMORY;
+ nil_item->payload.start = 0;
+ nil_item->payload.epid = 0;
+ nil_item->payload.length = 0;
+ cuda_memhandle_cache_enabled = 1;
+ ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item);
+ NELEMS = 0;
+ return PSM2_OK;
+}
+
+void am_cuda_memhandle_cache_map_fini()
+{
+#ifdef PSM_DEBUG
+ _HFI_DBG("cache hit counter: %d\n", cache_hit_counter);
+ _HFI_DBG("cache miss counter: %d\n", cache_miss_counter);
+#endif
+ if(cuda_memhandle_cache_enabled)
+ psmi_mpool_destroy(cuda_memhandle_mpool);
+ return;
+}
+
+/*
+ * Insert at the head of Idleq.
+ */
+static void
+am_cuda_idleq_insert(cl_map_item_t* memcache_item)
+{
+ if (FIRST == NULL) {
+ FIRST = memcache_item;
+ LAST = memcache_item;
+ return;
+ }
+ INEXT(FIRST) = memcache_item;
+ IPREV(memcache_item) = FIRST;
+ FIRST = memcache_item;
+ return;
+}
+
+/*
+ * Remove least recent used element.
+ */
+static void
+am_cuda_idleq_remove_last(cl_map_item_t* memcache_item)
+{
+ if (!INEXT(memcache_item)) {
+ LAST = NULL;
+ FIRST = NULL;
+ return;
+ }
+ LAST = INEXT(memcache_item);
+ IPREV(LAST) = NULL;
+ return;
+}
+
+static void
+am_cuda_idleq_remove(cl_map_item_t* memcache_item)
+{
+ if (LAST == memcache_item) {
+ am_cuda_idleq_remove_last(memcache_item);
+ return;
+ }
+ if (INEXT(memcache_item) == NULL) {
+ INEXT(IPREV(memcache_item)) = NULL;
+ return;
+ }
+ INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+ IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
+ return;
+}
+
+static void
+am_cuda_idleq_reorder(cl_map_item_t* memcache_item)
+{
+ if (FIRST == memcache_item && LAST == memcache_item ) {
+ return;
+ }
+ am_cuda_idleq_remove(memcache_item);
+ am_cuda_idleq_insert(memcache_item);
+ return;
+}
+
+/*
+ * After a successful cache hit, item is validated by doing a
+ * memcmp on the handle stored and the handle we recieve from the
+ * sender. If the validation fails the item is removed from the idleq,
+ * the rbtree, is put back into the mpool and IpcCloseMemHandle function
+ * is called.
+ */
+static psm2_error_t
+am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
+ uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+ uint32_t length, psm2_epid_t epid)
+{
+ if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle,
+ sizeof(cudaIpcMemHandle_t )))
+ && sbuf == memcache_item->payload.start
+ && epid == memcache_item->payload.epid) {
+ return PSM2_OK;
+ }
+ ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item);
+ PSMI_CUDA_CALL(cudaIpcCloseMemHandle,
+ memcache_item->payload.cuda_ipc_dev_ptr);
+ am_cuda_idleq_remove(memcache_item);
+ psmi_mpool_put(memcache_item);
+ return PSM2_OK_NO_PROGRESS;
+}
+
+/*
+ * Current eviction policy: Least Recently Used.
+ */
+static void
+am_cuda_memhandle_cache_evict()
+{
+ cl_map_item_t *p_item = LAST;
+ ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item);
+ PSMI_CUDA_CALL(cudaIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
+ am_cuda_idleq_remove_last(p_item);
+ psmi_mpool_put(p_item);
+ return;
+}
+
+static psm2_error_t
+am_cuda_memhandle_cache_register(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+ uint32_t length, psm2_epid_t epid,
+ void* cuda_ipc_dev_ptr)
+{
+ if (NELEMS == cuda_memhandle_cache_size)
+ am_cuda_memhandle_cache_evict();
+ cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool);
+ /* memcache_item cannot be NULL as we evict
+ * before the call to mpool_get. Check has
+ * been fixed to help with klockwork analysis.
+ */
+ if (memcache_item == NULL)
+ return PSM2_NO_MEMORY;
+ memcache_item->payload.start = sbuf;
+ memcache_item->payload.cuda_ipc_handle = *handle;
+ memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr;
+ memcache_item->payload.length = length;
+ memcache_item->payload.epid = epid;
+ ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item);
+ am_cuda_idleq_insert(memcache_item);
+ return PSM2_OK;
+}
+
+/*
+ * The key used to search the cache is the senders buf address pointer.
+ * Upon a succesful hit in the cache, additional validation is required
+ * as multiple senders could potentially send the same buf address value.
+ */
+void*
+am_cuda_memhandle_acquire(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+ uint32_t length, psm2_epid_t epid)
+{
+ void* cuda_ipc_dev_ptr;
+ if(cuda_memhandle_cache_enabled) {
+ cl_qmap_t *p_map = &cuda_memhandle_cachemap;
+ cl_map_item_t *p_item;
+ unsigned long start = (unsigned long)sbuf;
+ unsigned long end = start + length;
+ p_item = ips_cl_qmap_search(p_map, start, end);
+ if (p_item->payload.start) {
+ if (am_cuda_memhandle_cache_validate(p_item, sbuf,
+ handle, length, epid) == PSM2_OK) {
+#ifdef PSM_DEBUG
+ cache_hit_counter++;
+#endif
+ am_cuda_idleq_reorder(p_item);
+ return p_item->payload.cuda_ipc_dev_ptr;
+ }
+ }
+#ifdef PSM_DEBUG
+ cache_miss_counter++;
+#endif
+ PSMI_CUDA_CALL(cudaIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+ *handle, cudaIpcMemLazyEnablePeerAccess);
+ am_cuda_memhandle_cache_register(sbuf, handle,
+ length, epid, cuda_ipc_dev_ptr);
+ return cuda_ipc_dev_ptr;
+ } else {
+ PSMI_CUDA_CALL(cudaIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+ *handle, cudaIpcMemLazyEnablePeerAccess);
+ return cuda_ipc_dev_ptr;
+ }
+}
+
+void
+am_cuda_memhandle_release(void* cuda_ipc_dev_ptr)
+{
+ if(!cuda_memhandle_cache_enabled)
+ PSMI_CUDA_CALL(cudaIpcCloseMemHandle, cuda_ipc_dev_ptr);
+ return;
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+void
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+ cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+ if (!is_alloc) {
+ if(memcache_item->payload.start)
+ PSMI_CUDA_CALL(cudaIpcCloseMemHandle,
+ memcache_item->payload.cuda_ipc_dev_ptr);
+ }
+}
+
+#endif
diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h
new file mode 100644
index 0000000..13c972b
--- /dev/null
+++ b/ptl_am/am_cuda_memhandle_cache.h
@@ -0,0 +1,124 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#ifndef _AM_CUDA_MEMHANDLE_CACHE_H
+#define _AM_CUDA_MEMHANDLE_CACHE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+struct _cl_map_item;
+
+typedef struct
+{
+ unsigned long start; /* start virtual address */
+ cudaIpcMemHandle_t cuda_ipc_handle; /* cuda ipc mem handle */
+ void* cuda_ipc_dev_ptr;/* Cuda device pointer */
+ uint16_t length; /* length*/
+ psm2_epid_t epid;
+ struct _cl_map_item* i_prev; /* idle queue previous */
+ struct _cl_map_item* i_next; /* idle queue next */
+}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+ uint32_t nelems; /* number of elements in the cache */
+} rbtree_cuda_memhandle_cache_map_pl_t;
+
+#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
+
+#include "rbtree.h"
+
+cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
+uint8_t cuda_memhandle_cache_enabled;
+mpool_t cuda_memhandle_mpool;
+uint32_t cuda_memhandle_cache_size;
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
+
+/*
+ * Macro definition for easy programming.
+ */
+
+#define NELEMS cuda_memhandle_cachemap.payload.nelems
+
+/*
+ * Macro for idle queue management.
+ */
+#define IHEAD cuda_memhandle_cachemap.root
+#define LAST IHEAD->payload.i_prev
+#define FIRST IHEAD->payload.i_next
+#define INEXT(x) x->payload.i_next
+#define IPREV(x) x->payload.i_prev
+
+
+psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+
+psm2_error_t am_cuda_memhandle_cache_map_init();
+
+void*
+am_cuda_memhandle_acquire(uintptr_t sbuf, cudaIpcMemHandle_t* handle,
+ uint32_t length, psm2_epid_t epid);
+void
+am_cuda_memhandle_release(void* cuda_ipc_dev_ptr);
+
+void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj);
+
+void am_cuda_memhandle_cache_map_fini();
+
+#endif
+
+#endif
diff --git a/ptl_am/am_reqrep.c b/ptl_am/am_reqrep.c
new file mode 100644
index 0000000..5f90ec7
--- /dev/null
+++ b/ptl_am/am_reqrep.c
@@ -0,0 +1,118 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS];
+
+ /* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC.
+ * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly
+ * used to optimize the IPS path though we could put a stricter interpretation
+ * on it to disallow any replies.
+ */
+
+ /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+ * the handler index.
+ */
+ psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+ psmi_assert(epaddr->ptlctl->ptl != NULL);
+
+ req_args[0].u32w0 = (uint32_t) handler;
+ psmi_mq_mtucpy((void *)&req_args[1], (const void *)args,
+ (nargs * sizeof(psm2_amarg_t)));
+ psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx,
+ req_args, nargs + 1, src, len, 0);
+
+ if (completion_fn)
+ completion_fn(completion_ctxt);
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS];
+
+ /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+ * the handler index.
+ */
+ psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+ rep_args[0].u32w0 = (uint32_t) handler;
+ psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args,
+ (nargs * sizeof(psm2_amarg_t)));
+
+ psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx,
+ rep_args, nargs + 1, src, len, 0);
+
+ if (completion_fn)
+ completion_fn(completion_ctxt);
+
+ return PSM2_OK;
+}
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
new file mode 100755
index 0000000..52d3ab2
--- /dev/null
+++ b/ptl_am/am_reqrep_shmem.c
@@ -0,0 +1,2590 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h> /* shm_open and signal handling */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+#include "psmi_wrappers.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+
+/* AMLONG_SZ is the total size in memory of a bulk packet, including an
+ * am_pkt_bulk_t header struct.
+ * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */
+#define AMLONG_SZ 8192
+#define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t))
+
+static const amsh_qinfo_t amsh_qcounts = {
+ .qreqFifoShort = 1024,
+ .qreqFifoLong = 256,
+ .qrepFifoShort = 1024,
+ .qrepFifoLong = 256
+};
+
+static const amsh_qinfo_t amsh_qelemsz = {
+ .qreqFifoShort = sizeof(am_pkt_short_t),
+ .qreqFifoLong = AMLONG_SZ,
+ .qrepFifoShort = sizeof(am_pkt_short_t),
+ .qrepFifoLong = AMLONG_SZ
+};
+
+ustatic struct {
+ void *addr;
+ size_t len;
+ struct sigaction SIGSEGV_old_act;
+ struct sigaction SIGBUS_old_act;
+} action_stash;
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly);
+static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq);
+static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg,
+ void *buf, size_t len);
+
+/* Kassist helper functions */
+#if _HFI_DEBUGGING
+static const char *psmi_kassist_getmode(int mode);
+#endif
+static int psmi_get_kassist_mode();
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+static inline void
+am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz)
+{
+ pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED);
+ q->head = 0;
+ q->tail = 0;
+ q->elem_cnt = elem_cnt;
+ q->elem_sz = elem_sz;
+}
+
+static void
+am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems)
+{
+ int i;
+ am_pkt_bulk_t *bulkpkt;
+ uintptr_t bulkptr = (uintptr_t) base_ptr;
+
+ for (i = 0; i < nelems; i++, bulkptr += elemsz) {
+ bulkpkt = (am_pkt_bulk_t *) bulkptr;
+ bulkpkt->idx = i;
+ }
+}
+
+#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \
+ PSMI_PAGESIZE)
+static inline uintptr_t am_ctl_sizeof_block()
+{
+ return PSMI_ALIGNUP(
+ PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) +
+ /* reqctrl block */
+ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+ _PA(reqFifoShort) + _PA(reqFifoLong) +
+ /*reqctrl block */
+ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+ /* align to page size */
+ _PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE);
+}
+
+#undef _PA
+
+static void am_update_directory(struct am_ctl_nodeinfo *);
+
+static
+void amsh_atexit()
+{
+ static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER;
+ static int atexit_once;
+ psm2_ep_t ep;
+ ptl_t *ptl;
+
+ pthread_mutex_lock(&mutex_once);
+ if (atexit_once) {
+ pthread_mutex_unlock(&mutex_once);
+ return;
+ } else
+ atexit_once = 1;
+ pthread_mutex_unlock(&mutex_once);
+
+ ep = psmi_opened_endpoint;
+ while (ep) {
+ ptl = ep->ptl_amsh.ptl;
+ if (ptl->self_nodeinfo &&
+ ptl->amsh_keyname != NULL) {
+ _HFI_VDBG("unlinking shm file %s\n",
+ ptl->amsh_keyname);
+ shm_unlink(ptl->amsh_keyname);
+ }
+ ep = ep->user_ep_next;
+ }
+
+ return;
+}
+
+ustatic
+void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context)
+{
+ if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr &&
+ (unsigned long int) siginfo->si_addr < (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) {
+
+ static char shm_errmsg[256];
+
+ snprintf(shm_errmsg, sizeof(shm_errmsg),
+ "%s: Unable to allocate shared memory for intra-node messaging.\n"
+ "%s: Delete stale shared memory files in /dev/shm.\n",
+ psmi_gethostname(), psmi_gethostname());
+ amsh_atexit();
+ if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1)
+ psmi_exit(2);
+ else
+ psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */
+ } else {
+ if (signo == SIGSEGV) {
+ if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) {
+ psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+ raise(SIGSEGV);
+ struct sigaction act;
+ act.sa_sigaction = amsh_mmap_fault;
+ act.sa_flags = SA_SIGINFO;
+ psmi_sigaction(SIGSEGV, &act, NULL);
+ } else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) {
+ return;
+ } else {
+ action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context);
+ }
+ } else if (signo == SIGBUS) {
+ if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) {
+ psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+ raise(SIGBUS);
+ struct sigaction act;
+ act.sa_sigaction = amsh_mmap_fault;
+ act.sa_flags = SA_SIGINFO;
+ psmi_sigaction(SIGBUS, &act, NULL);
+ } else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) {
+ return;
+ } else {
+ action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context);
+ }
+ } else {
+ psmi_exit(signo);
+ }
+ }
+}
+
+/**
+ * Create endpoint shared-memory object, containing ep's info
+ * and message queues.
+ */
+psm2_error_t psmi_shm_create(ptl_t *ptl)
+{
+ psm2_ep_t ep = ptl->ep;
+ char shmbuf[256];
+ void *mapptr;
+ size_t segsz;
+ psm2_error_t err = PSM2_OK;
+ int shmfd;
+ char *amsh_keyname;
+ int iterator;
+ /* Get which kassist mode to use. */
+ ptl->psmi_kassist_mode = psmi_get_kassist_mode();
+
+ if (_HFI_PRDBG_ON) {
+ _HFI_PRDBG_ALWAYS
+ ("kassist_mode %d %s use_kassist %d\n",
+ ptl->psmi_kassist_mode,
+ psmi_kassist_getmode(ptl->psmi_kassist_mode),
+ (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF));
+ }
+
+ segsz = am_ctl_sizeof_block();
+ for (iterator = 0; iterator <= INT_MAX; iterator++) {
+ snprintf(shmbuf,
+ sizeof(shmbuf),
+ "/psm2_shm.%ld%016lx%d",
+ (long int) getuid(),
+ ep->epid,
+ iterator);
+ amsh_keyname = psmi_strdup(NULL, shmbuf);
+ if (amsh_keyname == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ shmfd =
+ shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (shmfd < 0) {
+ if (errno == EACCES && iterator < INT_MAX)
+ continue;
+ else {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error creating shared "
+ "memory object in "
+ "shm_open: %s",
+ strerror(errno));
+ goto fail;
+ }
+ } else {
+ struct stat st;
+ if (fstat(shmfd, &st) == -1) {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error validating "
+ "shared memory object "
+ "with fstat: %s",
+ strerror(errno));
+ goto fail;
+ }
+ if (getuid() == st.st_uid) {
+ err = PSM2_OK;
+ break;
+ } else {
+ err = PSM2_SHMEM_SEGMENT_ERR;
+ close(shmfd);
+ }
+ }
+ }
+ if (err) {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error creating shared memory object "
+ "in shm_open: namespace exhausted.");
+ goto fail;
+ }
+
+ /* Now register the atexit handler for cleanup, whether master or slave */
+ atexit(amsh_atexit);
+
+ _HFI_PRDBG("Opened shmfile %s\n", amsh_keyname);
+
+ if (ftruncate(shmfd, segsz) != 0) {
+ err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Error setting size of shared memory object to %u bytes in "
+ "ftruncate: %s\n",
+ (uint32_t) segsz,
+ strerror(errno));
+ goto fail;
+ }
+
+ mapptr = mmap(NULL, segsz,
+ PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0);
+ if (mapptr == MAP_FAILED) {
+ err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Error mmapping shared memory: %s",
+ strerror(errno));
+ goto fail;
+ }
+ close(shmfd);
+ memset((void *) mapptr, 0, segsz); /* touch all of my pages */
+
+ /* Our own ep's info for ptl_am resides at the start of the
+ shm object. Other processes need some of this info to
+ understand the rest of the queue structure and other details. */
+ ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr;
+ ptl->amsh_keyname = amsh_keyname;
+ ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr;
+
+fail:
+ return err;
+}
+
+psm2_error_t psmi_epdir_extend(ptl_t *ptl)
+{
+ struct am_ctl_nodeinfo *new = NULL;
+
+ new = (struct am_ctl_nodeinfo *)
+ psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+ (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) *
+ sizeof(struct am_ctl_nodeinfo));
+ if (new == NULL)
+ return PSM2_NO_MEMORY;
+
+ memcpy(new, ptl->am_ep,
+ ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+ memset(new + ptl->am_ep_size, 0,
+ AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo));
+
+ psmi_free(ptl->am_ep);
+ ptl->am_ep = new;
+ ptl->am_ep_size += AMSH_DIRBLOCK_SIZE;
+
+ return PSM2_OK;
+}
+
+/**
+ * Unmap shm regions upon proper disconnect with other processes
+ */
+psm2_error_t psmi_do_unmap(uintptr_t shmbase)
+{
+ psm2_error_t err = PSM2_OK;
+ if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+ err =
+ psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Error with munmap of shared segment: %s",
+ strerror(errno));
+ }
+ return err;
+}
+
+/**
+ * Map a remote process' shared memory object.
+ *
+ * If the remote process has a shared memory object available, add it to our own
+ * directory and return the shmidx. If the shared memory object does not exist,
+ * return -1, and the connect poll function will try to map again later.
+ */
+psm2_error_t psmi_shm_map_remote(ptl_t *ptl, psm2_epid_t epid, uint16_t *shmidx_o)
+{
+ int i;
+ int use_kassist;
+ uint16_t shmidx;
+ char shmbuf[256];
+ void *dest_mapptr;
+ size_t segsz;
+ psm2_error_t err = PSM2_OK;
+ int dest_shmfd;
+ struct am_ctl_nodeinfo *dest_nodeinfo;
+ int iterator;
+
+ shmidx = *shmidx_o = -1;
+
+ for (i = 0; i <= ptl->max_ep_idx; i++) {
+ if (ptl->am_ep[i].epid == epid) {
+ *shmidx_o = shmidx = i;
+ return err;
+ }
+ }
+
+
+ use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF);
+
+ segsz = am_ctl_sizeof_block();
+ for (iterator = 0; iterator <= INT_MAX; iterator++) {
+ snprintf(shmbuf,
+ sizeof(shmbuf),
+ "/psm2_shm.%ld%016lx%d",
+ (long int) getuid(),
+ epid,
+ iterator);
+ dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU);
+ if (dest_shmfd < 0) {
+ if (errno == EACCES && iterator < INT_MAX)
+ continue;
+ else {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error opening remote "
+ "shared memory object "
+ "in shm_open: %s",
+ strerror(errno));
+ goto fail;
+ }
+ } else {
+ struct stat st;
+ if (fstat(dest_shmfd, &st) == -1) {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error validating "
+ "shared memory object "
+ "with fstat: %s",
+ strerror(errno));
+ goto fail;
+ }
+ if (getuid() == st.st_uid) {
+ err = PSM2_OK;
+ break;
+ } else {
+ err = PSM2_SHMEM_SEGMENT_ERR;
+ close(dest_shmfd);
+ }
+ }
+ }
+ if (err) {
+ err = psmi_handle_error(NULL,
+ PSM2_SHMEM_SEGMENT_ERR,
+ "Error opening remote shared "
+ "memory object in shm_open: "
+ "namespace exhausted.");
+ goto fail;
+ }
+
+ dest_mapptr = mmap(NULL, segsz,
+ PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0);
+ if (dest_mapptr == MAP_FAILED) {
+ err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Error mmapping remote shared memory: %s",
+ strerror(errno));
+ goto fail;
+ }
+ close(dest_shmfd);
+ dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr;
+
+ /* We core dump right after here if we don't check the mmap */
+ action_stash.addr = dest_mapptr;
+ action_stash.len = segsz;
+
+ struct sigaction act;
+ act.sa_sigaction = amsh_mmap_fault;
+ act.sa_flags = SA_SIGINFO;
+
+ sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+ sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+ {
+ volatile uint16_t *is_init = &dest_nodeinfo->is_init;
+ while (*is_init == 0)
+ usleep(1);
+ ips_sync_reads();
+ _HFI_PRDBG("Got a published remote dirpage page at "
+ "%p, size=%dn", dest_mapptr, (int)segsz);
+ }
+
+ shmidx = -1;
+ if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) {
+ err = psmi_epdir_extend(ptl);
+ if (err)
+ goto fail;
+
+ for (i = 0; i <= ptl->max_ep_idx; i++) {
+ if (ptl->am_ep[i].epid != 0)
+ am_update_directory(&ptl->am_ep[i]);
+ }
+ }
+ for (i = 0; i < ptl->am_ep_size; i++) {
+ psmi_assert(ptl->am_ep[i].epid != epid);
+ if (ptl->am_ep[i].epid == 0) {
+ ptl->am_ep[i].epid = epid;
+ ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno;
+ ptl->am_ep[i].pid = dest_nodeinfo->pid;
+ if (use_kassist) {
+ /* If we are able to use CMA assume everyone
+ * else on the node can also use it.
+ * Advertise that CMA is active via the
+ * feature flag.
+ */
+
+ if (cma_available()) {
+ ptl->am_ep[i].amsh_features |=
+ AMSH_HAVE_CMA;
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_CMA;
+ } else {
+ ptl->psmi_kassist_mode =
+ PSMI_KASSIST_OFF;
+ use_kassist = 0;
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_NO_KASSIST;
+ }
+ } else
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_NO_KASSIST;
+ _HFI_PRDBG("KASSIST MODE: %s\n",
+ psmi_kassist_getmode(ptl->psmi_kassist_mode));
+ shmidx = *shmidx_o = i;
+ _HFI_PRDBG("Mapped epid %lx into shmidx %d\n", epid, shmidx);
+ ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr;
+ ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes;
+ if (i > ptl->max_ep_idx)
+ ptl->max_ep_idx = i;
+ break;
+ }
+ }
+
+ /* install the old sighandler back */
+ sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+ sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+ if (shmidx == (uint16_t)-1)
+ err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Could not connect to local endpoint"); fail:
+ return err;
+}
+
+/**
+ * Initialize pointer structure and locks for endpoint shared-memory AM.
+ */
+
+#define AMSH_QSIZE(type) \
+ PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \
+ PSMI_PAGESIZE)
+
+static psm2_error_t amsh_init_segment(ptl_t *ptl)
+{
+ psm2_error_t err = PSM2_OK;
+
+ /* Preconditions */
+ psmi_assert_always(ptl != NULL);
+ psmi_assert_always(ptl->ep != NULL);
+ psmi_assert_always(ptl->epaddr != NULL);
+ psmi_assert_always(ptl->ep->epid != 0);
+
+ if ((err = psmi_shm_create(ptl)))
+ goto fail;
+
+ ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort);
+ ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong);
+ ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort);
+ ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong);
+
+ /* We core dump right after here if we don't check the mmap */
+
+ struct sigaction act;
+ act.sa_sigaction = amsh_mmap_fault;
+ act.sa_flags = SA_SIGINFO;
+
+ sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+ sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+ /*
+ * Now that we know our epid, update it in the shmidx array
+ */
+ ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL;
+ ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL;
+
+ am_update_directory(ptl->self_nodeinfo);
+
+ ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *)
+ (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort));
+ ptl->reqH.end = (am_pkt_short_t *)
+ (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) +
+ amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort);
+
+ ptl->repH.head = ptl->repH.base = (am_pkt_short_t *)
+ (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort));
+ ptl->repH.end = (am_pkt_short_t *)
+ (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) +
+ amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort);
+
+ am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq,
+ amsh_qcounts.qreqFifoShort,
+ amsh_qelemsz.qreqFifoShort);
+ am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq,
+ amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong);
+ am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq,
+ amsh_qcounts.qrepFifoShort,
+ amsh_qelemsz.qrepFifoShort);
+ am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq,
+ amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong);
+
+ /* Set bulkidx in every bulk packet */
+ am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong,
+ amsh_qelemsz.qreqFifoLong,
+ amsh_qcounts.qreqFifoLong);
+ am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong,
+ amsh_qelemsz.qrepFifoLong,
+ amsh_qcounts.qrepFifoLong);
+
+ /* install the old sighandler back */
+ sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+ sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+fail:
+ return err;
+}
+
+psm2_error_t psmi_shm_detach(ptl_t *ptl)
+{
+ psm2_error_t err = PSM2_OK;
+ uintptr_t shmbase;
+
+ if (ptl->self_nodeinfo == NULL)
+ return err;
+
+ _HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1);
+ shmbase = ptl->self_nodeinfo->amsh_shmbase;
+ shm_unlink(ptl->amsh_keyname);
+ psmi_free(ptl->amsh_keyname);
+
+ if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+ err =
+ psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+ "Error with munmap of shared segment: %s",
+ strerror(errno));
+ goto fail;
+ }
+ ptl->self_nodeinfo = NULL;
+ return PSM2_OK;
+
+fail:
+ return err;
+}
+
+/**
+ * Update locally shared-pointer directory. The directory must be
+ * updated when a new epaddr is connected to or on every epaddr already
+ * connected to whenever the shared memory segment is relocated via mremap.
+ *
+ * @param epaddr Endpoint address for which to update local directory.
+ */
+
+static
+void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
+{
+ uintptr_t base_this;
+
+ base_this = nodeinfo->amsh_shmbase +
+ AMSH_BLOCK_HEADER_SIZE;
+
+ /* Request queues */
+ nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this;
+ nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *)
+ ((uintptr_t) nodeinfo->qdir.qreqH +
+ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+
+ nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *)
+ ((uintptr_t) nodeinfo->qdir.qreqFifoShort +
+ nodeinfo->amsh_qsizes.qreqFifoShort);
+
+ /* Reply queues */
+ nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *)
+ ((uintptr_t) nodeinfo->qdir.qreqFifoLong +
+ nodeinfo->amsh_qsizes.qreqFifoLong);
+
+ nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *)
+ ((uintptr_t) nodeinfo->qdir.qrepH +
+ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+ nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *)
+ ((uintptr_t) nodeinfo->qdir.qrepFifoShort +
+ nodeinfo->amsh_qsizes.qrepFifoShort);
+
+ _HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n",
+ nodeinfo->epaddr,
+ nodeinfo->qdir.qreqH,
+ nodeinfo->qdir.qreqFifoShort,
+ nodeinfo->qdir.qreqFifoLong);
+ _HFI_VDBG("epaddr=%p Reply Hdr=%p,Pkt=%p,Long=%p\n",
+ nodeinfo->epaddr,
+ nodeinfo->qdir.qrepH,
+ nodeinfo->qdir.qrepFifoShort,
+ nodeinfo->qdir.qrepFifoLong);
+
+ /* Sanity check */
+ uintptr_t base_next =
+ (uintptr_t) nodeinfo->qdir.qrepFifoLong +
+ nodeinfo->amsh_qsizes.qrepFifoLong;
+
+ psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block());
+}
+
+
+/* ep_epid_share_memory wrapper */
+static
+int amsh_epid_reachable(ptl_t *ptl, psm2_epid_t epid)
+{
+ int result;
+ psm2_error_t err;
+ err = psm2_ep_epid_share_memory(ptl->ep, epid, &result);
+ psmi_assert_always(err == PSM2_OK);
+ return result;
+}
+
+static
+psm2_error_t
+amsh_epaddr_add(ptl_t *ptl, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o)
+{
+ psm2_epaddr_t epaddr;
+ am_epaddr_t *amaddr;
+ psm2_error_t err = PSM2_OK;
+
+ psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL);
+
+ /* The self PTL handles loopback communication. */
+ psmi_assert(epid != ptl->epid);
+
+ /* note the size of the memory is am_epaddr_t */
+ epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep,
+ PER_PEER_ENDPOINT, 1,
+ sizeof(am_epaddr_t));
+ if (epaddr == NULL) {
+ return PSM2_NO_MEMORY;
+ }
+ psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL);
+
+ if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid),
+ psmi_gethostname(), 0)))
+ goto fail;
+
+ epaddr->ptlctl = ptl->ctl;
+ epaddr->epid = epid;
+
+ /* convert to am_epaddr_t */
+ amaddr = (am_epaddr_t *) epaddr;
+ /* tell the other endpoint their location in our directory */
+ amaddr->_shmidx = shmidx;
+ /* we haven't connected yet, so we can't give them the same hint */
+ amaddr->_return_shmidx = -1;
+ AMSH_CSTATE_OUTGOING_SET(amaddr, NONE);
+ AMSH_CSTATE_INCOMING_SET(amaddr, NONE);
+
+ /* other setup */
+ ptl->am_ep[shmidx].epaddr = epaddr;
+ am_update_directory(&ptl->am_ep[shmidx]);
+ /* Finally, add to table */
+ if ((err = psmi_epid_add(ptl->ep, epid, epaddr)))
+ goto fail;
+ _HFI_VDBG("epaddr=%s added to ptl=%p\n",
+ psmi_epaddr_get_name(epid), ptl);
+ *epaddr_o = epaddr;
+ return PSM2_OK;
+fail:
+ if (epaddr != ptl->epaddr)
+ psmi_free(epaddr);
+ return err;
+}
+
+static
+void
+amsh_epaddr_update(ptl_t *ptl, psm2_epaddr_t epaddr)
+{
+ am_epaddr_t *amaddr;
+ uint16_t shmidx;
+ struct am_ctl_nodeinfo *nodeinfo;
+
+ amaddr = (am_epaddr_t *) epaddr;
+ shmidx = amaddr->_shmidx;
+ nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase;
+
+ /* restart the connection process */
+ amaddr->_return_shmidx = -1;
+ AMSH_CSTATE_OUTGOING_SET(amaddr, NONE);
+
+ /* wait for the other process to init again */
+ {
+ volatile uint16_t *is_init = &nodeinfo->is_init;
+ while (*is_init == 0)
+ usleep(1);
+ ips_sync_reads();
+ }
+
+ /* get the updated values from the new nodeinfo page */
+ ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno;
+ ptl->am_ep[shmidx].pid = nodeinfo->pid;
+ ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes;
+ am_update_directory(&ptl->am_ep[shmidx]);
+ return;
+}
+
+struct ptl_connection_req {
+ int isdone;
+ int op; /* connect or disconnect */
+ int numep;
+ int numep_left;
+ int phase;
+
+ int *epid_mask;
+ const psm2_epid_t *epids; /* input epid list */
+ psm2_epaddr_t *epaddr;
+ psm2_error_t *errors; /* inout errors */
+
+ /* Used for connect/disconnect */
+ psm2_amarg_t args[4];
+};
+
+static
+void amsh_free_epaddr(psm2_epaddr_t epaddr)
+{
+ psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+ psmi_free(epaddr);
+ return;
+}
+
+#define PTL_OP_CONNECT 0
+#define PTL_OP_DISCONNECT 1
+#define PTL_OP_ABORT 2
+
+static
+psm2_error_t
+amsh_ep_connreq_init(ptl_t *ptl, int op, /* connect, disconnect or abort */
+ int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */
+ const int array_of_epid_mask[],
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr,
+ struct ptl_connection_req **req_o)
+{
+ int i, cstate;
+ psm2_epaddr_t epaddr;
+ psm2_epid_t epid;
+ struct ptl_connection_req *req = NULL;
+
+ req = (struct ptl_connection_req *)
+ psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1,
+ sizeof(struct ptl_connection_req));
+ if (req == NULL)
+ return PSM2_NO_MEMORY;
+ req->isdone = 0;
+ req->op = op;
+ req->numep = numep;
+ req->numep_left = 0;
+ req->phase = ptl->connect_phase;
+ req->epid_mask = (int *)
+ psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int));
+ if (req->epid_mask == NULL) {
+ psmi_free(req);
+ return PSM2_NO_MEMORY;
+ }
+ req->epaddr = array_of_epaddr;
+ req->epids = array_of_epid;
+ req->errors = array_of_errors;
+
+ /* First check if there's really something to connect/disconnect
+ * for this PTL */
+ for (i = 0; i < numep; i++) {
+ req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */
+ if (!array_of_epid_mask[i])
+ continue;
+ if (op == PTL_OP_CONNECT) {
+ epid = array_of_epid[i];
+
+ /* Connect only to other processes reachable by shared memory.
+ The self PTL handles loopback communication, so explicitly
+ refuse to connect to self. */
+ if (!amsh_epid_reachable(ptl, epid)
+ || epid == ptl->epid) {
+ array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+ array_of_epaddr[i] = NULL;
+ continue;
+ }
+
+ _HFI_VDBG("looking at epid %llx\n",
+ (unsigned long long)epid);
+ epaddr = psmi_epid_lookup(ptl->ep, epid);
+ if (epaddr != NULL) {
+ if (epaddr->ptlctl->ptl != ptl) {
+ array_of_errors[i] =
+ PSM2_EPID_UNREACHABLE;
+ array_of_epaddr[i] = NULL;
+ continue;
+ }
+ cstate =
+ AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+ if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+ array_of_epaddr[i] = epaddr;
+ array_of_errors[i] = PSM2_OK;
+ } else {
+ psmi_assert(cstate ==
+ AMSH_CSTATE_OUTGOING_NONE);
+ array_of_errors[i] = PSM2_TIMEOUT;
+ array_of_epaddr[i] = epaddr;
+ req->epid_mask[i] = AMSH_CMASK_PREREQ;
+ }
+ } else {
+ req->epid_mask[i] = AMSH_CMASK_PREREQ;
+ array_of_epaddr[i] = NULL;
+ }
+ } else { /* disc or abort */
+ epaddr = array_of_epaddr[i];
+ psmi_assert(epaddr != NULL);
+ cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+ if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+ req->epid_mask[i] = AMSH_CMASK_PREREQ;
+ _HFI_VDBG
+ ("Just set index %d to AMSH_CMASK_PREREQ\n",
+ i);
+ }
+ /* XXX undef ? */
+ }
+ if (req->epid_mask[i] != AMSH_CMASK_NONE)
+ req->numep_left++;
+ }
+
+ if (req->numep_left == 0) { /* nothing to do */
+ psmi_free(req->epid_mask);
+ psmi_free(req);
+ _HFI_VDBG("Nothing to connect, bump up phase\n");
+ ptl->connect_phase++;
+ *req_o = NULL;
+ return PSM2_OK;
+ } else {
+ *req_o = req;
+ return PSM2_OK_NO_PROGRESS;
+ }
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_poll(ptl_t *ptl, struct ptl_connection_req *req)
+{
+ int i, j, cstate;
+ uint16_t shmidx = (uint16_t)-1;
+ psm2_error_t err = PSM2_OK;
+ psm2_epid_t epid;
+ psm2_epaddr_t epaddr;
+
+ if (req == NULL || req->isdone)
+ return PSM2_OK;
+
+ psmi_assert_always(ptl->connect_phase == req->phase);
+
+ if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+ for (i = 0; i < req->numep; i++) {
+ if (req->epid_mask[i] == AMSH_CMASK_NONE ||
+ req->epid_mask[i] == AMSH_CMASK_DONE)
+ continue;
+
+ epaddr = req->epaddr[i];
+ psmi_assert(epaddr != NULL);
+ if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+ shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+ /* Make sure the target of the disconnect is still there */
+ if (ptl->am_ep[shmidx].
+ epid != epaddr->epid) {
+ req->numep_left--;
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *)
+ epaddr, NONE);
+ }
+ }
+
+ if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+ req->args[0].u32w0 = PSMI_AM_DISC_REQ;
+ req->args[0].u32w1 = ptl->connect_phase;
+ req->args[1].u64w0 = (uint64_t) ptl->epid;
+ psmi_assert(shmidx != (uint16_t)-1);
+ req->args[2].u16w0 = shmidx;
+ req->args[2].u32w1 = PSM2_OK;
+ req->args[3].u64w0 =
+ (uint64_t) (uintptr_t) &req->errors[i];
+ psmi_amsh_short_request(ptl, epaddr,
+ amsh_conn_handler_hidx,
+ req->args, 4, NULL, 0,
+ 0);
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+ DISC_REQUESTED);
+ /**
+ * Only munmap if we have nothing more to
+ * communicate with the other node, i.e. we
+ * already recieved a disconnect req from the
+ * other node.
+ */
+ if (AMSH_CSTATE_INCOMING_GET((am_epaddr_t *) epaddr) ==
+ AMSH_CSTATE_INCOMING_DISC_REQUESTED)
+ err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+ req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+ } else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+ cstate =
+ AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+ if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) {
+ req->numep_left--;
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *)
+ epaddr, NONE);
+ }
+ }
+ }
+ } else {
+ /* First see if we've made progress on any postreqs */
+ int n_prereq = 0;
+ for (i = 0; i < req->numep; i++) {
+ int cstate;
+ if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) {
+ if (req->epid_mask[i] == AMSH_CMASK_PREREQ)
+ n_prereq++;
+ continue;
+ }
+ epaddr = req->epaddr[i];
+ psmi_assert(epaddr != NULL);
+
+ /* detect if a race has occurred on due to re-using an
+ * old shm file - if so, restart the connection */
+ shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+ if (ptl->am_ep[shmidx].pid !=
+ ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) {
+ req->epid_mask[i] = AMSH_CMASK_PREREQ;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+ NONE);
+ n_prereq++;
+ amsh_epaddr_update(ptl, epaddr);
+ continue;
+ }
+
+ cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+ if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+ req->numep_left--;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr,
+ ESTABLISHED);
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ continue;
+ }
+ }
+ if (n_prereq > 0) {
+ psmi_assert(req->numep_left > 0);
+ /* Go through the list of peers we need to connect to and find out
+ * if they each shared ep is mapped into shm */
+ for (i = 0; i < req->numep; i++) {
+ if (req->epid_mask[i] != AMSH_CMASK_PREREQ)
+ continue;
+ epid = req->epids[i];
+ epaddr = req->epaddr[i];
+ /* Go through mapped epids and find the epid we're looking for */
+ for (shmidx = -1, j = 0;
+ j <= ptl->max_ep_idx; j++) {
+ /* epid is connected and ready to go */
+ if (ptl->am_ep[j].
+ epid == epid) {
+ shmidx = j;
+ break;
+ }
+ }
+ if (shmidx == (uint16_t)-1) {
+ /* Couldn't find peer's epid in dirpage.
+ Check shmdir to see if epid is up now. */
+ if ((err = psmi_shm_map_remote(ptl, epid, &shmidx))) {
+ return err;
+ }
+ continue;
+ }
+ /* Before we even send the request out, check to see if
+ * versions are interoperable */
+ if (!psmi_verno_isinteroperable
+ (ptl->am_ep[shmidx].
+ psm_verno)) {
+ char buf[32];
+ uint16_t their_verno =
+ ptl->am_ep[shmidx].
+ psm_verno;
+ snprintf(buf, sizeof(buf), "%d.%d",
+ PSMI_VERNO_GET_MAJOR
+ (their_verno),
+ PSMI_VERNO_GET_MINOR
+ (their_verno));
+
+ _HFI_INFO("Local endpoint id %" PRIx64
+ " has version %s "
+ "which is not supported by library version %d.%d",
+ epid, buf, PSM2_VERNO_MAJOR,
+ PSM2_VERNO_MINOR);
+ req->errors[i] =
+ PSM2_EPID_INVALID_VERSION;
+ req->numep_left--;
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ continue;
+ }
+ if (epaddr != NULL) {
+ psmi_assert(((am_epaddr_t *) epaddr)->
+ _shmidx == shmidx);
+ } else
+ if ((epaddr =
+ psmi_epid_lookup(ptl->ep,
+ epid)) == NULL) {
+ if ((err =
+ amsh_epaddr_add(ptl, epid, shmidx,
+ &epaddr))) {
+ return err;
+ }
+ }
+ req->epaddr[i] = epaddr;
+ req->args[0].u32w0 = PSMI_AM_CONN_REQ;
+ req->args[0].u32w1 = ptl->connect_phase;
+ req->args[1].u64w0 = (uint64_t) ptl->epid;
+ /* tell the other process its shmidx here */
+ req->args[2].u16w0 = shmidx;
+ req->args[2].u32w1 = PSM2_OK;
+ req->args[3].u64w0 =
+ (uint64_t) (uintptr_t) &req->errors[i];
+ req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+ psmi_amsh_short_request(ptl, epaddr,
+ amsh_conn_handler_hidx,
+ req->args, 4, NULL, 0,
+ 0);
+ _HFI_PRDBG("epaddr=%p, epid=%" PRIx64
+ " at shmidx=%d\n", epaddr, epid,
+ shmidx);
+ }
+ }
+ }
+
+ if (req->numep_left == 0) { /* we're all done */
+ req->isdone = 1;
+ return PSM2_OK;
+ } else {
+ sched_yield();
+ return PSM2_OK_NO_PROGRESS;
+ }
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_fini(ptl_t *ptl, struct ptl_connection_req *req)
+{
+ psm2_error_t err = PSM2_OK;
+ int i;
+
+ /* Wherever we are at in our connect process, we've been instructed to
+ * finish the connection process */
+ if (req == NULL)
+ return PSM2_OK;
+
+ /* This prevents future connect replies from referencing data structures
+ * that disappeared */
+ ptl->connect_phase++;
+
+ /* First process any leftovers in postreq or prereq */
+ for (i = 0; i < req->numep; i++) {
+ if (req->epid_mask[i] == AMSH_CMASK_NONE)
+ continue;
+ else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+ int cstate;
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ cstate =
+ AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) (req->
+ epaddr[i]));
+ if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+ req->numep_left--;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) (req->
+ epaddr[i]),
+ ESTABLISHED);
+ } else { /* never actually got reply */
+ req->errors[i] = PSM2_TIMEOUT;
+ }
+ }
+ /* If we couldn't go from prereq to postreq, that means we couldn't
+ * find the shmidx for an epid in time. This can only be a case of
+ * time out */
+ else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+ req->errors[i] = PSM2_TIMEOUT;
+ req->numep_left--;
+ req->epid_mask[i] = AMSH_CMASK_DONE;
+ }
+ }
+
+ /* Whatever is left can only be in DONE or NONE state */
+ for (i = 0; i < req->numep; i++) {
+ if (req->epid_mask[i] == AMSH_CMASK_NONE)
+ continue;
+ psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE);
+
+ err = psmi_error_cmp(err, req->errors[i]);
+ /* XXX TODO: Report errors in connection. */
+ /* Only free epaddr if they have disconnected from us */
+ int cstate = AMSH_CSTATE_INCOMING_GET((am_epaddr_t *) req->epaddr[i]);
+ if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) {
+ if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+ psmi_assert(req->epaddr[i] != NULL);
+ amsh_free_epaddr(req->epaddr[i]);
+ req->epaddr[i] = NULL;
+ }
+ }
+ }
+
+ psmi_free(req->epid_mask);
+ psmi_free(req);
+
+ return err;
+}
+
+/* Wrapper for 2.0's use of connect/disconnect. The plan is to move the
+ * init/poll/fini interface up to the PTL level for 2.2 */
+#define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20
+static
+psm2_error_t
+amsh_ep_connreq_wrap(ptl_t *ptl, int op,
+ int numep,
+ const psm2_epid_t *array_of_epid,
+ const int array_of_epid_mask[],
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+ psm2_error_t err;
+ uint64_t t_start;
+ struct ptl_connection_req *req;
+ int num_polls_noprogress = 0;
+ static int shm_polite_attach = -1;
+
+ if (shm_polite_attach == -1) {
+ char *p = getenv("PSM2_SHM_POLITE_ATTACH");
+ if (p && *p && atoi(p) != 0) {
+ fprintf(stderr, "%s: Using Polite SHM segment attach\n",
+ psmi_gethostname());
+ shm_polite_attach = 1;
+ }
+ shm_polite_attach = 0;
+ }
+
+ /* Initialize */
+ err = amsh_ep_connreq_init(ptl, op, numep,
+ array_of_epid, array_of_epid_mask,
+ array_of_errors, array_of_epaddr, &req);
+ if (err != PSM2_OK_NO_PROGRESS) /* Either we're all done with connect or
+ * there was an error */
+ return err;
+
+ /* Poll until either
+ * 1. We time out
+ * 2. We are done with connecting
+ */
+ t_start = get_cycles();
+ do {
+ psmi_poll_internal(ptl->ep, 1);
+ err = amsh_ep_connreq_poll(ptl, req);
+ if (err == PSM2_OK)
+ break; /* Finished before timeout */
+ else if (err != PSM2_OK_NO_PROGRESS) {
+ psmi_free(req->epid_mask);
+ psmi_free(req);
+ goto fail;
+ } else if (shm_polite_attach &&
+ ++num_polls_noprogress ==
+ CONNREQ_ZERO_POLLS_BEFORE_YIELD) {
+ num_polls_noprogress = 0;
+ PSMI_YIELD(ptl->ep->mq->progress_lock);
+ }
+ }
+ while (psmi_cycles_left(t_start, timeout_ns));
+
+ err = amsh_ep_connreq_fini(ptl, req);
+
+fail:
+ return err;
+}
+
+static
+psm2_error_t
+amsh_ep_connect(ptl_t *ptl,
+ int numep,
+ const psm2_epid_t *array_of_epid,
+ const int array_of_epid_mask[],
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+ return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid,
+ array_of_epid_mask, array_of_errors,
+ array_of_epaddr, timeout_ns);
+}
+
+static
+psm2_error_t
+amsh_ep_disconnect(ptl_t *ptl, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[], uint64_t timeout_ns)
+{
+ return amsh_ep_connreq_wrap(ptl,
+ force ? PTL_OP_ABORT : PTL_OP_DISCONNECT,
+ numep, NULL, array_of_epaddr_mask,
+ array_of_errors,
+ array_of_epaddr,
+ timeout_ns);
+}
+
+#undef CSWAP
+PSMI_ALWAYS_INLINE(
+int32_t
+cswap(volatile int32_t *p, int32_t old_value, int32_t new_value))
+{
+ asm volatile ("lock cmpxchg %2, %0" :
+ "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+ return old_value;
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0))
+{
+ am_pkt_short_t *pkt;
+ uint32_t idx;
+#ifndef CSWAP
+ pthread_spin_lock(&shq->lock);
+ idx = shq->tail;
+ pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+ if (pkt->flag == QFREE) {
+ ips_sync_reads();
+ pkt->flag = QUSED;
+ shq->tail += 1;
+ if (shq->tail == shq->elem_cnt)
+ shq->tail = 0;
+ } else {
+ pkt = 0;
+ }
+ pthread_spin_unlock(&shq->lock);
+#else
+ uint32_t idx_next;
+ do {
+ idx = shq->tail;
+ idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1;
+ } while (cswap(&shq->tail, idx, idx_next) != idx);
+
+ pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+ while (cswap(&pkt->flag, QFREE, QUSED) != QFREE);
+#endif
+ return pkt;
+}
+
+/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */
+#define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \
+ am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0)))
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt(ptl_t *ptl, uint16_t shmidx, int is_reply))
+{
+ volatile am_ctl_qhdr_t *shq;
+ am_pkt_short_t *pkt0;
+ if (!is_reply) {
+ shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq);
+ pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort;
+ } else {
+ shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq);
+ pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort;
+ }
+ return am_ctl_getslot_pkt_inner(shq, pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_long(ptl_t *ptl, uint16_t shmidx, int is_reply))
+{
+ volatile am_ctl_qhdr_t *shq;
+ am_pkt_bulk_t *pkt0;
+ if (!is_reply) {
+ shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq);
+ pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong;
+ } else {
+ shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq);
+ pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong;
+ }
+ return am_ctl_getslot_bulkpkt_inner(shq, pkt0);
+}
+
+psmi_handlertab_t psmi_allhandlers[] = {
+ {0}
+ ,
+ {amsh_conn_handler}
+ ,
+ {psmi_am_mq_handler}
+ ,
+ {psmi_am_mq_handler_data}
+ ,
+ {psmi_am_mq_handler_rtsmatch}
+ ,
+ {psmi_am_mq_handler_rtsdone}
+ ,
+ {psmi_am_handler}
+};
+
+PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr))
+{
+ QMARKFREE(hdr->head);
+ hdr->head++;
+ if (hdr->head == hdr->end)
+ hdr->head = hdr->base;
+}
+
+#define AMSH_ZERO_POLLS_BEFORE_YIELD 64
+#define AMSH_POLLS_BEFORE_PSM_POLL 16
+
+/* XXX this can be made faster. Instead of checking the flag of the head, keep
+ * a cached copy of the integer value of the tail and compare it against the
+ * previous one we saw.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_poll_internal_inner(ptl_t *ptl, int replyonly,
+ int is_internal))
+{
+ psm2_error_t err = PSM2_OK_NO_PROGRESS;
+ /* poll replies */
+ if (!QISEMPTY(ptl->repH.head->flag)) {
+ do {
+ ips_sync_reads();
+ process_packet(ptl, (am_pkt_short_t *) ptl->repH.head,
+ 0);
+ advance_head(&ptl->repH);
+ err = PSM2_OK;
+ } while (!QISEMPTY(ptl->repH.head->flag));
+ }
+
+ if (!replyonly) {
+ /* Request queue not enable for 2.0, will be re-enabled to support long
+ * replies */
+ if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) {
+ psmi_am_reqq_drain(ptl);
+ err = PSM2_OK;
+ }
+ if (!QISEMPTY(ptl->reqH.head->flag)) {
+ do {
+ ips_sync_reads();
+ process_packet(ptl,
+ (am_pkt_short_t *) ptl->reqH.
+ head, 1);
+ advance_head(&ptl->reqH);
+ err = PSM2_OK;
+ } while (!QISEMPTY(ptl->reqH.head->flag));
+ }
+ }
+
+ if (is_internal) {
+ if (err == PSM2_OK) /* some progress, no yields */
+ ptl->zero_polls = 0;
+ else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) {
+ /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */
+ sched_yield();
+ ptl->zero_polls = 0;
+ }
+
+ if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) {
+ psmi_poll_internal(ptl->ep, 0);
+ ptl->amsh_only_polls = 0;
+ }
+ }
+ return err; /* if we actually did something */
+}
+
+/* non-inlined version */
+static
+psm2_error_t
+amsh_poll_internal(ptl_t *ptl, int replyonly)
+{
+ return amsh_poll_internal_inner(ptl, replyonly, 1);
+}
+
+#ifdef PSM_PROFILE
+#define AMSH_POLL_UNTIL(ptl, isreply, cond) \
+ do { \
+ PSMI_PROFILE_BLOCK(); \
+ while (!(cond)) { \
+ PSMI_PROFILE_REBLOCK( \
+ amsh_poll_internal(ptl, isreply) == \
+ PSM2_OK_NO_PROGRESS); \
+ } \
+ PSMI_PROFILE_UNBLOCK(); \
+ } while (0)
+#else
+#define AMSH_POLL_UNTIL(ptl, isreply, cond) \
+ do { \
+ while (!(cond)) { \
+ amsh_poll_internal(ptl, isreply); \
+ } \
+ } while (0)
+#endif
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly)
+{
+ return amsh_poll_internal_inner(ptl, replyonly, 0);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx,
+ uint32_t bulkidx, uint16_t fmt, uint16_t nargs,
+ uint16_t handleridx, psm2_amarg_t *args,
+ const void *src, uint32_t len, int isreply))
+{
+ int i;
+ volatile am_pkt_short_t *pkt;
+ int copy_nargs;
+
+ AMSH_POLL_UNTIL(ptl, isreply,
+ (pkt =
+ am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL);
+
+ /* got a free pkt... fill it in */
+ pkt->bulkidx = bulkidx;
+ pkt->shmidx = returnidx;
+ pkt->type = fmt;
+ pkt->nargs = nargs;
+ pkt->handleridx = handleridx;
+
+ /* Limit the number of args copied here to NSHORT_ARGS. Additional args
+ are carried in the bulkpkt. */
+ copy_nargs = nargs;
+ if (copy_nargs > NSHORT_ARGS) {
+ copy_nargs = NSHORT_ARGS;
+ }
+
+ for (i = 0; i < copy_nargs; i++)
+ pkt->args[i] = args[i];
+
+ if (fmt == AMFMT_SHORT_INLINE)
+ mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src,
+ len);
+
+ _HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d,"
+ "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx,
+ pkt->flag, pkt->nargs, src, (int)len, (int)handleridx,
+ src != NULL ? *((uint32_t *) src) : 0);
+ QMARKREADY(pkt);
+}
+
+#define amsh_shm_copy_short psmi_mq_mtucpy
+#define amsh_shm_copy_long psmi_mq_mtucpy
+
+PSMI_ALWAYS_INLINE(
+int
+psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dst, int flags))
+{
+ uint16_t type;
+ uint32_t bulkidx;
+ uint16_t hidx = (uint16_t) handler;
+ int destidx = ((am_epaddr_t *) epaddr)->_shmidx;
+ int returnidx = ((am_epaddr_t *) epaddr)->_return_shmidx;
+ int is_reply = AM_IS_REPLY(amtype);
+ volatile am_pkt_bulk_t *bulkpkt;
+
+ _HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n",
+ is_reply ? "reply" : "request",
+ psmi_epaddr_get_name(epaddr->epid),
+ ((am_epaddr_t *) epaddr)->_shmidx, amtype);
+ psmi_assert(epaddr != ptl->epaddr);
+
+ switch (amtype) {
+ case AMREQUEST_SHORT:
+ case AMREPLY_SHORT:
+ if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) {
+ /* Payload fits in args packet */
+ type = AMFMT_SHORT_INLINE;
+ bulkidx = len;
+ } else {
+ int i;
+
+ psmi_assert(len < amsh_qelemsz.qreqFifoLong);
+ psmi_assert(src != NULL || nargs > NSHORT_ARGS);
+ type = AMFMT_SHORT;
+
+ AMSH_POLL_UNTIL(ptl, is_reply,
+ (bulkpkt =
+ am_ctl_getslot_long(ptl, destidx,
+ is_reply)) !=
+ NULL);
+
+ bulkidx = bulkpkt->idx;
+ bulkpkt->len = len;
+ _HFI_VDBG("bulkpkt %p flag is %d from idx %d\n",
+ bulkpkt, bulkpkt->flag, destidx);
+
+ for (i = 0; i < nargs - NSHORT_ARGS; i++) {
+ bulkpkt->args[i] = args[i + NSHORT_ARGS];
+ }
+
+ amsh_shm_copy_short((void *)bulkpkt->payload, src,
+ (uint32_t) len);
+ QMARKREADY(bulkpkt);
+ }
+ am_send_pkt_short(ptl, destidx, returnidx, bulkidx, type,
+ nargs, hidx, args, src, len, is_reply);
+ break;
+
+ case AMREQUEST_LONG:
+ case AMREPLY_LONG:
+ {
+ uint32_t bytes_left = len;
+ uint8_t *src_this = (uint8_t *) src;
+ uint8_t *dst_this = (uint8_t *) dst;
+ uint32_t bytes_this;
+
+ type = AMFMT_LONG;
+
+ _HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n",
+ is_reply ? "rep" : "req", src, dst,
+ (uint32_t) len, hidx);
+ while (bytes_left) {
+ bytes_this = min(bytes_left, AMLONG_MTU);
+ AMSH_POLL_UNTIL(ptl, is_reply,
+ (bulkpkt =
+ am_ctl_getslot_long(ptl,
+ destidx,
+ is_reply))
+ != NULL);
+ bytes_left -= bytes_this;
+ if (bytes_left == 0)
+ type = AMFMT_LONG_END;
+ bulkidx = bulkpkt->idx;
+ amsh_shm_copy_long((void *)bulkpkt->payload,
+ src_this, bytes_this);
+
+ bulkpkt->dest = (uintptr_t) dst;
+ bulkpkt->dest_off =
+ (uint32_t) ((uintptr_t) dst_this -
+ (uintptr_t) dst);
+ bulkpkt->len = bytes_this;
+ QMARKREADY(bulkpkt);
+ am_send_pkt_short(ptl, destidx, returnidx,
+ bulkidx, type, nargs, hidx,
+ args, NULL, 0, is_reply);
+ src_this += bytes_this;
+ dst_this += bytes_this;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ return 1;
+}
+
+/* A generic version that's not inlined */
+int
+psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dst, int flags)
+{
+ return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args,
+ nargs, src, len, dst, flags);
+}
+
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, int flags)
+{
+ return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler,
+ args, nargs, src, len, NULL, flags);
+}
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dest, int flags)
+{
+ return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler,
+ args, nargs, src, len, dest, flags);
+}
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, int flags)
+{
+ psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming,
+ handler, args, nargs, src, len, NULL, flags);
+ return;
+}
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dest, int flags)
+{
+ psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming,
+ handler, args, nargs, src, len, dest, flags);
+ return;
+}
+
+void psmi_am_reqq_init(ptl_t *ptl)
+{
+ ptl->psmi_am_reqq_fifo.first = NULL;
+ ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+}
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl)
+{
+ am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first;
+ am_reqq_t *req;
+ psm2_error_t err = PSM2_OK_NO_PROGRESS;
+
+ /* We're going to process the entire list, and running the generic handler
+ * below can cause other requests to be enqueued in the queue that we're
+ * processing. */
+ ptl->psmi_am_reqq_fifo.first = NULL;
+ ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+
+ while ((req = reqn) != NULL) {
+ err = PSM2_OK;
+ reqn = req->next;
+ _HFI_VDBG
+ ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n",
+ req, psmi_epaddr_get_hostname(req->epaddr->epid),
+ (void *)(uintptr_t) req->args[1].u64w0,
+ (void *)(uintptr_t) req->args[0].u64w0);
+ psmi_amsh_generic(req->amtype, req->ptl, req->epaddr,
+ req->handler, req->args, req->nargs, req->src,
+ req->len, req->dest, req->amflags);
+ if (req->flags & AM_FLAG_SRC_TEMP)
+ psmi_free(req->src);
+ psmi_free(req);
+ }
+ return err;
+}
+
+void
+psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, void *dest, int amflags)
+{
+ int i;
+ int flags = 0;
+ am_reqq_t *nreq =
+ (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t));
+ psmi_assert_always(nreq != NULL);
+ _HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, "
+ "localreq=%p, remotereq=%p\n", nreq,
+ psmi_epaddr_get_hostname(epaddr->epid), dest,
+ (int)len, (void *)(uintptr_t) args[1].u64w0,
+ (void *)(uintptr_t) args[0].u64w0);
+
+ psmi_assert(nargs <= 8);
+ nreq->next = NULL;
+ nreq->amtype = amtype;
+ nreq->ptl = ptl;
+ nreq->epaddr = epaddr;
+ nreq->handler = handler;
+ for (i = 0; i < nargs; i++)
+ nreq->args[i] = args[i];
+ nreq->nargs = nargs;
+ if (AM_IS_LONG(amtype) && src != NULL &&
+ len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) {
+ abort();
+ flags |= AM_FLAG_SRC_TEMP;
+ nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len);
+ psmi_assert_always(nreq->src != NULL); /* XXX mem */
+ amsh_shm_copy_short(nreq->src, src, len);
+ } else
+ nreq->src = src;
+ nreq->len = len;
+ nreq->dest = dest;
+ nreq->amflags = amflags;
+ nreq->flags = flags;
+
+ nreq->next = NULL;
+ *(ptl->psmi_am_reqq_fifo.lastp) = nreq;
+ ptl->psmi_am_reqq_fifo.lastp = &nreq->next;
+}
+
+static
+void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq)
+{
+ amsh_am_token_t tok;
+ psmi_handler_fn_t fn;
+ psm2_amarg_t *args = pkt->args;
+ uint16_t shmidx = pkt->shmidx;
+ int nargs = pkt->nargs;
+
+ tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0);
+ tok.ptl = ptl;
+ tok.mq = ptl->ep->mq;
+ tok.shmidx = shmidx;
+
+ uint16_t hidx = (uint16_t) pkt->handleridx;
+ uint32_t bulkidx = pkt->bulkidx;
+ uintptr_t bulkptr;
+ am_pkt_bulk_t *bulkpkt;
+
+ fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn;
+ psmi_assert(fn != NULL);
+ psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase);
+
+ if (pkt->type == AMFMT_SHORT_INLINE) {
+ _HFI_VDBG
+ ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n",
+ isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt,
+ hidx);
+
+ fn(&tok, args, nargs, pkt->length > 0 ?
+ (void *)&args[nargs] : NULL, pkt->length);
+ } else {
+ int isend = 0;
+ switch (pkt->type) {
+ case AMFMT_LONG_END:
+ isend = 1;
+ case AMFMT_LONG:
+ case AMFMT_SHORT:
+ if (isreq) {
+ bulkptr =
+ (uintptr_t) ptl->self_nodeinfo->qdir.
+ qreqFifoLong;
+ bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong;
+ } else {
+ bulkptr =
+ (uintptr_t) ptl->self_nodeinfo->qdir.
+ qrepFifoLong;
+ bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong;
+ }
+ break;
+ default:
+ bulkptr = 0;
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unknown/unhandled packet type 0x%x",
+ pkt->type);
+ return;
+ }
+
+ bulkpkt = (am_pkt_bulk_t *) bulkptr;
+ _HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d "
+ "from_idx=%d pkt=%p/%p hidx=%d\n",
+ ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag,
+ bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx);
+ psmi_assert(bulkpkt->flag == QREADY);
+
+ if (nargs > NSHORT_ARGS || isend == 1) {
+ /* Either there are more args in the bulkpkt, or this is the last
+ packet of a long payload. In either case, copy the args. */
+ int i;
+ args =
+ alloca((NSHORT_ARGS +
+ NBULK_ARGS) * sizeof(psm2_amarg_t));
+
+ for (i = 0; i < NSHORT_ARGS; i++) {
+ args[i] = pkt->args[i];
+ }
+
+ for (; i < nargs; i++) {
+ args[i] = bulkpkt->args[i - NSHORT_ARGS];
+ }
+ }
+
+ if (pkt->type == AMFMT_SHORT) {
+ fn(&tok, args, nargs,
+ (void *)bulkpkt->payload, bulkpkt->len);
+ QMARKFREE(bulkpkt);
+ } else {
+ amsh_shm_copy_long((void *)(bulkpkt->dest +
+ bulkpkt->dest_off),
+ bulkpkt->payload, bulkpkt->len);
+
+ /* If this is the last packet, copy args before running the
+ * handler */
+ if (isend) {
+ void *dest = (void *)bulkpkt->dest;
+ size_t len =
+ (size_t) (bulkpkt->dest_off + bulkpkt->len);
+ QMARKFREE(bulkpkt);
+ fn(&tok, args, nargs, dest, len);
+ } else
+ QMARKFREE(bulkpkt);
+ }
+ }
+ return;
+}
+
+static
+psm2_error_t
+amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
+ psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf,
+ uint32_t len)
+{
+ psm2_amarg_t args[5];
+ psm2_error_t err = PSM2_OK;
+
+ args[0].u32w0 = MQ_MSG_LONGRTS;
+ args[0].u32w1 = len;
+ args[1].u32w1 = tag->tag[0];
+ args[1].u32w0 = tag->tag[1];
+ args[2].u32w1 = tag->tag[2];
+ args[3].u64w0 = (uint64_t) (uintptr_t) req;
+ args[4].u64w0 = (uint64_t) (uintptr_t) buf;
+
+ psmi_assert(req != NULL);
+ req->type = MQE_TYPE_SEND;
+ req->buf = (void *)buf;
+ req->buf_len = len;
+ req->send_msglen = len;
+ req->send_msgoff = 0;
+
+#ifdef PSM_CUDA
+ /* If the send buffer is on gpu, we create a cuda IPC
+ * handle and send it as payload in the RTS
+ */
+ if (req->is_buf_gpu_mem) {
+ PSMI_CUDA_CALL(cudaIpcGetMemHandle,
+ (cudaIpcMemHandle_t *) &req->cuda_ipc_handle,
+ (void*) buf);
+ psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+ args, 5, (void*)&req->cuda_ipc_handle,
+ sizeof(cudaIpcMemHandle_t), 0);
+ req->cuda_ipc_handle_attached = 1;
+ } else
+ psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+ args, 5, NULL, 0, 0);
+#else
+ psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+ args, 5, NULL, 0, 0);
+#endif
+
+ return err;
+}
+
+/*
+ * All shared am mq sends, req can be NULL
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+ uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf,
+ uint32_t len))
+{
+ psm2_amarg_t args[3];
+ psm2_error_t err = PSM2_OK;
+ int is_blocking = (req == NULL);
+
+#ifdef PSM_CUDA
+ int gpu_mem;
+ /* All sends from a gpu buffer use the rendezvous protocol */
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+ if (!PSMI_IS_CUDA_ENABLED)
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ " Please enable PSM CUDA support when using GPU buffer \n");
+ gpu_mem = 1;
+ goto do_rendezvous;
+ } else
+ gpu_mem = 0;
+#endif
+
+ if (!flags && len <= AMLONG_MTU) {
+ if (len <= 32)
+ args[0].u32w0 = MQ_MSG_TINY;
+ else
+ args[0].u32w0 = MQ_MSG_SHORT;
+ args[1].u32w1 = tag->tag[0];
+ args[1].u32w0 = tag->tag[1];
+ args[2].u32w1 = tag->tag[2];
+
+ psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+ mq_handler_hidx, args, 3, ubuf, len, 0);
+ } else if (flags & PSM2_MQ_FLAG_SENDSYNC)
+ goto do_rendezvous;
+ else if (len <= mq->shm_thresh_rv) {
+ uint32_t bytes_left = len;
+ uint32_t bytes_this = min(bytes_left, AMLONG_MTU);
+ uint8_t *buf = (uint8_t *) ubuf;
+ args[0].u32w0 = MQ_MSG_EAGER;
+ args[0].u32w1 = len;
+ args[1].u32w1 = tag->tag[0];
+ args[1].u32w0 = tag->tag[1];
+ args[2].u32w1 = tag->tag[2];
+ psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+ mq_handler_hidx, args, 3, buf,
+ bytes_this, 0);
+ bytes_left -= bytes_this;
+ buf += bytes_this;
+ args[2].u32w0 = 0;
+ while (bytes_left) {
+ args[2].u32w0 += bytes_this;
+ bytes_this = min(bytes_left, AMLONG_MTU);
+ /* Here we kind of bend the rules, and assume that shared-memory
+ * active messages are delivered in order */
+ psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+ mq_handler_data_hidx, args,
+ 3, buf, bytes_this, 0);
+ buf += bytes_this;
+ bytes_left -= bytes_this;
+ }
+ } else {
+do_rendezvous:
+ if (is_blocking) {
+ req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+ if_pf(req == NULL)
+ return PSM2_NO_MEMORY;
+ req->send_msglen = len;
+ req->tag = *tag;
+
+ /* Since SEND command is blocking, this request is
+ * entirely internal and we will not be exposed to user.
+ * Setting as internal so it will not be added to
+ * mq->completed_q */
+ req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+ }
+#ifdef PSM_CUDA
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (gpu_mem) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)ubuf);
+ req->is_buf_gpu_mem = 1;
+ } else
+ req->is_buf_gpu_mem = 0;
+#endif
+
+ err =
+ amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag,
+ ubuf, len);
+
+ if (err == PSM2_OK && is_blocking) { /* wait... */
+ err = psmi_mq_wait_internal(&req);
+ }
+ return err; /* skip eager accounting below */
+ }
+
+ /* All eager async sends are always "all done" */
+ if (req != NULL) {
+ req->state = MQ_STATE_COMPLETE;
+ mq_qq_append(&mq->completed_q, req);
+ }
+
+ mq->stats.tx_num++;
+ mq->stats.tx_shm_num++;
+ mq->stats.tx_eager_num++;
+ mq->stats.tx_eager_bytes += len;
+
+ return err;
+}
+
+static
+psm2_error_t
+amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context,
+ psm2_mq_req_t *req_o)
+{
+ psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+ if_pf(req == NULL)
+ return PSM2_NO_MEMORY;
+
+ req->send_msglen = len;
+ req->tag = *tag;
+ req->context = context;
+
+ _HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+ psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+ psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+ tag->tag[0], tag->tag[1], tag->tag[2]);
+
+ amsh_mq_send_inner(mq, req, epaddr, flags, tag, ubuf, len);
+
+ *req_o = req;
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+ _HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+ psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+ psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+ tag->tag[0], tag->tag[1], tag->tag[2]);
+
+ amsh_mq_send_inner(mq, NULL, epaddr, flags, tag, ubuf, len);
+
+ return PSM2_OK;
+}
+
+/* kassist-related handling */
+int psmi_epaddr_pid(psm2_epaddr_t epaddr)
+{
+ uint16_t shmidx = ((am_epaddr_t *) epaddr)->_shmidx;
+ return epaddr->ptlctl->ptl->am_ep[shmidx].pid;
+}
+#if _HFI_DEBUGGING
+static
+const char *psmi_kassist_getmode(int mode)
+{
+ switch (mode) {
+ case PSMI_KASSIST_OFF:
+ return "kassist off";
+ case PSMI_KASSIST_CMA_GET:
+ return "cma get";
+ case PSMI_KASSIST_CMA_PUT:
+ return "cma put";
+ default:
+ return "unknown";
+ }
+}
+#endif
+
+static
+int psmi_get_kassist_mode()
+{
+ int mode = PSMI_KASSIST_MODE_DEFAULT;
+ /* Cuda PSM only supports KASSIST_CMA_GET */
+#ifdef PSM_CUDA
+ mode = PSMI_KASSIST_CMA_GET;
+#else
+ union psmi_envvar_val env_kassist;
+
+ if (!psmi_getenv("PSM2_KASSIST_MODE",
+ "PSM Shared memory kernel assist mode "
+ "(cma-put, cma-get, none)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)
+ PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) {
+ char *s = env_kassist.e_str;
+ if (strcasecmp(s, "cma-put") == 0)
+ mode = PSMI_KASSIST_CMA_PUT;
+ else if (strcasecmp(s, "cma-get") == 0)
+ mode = PSMI_KASSIST_CMA_GET;
+ else
+ mode = PSMI_KASSIST_OFF;
+ } else {
+ /* cma-get is the fastest, so it's the default.
+ Availability of CMA is checked in psmi_shm_create();
+ if CMA is not available it falls back to 'none' there. */
+ mode = PSMI_KASSIST_CMA_GET;
+ }
+#endif
+ return mode;
+}
+
+/* Connection handling for shared memory AM.
+ *
+ * arg0 => conn_op, result (PSM error type)
+ * arg1 => epid (always)
+ * arg2 => version.
+ * arg3 => pointer to error for replies.
+ */
+static
+void
+amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len)
+{
+ int op = args[0].u32w0;
+ int phase = args[0].u32w1;
+ psm2_epid_t epid = args[1].u64w0;
+ int16_t return_shmidx = args[2].u16w0;
+ psm2_error_t err = (psm2_error_t) args[2].u32w1;
+ psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0;
+
+ psm2_epaddr_t epaddr;
+ amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+ uint16_t shmidx = tok->shmidx;
+ int is_valid;
+ ptl_t *ptl = tok->ptl;
+ int cstate;
+
+ /* We do this because it's an assumption below */
+ psmi_assert_always(buf == NULL && len == 0);
+
+ _HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n",
+ op, phase, (unsigned long long)epid, err);
+ switch (op) {
+ case PSMI_AM_CONN_REQ:
+ _HFI_VDBG("Connect from %d:%d\n",
+ (int)psm2_epid_nid(epid), (int)psm2_epid_context(epid));
+ epaddr = psmi_epid_lookup(ptl->ep, epid);
+ if (shmidx == (uint16_t)-1) {
+ /* incoming packet will never be from our shmidx slot 0
+ thus the other process doesn't know our return info.
+ attach_to will lookup or create the proper shmidx */
+ if ((err = psmi_shm_map_remote(ptl, epid, &shmidx))) {
+ psmi_handle_error(PSMI_EP_NORETURN, err,
+ "Fatal error in "
+ "connecting to shm segment");
+ }
+ am_update_directory(&ptl->am_ep[shmidx]);
+ tok->shmidx = shmidx;
+ }
+
+ if (epaddr == NULL) {
+ uintptr_t args_segoff =
+ (uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase;
+ if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr)))
+ /* Unfortunately, no way out of here yet */
+ psmi_handle_error(PSMI_EP_NORETURN, err,
+ "Fatal error "
+ "in connecting to shm segment");
+ args =
+ (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase +
+ args_segoff);
+ }
+
+ /* Rewrite args */
+ ptl->connect_incoming++;
+ args[0].u32w0 = PSMI_AM_CONN_REP;
+ args[1].u64w0 = (psm2_epid_t) ptl->epid;
+ /* and return our shmidx for the connecting process */
+ args[2].u16w0 = shmidx;
+ args[2].u32w1 = PSM2_OK;
+ AMSH_CSTATE_INCOMING_SET((am_epaddr_t *) epaddr, ESTABLISHED);
+ ((am_epaddr_t *)epaddr)->_return_shmidx = return_shmidx;
+ tok->tok.epaddr_incoming = epaddr; /* adjust token */
+ psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+ args, narg, NULL, 0, 0);
+ break;
+
+ case PSMI_AM_CONN_REP:
+ if (ptl->connect_phase != phase) {
+ _HFI_VDBG("Out of phase connect reply\n");
+ return;
+ }
+ epaddr = ptl->am_ep[shmidx].epaddr;
+ /* check if a race has occurred on shm-file reuse.
+ * if so, don't transition to the next state.
+ * the next call to connreq_poll() will restart the
+ * connection.
+ */
+ if (ptl->am_ep[shmidx].pid !=
+ ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid)
+ break;
+
+ *perr = err;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr, REPLIED);
+ ((am_epaddr_t *)epaddr)->_return_shmidx = return_shmidx;
+ ptl->connect_outgoing++;
+ _HFI_VDBG("CCC epaddr=%s connected to ptl=%p\n",
+ psmi_epaddr_get_name(epaddr->epid), ptl);
+ break;
+
+ case PSMI_AM_DISC_REQ:
+ epaddr = psmi_epid_lookup(ptl->ep, epid);
+ if (!epaddr) {
+ _HFI_VDBG("Dropping disconnect request from an epid that we are not connected to\n");
+ return;
+ }
+ args[0].u32w0 = PSMI_AM_DISC_REP;
+ args[2].u32w1 = PSM2_OK;
+ AMSH_CSTATE_INCOMING_SET((am_epaddr_t *) epaddr, DISC_REQUESTED);
+ ptl->connect_incoming--;
+ /* Before sending the reply, make sure the process
+ * is still connected */
+
+ if (ptl->am_ep[shmidx].epid != epaddr->epid)
+ is_valid = 0;
+ else
+ is_valid = 1;
+
+ if (is_valid) {
+ psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+ args, narg, NULL, 0, 0);
+ /**
+ * Only munmap if we have nothing more to
+ * communicate with the other node, i.e. we are
+ * already disconnected with the other node
+ * or have sent a disconnect request.
+ */
+ cstate = AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr);
+ if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) {
+ err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+ psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+ }
+ }
+ break;
+
+ case PSMI_AM_DISC_REP:
+ if (ptl->connect_phase != phase) {
+ _HFI_VDBG("Out of phase disconnect reply\n");
+ return;
+ }
+ *perr = err;
+ epaddr = tok->tok.epaddr_incoming;
+ AMSH_CSTATE_OUTGOING_SET((am_epaddr_t *) epaddr, DISC_REPLIED);
+ ptl->connect_outgoing--;
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unknown/unhandled connect handler op=%d",
+ op);
+ break;
+ }
+ return;
+}
+
+static
+size_t amsh_sizeof(void)
+{
+ return sizeof(ptl_t);
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+ if (parameters == NULL) {
+ return PSM2_PARAM_ERR;
+ }
+
+ parameters->max_handlers = PSMI_AM_NUM_HANDLERS;
+ parameters->max_nargs = PSMI_AM_MAX_ARGS;
+ parameters->max_request_short = AMLONG_MTU;
+ parameters->max_reply_short = AMLONG_MTU;
+
+ return PSM2_OK;
+}
+
+/**
+ * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid.
+ * @param ptl Pointer to caller-allocated space for PTL (fill in)
+ * @param ctl Pointer to caller-allocated space for PTL-control
+ * structure (fill in)
+ */
+static
+psm2_error_t
+amsh_init(psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+ psm2_error_t err = PSM2_OK;
+
+ /* Preconditions */
+ psmi_assert_always(ep != NULL);
+ psmi_assert_always(ep->epaddr != NULL);
+ psmi_assert_always(ep->epid != 0);
+
+ ptl->ep = ep; /* back pointer */
+ ptl->epid = ep->epid; /* cache epid */
+ ptl->epaddr = ep->epaddr; /* cache a copy */
+ ptl->ctl = ctl;
+ ptl->zero_polls = 0;
+
+ ptl->connect_phase = 0;
+ ptl->connect_incoming = 0;
+ ptl->connect_outgoing = 0;
+
+ memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt));
+ memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo));
+
+ ptl->max_ep_idx = -1;
+ ptl->am_ep_size = AMSH_DIRBLOCK_SIZE;
+
+ ptl->am_ep = (struct am_ctl_nodeinfo *)
+ psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+ ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+ if (ptl->am_ep == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+ if ((err = amsh_init_segment(ptl)))
+ goto fail;
+
+ ptl->self_nodeinfo->psm_verno = PSMI_VERNO;
+ if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) {
+ if (cma_available()) {
+ ptl->self_nodeinfo->amsh_features |=
+ AMSH_HAVE_CMA;
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_CMA;
+ } else {
+ ptl->psmi_kassist_mode =
+ PSMI_KASSIST_OFF;
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_NO_KASSIST;
+ }
+ } else {
+ psmi_shm_mq_rv_thresh =
+ PSMI_MQ_RV_THRESH_NO_KASSIST;
+ }
+ ptl->self_nodeinfo->pid = getpid();
+ ptl->self_nodeinfo->epid = ep->epid;
+ ptl->self_nodeinfo->epaddr = ep->epaddr;
+
+ ips_mb();
+ ptl->self_nodeinfo->is_init = 1;
+
+ psmi_am_reqq_init(ptl);
+ memset(ctl, 0, sizeof(*ctl));
+
+ /* Fill in the control structure */
+ ctl->ep = ep;
+ ctl->ptl = ptl;
+ ctl->ep_poll = amsh_poll;
+ ctl->ep_connect = amsh_ep_connect;
+ ctl->ep_disconnect = amsh_ep_disconnect;
+
+ ctl->mq_send = amsh_mq_send;
+ ctl->mq_isend = amsh_mq_isend;
+
+ ctl->am_get_parameters = psmi_amsh_am_get_parameters;
+ ctl->am_short_request = psmi_amsh_am_short_request;
+ ctl->am_short_reply = psmi_amsh_am_short_reply;
+
+ /* No stats in shm (for now...) */
+ ctl->epaddr_stats_num = NULL;
+ ctl->epaddr_stats_init = NULL;
+ ctl->epaddr_stats_get = NULL;
+#ifdef PSM_CUDA
+ union psmi_envvar_val env_memcache_enabled;
+ psmi_getenv("PSM2_CUDA_MEMCACHE_ENABLED",
+ "PSM cuda ipc memhandle cache enabled (default is enabled)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)
+ 1, &env_memcache_enabled);
+ if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) {
+ union psmi_envvar_val env_memcache_size;
+ psmi_getenv("PSM2_CUDA_MEMCACHE_SIZE",
+ "Size of the cuda ipc memhandle cache ",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)
+ CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
+ if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint)
+ != PSM2_OK))
+ goto fail;
+ if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK))
+ goto fail;
+ }
+#endif
+fail:
+ return err;
+}
+
+static psm2_error_t amsh_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+ struct psmi_eptab_iterator itor;
+ psm2_epaddr_t epaddr;
+ psm2_error_t err = PSM2_OK;
+ psm2_error_t err_seg;
+ uint64_t t_start = get_cycles();
+ int i = 0;
+
+ /* Close whatever has been left open -- this will be factored out for 2.1 */
+ if (ptl->connect_outgoing > 0) {
+ int num_disc = 0;
+ int *mask;
+ psm2_error_t *errs;
+ psm2_epaddr_t *epaddr_array;
+
+ psmi_epid_itor_init(&itor, ptl->ep);
+ while ((epaddr = psmi_epid_itor_next(&itor))) {
+ if (epaddr->ptlctl->ptl != ptl)
+ continue;
+ if (AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr) ==
+ AMSH_CSTATE_OUTGOING_ESTABLISHED)
+ num_disc++;
+ }
+ psmi_epid_itor_fini(&itor);
+
+ mask =
+ (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+ sizeof(int));
+ errs = (psm2_error_t *)
+ psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+ sizeof(psm2_error_t));
+ epaddr_array = (psm2_epaddr_t *)
+ psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+ sizeof(psm2_epaddr_t));
+
+ if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+ if (epaddr_array)
+ psmi_free(epaddr_array);
+ if (errs)
+ psmi_free(errs);
+ if (mask)
+ psmi_free(mask);
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ psmi_epid_itor_init(&itor, ptl->ep);
+ while ((epaddr = psmi_epid_itor_next(&itor))) {
+ if (epaddr->ptlctl->ptl == ptl) {
+ if (AMSH_CSTATE_OUTGOING_GET((am_epaddr_t *) epaddr)
+ == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+ mask[i] = 1;
+ epaddr_array[i] = epaddr;
+ i++;
+ }
+ }
+ }
+ psmi_epid_itor_fini(&itor);
+ psmi_assert(i == num_disc && num_disc > 0);
+ err = amsh_ep_disconnect(ptl, force, num_disc, epaddr_array,
+ mask, errs, timeout_ns);
+ psmi_free(mask);
+ psmi_free(errs);
+ psmi_free(epaddr_array);
+ }
+
+ if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+ while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+ if (!psmi_cycles_left(t_start, timeout_ns)) {
+ err = PSM2_TIMEOUT;
+ _HFI_VDBG("CCC timed out with from=%d,to=%d\n",
+ ptl->connect_incoming, ptl->connect_outgoing);
+ break;
+ }
+ psmi_poll_internal(ptl->ep, 1);
+ }
+ } else
+ _HFI_VDBG("CCC complete disconnect from=%d,to=%d\n",
+ ptl->connect_incoming, ptl->connect_outgoing);
+
+ if ((err_seg = psmi_shm_detach(ptl))) {
+ err = err_seg;
+ goto fail;
+ }
+
+ /* This prevents poll calls between now and the point where the endpoint is
+ * deallocated to reference memory that disappeared */
+ ptl->repH.head = &ptl->amsh_empty_shortpkt;
+ ptl->reqH.head = &ptl->amsh_empty_shortpkt;
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED)
+ am_cuda_memhandle_cache_map_fini();
+#endif
+ return PSM2_OK;
+fail:
+ return err;
+
+}
+
+static
+psm2_error_t
+amsh_setopt(const void *component_obj, int optname,
+ const void *optval, uint64_t optlen)
+{
+ /* No options for AM PTL at the moment */
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown AM ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+amsh_getopt(const void *component_obj, int optname,
+ void *optval, uint64_t *optlen)
+{
+ /* No options for AM PTL at the moment */
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown AM ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_amsh = {
+ amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt
+};
diff --git a/ptl_am/cmarw.h b/ptl_am/cmarw.h
new file mode 100644
index 0000000..0317ed4
--- /dev/null
+++ b/ptl_am/cmarw.h
@@ -0,0 +1,73 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+/*
+ * read from remote process pid
+ */
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n);
+
+/*
+ * write to remote process pid
+ */
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n);
+
+/*
+ * Test if CMA is available by trying a no-op call.
+ * Returns 1 if CMA is present, 0 if not.
+ */
+int cma_available(void);
diff --git a/ptl_am/cmarwu.c b/ptl_am/cmarwu.c
new file mode 100644
index 0000000..a9a1d83
--- /dev/null
+++ b/ptl_am/cmarwu.c
@@ -0,0 +1,207 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "cmarw.h"
+
+/* An iovec looks like this:
+ * struct iovec {
+ * void *iov_base; // Starting address
+ * size_t iov_len; // Number of bytes to transfer
+ * };
+ */
+
+#if 0
+#define __NR_process_vm_readv 310
+#define __NR_process_vm_writev 311
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+ syscall(__NR_process_vm_readv, \
+ pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+ syscall(__NR_process_vm_writev, \
+ pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+#endif
+
+/*CMA syscall wrappers were added in glibc 2.15. For anything older than that,
+ we need to define our own wrappers. Apparently older (and maybe newer?)
+ (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5
+ arguments via the generic syscall() function. These CMA functions, however,
+ have 6 arguments. So for now, we hack our way around it by generating ASM
+ code for doing a syscall directly.
+*/
+
+#if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15))
+
+#ifdef __x86_64__
+
+#define __NR_process_vm_readv 310
+#define __NR_process_vm_writev 311
+
+static inline ssize_t __x86_64_syscall6(int syscall,
+ pid_t pid,
+ const struct iovec *local_iov,
+ unsigned long liovcnt,
+ const struct iovec *remote_iov,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ /*GCC inline ASM is annoying -- can't specify all the x86_64 registers
+ directly, so declare register-specific variables and use them. */
+ register int64_t rax asm("rax") = syscall;
+ register int64_t rdi asm("rdi") = pid;
+ register int64_t rsi asm("rsi") = (intptr_t) local_iov;
+ register int64_t rdx asm("rdx") = liovcnt;
+ register int64_t r10 asm("r10") = (intptr_t) remote_iov;
+ register int64_t r8 asm("r8") = riovcnt;
+ register int64_t r9 asm("r9") = flags;
+
+ asm volatile ("syscall\n" : "=a" (rax)
+ : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10),
+ "r"(r8), "r"(r9)
+ : "%rcx", "%r11", "cc", "memory");
+ return rax;
+}
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+ __x86_64_syscall6(__NR_process_vm_readv, \
+ pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+ __x86_64_syscall6(__NR_process_vm_writev, \
+ pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#else /* ndef __x86_64__ */
+#error "Can't compile CMA support for this architecture."
+#endif /* __x86_64__ */
+#endif /* __GLIBC__ < 2.15 */
+
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n)
+{
+ int64_t nr, sum;
+ struct iovec local = {
+ .iov_base = dst,
+ .iov_len = n
+ };
+ struct iovec remote = {
+ .iov_base = (void *)src,
+ .iov_len = n
+ };
+ nr = sum = 0;
+ while (sum != n) {
+ nr = process_vm_readv(pid, &local, 1, &remote, 1, 0);
+ if (nr == -1) {
+ return -1;
+ }
+ sum += nr;
+ local.iov_base += nr;
+ local.iov_len -= nr;
+ remote.iov_base += nr;
+ remote.iov_len -= nr;
+ }
+ return sum;
+}
+
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n)
+{
+ int64_t nr, sum;
+ struct iovec local = {
+ .iov_base = (void *)src,
+ .iov_len = n
+ };
+ struct iovec remote = {
+ .iov_base = dst,
+ .iov_len = n
+ };
+
+ nr = sum = 0;
+ while (sum != n) {
+ nr = process_vm_writev(pid, &local, 1, &remote, 1, 0);
+ if (nr == -1) {
+ return -1;
+ }
+ sum += nr;
+ local.iov_base += nr;
+ local.iov_len -= nr;
+ remote.iov_base += nr;
+ remote.iov_len -= nr;
+ }
+ return sum;
+}
+
+/* Test if CMA is available by trying a no-op call. */
+int cma_available(void)
+{
+
+ /* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred)
+ * should be returned. If not present, expect -ENOSYS. */
+
+ int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0);
+
+ if (ret == 0) {
+ /* CMA is available! */
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h
new file mode 100644
index 0000000..a6ba9db
--- /dev/null
+++ b/ptl_am/psm_am_internal.h
@@ -0,0 +1,466 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef PSMI_AM_H
+#define PSMI_AM_H
+
+#include "../psm_am_internal.h"
+
+#define AMSH_DIRBLOCK_SIZE 128
+
+typedef
+struct am_epaddr {
+ /* must be the first field to be the same address */
+ struct psm2_epaddr epaddr;
+ union {
+ uint16_t _ptladdr_u16[4];
+ uint32_t _ptladdr_u32[2];
+ uint64_t _ptladdr_u64;
+ uint8_t _ptladdr_data[0];
+ };
+} am_epaddr_t;
+
+/* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining
+ arguments are passed using space in am_pkt_bulk_t. One additional argument
+ is added for passing the internal ptl_am handler index. */
+#define NSHORT_ARGS 6
+#define NBULK_ARGS (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1)
+
+typedef
+struct amsh_am_token {
+ struct psmi_am_token tok;
+
+ ptl_t *ptl; /**> What PTL was it received on */
+ psm2_mq_t mq; /**> What matched queue is this for ? */
+ uint16_t shmidx; /**> what shmidx sent this */
+} amsh_am_token_t;
+
+typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs,
+ void *src, size_t len);
+
+typedef struct psmi_handlertab {
+ psmi_handler_fn_t fn;
+} psmi_handlertab_t;
+
+/*
+ * Can change the rendezvous threshold based on usage of cma (or not)
+ */
+#define PSMI_MQ_RV_THRESH_CMA 16000
+
+/* If no kernel assisted copy is available this is the rendezvous threshold */
+#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
+
+#define PSMI_AM_CONN_REQ 1
+#define PSMI_AM_CONN_REP 2
+#define PSMI_AM_DISC_REQ 3
+#define PSMI_AM_DISC_REP 4
+
+#define PSMI_KASSIST_OFF 0x0
+#define PSMI_KASSIST_CMA_GET 0x1
+#define PSMI_KASSIST_CMA_PUT 0x2
+
+#define PSMI_KASSIST_CMA 0x3
+#define PSMI_KASSIST_GET 0x1
+#define PSMI_KASSIST_PUT 0x2
+#define PSMI_KASSIST_MASK 0x3
+
+#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET
+#define PSMI_KASSIST_MODE_DEFAULT_STRING "cma-get"
+
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+/*
+ * Eventually, we will allow users to register handlers as "don't reply", which
+ * may save on some of the buffering requirements
+ */
+#define PSMI_HANDLER_NEEDS_REPLY(handler) 1
+#define PSMI_VALIDATE_REPLY(handler) assert(PSMI_HANDLER_NEEDS_REPLY(handler))
+
+int psmi_amsh_poll(ptl_t *ptl, int replyonly);
+
+/* Shared memory AM, forward decls */
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, int flags);
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, int flags);
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dest, int flags);
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ const void *src, size_t len, void *dest, int flags);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len);
+void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg,
+ void *buf, size_t len);
+void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg,
+ void *buf, size_t len);
+void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg,
+ void *buf, size_t len);
+void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg,
+ void *buf, size_t len);
+void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len);
+
+/* AM over shared memory (forward decls) */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+
+#define amsh_conn_handler_hidx 1
+#define mq_handler_hidx 2
+#define mq_handler_data_hidx 3
+#define mq_handler_rtsmatch_hidx 4
+#define mq_handler_rtsdone_hidx 5
+#define am_handler_hidx 6
+
+#define AMREQUEST_SHORT 0
+#define AMREQUEST_LONG 1
+#define AMREPLY_SHORT 2
+#define AMREPLY_LONG 3
+#define AM_IS_REPLY(x) ((x)&0x2)
+#define AM_IS_REQUEST(x) (!AM_IS_REPLY(x))
+#define AM_IS_LONG(x) ((x)&0x1)
+#define AM_IS_SHORT(x) (!AM_IS_LONG(x))
+
+#define AM_FLAG_SRC_ASYNC 0x1
+#define AM_FLAG_SRC_TEMP 0x2
+
+/*
+ * Request Fifo.
+ */
+typedef
+struct am_reqq {
+ struct am_reqq *next;
+
+ ptl_t *ptl;
+ psm2_epaddr_t epaddr;
+ int amtype;
+ psm2_handler_t handler;
+ psm2_amarg_t args[8];
+ int nargs;
+ uint32_t len;
+ void *src;
+ void *dest;
+ int amflags;
+ int flags;
+} am_reqq_t;
+
+struct am_reqq_fifo_t {
+ am_reqq_t *first;
+ am_reqq_t **lastp;
+};
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl);
+void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, void *dest, int flags);
+
+/*
+ * Shared memory Active Messages, implementation derived from
+ * Lumetta, Mainwaring, Culler. Multi-Protocol Active Messages on a Cluster of
+ * SMP's. Supercomputing 1997.
+ *
+ * We support multiple endpoints in shared memory, but we only support one
+ * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some
+ * structures are endpoint specific (as denoted * with amsh_ep_) and others are
+ * specific to the single shared memory context * (amsh_ global variables).
+ *
+ * Each endpoint maintains a shared request block and a shared reply block.
+ * Each block is composed of queues for small, medium and large messages.
+ */
+
+#define QFREE 0
+#define QUSED 1
+#define QREADY 2
+#define QREADYMED 3
+#define QREADYLONG 4
+
+#define QISEMPTY(flag) (flag < QREADY)
+#if defined(__x86_64__) || defined(__i386__)
+# define _QMARK_FLAG_FENCE() asm volatile("" : : : "memory") /* compilerfence */
+#else
+# error No _QMARK_FLAG_FENCE() defined for this platform
+#endif
+
+#define _QMARK_FLAG(pkt_ptr, _flag) \
+ do { \
+ _QMARK_FLAG_FENCE(); \
+ (pkt_ptr)->flag = (_flag); \
+ } while (0)
+
+#define QMARKFREE(pkt_ptr) _QMARK_FLAG(pkt_ptr, QFREE)
+#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY)
+#define QMARKUSED(pkt_ptr) _QMARK_FLAG(pkt_ptr, QUSED)
+
+#define AMFMT_SYSTEM 1
+#define AMFMT_SHORT_INLINE 2
+#define AMFMT_SHORT 3
+#define AMFMT_LONG 4
+#define AMFMT_LONG_END 5
+
+#define _shmidx _ptladdr_u16[0]
+#define _return_shmidx _ptladdr_u16[1]
+#define _cstate _ptladdr_u16[2]
+
+#define AMSH_CMASK_NONE 0
+#define AMSH_CMASK_PREREQ 1
+#define AMSH_CMASK_POSTREQ 2
+#define AMSH_CMASK_DONE 3
+
+#define AMSH_CSTATE_OUTGOING_MASK 0x0f
+#define AMSH_CSTATE_OUTGOING_NONE 0x01
+#define AMSH_CSTATE_OUTGOING_REPLIED 0x02
+#define AMSH_CSTATE_OUTGOING_ESTABLISHED 0x03
+#define AMSH_CSTATE_OUTGOING_DISC_REPLIED 0x04
+#define AMSH_CSTATE_OUTGOING_DISC_REQUESTED 0x05
+#define AMSH_CSTATE_OUTGOING_GET(amaddr) ((amaddr)->_cstate & AMSH_CSTATE_OUTGOING_MASK)
+#define AMSH_CSTATE_OUTGOING_SET(amaddr, state) \
+ (amaddr)->_cstate = (((amaddr)->_cstate & ~AMSH_CSTATE_OUTGOING_MASK) | \
+ ((AMSH_CSTATE_OUTGOING_ ## state) & AMSH_CSTATE_OUTGOING_MASK))
+
+#define AMSH_CSTATE_INCOMING_MASK 0xf0
+#define AMSH_CSTATE_INCOMING_NONE 0x10
+#define AMSH_CSTATE_INCOMING_DISC_REQUESTED 0x40
+#define AMSH_CSTATE_INCOMING_ESTABLISHED 0x50
+#define AMSH_CSTATE_INCOMING_GET(amaddr) ((amaddr)->_cstate & AMSH_CSTATE_INCOMING_MASK)
+#define AMSH_CSTATE_INCOMING_SET(amaddr, state) \
+ (amaddr)->_cstate = (((amaddr)->_cstate & ~AMSH_CSTATE_INCOMING_MASK) | \
+ ((AMSH_CSTATE_INCOMING_ ## state) & AMSH_CSTATE_INCOMING_MASK))
+
+/**********************************
+ * Shared memory packet formats
+ **********************************/
+typedef
+struct am_pkt_short {
+ uint32_t flag; /**> Packet state */
+ union {
+ uint32_t bulkidx; /**> index in bulk packet queue */
+ uint32_t length; /**> length when no bulkidx used */
+ };
+ uint16_t shmidx; /**> index in shared segment */
+ uint16_t type;
+ uint16_t nargs;
+ uint16_t handleridx;
+
+ psm2_amarg_t args[NSHORT_ARGS]; /* AM arguments */
+
+ /* We eventually will expose up to 8 arguments, but this isn't implemented
+ * For now. >6 args will probably require a medium instead of a short */
+} __attribute__ ((aligned(64)))
+am_pkt_short_t;
+PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64);
+
+typedef struct am_pkt_bulk {
+ uint32_t flag;
+ uint32_t idx;
+ uintptr_t dest; /* Destination pointer in "longs" */
+ uint32_t dest_off; /* Destination pointer offset */
+ uint32_t len; /* Destination length within offset */
+ psm2_amarg_t args[NBULK_ARGS]; /* Additional "spillover" for >6 args */
+ uint8_t payload[0];
+} am_pkt_bulk_t;
+/* No strict size decl, used for mediums and longs */
+
+/****************************************************
+ * Shared memory header and block control structures
+ ***************************************************/
+
+/* Each pkt queue has the same header format, although the queue
+ * consumers don't use the 'head' index in the same manner. */
+typedef struct am_ctl_qhdr {
+ uint32_t head; /* Touched only by 1 consumer */
+ uint8_t _pad0[64 - 4];
+
+ pthread_spinlock_t lock;
+ uint32_t tail; /* XXX candidate for fetch-and-incr */
+ uint32_t elem_cnt;
+ uint32_t elem_sz;
+ uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)];
+} am_ctl_qhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128);
+
+/* Each block reserves some space at the beginning to store auxiliary data */
+#define AMSH_BLOCK_HEADER_SIZE 4096
+
+/* Each process has a reply qhdr and a request qhdr */
+typedef struct am_ctl_blockhdr {
+ volatile am_ctl_qhdr_t shortq;
+ volatile am_ctl_qhdr_t longbulkq;
+} am_ctl_blockhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2);
+
+/* We cache the "shorts" because that's what we poll on in the critical path.
+ * We take care to always update these pointers whenever the segment is remapped.
+ */
+typedef struct am_ctl_qshort_cache {
+ volatile am_pkt_short_t *base;
+ volatile am_pkt_short_t *head;
+ volatile am_pkt_short_t *end;
+} am_ctl_qshort_cache_t;
+
+/******************************************
+ * Shared segment local directory (global)
+ ******************************************
+ *
+ * Each process keeps a directory for where request and reply structures are
+ * located at its peers. This directory must be re-initialized every time the
+ * shared segment moves in the VM, and the segment moves every time we remap()
+ * for additional memory.
+ */
+struct amsh_qdirectory {
+ am_ctl_blockhdr_t *qreqH;
+ am_pkt_short_t *qreqFifoShort;
+ am_pkt_bulk_t *qreqFifoLong;
+
+ am_ctl_blockhdr_t *qrepH;
+ am_pkt_short_t *qrepFifoShort;
+ am_pkt_bulk_t *qrepFifoLong;
+} __attribute__ ((aligned(64)));
+
+#define AMSH_HAVE_CMA 0x1
+#define AMSH_HAVE_KASSIST 0x1
+
+/******************************************
+ * Shared fifo element counts and sizes
+ ******************************************
+ * These values are context-wide, they can only be set early on and can't be *
+ * modified at runtime. All endpoints are expected to use the same values.
+ */
+typedef
+struct amsh_qinfo {
+ int qreqFifoShort;
+ int qreqFifoLong;
+
+ int qrepFifoShort;
+ int qrepFifoLong;
+} amsh_qinfo_t;
+
+/******************************************
+ * Per-endpoint structures (ep-local)
+ ******************************************
+ * Each endpoint keeps its own information as to where it resides in the
+ * directory, and maintains its own cached copies of where the short header
+ * resides in shared memory.
+ *
+ * This structure is carefully arranged to optimize cache locality and
+ * performance. Do not modify without careful and thorough analysis.
+ */
+struct am_ctl_nodeinfo {
+ uint16_t psm_verno;
+ volatile uint16_t is_init;
+ volatile pid_t pid;
+ psm2_epid_t epid;
+ psm2_epaddr_t epaddr;
+ uintptr_t amsh_shmbase;
+ amsh_qinfo_t amsh_qsizes;
+ uint32_t amsh_features;
+ struct amsh_qdirectory qdir;
+} __attribute__((aligned(64)));
+
+struct ptl {
+ psm2_ep_t ep;
+ psm2_epid_t epid;
+ psm2_epaddr_t epaddr;
+ ptl_ctl_t *ctl;
+
+ int connect_phase;
+ int connect_outgoing;
+ int connect_incoming;
+
+ int zero_polls;
+ int amsh_only_polls;
+ int max_ep_idx, am_ep_size;
+ int psmi_kassist_mode;
+ char *amsh_keyname;
+
+ /* These three items carefully picked to fit in one cache line. */
+ am_ctl_qshort_cache_t reqH;
+ am_ctl_qshort_cache_t repH;
+ struct am_reqq_fifo_t psmi_am_reqq_fifo;
+
+ am_pkt_short_t amsh_empty_shortpkt;
+
+ struct am_ctl_nodeinfo *self_nodeinfo;
+ struct am_ctl_nodeinfo *am_ep;
+} __attribute__((aligned(64)));
+
+#endif
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
new file mode 100644
index 0000000..1f20cdf
--- /dev/null
+++ b/ptl_am/ptl.c
@@ -0,0 +1,364 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+/**
+ * Callback function when a receive request is matched with the
+ * tag obtained from the RTS packet.
+ */
+static
+psm2_error_t
+ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
+ amsh_am_token_t *tok)
+{
+ psm2_amarg_t args[5];
+ psm2_epaddr_t epaddr = req->rts_peer;
+ ptl_t *ptl = epaddr->ptlctl->ptl;
+ int cma_succeed = 0;
+ int pid = 0, cuda_ipc_send_completion = 0;
+
+ PSM2_LOG_MSG("entering.");
+ psmi_assert((tok != NULL && was_posted)
+ || (tok == NULL && !was_posted));
+
+ _HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n",
+ req, req->buf, req->recv_msglen, tok);
+#ifdef PSM_CUDA
+ if (req->cuda_ipc_handle_attached) {
+
+ void* cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf,
+ (cudaIpcMemHandle_t*)&req->cuda_ipc_handle,
+ req->recv_msglen,
+ req->rts_peer->epid);
+ /* cudaMemcpy into the receive side buffer
+ * based on its location */
+ if (req->is_buf_gpu_mem) {
+ PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_dev_ptr,
+ req->recv_msglen, cudaMemcpyDeviceToDevice);
+ PSMI_CUDA_CALL(cudaEventRecord, req->cuda_ipc_event, 0);
+ PSMI_CUDA_CALL(cudaEventSynchronize, req->cuda_ipc_event);
+ } else
+ PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_dev_ptr,
+ req->recv_msglen, cudaMemcpyDeviceToHost);
+ cuda_ipc_send_completion = 1;
+ am_cuda_memhandle_release(cuda_ipc_dev_ptr);
+ req->cuda_ipc_handle_attached = 0;
+ goto send_cts;
+ }
+#endif
+
+ if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET)
+ && req->recv_msglen > 0
+ && (pid = psmi_epaddr_pid(epaddr))) {
+#ifdef PSM_CUDA
+ /* If the buffer on the send side is on the host,
+ * we alloc a bounce buffer, use kassist and then
+ * do a cudaMemcpy if the buffer on the recv side
+ * resides on the GPU
+ */
+ if (req->is_buf_gpu_mem) {
+ void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->recv_msglen);
+ size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+ cuda_ipc_bounce_buf, req->recv_msglen);
+ psmi_assert_always(nbytes == req->recv_msglen);
+ PSMI_CUDA_CALL(cudaMemcpy, req->buf, cuda_ipc_bounce_buf,
+ req->recv_msglen, cudaMemcpyHostToDevice);
+ /* Cuda library has recent optimizations where they do
+ * not guarantee synchronus nature for Host to Device
+ * copies for msg sizes less than 64k. The event record
+ * and synchronize calls are to guarentee completion.
+ */
+ PSMI_CUDA_CALL(cudaEventRecord, req->cuda_ipc_event, 0);
+ PSMI_CUDA_CALL(cudaEventSynchronize, req->cuda_ipc_event);
+ psmi_free(cuda_ipc_bounce_buf);
+ } else {
+ /* cma can be done in handler context or not. */
+ size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+ req->buf, req->recv_msglen);
+ psmi_assert_always(nbytes == req->recv_msglen);
+ }
+#else
+ /* cma can be done in handler context or not. */
+ size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+ req->buf, req->recv_msglen);
+ if (nbytes == -1) {
+ ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+ _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n");
+ }
+ else {
+ psmi_assert_always(nbytes == req->recv_msglen);
+ cma_succeed = 1;
+ }
+ psmi_assert_always(nbytes == req->recv_msglen);
+#endif
+ }
+
+#ifdef PSM_CUDA
+send_cts:
+#endif
+ args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr;
+ args[1].u64w0 = (uint64_t) (uintptr_t) req;
+ args[2].u64w0 = (uint64_t) (uintptr_t) req->buf;
+ args[3].u32w0 = req->recv_msglen;
+ args[3].u32w1 = tok != NULL ? 1 : 0;
+ args[4].u32w0 = ptl->psmi_kassist_mode; // pass current kassist mode to the peer process
+
+ if (tok != NULL) {
+ psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl,
+ tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx,
+ args, 5, NULL, 0, NULL, 0);
+ } else
+ psmi_amsh_short_request(ptl, epaddr, mq_handler_rtsmatch_hidx,
+ args, 5, NULL, 0, 0);
+
+ /* 0-byte completion or we used kassist */
+ if (pid || cma_succeed ||
+ req->recv_msglen == 0 || cuda_ipc_send_completion == 1) {
+ psmi_mq_handle_rts_complete(req);
+ }
+ PSM2_LOG_MSG("leaving.");
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted)
+{
+ /* was_posted == 0 allows us to assume that we're not running this callback
+ * within am handler context (i.e. we can poll) */
+ psmi_assert(was_posted == 0);
+ return ptl_handle_rtsmatch_request(req, 0, NULL);
+}
+
+void
+psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len)
+{
+ amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+ psm2_mq_req_t req;
+ psm2_mq_tag_t tag;
+ int rc;
+ uint32_t opcode = args[0].u32w0;
+ uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1;
+
+ tag.tag[0] = args[1].u32w1;
+ tag.tag[1] = args[1].u32w0;
+ tag.tag[2] = args[2].u32w1;
+ psmi_assert(toki != NULL);
+ _HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n",
+ tok->mq, opcode, (int)len, msglen);
+
+ switch (opcode) {
+ case MQ_MSG_TINY:
+ case MQ_MSG_SHORT:
+ case MQ_MSG_EAGER:
+ rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming,
+ &tag, msglen, 0, buf,
+ (uint32_t) len, 1, opcode, &req);
+
+ /* for eager matching */
+ req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming;
+ req->msg_seqnum = 0; /* using seqnum 0 */
+ break;
+ default:{
+ void *sreq = (void *)(uintptr_t) args[3].u64w0;
+ uintptr_t sbuf = (uintptr_t) args[4].u64w0;
+ psmi_assert(narg == 5);
+ psmi_assert_always(opcode == MQ_MSG_LONGRTS);
+ rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming,
+ &tag, msglen, NULL, 0, 1,
+ ptl_handle_rtsmatch, &req);
+
+ req->rts_peer = tok->tok.epaddr_incoming;
+ req->ptl_req_ptr = sreq;
+ req->rts_sbuf = sbuf;
+#ifdef PSM_CUDA
+ /* Payload in RTS would mean an IPC handle has been
+ * sent. This would also mean the sender has to
+ * send from a GPU buffer
+ */
+ if (buf && len > 0) {
+ req->cuda_ipc_handle = *((cudaIpcMemHandle_t*)buf);
+ req->cuda_ipc_handle_attached = 1;
+ }
+#endif
+
+ if (rc == MQ_RET_MATCH_OK) /* we are in handler context, issue a reply */
+ ptl_handle_rtsmatch_request(req, 1, tok);
+ /* else will be called later */
+ break;
+ }
+ }
+ return;
+}
+
+void
+psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len)
+{
+ amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+ psmi_assert(toki != NULL);
+
+ psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming;
+ psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0); /* using seqnum 0 */
+ psmi_assert_always(req != NULL);
+ psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len);
+
+ return;
+}
+
+/**
+ * Function to handle CTS on the sender.
+ */
+void
+psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len)
+{
+ amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+ psmi_assert(toki != NULL);
+
+ ptl_t *ptl = tok->ptl;
+ psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+#ifdef PSM_CUDA
+ /* If send side req has a cuda ipc handle attached then we can
+ * assume the data has been copied as soon as we get a CTS
+ */
+ if (sreq->cuda_ipc_handle_attached) {
+ sreq->cuda_ipc_handle_attached = 0;
+ psmi_mq_handle_rts_complete(sreq);
+ return;
+ }
+#endif
+ void *dest = (void *)(uintptr_t) args[2].u64w0;
+ uint32_t msglen = args[3].u32w0;
+ psm2_amarg_t rarg[1];
+
+ _HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n",
+ sreq, (void *)(uintptr_t) args[1].u64w0, sreq->buf, dest,
+ msglen);
+
+ if (msglen > 0) {
+ rarg[0].u64w0 = args[1].u64w0; /* rreq */
+ int kassist_mode = ptl->psmi_kassist_mode;
+ int kassist_mode_peer = args[4].u32w0;
+ // In general, peer process(es) shall have the same kassist mode set,
+ // but due to dynamic CMA failure detection, we must align local and remote state,
+ // and make protocol to adopt to that potential change.
+ if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) {
+ ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+ goto no_kassist;
+ }
+
+ if (kassist_mode & PSMI_KASSIST_PUT) {
+ int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming);
+ size_t nbytes = cma_put(sreq->buf, pid, dest, msglen);
+ if (nbytes == -1) {
+ _HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n");
+ ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+ goto no_kassist;
+ }
+
+ psmi_assert_always(nbytes == msglen);
+
+ /* Send response that PUT is complete */
+ psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx,
+ rarg, 1, NULL, 0, 0);
+ } else if (!(kassist_mode & PSMI_KASSIST_MASK)) {
+ /* Only transfer if kassist is off, i.e. neither GET nor PUT. */
+no_kassist:
+ psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg,
+ 1, sreq->buf, msglen, dest, 0);
+ }
+ }
+ psmi_mq_handle_rts_complete(sreq);
+}
+
+void
+psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf,
+ size_t len)
+{
+ psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+ psmi_assert(narg == 1);
+ _HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->buf,
+ rreq->recv_msglen);
+ psmi_mq_handle_rts_complete(rreq);
+}
+
+void
+psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len)
+{
+ amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+ psm2_am_handler_fn_t hfn;
+
+ psmi_assert(toki != NULL);
+
+ hfn = psm_am_get_handler_function(tok->mq->ep,
+ (psm2_handler_t) args[0].u32w0);
+
+ /* Invoke handler function. For AM we do not support break functionality */
+ hfn(toki, args + 1, narg - 1, buf, len);
+
+ return;
+}
diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h
new file mode 100644
index 0000000..e1bd064
--- /dev/null
+++ b/ptl_am/ptl_fwd.h
@@ -0,0 +1,64 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_AMSH_H
+#define _PTL_FWD_AMSH_H
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_amsh;
+
+extern int psmi_shm_mq_rv_thresh;
+
+#endif
diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile
new file mode 100644
index 0000000..d48c883
--- /dev/null
+++ b/ptl_ips/Makefile
@@ -0,0 +1,96 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir = $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ipserror.o ips_recvq.o \
+ ips_recvhdrq.o ips_spio.o ips_proto_recv.o ips_proto_connect.o \
+ ips_proto_dump.o ips_proto_mq.o ips_subcontext.o \
+ ips_writehdrq.o ips_proto_expected.o ips_tid.o \
+ ips_scb.o ips_proto_am.o ips_opp_path_rec.o ips_tidflow.o \
+ ips_epstate.o ips_crc32.o ips_path_rec.o ips_tidcache.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+ $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno; \
+ cd -; \
+ fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+ @echo "Nothing to do for install."
diff --git a/ptl_ips/ips_crc32.c b/ptl_ips/ips_crc32.c
new file mode 100644
index 0000000..d6ed1bf
--- /dev/null
+++ b/ptl_ips/ips_crc32.c
@@ -0,0 +1,91 @@
+/* The code in this file was derived from crc32.c in zlib 1.2.3, and
+ modified from its original form to suit our requirements. The zlib
+ license and crc32.c copyright and credits are preserved below. */
+
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.3, July 18th, 2005
+
+ Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup at gzip.org madler at alumni.caltech.edu
+
+ The data format used by the zlib library is described by RFCs (Request for
+ Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+ (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64 at csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors. This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+ uint32_t c;
+ int n, k;
+
+ for (n = 0; n < 256; n++) {
+ c = (uint32_t) n;
+ for (k = 0; k < 8; k++) {
+ if (c & 1)
+ c = 0xedb88320 ^ (c >> 1);
+ else
+ c = c >> 1;
+ }
+ crc_table[n] = c;
+ }
+ crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc)
+{
+ uint32_t c = crc;
+ uint32_t n;
+
+ if (!crc_table_computed) {
+ make_crc_table();
+ }
+ for (n = 0; n < len; n++) {
+ c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8);
+ }
+ return c;
+}
diff --git a/ptl_ips/ips_epstate.c b/ptl_ips/ips_epstate.c
new file mode 100644
index 0000000..8206847
--- /dev/null
+++ b/ptl_ips/ips_epstate.c
@@ -0,0 +1,154 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_epstate.h"
+
+/* The indexes are used to map a particular endpoint to a structure at the
+ * receiver. Although we take extra care to validate the identity of endpoints
+ * when packets are received, the communication index is at an offset selected
+ * by the endpoint that allocates the index. This narrows the window of two
+ * jobs communicated with the same set of indexes from getting crosstalk.
+ */
+/* Allocate new epaddrs in chunks of 128 */
+#define PTL_EPADDR_ALLOC_CHUNK 128
+
+psm2_error_t
+ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context)
+{
+ memset(eps, 0, sizeof(*eps));
+ eps->context = context;
+ eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) &
+ (IPS_EPSTATE_CONNIDX_MAX-1);
+ return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps)
+{
+ if (eps->eps_tab)
+ psmi_free(eps->eps_tab);
+ memset(eps, 0, sizeof(*eps));
+ return PSM2_OK;
+}
+
+/*
+ * Add ipsaddr with epid to the epstate table, return new index to caller in
+ * 'connidx'.
+ */
+psm2_error_t
+ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr,
+ ips_epstate_idx *connidx_o)
+{
+ int i, j;
+ ips_epstate_idx connidx;
+
+ if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */
+ struct ips_epstate_entry *newtab;
+ eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK;
+ newtab = (struct ips_epstate_entry *)
+ psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT,
+ eps->eps_tabsize,
+ sizeof(struct ips_epstate_entry));
+ if (newtab == NULL)
+ return PSM2_NO_MEMORY;
+ else if (eps->eps_tab) { /* NOT first alloc */
+ for (i = 0;
+ i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++)
+ newtab[i] = eps->eps_tab[i]; /* deep copy */
+ psmi_free(eps->eps_tab);
+ }
+ eps->eps_tab = newtab;
+ }
+ /* Find the next free hole. We can afford to do this since connect is not
+ * in the critical path */
+ for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) {
+ if (j == eps->eps_tabsize)
+ j = 0;
+ if (eps->eps_tab[j].ipsaddr == NULL) {
+ eps->eps_tab_nextidx = j + 1;
+ if (eps->eps_tab_nextidx == eps->eps_tabsize)
+ eps->eps_tab_nextidx = 0;
+ break;
+ }
+ }
+ psmi_assert_always(i != eps->eps_tabsize);
+ connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+ _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n",
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx,
+ j);
+ eps->eps_tab[j].ipsaddr = ipsaddr;
+ if (j >= IPS_EPSTATE_CONNIDX_MAX) {
+ return psmi_handle_error(eps->context->ep,
+ PSM2_TOO_MANY_ENDPOINTS,
+ "Can't connect to more than %d non-local endpoints",
+ IPS_EPSTATE_CONNIDX_MAX);
+ }
+ *connidx_o = connidx;
+ return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx)
+{
+ ips_epstate_idx idx;
+ /* actual table index */
+ idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+ psmi_assert_always(idx < eps->eps_tabsize);
+ _HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx);
+ eps->eps_tab[idx].ipsaddr = NULL;
+ /* We may eventually want to release memory, but probably not */
+ eps->eps_tabsizeused--;
+ return PSM2_OK;
+}
diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h
new file mode 100644
index 0000000..7308040
--- /dev/null
+++ b/ptl_ips/ips_epstate.h
@@ -0,0 +1,100 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_EPSTATE_H
+#define _IPS_EPSTATE_H
+
+#include "psm_user.h"
+
+typedef uint32_t ips_epstate_idx;
+#define IPS_EPSTATE_CONNIDX_MAX (1<<26)
+
+struct ips_epaddr;
+
+struct ips_epstate_entry {
+ struct ips_epaddr *ipsaddr;
+};
+
+struct ips_epstate {
+ const psmi_context_t *context;
+ ips_epstate_idx eps_base_idx;
+ int eps_tabsize;
+ int eps_tabsizeused;
+ int eps_tab_nextidx;
+
+ struct ips_epstate_entry *eps_tab;
+};
+
+psm2_error_t ips_epstate_init(struct ips_epstate *eps,
+ const psmi_context_t *contextj);
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps);
+
+psm2_error_t ips_epstate_add(struct ips_epstate *eps,
+ struct ips_epaddr *ipsaddr,
+ ips_epstate_idx *connidx);
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx);
+
+PSMI_INLINE(
+struct ips_epstate_entry *
+ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx))
+{
+ idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+ if (idx < eps->eps_tabsize)
+ return &eps->eps_tab[idx];
+ else
+ return NULL;
+}
+
+#endif /* _IPS_EPSTATE_H */
diff --git a/ptl_ips/ips_expected_proto.h b/ptl_ips/ips_expected_proto.h
new file mode 100644
index 0000000..a402b93
--- /dev/null
+++ b/ptl_ips/ips_expected_proto.h
@@ -0,0 +1,397 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * Control and state structure for one instance of the expected protocol. The
+ * protocol depends on some upcalls from internal portions of the receive
+ * protocol (such as opcodes dedicated for expected protocol handling)
+ */
+
+/*
+ * Expected tid operations are carried out over "sessions". One session is a
+ * collection of N tids where N is determined by the expected message window
+ * size (-W option or PSM2_MQ_RNDV_HFI_WINDOW). Since naks can cause
+ * retransmissions, each session has an session index (_desc_idx) and a
+ * generation count (_desc_genc) to be able to identify if retransmitted
+ * packets reference the correct session.
+ *
+ * index and generation count are each 4 bytes encoded in one ptl_arg. They
+ * could be compressed further but we have the header space, so we don't
+ * bother.
+ */
+#define _desc_idx u32w0
+#define _desc_genc u32w1
+
+/*
+ * For debug and/or other reasons, we can log the state of each tid and
+ * optionally associate it to a particular receive descriptor
+ */
+
+#define TIDSTATE_FREE 0
+#define TIDSTATE_USED 1
+
+struct ips_tidinfo {
+ uint32_t tid;
+ uint32_t state;
+ struct ips_tid_recv_desc *tidrecvc;
+};
+
+/* Generate an expected header every 16 packets */
+#define PSM_DEFAULT_EXPECTED_HEADER 16
+
+struct ips_protoexp {
+ const struct ptl *ptl;
+ struct ips_proto *proto;
+ struct psmi_timer_ctrl *timerq;
+ struct ips_tid tidc;
+ struct ips_tf tfc;
+
+ psm_transfer_type_t ctrl_xfer_type;
+ psm_transfer_type_t tid_xfer_type;
+ struct ips_scbctrl tid_scbc_rv;
+ mpool_t tid_desc_send_pool;
+ mpool_t tid_getreq_pool;
+ mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */
+ mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */
+ struct drand48_data tidflow_drand48_data;
+ uint32_t tid_flags;
+ uint32_t tid_send_fragsize;
+ uint32_t tid_page_offset_mask;
+ uint64_t tid_page_mask;
+ uint32_t hdr_pkt_interval;
+ struct ips_tidinfo *tid_info;
+
+ STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */
+ ips_tid_send_desc) pend_sendq;
+ struct psmi_timer timer_send;
+
+ STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq; /* pending tid reqs */
+ struct psmi_timer timer_getreqs;
+
+#ifdef PSM_CUDA
+ STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */
+ ips_tid_get_request) cudapend_getreqsq;
+ struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg;
+ struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg;
+ mpool_t cuda_hostbuf_pool_recv;
+ mpool_t cuda_hostbuf_pool_small_recv;
+ cudaStream_t cudastream_recv;
+#endif
+};
+
+/*
+ * TID member list format used in communication.
+ * Since the compiler does not make sure the bit fields order,
+ * we use mask and shift defined below.
+typedef struct {
+ uint32_t length:11; // in page unit, max 1024 pages
+ uint32_t reserved:9; // for future usage
+ uint32_t tidctrl:2; // hardware defined tidctrl value
+ uint32_t tid:10; // hardware only support 10bits
+}
+ips_tid_session_member;
+ */
+#define IPS_TIDINFO_LENGTH_SHIFT 0
+#define IPS_TIDINFO_LENGTH_MASK 0x7ff
+#define IPS_TIDINFO_TIDCTRL_SHIFT 20
+#define IPS_TIDINFO_TIDCTRL_MASK 0x3
+#define IPS_TIDINFO_TID_SHIFT 22
+#define IPS_TIDINFO_TID_MASK 0x3ff
+
+#define IPS_TIDINFO_GET_LENGTH(tidinfo) \
+ (((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK)
+#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \
+ (((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK)
+#define IPS_TIDINFO_GET_TID(tidinfo) \
+ (((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK)
+
+typedef struct {
+ uint8_t tsess_unaligned_start; /* unaligned bytes at starting */
+ uint8_t tsess_unaligned_end; /* unaligned bytes at ending */
+ uint16_t tsess_tidcount; /* tid number for the session */
+ uint32_t tsess_tidoffset; /* offset in first tid */
+ uint32_t tsess_srcoff; /* source offset from beginning */
+ uint32_t tsess_length; /* session length, including start/end */
+
+ uint32_t tsess_list[0]; /* must be last in struct */
+} ips_tid_session_list;
+
+/*
+ * Send-side expected send descriptors.
+ *
+ * Descriptors are allocated when tid grant requests are received (the 'target'
+ * side of an RDMA get request). Descriptors are added to a pending queue of
+ * expected sends and processed one at a time (scb's are requested and messages
+ * sent until all fragments of the descriptor's length are put on the wire).
+ *
+ */
+#define TIDSENDC_SDMA_VEC_DEFAULT 260
+
+struct ips_tid_send_desc {
+ struct ips_protoexp *protoexp;
+ STAILQ_ENTRY(ips_tid_send_desc) next;
+
+ /* Filled in at allocation time */
+ ptl_arg_t sdescid; /* sender descid */
+ ptl_arg_t rdescid; /* reciever descid */
+ ips_epaddr_t *ipsaddr;
+ psm2_mq_req_t mqreq;
+
+ /* tidflow to send tid traffic */
+ struct ips_flow tidflow;
+
+ /* Iterated during send progress */
+ void *userbuf; /* user privided buffer */
+ void *buffer;
+ uint32_t length; /* total length, includint start/end */
+
+ uint32_t tidbytes; /* bytes sent over tid so far */
+ uint32_t remaining_tidbytes;
+ uint32_t offset_in_tid; /* could be more than page */
+ uint32_t remaining_bytes_in_tid;
+
+ uint16_t frame_send;
+ uint16_t tid_idx;
+ uint16_t is_complete;
+ uint16_t frag_size;
+ /* bitmap of queued control messages for flow */
+ uint16_t ctrl_msg_queued;
+
+#ifdef PSM_CUDA
+ /* As size of cuda_hostbuf is less than equal to window size,
+ * there is a guarantee that the maximum number of host bufs we
+ * would need to attach to a tidsendc would be 2
+ */
+ struct ips_cuda_hostbuf *cuda_hostbuf[2];
+ /* Number of hostbufs attached */
+ uint8_t cuda_num_buf;
+#endif
+ /*
+ * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes,
+ * so the max possible tid window size mq->hfi_base_window_rv is 4M.
+ * However, PSM must fit tid grant message into a single transfer
+ * unit, either PIO or SDMA, PSM will shrink the window accordingly.
+ */
+ uint16_t tsess_tidlist_length;
+ union {
+ ips_tid_session_list tid_list;
+ uint8_t filler[PSM_TIDLIST_BUFSIZE+
+ sizeof(ips_tid_session_list)];
+ };
+};
+
+#define TIDRECVC_STATE_FREE 0
+#define TIDRECVC_STATE_BUSY 1
+
+struct ips_expected_recv_stats {
+ uint32_t nSeqErr;
+ uint32_t nGenErr;
+ uint32_t nReXmit;
+ uint32_t nErrChkReceived;
+};
+
+struct ips_tid_recv_desc {
+ const psmi_context_t *context;
+ struct ips_protoexp *protoexp;
+
+ ptl_arg_t rdescid; /* reciever descid */
+ ips_epaddr_t *ipsaddr;
+ struct ips_tid_get_request *getreq;
+
+ /* scb to send tid grant CTS */
+ ips_scb_t *grantscb;
+ /* scb to send tid data completion */
+ ips_scb_t *completescb;
+
+ /* tidflow to only send ctrl msg ACK and NAK */
+ struct ips_flow tidflow;
+
+ /* TF protocol state (recv) */
+ uint32_t state;
+ uint32_t tidflow_active_gen;
+ uint32_t tidflow_nswap_gen;
+ psmi_seqnum_t tidflow_genseq;
+
+#ifdef PSM_CUDA
+ struct ips_cuda_hostbuf *cuda_hostbuf;
+ uint8_t is_ptr_gpu_backed;
+#endif
+
+ void *buffer;
+ uint32_t recv_msglen;
+ uint32_t recv_tidbytes; /* exlcude start/end trim */
+
+ struct ips_expected_recv_stats stats;
+
+ /* bitmap of queued control messages for */
+ uint16_t ctrl_msg_queued;
+ /*
+ * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes,
+ * so the max possible tid window size mq->hfi_base_window_rv is 4M.
+ * However, PSM must fit tid grant message into a single transfer
+ * unit, either PIO or SDMA, PSM will shrink the window accordingly.
+ */
+ uint16_t tsess_tidlist_length;
+ union {
+ ips_tid_session_list tid_list;
+ uint8_t filler[PSM_TIDLIST_BUFSIZE+
+ sizeof(ips_tid_session_list)];
+ };
+};
+
+/*
+ * Get requests, issued by MQ when there's a match on a large message. Unlike
+ * an RDMA get, the initiator identifies the location of the data at the target
+ * using a 'send token' instead of a virtual address. This, of course, assumes
+ * that the target has already registered the token and communicated it to the
+ * initiator beforehand (it actually sends the token as part of the initial
+ * MQ message that contains the MQ tag).
+ *
+ * The operation is semantically a two-sided RDMA get.
+ */
+typedef void (*ips_tid_completion_callback_t) (void *);
+
+struct ips_tid_get_request {
+ STAILQ_ENTRY(ips_tid_get_request) tidgr_next;
+ struct ips_protoexp *tidgr_protoexp;
+ psm2_epaddr_t tidgr_epaddr;
+
+ void *tidgr_lbuf;
+ uint32_t tidgr_length;
+ uint32_t tidgr_rndv_winsz;
+ uint32_t tidgr_sendtoken;
+ ips_tid_completion_callback_t tidgr_callback;
+ void *tidgr_ucontext;
+
+ uint32_t tidgr_offset; /* offset in bytes */
+ uint32_t tidgr_bytesdone;
+ uint32_t tidgr_flags;
+
+#ifdef PSM_CUDA
+ int cuda_hostbuf_used;
+ uint32_t tidgr_cuda_bytesdone;
+ STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend, /* pending exp. sends */
+ ips_cuda_hostbuf) pend_cudabuf;
+#endif
+};
+
+/*
+ * Descriptor limits, structure contents of struct psmi_rlimit_mpool for
+ * normal, min and large configurations.
+ */
+#define TID_SENDSESSIONS_LIMITS { \
+ .env = "PSM2_TID_SENDSESSIONS_MAX", \
+ .descr = "Tid max send session descriptors", \
+ .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \
+ .minval = 1, \
+ .maxval = 1<<30, \
+ .mode[PSMI_MEMMODE_NORMAL] = { 256, 8192 }, \
+ .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \
+ .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \
+ }
+
+/*
+ * Expected send support
+ */
+/*
+ * The expsend token is currently always a pointer to a MQ request. It is
+ * echoed on the wire throughout various phases of the expected send protocol
+ * to identify a particular send.
+ */
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+ const struct ips_proto *proto,
+ uint32_t protoexp_flags, int num_of_send_bufs,
+ int num_of_send_desc,
+ struct ips_protoexp **protoexp_o);
+MOCK_DCL_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp);
+void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev);
+
+int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev);
+int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev);
+
+psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc);
+
+PSMI_ALWAYS_INLINE(
+void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len))
+{
+ while (len) {
+ dst[len-1] = src[len-1];
+ len--;
+ }
+}
+
+/*
+ * Peer is waiting (blocked) for this request
+ */
+#define IPS_PROTOEXP_TIDGET_WAIT 0x1
+#define IPS_PROTOEXP_TIDGET_PEERWAIT 0x2
+psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+ void *buf, uint32_t length,
+ psm2_epaddr_t epaddr,
+ uint32_t remote_tok, uint32_t flags,
+ ips_tid_completion_callback_t
+ callback, void *context);
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+ ips_epaddr_t *ipsaddr, psm2_mq_req_t req,
+ ptl_arg_t rdescid, uint32_t tidflow_genseq,
+ ips_tid_session_list *tid_list,
+ uint32_t tid_list_size);
diff --git a/ptl_ips/ips_opp_path_rec.c b/ptl_ips/ips_opp_path_rec.c
new file mode 100644
index 0000000..b9c3904
--- /dev/null
+++ b/ptl_ips/ips_opp_path_rec.c
@@ -0,0 +1,602 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include <dlfcn.h>
+
+#define DF_OPP_LIBRARY "libopasadb.so.1.0.0"
+#define DATA_VFABRIC_OFFSET 8
+
+/* SLID and DLID are in network byte order */
+static psm2_error_t
+ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto,
+ uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+ ips_path_rec_t **ppath_rec)
+{
+ psm2_error_t err = PSM2_OK;
+ ibta_path_rec_t query, opp_response;
+#ifdef _HFI_DEBUGGING
+ int opp_response_set = 0;
+#endif
+ ips_path_rec_t *path_rec;
+ int opp_err;
+ ENTRY elid, *epath = NULL;
+ char eplid[128];
+ uint64_t timeout_ack_ms;
+
+ /* Query path record query cache first */
+ bzero(&query, sizeof(query));
+ bzero(eplid, sizeof(eplid));
+
+ /* Bulk service ID is control service id + 1 */
+ switch (type) {
+ case IPS_PATH_LOW_PRIORITY:
+ query.service_id =
+ __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET);
+ break;
+ case IPS_PATH_NORMAL_PRIORITY:
+ case IPS_PATH_HIGH_PRIORITY:
+ default:
+ query.service_id = __cpu_to_be64(proto->ep->service_id);
+ }
+
+ query.slid = slid;
+ query.dlid = dlid;
+
+ snprintf(eplid, sizeof(eplid), "%s_%x_%x",
+ (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH",
+ query.slid, query.dlid);
+ elid.key = eplid;
+ hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+ if (!epath) { /* Unable to find path record in cache */
+ elid.key =
+ psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+ path_rec = (ips_path_rec_t *)
+ psmi_calloc(proto->ep, UNDEFINED, 1,
+ sizeof(ips_path_rec_t));
+ if (!elid.key || !path_rec) {
+ if (elid.key)
+ psmi_free(elid.key);
+ if (path_rec)
+ psmi_free(path_rec);
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ /* Get path record between local LID and remote */
+ opp_err =
+ proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt,
+ &query,
+ &opp_response);
+ if (opp_err) {
+ psmi_free(path_rec);
+ psmi_free(elid.key);
+ err = PSM2_EPID_PATH_RESOLUTION;
+ goto fail;
+ }
+#ifdef _HFI_DEBUGGING
+ opp_response_set = 1;
+#endif
+ /* Create path record */
+ path_rec->pr_slid = opp_response.slid;
+ path_rec->pr_dlid = opp_response.dlid;
+ path_rec->pr_mtu =
+ min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f),
+ proto->epinfo.ep_mtu);
+ path_rec->pr_pkey = ntohs(opp_response.pkey);
+ path_rec->pr_sl = ntohs(opp_response.qos_class_sl);
+ path_rec->pr_static_ipd =
+ proto->ips_ipd_delay[opp_response.rate & 0x3f];
+
+ /* Setup CCA parameters for path */
+ if (path_rec->pr_sl > PSMI_SL_MAX) {
+ psmi_free(path_rec);
+ psmi_free(elid.key);
+ err = PSM2_INTERNAL_ERR;
+ goto fail;
+ }
+ if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) {
+ _HFI_CCADBG("No CCA for sl %d, disable CCA\n",
+ path_rec->pr_sl);
+ proto->flags &= ~IPS_PROTO_FLAG_CCA;
+ proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+ }
+ if (!(proto->ep->context.runtime_flags &
+ HFI1_CAP_STATIC_RATE_CTRL)) {
+ _HFI_CCADBG("No Static-Rate-Control, disable CCA\n");
+ proto->flags &= ~IPS_PROTO_FLAG_CCA;
+ proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+ }
+
+ path_rec->proto = proto;
+ path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min;
+ path_rec->pr_timer_cca = NULL;
+
+ /* Determine active IPD for path. Is max of static rate and CCT table */
+ if (!(proto->flags & IPS_PROTO_FLAG_CCA)) {
+ path_rec->pr_active_ipd = 0;
+ path_rec->pr_cca_divisor = 0;
+ } else if ((path_rec->pr_static_ipd) &&
+ ((path_rec->pr_static_ipd + 1) >
+ (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+ path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+ path_rec->pr_cca_divisor = 0; /*Static rate has no CCA divisor */
+ } else {
+ /* Pick it from the CCT table */
+ path_rec->pr_active_ipd =
+ proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+ path_rec->pr_cca_divisor =
+ proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+ }
+
+ /* Compute max timeout based on pkt life time for path */
+ timeout_ack_ms =
+ ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) /
+ 1000000UL);
+ timeout_ack_ms =
+ ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT +
+ timeout_ack_ms);
+ if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms)
+ proto->epinfo.ep_timeout_ack_max = timeout_ack_ms;
+
+ /* Add path record into cache */
+ strcpy(elid.key, eplid);
+ elid.data = (void *)path_rec;
+ hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+ } else /* Path record found in cache */
+ path_rec = (ips_path_rec_t *) epath->data;
+
+#ifdef _HFI_DEBUGGING
+ /* Dump path record stats */
+ _HFI_PRDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n",
+ (uint64_t) __be64_to_cpu(query.service_id),
+ __be16_to_cpu(slid), __be16_to_cpu(dlid));
+ if (opp_response_set)
+ {
+ _HFI_PRDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f),
+ path_rec->pr_mtu);
+ _HFI_PRDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey));
+ _HFI_PRDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl));
+ _HFI_PRDBG("Rate: %x, IPD: %x\n", (opp_response.rate & 0x3f),
+ path_rec->pr_static_ipd);
+ }
+ _HFI_PRDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n",
+ proto->epinfo.ep_timeout_ack,
+ proto->epinfo.ep_timeout_ack_max);
+#endif
+ /* Return the IPS path record */
+ *ppath_rec = path_rec;
+
+fail:
+ return err;
+}
+
+static psm2_error_t
+ips_opp_path_rec(struct ips_proto *proto,
+ uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+ unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+ psm2_error_t err = PSM2_OK;
+ uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc);
+ ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY;
+ ips_path_rec_t *path;
+ ips_path_grp_t *pathgrp;
+ uint16_t path_slid, path_dlid;
+ ENTRY elid, *epath = NULL;
+ char eplid[128];
+
+ /*
+ * High Priority Path
+ * ------------------
+ *
+ * Uses the "base" Service ID. For now there exists only 1 high priority
+ * path between nodes even for non zero LMC fabrics.
+ *
+ * Normal/Low Priority Paths
+ * -------------------------
+ *
+ * Currently these paths are the same i.e. they are queried for the same
+ * Service ID/vFabric which is the Base Service ID for High Priority + 1.
+ *
+ * Use case Scenarios
+ * ------------------
+ *
+ * Since with vFabrics we have the capability to define different QoS
+ * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is
+ * setup in a separate vFabric for high priority traffic. The NORMAL paths
+ * are setup in a separate vFabric optimized for high bandwidth. This allows
+ * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked
+ * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.)
+ * also use the high priority control vFabric.
+ *
+ * NOTE: In order to distinguish between the different vFabrics the user
+ * specifies the service ID to use via mpirun (or environment variable).
+ * This is the service ID for the high priority control traffic. The bulk
+ * data vFabric is identified by service ID + 1. So for each MPI application
+ * one should specify two service IDs for the high priority and bulk data.
+ * Both these service IDs can be placed in the same vFabric which can be
+ * configured for high priority or bandwidth traffic giving us the default
+ * behavior upto Infinhfi 2.5 release.
+ *
+ * NOTE: All of the above would have really helped if the S20 silicon could
+ * correctly support IBTA QoS features. Due to S20 design we can only have
+ * high priority VLarb table (low priority VLarb table results in round
+ * robin arbitration ignoring the weights!). But if this is fixed in a
+ * subsequent chip respin then this may potentially help our scalability
+ * on large fabrics.
+ *
+ * Mesh/Torus and DOR routed networks
+ * ----------------------------------
+ *
+ * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be
+ * more). We would like to take advantage of dispersive routing on these
+ * fabrics as well to obtain better "worst case/congested" bandwidth. For
+ * these networks currently the base LIDs are used for UPDN routing which
+ * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR
+ * routing (Dimension Ordered Routing) to avoid deadlocks and provide
+ * higher performance. If a fabric is disrupted then only the base UPDN
+ * routing is available. PSM should continue to operate in this environment
+ * albeit with degraded performance. In disrupted fabric the OPP path
+ * record queries may fail for some DOR routed LIDs i.e. no path exists
+ * PSM should hence ignore path record failures as they indicate a disrupted
+ * fabric and only use valid paths that are returned from the replica. This
+ * will degenerate to only using the UPDN paths on disrupted fabrics and DOR
+ * routes only for fully configured fabrics. Note: For a clean fabric the
+ * base LIDs that are configured for UPDN route will not exist in the replica
+ * as DOR routes are preferred. Hence we will only dispersively route across
+ * the DOR routes only using the UPDN route for disrupted fabrics.
+ *
+ * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN
+ * TAKE PLACE.
+ */
+
+ /* Check if this path grp is already in hash table */
+ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+ elid.key = eplid;
+ hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+ if (epath) { /* Find path group in cache */
+ *ppathgrp = (ips_path_grp_t *) epath->data;
+ return err;
+ }
+
+ /* If base lids are only used then reset num_path to 1 */
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+ num_path = 1;
+
+ /* Allocate a new pathgroup */
+ elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+ pathgrp = (ips_path_grp_t *)
+ psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+ num_path * IPS_PATH_MAX_PRIORITY *
+ sizeof(ips_path_rec_t *));
+ if (!elid.key || !pathgrp) {
+ if (elid.key)
+ psmi_free(elid.key);
+ if (pathgrp)
+ psmi_free(pathgrp);
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ /* dlid is the peer base lid */
+ pathgrp->pg_base_lid = __be16_to_cpu(dlid);
+
+ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] =
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] =
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0;
+
+ /* For now there is always only one high priority path between nodes. */
+ for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) {
+ path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+ path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+ err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto,
+ path_slid, path_dlid,
+ desthfi_type, &path);
+
+ if (err == PSM2_OK) { /* Valid high priority path found */
+ /* Resolved high priority path successfully */
+ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++;
+ pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path;
+
+ /* Increment current path index */
+ cpath++;
+ }
+ }
+
+ /* Make sure we have atleast 1 high priority path */
+ if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) {
+ psmi_free(elid.key);
+ psmi_free(pathgrp);
+ err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+ "OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+ PRIx64 " defined?", ntohs(slid),
+ ntohs(dlid),
+ (uint64_t) proto->ep->service_id);
+ goto fail;
+ }
+
+ /* Once we have the high-priority path, set the partition key */
+ if (hfi_set_pkey(proto->ep->context.ctrl,
+ (uint16_t) pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey) != 0) {
+ err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+ "Couldn't set device pkey 0x%x: %s",
+ (int)pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey,
+ strerror(errno));
+ psmi_free(elid.key);
+ psmi_free(pathgrp);
+ goto fail;
+ }
+
+
+ /* Next setup the bulk paths. If the subnet administrator has misconfigured
+ * or rather not configured two separate service IDs we place the bulk
+ * paths in the same vFabric as the control paths.
+ */
+
+ path_type = IPS_PATH_NORMAL_PRIORITY;
+ for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+ path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+ path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_normal_path_res:
+ err = ips_opp_get_path_rec(path_type, proto,
+ path_slid, path_dlid, desthfi_type,
+ &path);
+ if (err != PSM2_OK) {
+ if (path_type == IPS_PATH_NORMAL_PRIORITY) {
+ /* Subnet may only be configured for one service ID/vFabric. Default
+ * to using the control vFabric/service ID for bulk data as well.
+ */
+ path_type = IPS_PATH_HIGH_PRIORITY;
+ goto retry_normal_path_res;
+ }
+
+ /* Unable to resolve path for <path_slid, path_dline>. This is possible
+ * for disrupted fabrics using DOR routing so continue to acquire paths
+ */
+ err = PSM2_OK;
+ continue;
+ }
+
+ /* Valid path. */
+ pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path;
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++;
+ cpath++;
+ }
+
+ /* Make sure we have at least have a single bulk data transfer path */
+ if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) {
+ psmi_free(elid.key);
+ psmi_free(pathgrp);
+ err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+ "OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+ PRIx64 " defined?", ntohs(slid),
+ ntohs(dlid),
+ (uint64_t) proto->ep->service_id);
+ goto fail;
+ }
+
+ path_type = IPS_PATH_LOW_PRIORITY;
+ for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+ path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+ path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_low_path_res:
+ err = ips_opp_get_path_rec(path_type, proto,
+ path_slid, path_dlid, desthfi_type,
+ &path);
+ if (err != PSM2_OK) {
+ if (path_type == IPS_PATH_LOW_PRIORITY) {
+ /* Subnet may only be configured for one service ID/vFabric. Default
+ * to using the control vFabric/service ID for bulk data as well.
+ */
+ path_type = IPS_PATH_HIGH_PRIORITY;
+ goto retry_low_path_res;
+ }
+
+ /* Unable to resolve path for <path_slid, path_dline>. This is possible
+ * for disrupted fabrics using DOR routing so continue to acquire paths
+ */
+ err = PSM2_OK;
+ continue;
+ }
+
+ /* Valid path. */
+ pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path;
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++;
+ cpath++;
+ }
+
+ /* Make sure we have at least have a single bulk data transfer path */
+ if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) {
+ psmi_free(elid.key);
+ psmi_free(pathgrp);
+ err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+ "OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+ PRIx64 " defined?", ntohs(slid),
+ ntohs(dlid),
+ (uint64_t) proto->ep->service_id);
+ goto fail;
+ }
+
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+ pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+ proto->epinfo.ep_context %
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+ pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+ proto->epinfo.ep_context %
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+ }
+
+ /* Add path group into cache */
+ strcpy(elid.key, eplid);
+ elid.data = (void *)pathgrp;
+ hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+ *ppathgrp = pathgrp;
+
+fail:
+ if (err != PSM2_OK)
+ _HFI_PRDBG
+ ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n",
+ slid, dlid);
+ return err;
+}
+
+static psm2_error_t ips_opp_fini(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+
+ if (proto->opp_lib)
+ dlclose(proto->opp_lib);
+
+ return err;
+}
+
+psm2_error_t ips_opp_init(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+ char hfiName[32];
+
+ proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW);
+ if (!proto->opp_lib) {
+ char *err = dlerror();
+ _HFI_ERROR
+ ("Unable to open OFED Plus Plus library %s. Error: %s\n",
+ DF_OPP_LIBRARY, err ? err : "no dlerror()");
+ goto fail;
+ }
+
+ /* Resolve symbols that we require within opp library */
+ proto->opp_fn.op_path_find_hca =
+ dlsym(proto->opp_lib, "op_path_find_hfi");
+ proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open");
+ proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close");
+ proto->opp_fn.op_path_get_path_by_rec =
+ dlsym(proto->opp_lib, "op_path_get_path_by_rec");
+
+ /* If we can't resovle any symbol then fail to load opp module */
+ if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open ||
+ !proto->opp_fn.op_path_close
+ || !proto->opp_fn.op_path_get_path_by_rec) {
+ _HFI_ERROR
+ ("Unable to resolve symbols in OPP library. Unloading.\n");
+ goto fail;
+ }
+
+ /* If PSM2_IDENTIFY is set display the OPP library location being used. */
+ if (getenv("PSM2_IDENTIFY")) {
+ Dl_info info_opp;
+ printf
+ ("PSM2 path record queries using OFED Plus Plus (%s) from %s\n",
+ DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open,
+ &info_opp) ? info_opp.
+ dli_fname :
+ "Unknown/unsupported version of OPP library found!");
+ }
+
+ /* Obtain handle to hfi (requires verbs on node) */
+ snprintf(hfiName, sizeof(hfiName), "hfi1_%d",
+ proto->ep->context.ctrl->__hfi_unit);
+ proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device);
+ if (!proto->hndl) {
+ _HFI_ERROR
+ ("OPP: Unable to find HFI %s. Disabling OPP interface for path record queries.\n",
+ hfiName);
+ goto fail;
+ }
+
+ /* Get OPP context */
+ proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1);
+ if (!proto->opp_ctxt) {
+ _HFI_ERROR
+ ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n");
+ goto fail;
+ }
+
+ /* Setup default errorcheck timeout. OPP may change it later. */
+ proto->epinfo.ep_timeout_ack =
+ ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+ proto->epinfo.ep_timeout_ack_max =
+ ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+ proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT;
+
+ /* OPP initialized successfully */
+ proto->ibta.get_path_rec = ips_opp_path_rec;
+ proto->ibta.fini = ips_opp_fini;
+ proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC;
+
+ return err;
+
+fail:
+ _HFI_ERROR("Make sure SM is running...\n");
+ _HFI_ERROR("Make sure service ibacm is running...\n");
+ _HFI_ERROR("to start ibacm: service ibacm start\n");
+ _HFI_ERROR("or enable it at boot time: opaconfig -E ibacm\n\n");
+
+ err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+ "Unable to initialize OFED Plus library successfully.\n");
+
+ if (proto->opp_lib)
+ dlclose(proto->opp_lib);
+
+ return err;
+}
diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c
new file mode 100644
index 0000000..647b111
--- /dev/null
+++ b/ptl_ips/ips_path_rec.c
@@ -0,0 +1,791 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+static void ips_gen_ipd_table(struct ips_proto *proto)
+{
+ uint8_t delay = 0, step = 1;
+ /* Based on our current link rate setup the IPD table */
+ memset(proto->ips_ipd_delay, 0xFF, sizeof(proto->ips_ipd_delay));
+
+ /*
+ * Based on the starting rate of the link, we let the code to
+ * fall through to next rate without 'break' in the code. The
+ * decrement is doubled at each rate level...
+ */
+ switch (proto->epinfo.ep_link_rate) {
+ case IBV_RATE_300_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_200_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_168_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_120_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ case IBV_RATE_112_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ case IBV_RATE_100_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_80_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_80_GBPS] = delay;
+ case IBV_RATE_60_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_60_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_40_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_40_GBPS] = delay;
+ case IBV_RATE_30_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_30_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_25_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_25_GBPS] = delay;
+ case IBV_RATE_20_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_20_GBPS] = delay;
+ delay += step;
+ step *= 2;
+ case IBV_RATE_10_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_10_GBPS] = delay;
+ case IBV_RATE_5_GBPS:
+ proto->ips_ipd_delay[IBV_RATE_5_GBPS] = delay;
+ default:
+ break;
+ }
+}
+
+static psm2_error_t ips_gen_cct_table(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+ uint32_t cca_divisor, ipdidx, ipdval = 1;
+ uint16_t *cct_table;
+
+ /* The CCT table is static currently. If it's already created then return */
+ if (proto->cct)
+ goto fail;
+
+ /* Allocate the CCT table */
+ cct_table = psmi_calloc(proto->ep, UNDEFINED,
+ proto->ccti_size, sizeof(uint16_t));
+ if (!cct_table) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ if (proto->ccti_size)
+ {
+ /* The first table entry is always 0 i.e. no IPD delay */
+ cct_table[0] = 0;
+ }
+
+ /* Generate the remaining CCT table entries */
+ for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4, ipdval++)
+ for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) {
+ if ((ipdidx + cca_divisor) == proto->ccti_size)
+ break;
+ cct_table[ipdidx + cca_divisor] =
+ (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) |
+ (ipdval & 0x3FFF));
+ _HFI_CCADBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n",
+ ipdidx + cca_divisor,
+ cct_table[ipdidx + cca_divisor],
+ (cct_table[ipdidx + cca_divisor] >>
+ CCA_DIVISOR_SHIFT),
+ cct_table[ipdidx +
+ cca_divisor] & CCA_IPD_MASK);
+ }
+
+ /* On link up/down CCT is re-generated. If CCT table is previously created
+ * free it
+ */
+ if (proto->cct) {
+ psmi_free(proto->cct);
+ proto->cct = NULL;
+ }
+
+ /* Update to the new CCT table */
+ proto->cct = cct_table;
+
+fail:
+ return err;
+}
+
+static opa_rate ips_default_hfi_rate(uint16_t hfi_type)
+{
+ opa_rate rate;
+
+ switch (hfi_type) {
+ case PSMI_HFI_TYPE_OPA1:
+ rate = IBV_RATE_100_GBPS;
+ break;
+ case PSMI_HFI_TYPE_OPA2:
+ rate = IBV_RATE_120_GBPS;
+ break;
+ default:
+ rate = IBV_RATE_MAX;
+ }
+
+ return rate;
+}
+
+static opa_rate ips_rate_to_enum(int link_rate)
+{
+ opa_rate rate;
+
+ switch (link_rate) {
+ case 300:
+ rate = IBV_RATE_300_GBPS;
+ break;
+ case 200:
+ rate = IBV_RATE_200_GBPS;
+ break;
+ case 100:
+ rate = IBV_RATE_100_GBPS;
+ break;
+ case 25:
+ rate = IBV_RATE_25_GBPS;
+ break;
+ case 168:
+ rate = IBV_RATE_168_GBPS;
+ break;
+ case 112:
+ rate = IBV_RATE_112_GBPS;
+ break;
+ case 56:
+ rate = IBV_RATE_56_GBPS;
+ break;
+ case 14:
+ rate = IBV_RATE_14_GBPS;
+ break;
+ case 120:
+ rate = IBV_RATE_120_GBPS;
+ break;
+ case 80:
+ rate = IBV_RATE_80_GBPS;
+ break;
+ case 60:
+ rate = IBV_RATE_60_GBPS;
+ break;
+ case 40:
+ rate = IBV_RATE_40_GBPS;
+ break;
+ case 30:
+ rate = IBV_RATE_30_GBPS;
+ break;
+ case 20:
+ rate = IBV_RATE_20_GBPS;
+ break;
+ case 10:
+ rate = IBV_RATE_10_GBPS;
+ break;
+ case 5:
+ rate = IBV_RATE_5_GBPS;
+ break;
+ default:
+ rate = IBV_RATE_MAX;
+ }
+
+ return rate;
+}
+
+static psm2_error_t
+ips_none_get_path_rec(struct ips_proto *proto,
+ uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+ unsigned long timeout, ips_path_rec_t **ppath_rec)
+{
+ psm2_error_t err = PSM2_OK;
+ ips_path_rec_t *path_rec;
+ ENTRY elid, *epath = NULL;
+ char eplid[128];
+
+ /* Query the path record cache */
+ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+ elid.key = eplid;
+ hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+ if (!epath) {
+ elid.key =
+ psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+ path_rec = (ips_path_rec_t *)
+ psmi_calloc(proto->ep, UNDEFINED, 1,
+ sizeof(ips_path_rec_t));
+ if (!elid.key || !path_rec) {
+ if (elid.key)
+ psmi_free(elid.key);
+ if (path_rec)
+ psmi_free(path_rec);
+ return PSM2_NO_MEMORY;
+ }
+
+ /* Create path record */
+ path_rec->pr_slid = slid;
+ path_rec->pr_dlid = dlid;
+ path_rec->pr_mtu = proto->epinfo.ep_mtu;
+ path_rec->pr_pkey = proto->epinfo.ep_pkey;
+ path_rec->pr_sl = proto->epinfo.ep_sl;
+
+ /* Determine the IPD based on our local link rate and default link rate for
+ * remote hfi type.
+ */
+ path_rec->pr_static_ipd =
+ proto->ips_ipd_delay[ips_default_hfi_rate(desthfi_type)];
+
+ _HFI_CCADBG("pr_static_ipd = %d\n", (int) path_rec->pr_static_ipd);
+
+ /* Setup CCA parameters for path */
+ if (path_rec->pr_sl > PSMI_SL_MAX) {
+ psmi_free(elid.key);
+ psmi_free(path_rec);
+ return PSM2_INTERNAL_ERR;
+ }
+ if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) {
+ _HFI_CCADBG("No CCA for sl %d, disable CCA\n",
+ path_rec->pr_sl);
+ proto->flags &= ~IPS_PROTO_FLAG_CCA;
+ proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+ }
+ if (!(proto->ep->context.runtime_flags &
+ HFI1_CAP_STATIC_RATE_CTRL)) {
+ _HFI_CCADBG("No Static-Rate-Control, disable CCA\n");
+ proto->flags &= ~IPS_PROTO_FLAG_CCA;
+ proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+ }
+
+ path_rec->proto = proto;
+ path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min;
+ path_rec->pr_timer_cca = NULL;
+
+ /* Determine active IPD for path. Is max of static rate and CCT table */
+ if (!(proto->flags & IPS_PROTO_FLAG_CCA)) {
+ _HFI_CCADBG("No IPS_PROTO_FLAG_CCA\n");
+
+ path_rec->pr_active_ipd = 0;
+ path_rec->pr_cca_divisor = 0;
+
+ _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+ _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+ } else if ((path_rec->pr_static_ipd) &&
+ ((path_rec->pr_static_ipd + 1) >
+ (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+ _HFI_CCADBG("IPS_PROTO_FLAG_CCA set, Setting pr_active_ipd.\n");
+
+ path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+ path_rec->pr_cca_divisor = 0;
+
+ _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+ _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+ } else {
+ /* Pick it from the CCT table */
+ _HFI_CCADBG("Picking up active IPD from CCT table, index %d, value 0x%x\n",
+ (int) path_rec->pr_ccti, (int) proto->cct[path_rec->pr_ccti]);
+
+ path_rec->pr_active_ipd =
+ proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+ path_rec->pr_cca_divisor =
+ proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+
+ _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd);
+ _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor);
+ }
+
+ /* Add path record into cache */
+ strcpy(elid.key, eplid);
+ elid.data = (void *)path_rec;
+ hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+ } else
+ path_rec = (ips_path_rec_t *) epath->data;
+
+ /* Return IPS path record */
+ *ppath_rec = path_rec;
+
+ return err;
+}
+
+static psm2_error_t
+ips_none_path_rec(struct ips_proto *proto,
+ uint16_t slid, uint16_t dlid, uint16_t desthfi_type,
+ unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+ psm2_error_t err = PSM2_OK;
+ uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc);
+ uint16_t base_slid, base_dlid;
+ ips_path_rec_t *path;
+ ips_path_grp_t *pathgrp;
+ ENTRY elid, *epath = NULL;
+ char eplid[128];
+
+ /* For the "none" path record resolution all paths are assumed to be of equal
+ * priority however since we want to isolate all control traffic (acks, naks)
+ * to a separate path for non zero LMC subnets the "first path" between a
+ * pair of endpoints is always the "higher" priority paths. The rest of the
+ * paths are the normal (and low priority) paths.
+ */
+
+ /* Query the path record cache */
+ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+ elid.key = eplid;
+ hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+ if (epath) { /* Find path group in cache */
+ *ppathgrp = (ips_path_grp_t *) epath->data;
+ return err;
+ }
+
+ /* If base lids are only used then reset num_path to 1 */
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+ num_path = 1;
+
+ /* Allocate a new pathgroup */
+ elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+ pathgrp = (ips_path_grp_t *)
+ psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+ num_path * IPS_PATH_MAX_PRIORITY *
+ sizeof(ips_path_rec_t *));
+ if (!elid.key || !pathgrp) {
+ if (elid.key)
+ psmi_free(elid.key);
+ if (pathgrp)
+ psmi_free(pathgrp);
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ /* dlid is the peer base lid */
+ pathgrp->pg_base_lid = __be16_to_cpu(dlid);
+
+ if (num_path > 1) {
+ /* One control path and (num_path - 1) norm and low priority paths */
+ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1;
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1;
+ } else {
+ /* LMC of 0. Use the same path for all priorities */
+ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1;
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1;
+ }
+
+ /* For "none" path record we just setup 2^lmc paths. To get better load
+ * balance
+ */
+ for (pidx = 0; pidx < num_path; pidx++) {
+ base_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+ base_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+ err =
+ ips_none_get_path_rec(proto, base_slid, base_dlid,
+ desthfi_type, timeout, &path);
+ if (err != PSM2_OK) {
+ psmi_free(elid.key);
+ psmi_free(pathgrp);
+ goto fail;
+ }
+
+ if (num_path > 1) {
+ if (pidx == 0) {
+ /* First path is always the high priority path */
+ pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] =
+ path;
+ } else {
+ pathgrp->pg_path[pidx -
+ 1][IPS_PATH_NORMAL_PRIORITY] =
+ path;
+ pathgrp->pg_path[pidx -
+ 1][IPS_PATH_LOW_PRIORITY] =
+ path;
+ }
+ } else {
+ pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path;
+ pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path;
+ pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path;
+ }
+ }
+
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+ pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+ proto->epinfo.ep_context %
+ pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+ pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+ proto->epinfo.ep_context %
+ pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+ }
+
+ /* Add path record into cache */
+ strcpy(elid.key, eplid);
+ elid.data = (void *)pathgrp;
+ hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+ *ppathgrp = pathgrp;
+
+fail:
+ if (err != PSM2_OK)
+ _HFI_PRDBG
+ ("Unable to get path record for LID %x <---> DLID %x.\n",
+ slid, dlid);
+ return err;
+}
+
+static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+
+ /* Obtain the SL and PKEY to use from the environment (HFI_SL & PSM_KEY) */
+ proto->epinfo.ep_sl = proto->ep->out_sl;
+ proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey;
+
+ /*
+ * Parse the err_chk settings from the environment.
+ * <min_timeout>:<max_timeout>:<timeout_factor>
+ */
+ {
+ union psmi_envvar_val env_to;
+ char *errchk_to = PSM_TID_TIMEOUT_DEFAULT;
+ int tvals[3] = {
+ IPS_PROTO_ERRCHK_MS_MIN_DEFAULT,
+ IPS_PROTO_ERRCHK_MS_MAX_DEFAULT,
+ IPS_PROTO_ERRCHK_FACTOR_DEFAULT
+ };
+
+ if (!psmi_getenv("PSM2_ERRCHK_TIMEOUT",
+ "Errchk timeouts in mS <min:max:factor>",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)errchk_to, &env_to)) {
+ /* Not using default values, parse what we can */
+ errchk_to = env_to.e_str;
+ psmi_parse_str_tuples(errchk_to, 3, tvals);
+ /* Adjust for max smaller than min, things would break */
+ if (tvals[1] < tvals[0])
+ tvals[1] = tvals[0];
+ }
+
+ proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]);
+ proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]);
+ proto->epinfo.ep_timeout_ack_factor = tvals[2];
+ }
+
+ proto->ibta.get_path_rec = ips_none_path_rec;
+ proto->ibta.fini = NULL;
+
+ /* With no path records queries set pkey manually */
+ if (hfi_set_pkey(proto->ep->context.ctrl,
+ (uint16_t) proto->ep->network_pkey) != 0) {
+ err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+ "Couldn't set device pkey 0x%x: %s",
+ (int)proto->ep->network_pkey,
+ strerror(errno));
+ }
+
+ return err;
+}
+
+/* (Re)load the SL2VL table */
+psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto)
+{
+ int ret, i;
+
+ /* Get SL2SC table for unit, port */
+ for (i = 0; i < 32; i++) {
+ if ((ret =
+ hfi_get_port_sl2sc(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ (uint8_t) i)) < 0) {
+ /* Unable to get SL2SC. Set it to default */
+ ret = PSMI_SC_DEFAULT;
+ }
+
+ proto->sl2sc[i] = (uint16_t) ret;
+ }
+ /* Get SC2VL table for unit, port */
+ for (i = 0; i < 32; i++) {
+ if ((ret =
+ hfi_get_port_sc2vl(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ (uint8_t) i)) < 0) {
+ /* Unable to get SC2VL. Set it to default */
+ ret = PSMI_VL_DEFAULT;
+ }
+
+ proto->sc2vl[i] = (uint16_t) ret;
+ }
+
+ return PSM2_OK;
+}
+
+/* On link up/down we need to update some state */
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+ int ret;
+
+ /* Get base lid, lmc and rate as these may have changed if the link bounced */
+ proto->epinfo.ep_base_lid =
+ __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid));
+
+ if ((ret = hfi_get_port_lmc(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port)) < 0) {
+ err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+ "Could obtain LMC for unit %u:%u. Error: %s",
+ proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ strerror(errno));
+ goto fail;
+ }
+ proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC);
+
+ if ((ret = hfi_get_port_rate(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port)) <
+ 0) {
+ err =
+ psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+ "Could obtain link rate for unit %u:%u. Error: %s",
+ proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ strerror(errno));
+ goto fail;
+ }
+ proto->epinfo.ep_link_rate = ips_rate_to_enum(ret);
+
+ /* Load the SL2SC2VL table */
+ ips_ibta_init_sl2sc2vl_table(proto);
+
+ /* Regenerate new IPD table for the updated link rate. */
+ ips_gen_ipd_table(proto);
+
+ /* Generate the CCT table. */
+ err = ips_gen_cct_table(proto);
+
+fail:
+ return err;
+}
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+ union psmi_envvar_val psm_path_policy;
+ union psmi_envvar_val disable_cca;
+ union psmi_envvar_val cca_prescan;
+
+ /* Get the path selection policy */
+ psmi_getenv("PSM2_PATH_SELECTION",
+ "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)"adaptive", &psm_path_policy);
+
+ if (!strcasecmp((const char *)psm_path_policy.e_str, "adaptive"))
+ proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE;
+ else if (!strcasecmp((const char *)psm_path_policy.e_str, "static_src"))
+ proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC;
+ else if (!strcasecmp
+ ((const char *)psm_path_policy.e_str, "static_dest"))
+ proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST;
+ else if (!strcasecmp
+ ((const char *)psm_path_policy.e_str, "static_base"))
+ proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE;
+
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+ _HFI_PRDBG("Using adaptive path selection.\n");
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+ _HFI_PRDBG("Static path selection: Src Context\n");
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+ _HFI_PRDBG("Static path selection: Dest Context\n");
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+ _HFI_PRDBG("Static path selection: Base LID\n");
+
+ psmi_getenv("PSM2_DISABLE_CCA",
+ "Disable use of Congestion Control Architecure (CCA) [enabled] ",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &disable_cca);
+ if (disable_cca.e_uint)
+ _HFI_CCADBG("CCA is disabled for congestion control.\n");
+ else {
+ int i;
+ char ccabuf[256];
+ uint8_t *p;
+
+ proto->flags |= IPS_PROTO_FLAG_CCA;
+/*
+ * If user set any environment variable, use self CCA.
+ */
+ if (getenv("PSM2_CCTI_INCREMENT") || getenv("PSM2_CCTI_TIMER")
+ || getenv("PSM2_CCTI_TABLE_SIZE")) {
+ goto disablecca;
+ }
+
+ psmi_getenv("PSM2_CCA_PRESCAN",
+ "Enable Congestion Control Prescanning (disabled by default) ",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &cca_prescan);
+
+ if (cca_prescan.e_uint)
+ proto->flags |= IPS_PROTO_FLAG_CCA_PRESCAN;
+
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+ i = hfi_get_cc_settings_bin(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ ccabuf);
+ if (i <= 0) {
+ goto disablecca;
+ }
+ p = (uint8_t *) ccabuf;
+ memcpy(&proto->ccti_ctrlmap, p, 4);
+ p += 4;
+ memcpy(&proto->ccti_portctrl, p, 2);
+ p += 2;
+ for (i = 0; i < 32; i++) {
+ proto->cace[i].ccti_increase = *p;
+ p++;
+ /* skip reserved u8 */
+ p++;
+ memcpy(&proto->cace[i].ccti_timer_cycles, p, 2);
+ p += 2;
+ proto->cace[i].ccti_timer_cycles =
+ us_2_cycles(proto->cace[i].ccti_timer_cycles);
+ proto->cace[i].ccti_threshold = *p;
+ p++;
+ proto->cace[i].ccti_min = *p;
+ p++;
+ }
+
+ i = hfi_get_cc_table_bin(proto->ep->context.ctrl->__hfi_unit,
+ proto->ep->context.ctrl->__hfi_port,
+ &proto->cct);
+ if (i < 0) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ } else if (i == 0) {
+ goto disablecca;
+ }
+ proto->ccti_limit = i;
+ proto->ccti_size = proto->ccti_limit + 1;
+
+ _HFI_CCADBG("ccti_limit = %d\n", (int) proto->ccti_limit);
+ for (i = 0; i < proto->ccti_limit; i++)
+ _HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]);
+
+
+ goto finishcca;
+
+/*
+ * Disable CCA.
+ */
+disablecca:
+ proto->flags &= ~IPS_PROTO_FLAG_CCA;
+ proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+ }
+
+finishcca:
+ /* Initialize path record/group hash table */
+ hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash);
+ hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash);
+
+ /* On startup treat it as a link up/down event to setup state . */
+ if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK)
+ goto fail;
+
+ /* Setup the appropriate query interface for the endpoint */
+ switch (proto->ep->path_res_type) {
+ case PSM2_PATH_RES_OPP:
+ err = ips_opp_init(proto);
+ if (err != PSM2_OK)
+ _HFI_ERROR
+ ("Unable to use OFED Plus Plus for path record queries.\n");
+ break;
+ case PSM2_PATH_RES_UMAD:
+ _HFI_ERROR
+ ("Path record queries using UMAD is not supported in PSM version %d.%dx\n",
+ PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+ err = PSM2_EPID_PATH_RESOLUTION;
+ break;
+ case PSM2_PATH_RES_NONE:
+ default:
+ err = ips_none_path_rec_init(proto);
+ }
+
+fail:
+ return err;
+}
+MOCK_DEF_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto)
+{
+ psm2_error_t err = PSM2_OK;
+
+ if (proto->ibta.fini)
+ err = proto->ibta.fini(proto);
+
+ /* Destroy the path record/group hash */
+ hdestroy_r(&proto->ips_path_rec_hash);
+ hdestroy_r(&proto->ips_path_grp_hash);
+
+ return err;
+}
diff --git a/ptl_ips/ips_path_rec.h b/ptl_ips/ips_path_rec.h
new file mode 100644
index 0000000..21cbef5
--- /dev/null
+++ b/ptl_ips/ips_path_rec.h
@@ -0,0 +1,185 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */
+
+
+#ifndef _IPS_PATH_REC_H_
+#define _IPS_PATH_REC_H_
+
+#include <search.h>
+
+/* Default size of path record hash table */
+#define DF_PATH_REC_HASH_SIZE 2047
+
+/* Default size of path group hash table */
+#define DF_PATH_GRP_HASH_SIZE 255
+
+/* Default size of CCT table. Must be multiple of 64 */
+#define DF_CCT_TABLE_SIZE 128
+
+/* CCT max IPD delay. */
+#define DF_CCT_MAX_IPD_DELAY_US 21
+
+/* CCA divisor shift */
+#define CCA_DIVISOR_SHIFT 14
+
+/* CCA ipd mask */
+#define CCA_IPD_MASK 0x3FFF
+
+/* A lot of these are IBTA specific defines that are available in other header
+ * files. To minimize dependencies with PSM build process they are listed
+ * here. Most of this is used to implement IBTA compliance features with PSM
+ * like path record query etc.
+ */
+
+enum opa_mtu {
+ IBTA_MTU_256 = 1,
+ IBTA_MTU_512 = 2,
+ IBTA_MTU_1024 = 3,
+ IBTA_MTU_2048 = 4,
+ IBTA_MTU_4096 = 5,
+ OPA_MTU_8192 = 6,
+ OPA_MTU_10240 = 7,
+ IBTA_MTU_MIN = IBTA_MTU_256,
+ OPA_MTU_MIN = IBTA_MTU_256,
+ OPA_MTU_MAX = OPA_MTU_10240,
+};
+
+typedef enum {
+ IBV_RATE_MAX = 0,
+ IBV_RATE_2_5_GBPS = 2,
+ IBV_RATE_5_GBPS = 5,
+ IBV_RATE_10_GBPS = 3,
+ IBV_RATE_20_GBPS = 6,
+ IBV_RATE_30_GBPS = 4,
+ IBV_RATE_40_GBPS = 7,
+ IBV_RATE_60_GBPS = 8,
+ IBV_RATE_80_GBPS = 9,
+ IBV_RATE_120_GBPS = 10,
+ IBV_RATE_14_GBPS = 11,
+ IBV_RATE_56_GBPS = 12,
+ IBV_RATE_112_GBPS = 13,
+ IBV_RATE_168_GBPS = 14,
+ IBV_RATE_25_GBPS = 15,
+ IBV_RATE_100_GBPS = 16,
+ IBV_RATE_200_GBPS = 17,
+ IBV_RATE_300_GBPS = 18
+} opa_rate;
+
+static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
+{
+ switch (mtu) {
+ case IBTA_MTU_256:
+ return 256;
+ case IBTA_MTU_512:
+ return 512;
+ case IBTA_MTU_1024:
+ return 1024;
+ case IBTA_MTU_2048:
+ return 2048;
+ case IBTA_MTU_4096:
+ return 4096;
+ case OPA_MTU_8192:
+ return 8192;
+ case OPA_MTU_10240:
+ return 10240;
+ default:
+ return -1;
+ }
+}
+
+/* This is same as ob_path_rec from ib_types.h. Listed here to be self
+ * contained to minimize dependencies during build etc.
+ */
+typedef struct _ibta_path_rec {
+ uint64_t service_id; /* net order */
+ uint8_t dgid[16];
+ uint8_t sgid[16];
+ uint16_t dlid; /* net order */
+ uint16_t slid; /* net order */
+ uint32_t hop_flow_raw; /* net order */
+ uint8_t tclass;
+ uint8_t num_path;
+ uint16_t pkey; /* net order */
+ uint16_t qos_class_sl; /* net order */
+ uint8_t mtu; /* IBTA encoded */
+ uint8_t rate; /* IBTA encoded */
+ uint8_t pkt_life; /* IBTA encoded */
+ uint8_t preference;
+ uint8_t resv2[6];
+} ibta_path_rec_t;
+
+/*
+ * PSM IPS path record components for endpoint.
+ */
+struct ips_proto;
+typedef struct ips_path_rec {
+ uint16_t pr_slid; /* For Torus/non zero LMC fabrics this can be diff */
+ uint16_t pr_dlid;
+ uint16_t pr_mtu; /* < Path's MTU */
+ uint16_t pr_pkey;
+ uint16_t pr_static_ipd; /* Static rate IPD from path record */
+ uint8_t pr_sl;
+
+ /* IBTA CCA parameters per path */
+ uint8_t pr_cca_divisor; /* CCA divisor [14:15] in CCT entry */
+ uint16_t pr_active_ipd; /* The current active IPD. max(static,cct) */
+ uint16_t pr_ccti; /* CCA table index */
+ psmi_timer *pr_timer_cca; /* Congestion timer for epr_ccti increment. */
+ struct ips_proto *proto; /* for global info */
+} ips_path_rec_t;
+
+psm2_error_t ips_opp_init(struct ips_proto *proto);
+
+#endif
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
new file mode 100644
index 0000000..150bda1
--- /dev/null
+++ b/ptl_ips/ips_proto.c
@@ -0,0 +1,2348 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * IPS - Interconnect Protocol Stack.
+ */
+
+#include <assert.h>
+#include <sys/uio.h> /* writev */
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_help.h"
+#include "psmi_wrappers.h"
+
+/*
+ * Control message types have their own flag to determine whether a message of
+ * that type is queued or not. These flags are kept in a state bitfield.
+ */
+#define CTRL_MSG_ACK_QUEUED 0x0001
+#define CTRL_MSG_NAK_QUEUED 0x0002
+#define CTRL_MSG_BECN_QUEUED 0x0004
+#define CTRL_MSG_ERR_CHK_QUEUED 0x0008
+#define CTRL_MSG_ERR_CHK_GEN_QUEUED 0x0010
+#define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0020
+#define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0040
+#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080
+#define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0100
+
+#ifdef PSM_CUDA
+uint32_t gpudirect_send_threshold;
+uint32_t gpudirect_recv_threshold;
+#endif
+
+static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto);
+static psm2_error_t proto_sdma_init(struct ips_proto *proto,
+ const psmi_context_t *context);
+
+#ifdef PSM_CUDA
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+{
+ struct ips_cuda_hostbuf *icb;
+ struct ips_cuda_hostbuf_mpool_cb_context *ctxt =
+ (struct ips_cuda_hostbuf_mpool_cb_context *) context;
+
+ icb = (struct ips_cuda_hostbuf *)obj;
+ if (is_alloc) {
+ PSMI_CUDA_CALL(cudaHostAlloc,
+ (void **) &icb->host_buf,
+ ctxt->bufsz,
+ cudaHostAllocPortable);
+ PSMI_CUDA_CALL(cudaEventCreate, &icb->copy_status);
+ } else {
+ if (icb->host_buf) {
+ PSMI_CUDA_CALL(cudaFreeHost, icb->host_buf);
+ PSMI_CUDA_CALL(cudaEventDestroy, icb->copy_status);
+ }
+ }
+ return;
+}
+#endif
+
+psm2_error_t
+ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
+ int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
+ const struct psmi_timer_ctrl *timerq,
+ const struct ips_epstate *epstate,
+ const struct ips_spio *spioc, struct ips_proto *proto)
+{
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+ uint32_t protoexp_flags, cksum_sz;
+ union psmi_envvar_val env_tid, env_cksum, env_mtu;
+ psm2_error_t err = PSM2_OK;
+
+ /*
+ * Checksum packets within PSM. Default is off.
+ * This is heavy weight and done in software so not recommended for
+ * production runs.
+ */
+
+ psmi_getenv("PSM2_CHECKSUM",
+ "Enable checksum of messages (0 disables checksum)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, &env_cksum);
+
+ memset(proto, 0, sizeof(struct ips_proto));
+ proto->ptl = (ptl_t *) ptl;
+ proto->ep = context->ep; /* cached */
+ proto->mq = context->ep->mq; /* cached */
+ proto->fd = context->fd; /* cached */
+ proto->pend_sends.proto = proto;
+ psmi_timer_entry_init(&proto->pend_sends.timer,
+ ips_proto_timer_pendq_callback,
+ &proto->pend_sends);
+ STAILQ_INIT(&proto->pend_sends.pendq);
+ proto->epstate = (struct ips_epstate *)epstate;
+ proto->timerq = (struct psmi_timer_ctrl *)timerq;
+ proto->spioc = (struct ips_spio *)spioc;
+
+ proto->epinfo.ep_baseqp = base_info->bthqp;
+ proto->epinfo.ep_context = ctxt_info->ctxt; /* "real" context */
+ proto->epinfo.ep_subcontext = ctxt_info->subctxt;
+ proto->epinfo.ep_hfi_type = psmi_get_hfi_type(context);
+ proto->epinfo.ep_jkey = base_info->jkey;
+
+ /* If checksums enabled we insert checksum at end of packet */
+ cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0;
+ proto->epinfo.ep_mtu = context->ep->mtu;
+ /* Decrement checksum */
+ proto->epinfo.ep_mtu -= cksum_sz;
+
+ /* See if user specifies a lower MTU to use */
+ if (!psmi_getenv
+ ("PSM2_MTU", "MTU specified by user: 1-7,256-8192,10240]",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)-1, &env_mtu)) {
+ if (env_mtu.e_int != 256 && env_mtu.e_int != 512
+ && env_mtu.e_int != 1024 && env_mtu.e_int != 2048
+ && env_mtu.e_int != 4096 && env_mtu.e_int != 8192
+ && env_mtu.e_int != 10240) {
+ if (env_mtu.e_int < OPA_MTU_MIN ||
+ env_mtu.e_int > OPA_MTU_MAX)
+ env_mtu.e_int = OPA_MTU_8192;
+ env_mtu.e_int =
+ opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int);
+ }
+ if (proto->epinfo.ep_mtu > env_mtu.e_int)
+ proto->epinfo.ep_mtu = env_mtu.e_int;
+ }
+
+ /*
+ * The PIO size should not include the ICRC because it is
+ * stripped by HW before delivering to receiving buffer.
+ * We decide to use minimum 2 PIO buffers so that PSM has
+ * turn-around time to do PIO transfer. Each credit is a
+ * block of 64 bytes. Also PIO buffer size must not be
+ * bigger than MTU.
+ */
+ proto->epinfo.ep_piosize = (ctxt_info->credits / 2) * 64 -
+ (sizeof(struct ips_message_header) + HFI_PCB_SIZE_IN_BYTES +
+ cksum_sz);
+ proto->epinfo.ep_piosize =
+ min(proto->epinfo.ep_piosize, proto->epinfo.ep_mtu);
+
+ /* Keep PIO as multiple of cache line size */
+ if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES)
+ proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1);
+
+ /* Save back to hfi level. */
+ context->ctrl->__hfi_mtusize = proto->epinfo.ep_mtu;
+ context->ctrl->__hfi_piosize = proto->epinfo.ep_piosize;
+
+ /* sdma completion queue */
+ proto->sdma_comp_queue =
+ (struct hfi1_sdma_comp_entry *) base_info->sdma_comp_bufbase;
+ proto->sdma_queue_size = ctxt_info->sdma_ring_size;
+ /* don't use the last slot */
+
+ {
+ /* configure sdma_avail_counter */
+ union psmi_envvar_val env_sdma_avail;
+ int tmp_queue_size = proto->sdma_queue_size - 1;
+
+ psmi_getenv("PSM2_MAX_PENDING_SDMA_REQS",
+ "PSM maximum pending SDMA requests",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val) tmp_queue_size,
+ &env_sdma_avail);
+
+ if ((env_sdma_avail.e_int < 8) || (env_sdma_avail.e_int > proto->sdma_queue_size - 1))
+ proto->sdma_avail_counter = proto->sdma_queue_size - 1;
+ else
+ proto->sdma_avail_counter = env_sdma_avail.e_int;
+ }
+
+
+ proto->sdma_fill_index = 0;
+ proto->sdma_done_index = 0;
+ proto->sdma_scb_queue = (struct ips_scb **)
+ psmi_calloc(proto->ep, UNDEFINED,
+ proto->sdma_queue_size, sizeof(struct ips_scb *));
+ if (proto->sdma_scb_queue == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT);
+ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U;
+ proto->t_init = get_cycles();
+ proto->t_fini = 0;
+ proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0;
+ proto->runid_key = getpid();
+
+ proto->num_connected_outgoing = 0;
+ proto->num_connected_incoming = 0;
+ proto->num_disconnect_requests = 0;
+ proto->stray_warn_interval = (uint64_t) -1;
+ proto->done_warning = 0;
+ proto->done_once = 0;
+ proto->num_bogus_warnings = 0;
+ proto->psmi_logevent_tid_send_reqs.interval_secs = 15;
+ proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+ proto->psmi_logevent_tid_send_reqs.count = 0;
+#ifdef PSM_CUDA
+ /*
+ * We will need to add two extra bytes to iov_len
+ * when passing sdma hdr info to driver due to
+ * the new flags member in the struct.
+ */
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ proto->ips_extra_sdmahdr_size = sizeof(struct sdma_req_info) -
+ sizeof(struct sdma_req_info_v6_3);
+ else
+#endif
+ if (sizeof(struct sdma_req_info) != sizeof(struct sdma_req_info_v6_3))
+ proto->ips_extra_sdmahdr_size = sizeof(struct sdma_req_info) -
+ sizeof(struct sdma_req_info_v6_3);
+ else
+ proto->ips_extra_sdmahdr_size = 0;
+
+ /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */
+ if ((err = ips_ibta_init(proto)))
+ goto fail;
+
+ {
+ /* User asks for HFI loopback? */
+ union psmi_envvar_val env_loopback;
+
+ psmi_getenv("PSM2_HFI_LOOPBACK",
+ "PSM uses HFI loopback (default is disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_loopback);
+
+ if (env_loopback.e_uint)
+ proto->flags |= IPS_PROTO_FLAG_LOOPBACK;
+ }
+
+ {
+ /* Disable coalesced ACKs? */
+ union psmi_envvar_val env_coalesce_acks;
+
+ psmi_getenv("PSM2_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, /* Enabled by default */
+ &env_coalesce_acks);
+
+ if (env_coalesce_acks.e_uint)
+ proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS;
+ }
+
+ {
+ /* Number of credits per flow */
+ union psmi_envvar_val env_flow_credits;
+ int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc);
+
+ psmi_getenv("PSM2_FLOW_CREDITS",
+ "Number of unacked packets (credits) per flow (default is 64)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)df_flow_credits,
+ &env_flow_credits);
+ proto->flow_credits = env_flow_credits.e_uint;
+ }
+
+ /*
+ * Pre-calculate the PSN mask to support 24 or 31 bits PSN.
+ */
+ if ((context->runtime_flags & HFI1_CAP_EXTENDED_PSN)) {
+ proto->psn_mask = 0x7FFFFFFF;
+ } else {
+ proto->psn_mask = 0xFFFFFF;
+ }
+
+ /*
+ * Initialize SDMA, otherwise, turn on all PIO.
+ */
+ if ((context->runtime_flags & HFI1_CAP_SDMA)) {
+ if ((err = proto_sdma_init(proto, context)))
+ goto fail;
+ } else {
+ proto->flags |= IPS_PROTO_FLAG_SPIO;
+ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+ ~0U;
+ }
+
+ /*
+ * Setup the protocol wide short message ep flow.
+ */
+ if (proto->flags & IPS_PROTO_FLAG_SDMA) {
+ proto->msgflowid = EP_FLOW_GO_BACK_N_DMA;
+ } else {
+ proto->msgflowid = EP_FLOW_GO_BACK_N_PIO;
+ }
+
+ /*
+ * Clone sendreq mpool configuration for pend sends config
+ */
+ {
+ uint32_t chunks, maxsz;
+
+ psmi_assert_always(proto->ep->mq->sreq_pool != NULL);
+ psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks,
+ &maxsz);
+
+ proto->pend_sends_pool =
+ psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks,
+ maxsz, 0, DESCRIPTORS, NULL, NULL);
+ if (proto->pend_sends_pool == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ }
+
+ /*
+ * Create a pool of CCA timers for path_rec. The timers should not
+ * exceed the scb number num_of_send_desc(default 4K).
+ */
+ {
+ uint32_t chunks, maxsz;
+
+ chunks = 256;
+ maxsz = num_of_send_desc;
+
+ proto->timer_pool =
+ psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz,
+ 0, DESCRIPTORS, NULL, NULL);
+ if (proto->timer_pool == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ }
+
+ /*
+ * Register ips protocol statistics
+ *
+ * We put a (*) in the output to denote stats that may cause a drop in
+ * performance.
+ *
+ * We put a (**) in the output of those stats that "should never happen"
+ */
+ {
+ struct psmi_stats_entry entries[] = {
+ PSMI_STATS_DECLU64("pio busy count",
+ &proto->stats.pio_busy_cnt),
+ /* Throttling by kernel */
+ PSMI_STATS_DECLU64("writev busy cnt",
+ &proto->stats.writev_busy_cnt),
+ /* When local dma completion is in the way... */
+ PSMI_STATS_DECLU64("writev compl. eagain",
+ &proto->stats.writev_compl_eagain),
+ /* When remote completion happens before local completion */
+ PSMI_STATS_DECLU64("writev compl. delay (*)",
+ &proto->stats.writev_compl_delay),
+ PSMI_STATS_DECLU64("scb unavail eager count",
+ &proto->stats.scb_egr_unavail_cnt),
+ PSMI_STATS_DECLU64("scb unavail exp count",
+ &proto->stats.scb_exp_unavail_cnt),
+ PSMI_STATS_DECLU64("rcvhdr overflows", /* Normal egr/hdr ovflw */
+ &proto->stats.hdr_overflow),
+ PSMI_STATS_DECLU64("rcveager overflows",
+ &proto->stats.egr_overflow),
+ PSMI_STATS_DECLU64("lid zero errs (**)", /* shouldn't happen */
+ &proto->stats.lid_zero_errs),
+ PSMI_STATS_DECLU64("unknown packets (**)", /* shouldn't happen */
+ &proto->stats.unknown_packets),
+ PSMI_STATS_DECLU64("stray packets (*)",
+ &proto->stats.stray_packets),
+ PSMI_STATS_DECLU64("pio stalls (*)", /* shouldn't happen too often */
+ &proto->spioc->spio_num_stall_total),
+ PSMI_STATS_DECLU64("ICRC error (*)",
+ &proto->error_stats.num_icrc_err),
+ PSMI_STATS_DECLU64("ECC error ",
+ &proto->error_stats.num_ecc_err),
+ PSMI_STATS_DECLU64("Len error",
+ &proto->error_stats.num_len_err),
+ PSMI_STATS_DECLU64("TID error ",
+ &proto->error_stats.num_tid_err),
+ PSMI_STATS_DECLU64("DC error ",
+ &proto->error_stats.num_dc_err),
+ PSMI_STATS_DECLU64("DCUNC error ",
+ &proto->error_stats.num_dcunc_err),
+ PSMI_STATS_DECLU64("KHDRLEN error ",
+ &proto->error_stats.num_khdrlen_err),
+
+ };
+
+ err =
+ psmi_stats_register_type
+ ("OPA low-level protocol stats",
+ PSMI_STATSTYPE_IPSPROTO, entries,
+ PSMI_STATS_HOWMANY(entries), NULL);
+ if (err != PSM2_OK)
+ goto fail;
+ }
+
+ /*
+ * Control Queue and messaging
+ */
+ ctrlq_init(&proto->ctrlq, proto);
+
+ /*
+ * Receive-side handling
+ */
+ if ((err = ips_proto_recv_init(proto)))
+ goto fail;
+
+ /*
+ * Eager buffers. We don't care to receive a callback when eager buffers
+ * are newly released since we actively poll for new bufs.
+ */
+ {
+ /* configure PSM bounce buffer size */
+ union psmi_envvar_val env_bbs;
+
+ psmi_getenv("PSM2_BOUNCE_SZ",
+ "PSM bounce buffer size (default is 8192B)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+ (union psmi_envvar_val)8192,
+ &env_bbs);
+
+ proto->scb_bufsize = env_bbs.e_uint;
+ }
+
+ if ((err = ips_scbctrl_init(context, num_of_send_desc,
+ num_of_send_bufs, imm_size,
+ proto->scb_bufsize, NULL, NULL,
+ &proto->scbc_egr)))
+ goto fail;
+
+ /*
+ * Expected protocol handling.
+ * If we enable tid-based expected rendezvous, the expected protocol code
+ * handles its own rv scb buffers. If not, we have to enable eager-based
+ * rendezvous and we allocate scb buffers for it.
+ */
+ psmi_getenv("PSM2_TID",
+ "Tid proto flags (0 disables protocol)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)IPS_PROTOEXP_FLAGS_DEFAULT,
+ &env_tid);
+ protoexp_flags = env_tid.e_uint;
+
+ if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED) {
+ if (cuda_runtime_version >= 7000) {
+ PSMI_CUDA_CALL(cudaStreamCreateWithFlags,
+ &proto->cudastream_send, cudaStreamNonBlocking);
+ } else {
+ PSMI_CUDA_CALL(cudaStreamCreate,
+ &proto->cudastream_send);
+ }
+ }
+#endif
+ proto->scbc_rv = NULL;
+ if ((err = ips_protoexp_init(context, proto, protoexp_flags,
+ num_of_send_bufs, num_of_send_desc,
+ &proto->protoexp)))
+ goto fail;
+ } else {
+ proto->protoexp = NULL;
+ proto->scbc_rv = (struct ips_scbctrl *)
+ psmi_calloc(proto->ep, DESCRIPTORS,
+ 1, sizeof(struct ips_scbctrl));
+ if (proto->scbc_rv == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ /*
+ * Rendezvous buffers. We want to get a callback for rendezvous bufs
+ * since we asynchronously try to make progress on these sends and only
+ * schedule them on the timerq if there are pending sends and available
+ * bufs.
+ */
+ if ((err =
+ ips_scbctrl_init(context, num_of_send_desc,
+ 0 /* no bufs */ ,
+ 0, 0 /* bufsize==0 */ ,
+ ips_proto_rv_scbavail_callback,
+ proto, proto->scbc_rv)))
+ goto fail;
+ }
+
+ /*
+ * Parse the tid error settings from the environment.
+ * <interval_secs>:<max_count_before_exit>
+ */
+ {
+ int tvals[2];
+ char *tid_err;
+ union psmi_envvar_val env_tiderr;
+
+ tid_err = "-1:0"; /* no tiderr warnings, never exits */
+ tvals[0] = -1;
+ tvals[1] = 0;
+
+ if (!psmi_getenv("PSM2_TID_ERROR",
+ "Tid error control <intval_secs:max_errors>",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)tid_err, &env_tiderr)) {
+ /* not using default values */
+ tid_err = env_tiderr.e_str;
+ psmi_parse_str_tuples(tid_err, 2, tvals);
+ }
+ if (tvals[0] >= 0)
+ proto->tiderr_warn_interval = sec_2_cycles(tvals[0]);
+ else
+ proto->tiderr_warn_interval = UINT64_MAX;
+ proto->tiderr_max = tvals[1];
+ _HFI_PRDBG("Tid error control: warning every %d secs%s, "
+ "fatal error after %d tid errors%s\n",
+ tvals[0], (tvals[0] < 0) ? " (no warnings)" : "",
+ tvals[1], (tvals[1] == 0) ? " (never fatal)" : "");
+ }
+
+ /* Active Message interface. AM requests compete with MQ for eager
+ * buffers, since request establish the amount of buffering in the
+ * network (maximum number of requests in flight). The AM init function
+ * does not allow the number of send buffers to be set separately from
+ * the number of send descriptors, because otherwise it would have to
+ * impose extremely arcane constraints on the relative amounts to avoid
+ * a deadlock scenario. Thus, it handles it internally. The constraint
+ * is: In a node pair, the number of reply send buffers on at least one
+ * of the nodes must be at least double the number (optimal: double + 1)
+ * of send descriptors on the other node. */
+ if ((err = ips_proto_am_init(proto,
+ min(num_of_send_bufs, num_of_send_desc),
+ imm_size,
+ &proto->proto_am)))
+ goto fail;
+
+#if 0
+ if (!host_pid) {
+ char ipbuf[INET_ADDRSTRLEN], *p;
+ host_pid = (uint32_t) getpid();
+ host_ipv4addr = psmi_get_ipv4addr(); /* already be */
+ if (host_ipv4addr == 0) {
+ _HFI_DBG("Unable to obtain local IP address, "
+ "not fatal but some features may be disabled\n");
+ } else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) {
+ _HFI_INFO("Localhost IP address is set to the "
+ "loopback address 127.0.0.1, "
+ "not fatal but some features may be disabled\n");
+ } else {
+ p = (char *)inet_ntop(AF_INET,
+ (const void *)&host_ipv4addr,
+ ipbuf, sizeof(ipbuf));
+ _HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p,
+ host_pid);
+ }
+
+ /* Store in big endian for use in ERR_CHK */
+ host_pid = __cpu_to_be32(host_pid);
+ }
+#endif
+#ifdef PSM_CUDA
+ union psmi_envvar_val env_gpudirect_rdma;
+ psmi_getenv("PSM2_GPUDIRECT",
+ "Use GPUDirect RDMA support to allow the HFI to directly read"
+ " from the GPU for SDMA. Requires driver support.(default is "
+ " disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_gpudirect_rdma);
+
+ /* Default Send threshold for Gpu-direct set to 30000 */
+ union psmi_envvar_val env_gpudirect_send_thresh;
+ psmi_getenv("PSM2_GPUDIRECT_SEND_THRESH",
+ "Threshold to switch off Gpu-Direct feature on send side",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)30000, &env_gpudirect_send_thresh);
+ gpudirect_send_threshold = env_gpudirect_send_thresh.e_uint;
+
+ union psmi_envvar_val env_gpudirect_recv_thresh;
+ psmi_getenv("PSM2_GPUDIRECT_RECV_THRESH",
+ "Threshold to switch off Gpu-Direct feature on receive side",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &env_gpudirect_recv_thresh);
+ gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
+
+ if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
+ if (!PSMI_IS_CUDA_ENABLED ||
+ /* All pio, No SDMA*/
+ (proto->flags & IPS_PROTO_FLAG_SPIO) ||
+ !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) ||
+ !PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ err = psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_INTERNAL_ERR,
+ "Requires hfi1 driver with GPU-Direct feature enabled.\n");
+ proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
+ proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
+ } else {
+ /* The following environment variables are here for internal
+ * experimentation and will not be documented for any customers.
+ */
+ /* Use GPUDirect RDMA for SDMA send? */
+ union psmi_envvar_val env_gpudirect_rdma_send;
+ psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
+ "Use GPUDirect RDMA support to allow the HFI to directly"
+ " read from the GPU for SDMA. Requires driver"
+ " support.(default is disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_gpudirect_rdma_send);
+
+ if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
+ if (!PSMI_IS_CUDA_ENABLED ||
+ /* All pio, No SDMA*/
+ (proto->flags & IPS_PROTO_FLAG_SPIO))
+ err = psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_INTERNAL_ERR,
+ "Unable to start run as PSM would require cuda, sdma"
+ "and TID support\n");
+ proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
+ }
+ /* Use GPUDirect RDMA for recv? */
+ union psmi_envvar_val env_gpudirect_rdma_recv;
+ psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
+ "Use GPUDirect RDMA support to allow the HFI to directly"
+ " write into GPU. Requires driver support.(default is"
+ " disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_gpudirect_rdma_recv);
+
+ if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
+ if (!PSMI_IS_CUDA_ENABLED ||
+ !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
+ err = psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_INTERNAL_ERR,
+ "Unable to start run as PSM would require cuda,"
+ " sdma and TID support\n");
+ proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
+ }
+ }
+
+ if (PSMI_IS_CUDA_ENABLED &&
+ (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
+ struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+ uint32_t maxsz, chunksz, max_elements;
+
+ if ((err = psmi_parse_mpool_env(proto->mq, 1,
+ &rlim, &maxsz, &chunksz)))
+ goto fail;
+
+ /* the maxsz is the amount in MB, not the number of entries,
+ * since the element size depends on the window size */
+ max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+ /* mpool requires max_elements to be power of 2. round down. */
+ max_elements = 1 << (31 - __builtin_clz(max_elements));
+ proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv;
+ proto->cuda_hostbuf_pool_send =
+ psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+ chunksz, max_elements, 0,
+ UNDEFINED, NULL, NULL,
+ psmi_cuda_hostbuf_alloc_func,
+ (void *)
+ &proto->cuda_hostbuf_send_cfg);
+
+ if (proto->cuda_hostbuf_pool_send == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate CUDA host send buffer pool");
+ goto fail;
+ }
+
+ /* use the same number of elements for the small pool */
+ proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ;
+ proto->cuda_hostbuf_pool_small_send =
+ psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+ chunksz, max_elements, 0,
+ UNDEFINED, NULL, NULL,
+ psmi_cuda_hostbuf_alloc_func,
+ (void *)
+ &proto->cuda_hostbuf_small_send_cfg);
+
+ if (proto->cuda_hostbuf_pool_small_send == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate CUDA host small send buffer pool");
+ goto fail;
+ }
+
+ /* Configure the amount of prefetching */
+ union psmi_envvar_val env_prefetch_limit;
+
+ psmi_getenv("PSM2_CUDA_PREFETCH_LIMIT",
+ "How many TID windows to prefetch at RTS time(default is 2)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT,
+ &env_prefetch_limit);
+ proto->cuda_prefetch_limit = env_prefetch_limit.e_uint;
+ }
+#endif
+fail:
+ return err;
+}
+
+psm2_error_t
+ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
+{
+ struct psmi_eptab_iterator itor;
+ uint64_t t_start;
+ uint64_t t_grace_start, t_grace_time, t_grace_interval;
+ psm2_epaddr_t epaddr;
+ psm2_error_t err = PSM2_OK;
+ int i;
+ union psmi_envvar_val grace_intval;
+
+ psmi_getenv("PSM2_CLOSE_GRACE_PERIOD",
+ "Additional grace period in seconds for closing end-point.",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &grace_intval);
+
+ if (getenv("PSM2_CLOSE_GRACE_PERIOD")) {
+ t_grace_time = grace_intval.e_uint * SEC_ULL;
+ } else if (timeout_in > 0) {
+ /* default to half of the close time-out */
+ t_grace_time = timeout_in / 2;
+ } else {
+ /* propagate the infinite time-out case */
+ t_grace_time = 0;
+ }
+
+ if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT)
+ t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+ /* At close we will busy wait for the grace interval to see if any
+ * receive progress is made. If progress is made we will wait for
+ * another grace interval, until either no progress is made or the
+ * entire grace period has passed. If the grace interval is too low
+ * we may miss traffic and exit too early. If the grace interval is
+ * too large the additional time spent while closing the program
+ * will become visible to the user. */
+ psmi_getenv("PSM2_CLOSE_GRACE_INTERVAL",
+ "Grace interval in seconds for closing end-point.",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)0, &grace_intval);
+
+ if (getenv("PSM2_CLOSE_GRACE_INTERVAL")) {
+ t_grace_interval = grace_intval.e_uint * SEC_ULL;
+ } else {
+ /* A heuristic is used to scale up the timeout linearly with
+ * the number of endpoints, and we allow one second per 1000
+ * endpoints. */
+ t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000;
+ }
+
+ if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL)
+ t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL;
+ if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL)
+ t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL;
+
+ PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+ t_start = proto->t_fini = get_cycles();
+
+ /* Close whatever has been left open */
+ if (proto->num_connected_outgoing > 0) {
+ int num_disc = 0;
+ int *mask;
+ psm2_error_t *errs;
+ psm2_epaddr_t *epaddr_array;
+
+ psmi_epid_itor_init(&itor, proto->ep);
+ while ((epaddr = psmi_epid_itor_next(&itor))) {
+ if (epaddr->ptlctl->ptl == proto->ptl)
+ num_disc++;
+ }
+ psmi_epid_itor_fini(&itor);
+ mask =
+ (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc,
+ sizeof(int));
+ errs = (psm2_error_t *)
+ psmi_calloc(proto->ep, UNDEFINED, num_disc,
+ sizeof(psm2_error_t));
+ epaddr_array = (psm2_epaddr_t *)
+ psmi_calloc(proto->ep, UNDEFINED, num_disc,
+ sizeof(psm2_epaddr_t));
+
+ if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+ if (epaddr_array)
+ psmi_free(epaddr_array);
+ if (errs)
+ psmi_free(errs);
+ if (mask)
+ psmi_free(mask);
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ psmi_epid_itor_init(&itor, proto->ep);
+ i = 0;
+ while ((epaddr = psmi_epid_itor_next(&itor))) {
+ /*
+ * if cstate_outgoing is CSTATE_NONE, then we know it
+ * is an uni-directional connect, in that the peer
+ * sent a connect request to us, but we never sent one
+ * out to the peer epid. Ignore handling those in
+ * ips_proto_disconnect() as we will do the right thing
+ * when a disconnect request for the epaddr comes in from the peer.
+ */
+ if (epaddr->ptlctl->ptl == proto->ptl &&
+ ((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) {
+ mask[i] = 1;
+ epaddr_array[i] = epaddr;
+ i++;
+ IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr);
+ }
+ }
+ psmi_epid_itor_fini(&itor);
+ err = ips_proto_disconnect(proto, force, num_disc, epaddr_array,
+ mask, errs, timeout_in);
+ psmi_free(mask);
+ psmi_free(errs);
+ psmi_free(epaddr_array);
+ }
+
+ t_grace_start = get_cycles();
+
+ while (psmi_cycles_left(t_grace_start, t_grace_time)) {
+ uint64_t t_grace_interval_start = get_cycles();
+ int num_disconnect_requests = proto->num_disconnect_requests;
+ PSMI_BLOCKUNTIL(
+ proto->ep, err,
+ proto->num_connected_incoming == 0 ||
+ (!psmi_cycles_left(t_start, timeout_in) &&
+ (!psmi_cycles_left(t_grace_interval_start,
+ t_grace_interval) ||
+ !psmi_cycles_left(t_grace_start, t_grace_time))));
+ if (num_disconnect_requests == proto->num_disconnect_requests) {
+ /* nothing happened in this grace interval so break out early */
+ break;
+ }
+ }
+
+#if _HFI_DEBUGGING
+ if (_HFI_PRDBG_ON) {
+ uint64_t t_grace_finish = get_cycles();
+
+ _HFI_PRDBG_ALWAYS(
+ "Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n",
+ proto->num_connected_outgoing, proto->num_connected_incoming,
+ (int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) /
+ MSEC_ULL), (int)(t_grace_time / MSEC_ULL));
+ }
+#endif
+
+ if ((err = ips_ibta_fini(proto)))
+ goto fail;
+
+ if ((err = ips_proto_am_fini(&proto->proto_am)))
+ goto fail;
+
+ if ((err = ips_scbctrl_fini(&proto->scbc_egr)))
+ goto fail;
+
+ ips_proto_recv_fini(proto);
+
+ if (proto->protoexp) {
+ if ((err = ips_protoexp_fini(proto->protoexp)))
+ goto fail;
+ } else {
+ ips_scbctrl_fini(proto->scbc_rv);
+ psmi_free(proto->scbc_rv);
+ }
+
+ psmi_mpool_destroy(proto->pend_sends_pool);
+ psmi_mpool_destroy(proto->timer_pool);
+
+ psmi_free(proto->sdma_scb_queue);
+
+fail:
+ proto->t_fini = proto->t_init = 0;
+ return err;
+}
+
+static
+psm2_error_t
+proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context)
+{
+ union psmi_envvar_val env_sdma, env_hfiegr;
+ psm2_error_t err = PSM2_OK;
+
+ /*
+ * Only initialize if RUNTIME_SDMA is enabled.
+ */
+ psmi_assert_always(context->runtime_flags & HFI1_CAP_SDMA);
+
+ psmi_getenv("PSM2_SDMA",
+ "hfi send dma flags (0 disables send dma, 2 disables send pio, "
+ "1 for both sdma/spio, default 1)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)1, &env_sdma);
+ if (env_sdma.e_uint == 0)
+ proto->flags |= IPS_PROTO_FLAG_SPIO;
+ else if (env_sdma.e_uint == 2)
+ proto->flags |= IPS_PROTO_FLAG_SDMA;
+
+ if (!(proto->flags & (IPS_PROTO_FLAG_SDMA | IPS_PROTO_FLAG_SPIO))) {
+ /* use both spio and sdma */
+ if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M)
+ {
+ proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2;
+ proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_PHI2;
+ } else {
+ proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_XEON;
+ proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_XEON;
+ }
+
+ if (!psmi_getenv("PSM2_MQ_EAGER_SDMA_SZ",
+ "hfi pio-to-sdma eager switchover",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val) proto->iovec_thresh_eager,
+ &env_hfiegr)) {
+ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+ env_hfiegr.e_uint;
+ }
+ } else if (proto->flags & IPS_PROTO_FLAG_SDMA) { /* all sdma */
+ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+ 0;
+ } else if (proto->flags & IPS_PROTO_FLAG_SPIO) { /* all spio */
+ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+ ~0U;
+ }
+
+ return err;
+}
+
+static
+void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto)
+{
+ /* clear the ctrl send queue */
+ memset(ctrlq, 0, sizeof(*ctrlq));
+
+ proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED;
+ proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED;
+ proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED;
+ proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED;
+ proto->message_type_to_index[OPCODE_ERR_CHK_GEN] =
+ CTRL_MSG_ERR_CHK_GEN_QUEUED;
+ proto->message_type_to_index[OPCODE_CONNECT_REQUEST] =
+ CTRL_MSG_CONNECT_REQUEST_QUEUED;
+ proto->message_type_to_index[OPCODE_CONNECT_REPLY] =
+ CTRL_MSG_CONNECT_REPLY_QUEUED;
+ proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] =
+ CTRL_MSG_DISCONNECT_REQUEST_QUEUED;
+ proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] =
+ CTRL_MSG_DISCONNECT_REPLY_QUEUED;
+
+ ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0;
+ ctrlq->ctrlq_overflow = 0;
+ ctrlq->ctrlq_proto = proto;
+
+ /*
+ * We never enqueue ctrl messages with real payload. If we do,
+ * the queue 'elem_payload' size needs to be big enough.
+ * Note: enqueue nak/ack is very important for performance.
+ */
+ proto->ctrl_msg_queue_enqueue =
+ CTRL_MSG_ACK_QUEUED |
+ CTRL_MSG_NAK_QUEUED |
+ CTRL_MSG_BECN_QUEUED;
+
+ psmi_timer_entry_init(&ctrlq->ctrlq_timer,
+ ips_proto_timer_ctrlq_callback, ctrlq);
+
+ return;
+}
+
+static __inline__ void _build_ctrl_message(struct ips_proto *proto,
+ struct ips_flow *flow, uint8_t message_type,
+ ips_scb_t *ctrlscb, uint32_t paylen)
+{
+ uint32_t tot_paywords = (sizeof(struct ips_message_header) +
+ HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT;
+ ips_epaddr_t *ipsaddr = flow->ipsaddr;
+ struct ips_message_header *p_hdr = &ctrlscb->ips_lrh;
+ ips_path_rec_t *ctrl_path =
+ ipsaddr->pathgrp->pg_path[ipsaddr->
+ hpp_index][IPS_PATH_HIGH_PRIORITY];
+
+ if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) &&
+ (++ipsaddr->hpp_index >=
+ ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]))
+ ipsaddr->hpp_index = 0;
+
+ /* Control messages go over the control path. */
+ p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+ ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) <<
+ HFI_LRH_SL_SHIFT) |
+ ((proto->sl2sc[ctrl_path->pr_sl] &
+ HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT));
+ p_hdr->lrh[1] = ctrl_path->pr_dlid;
+ p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK);
+ p_hdr->lrh[3] = ctrl_path->pr_slid;
+
+ p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey |
+ (message_type << HFI_BTH_OPCODE_SHIFT));
+
+ /* If flow is congested then generate a BECN for path. */
+ if_pf(flow->flags & IPS_FLOW_FLAG_GEN_BECN) {
+ p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+ ipsaddr->
+ subcontext <<
+ HFI_BTH_SUBCTXT_SHIFT | flow->
+ flowid << HFI_BTH_FLOWID_SHIFT |
+ proto->epinfo.
+ ep_baseqp << HFI_BTH_QP_SHIFT | 1
+ << HFI_BTH_BECN_SHIFT);
+ flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN;
+ }
+ else {
+ p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+ ipsaddr->
+ subcontext <<
+ HFI_BTH_SUBCTXT_SHIFT | flow->
+ flowid << HFI_BTH_FLOWID_SHIFT |
+ proto->epinfo.
+ ep_baseqp << HFI_BTH_QP_SHIFT);
+ }
+
+ /* p_hdr->bth[2] already set by caller, or don't care */
+ /* p_hdr->ack_seq_num already set by caller, or don't care */
+
+ p_hdr->connidx = ipsaddr->connidx_outgoing;
+ p_hdr->flags = 0;
+
+ p_hdr->khdr.kdeth0 = __cpu_to_le32(
+ (ctrlscb->flags & IPS_SEND_FLAG_INTR) |
+ (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT));
+ p_hdr->khdr.kdeth1 = __cpu_to_le32(proto->epinfo.ep_jkey);
+
+ return;
+}
+
+psm2_error_t
+ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
+{
+ struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context;
+ struct ips_proto *proto = ctrlq->ctrlq_proto;
+ struct ips_ctrlq_elem *cqe;
+ uint32_t have_cksum;
+ psm2_error_t err;
+
+ have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+ /* service ctrl send queue first */
+ while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) {
+ cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail];
+
+ if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) {
+ err = ips_spio_transfer_frame(proto,
+ cqe->msg_scb.flow, &cqe->msg_scb.pbc,
+ cqe->msg_scb.cksum, 0, PSMI_TRUE,
+ have_cksum, cqe->msg_scb.cksum[0]
+#ifdef PSM_CUDA
+ , 0
+#endif
+ );
+ } else {
+ err = ips_dma_transfer_frame(proto,
+ cqe->msg_scb.flow, &cqe->msg_scb,
+ cqe->msg_scb.cksum, 0,
+ have_cksum, cqe->msg_scb.cksum[0]);
+ }
+
+ if (err == PSM2_OK) {
+ ips_proto_epaddr_stats_set(proto, cqe->message_type);
+ *cqe->msg_queue_mask &=
+ ~message_type2index(proto, cqe->message_type);
+ cqe->msg_queue_mask = NULL;
+ ctrlq->ctrlq_tail =
+ (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE;
+ } else {
+ psmi_assert(err == PSM2_EP_NO_RESOURCES);
+
+ if (proto->flags & IPS_PROTO_FLAG_SDMA)
+ proto->stats.writev_busy_cnt++;
+ else
+ proto->stats.pio_busy_cnt++;
+ /* re-request a timer expiration */
+ psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+ PSMI_TIMER_PRIO_0);
+ return PSM2_OK;
+ }
+ }
+
+ return PSM2_OK;
+}
+
+/* Update cqe struct which is a single element from pending control message queue */
+PSMI_ALWAYS_INLINE(
+void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask,
+ struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){
+
+ cqe->message_type = message_type;
+ cqe->msg_queue_mask = msg_queue_mask;
+ psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh,
+ &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh));
+ cqe->msg_scb.flow = flow;
+ cqe->msg_scb.cksum[0] = ctrlscb->cksum[0];
+}
+
+psm2_error_t
+ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
+ uint16_t *msg_queue_mask, ips_scb_t *ctrlscb,
+ void *payload, uint32_t paylen)
+{
+ psm2_error_t err = PSM2_EP_NO_RESOURCES;
+ ips_epaddr_t *ipsaddr = flow->ipsaddr;
+ struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+ struct ips_ctrlq *ctrlq = &proto->ctrlq;
+ struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe;
+ uint32_t have_cksum;
+
+ psmi_assert(message_type >= OPCODE_ACK &&
+ message_type <= OPCODE_DISCONNECT_REPLY);
+ psmi_assert((paylen & 0x3) == 0); /* require 4-byte multiple */
+ psmi_assert(flow->frag_size >=
+ (paylen + PSM_CRC_SIZE_IN_BYTES));
+
+ /* Drain queue if non-empty */
+ if (cqe[ctrlq->ctrlq_tail].msg_queue_mask)
+ ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL);
+
+ /* finish setup control message header */
+ _build_ctrl_message(proto, flow, message_type, ctrlscb, paylen);
+
+ /* If enabled checksum control message */
+ have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+ if (have_cksum) {
+ ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+ ips_do_cksum(proto, &ctrlscb->ips_lrh,
+ payload, paylen, ctrlscb->cksum);
+ }
+
+ /*
+ * for ACK/NAK/BECN, we use the fast flow to send over, otherwise,
+ * we use the original flow
+ */
+ if (message_type == OPCODE_ACK ||
+ message_type == OPCODE_NAK ||
+ message_type == OPCODE_BECN)
+ {
+ psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[proto->msgflowid];
+ }
+
+ switch (flow->transfer) {
+ case PSM_TRANSFER_PIO:
+ err = ips_spio_transfer_frame(proto, flow,
+ &ctrlscb->pbc, payload, paylen,
+ PSMI_TRUE, have_cksum, ctrlscb->cksum[0]
+#ifdef PSM_CUDA
+ , 0
+#endif
+ );
+ break;
+ case PSM_TRANSFER_DMA:
+ err = ips_dma_transfer_frame(proto, flow,
+ ctrlscb, payload, paylen,
+ have_cksum, ctrlscb->cksum[0]);
+ break;
+ default:
+ err = PSM2_INTERNAL_ERR;
+ break;
+ }
+
+ if (err == PSM2_OK)
+ ips_proto_epaddr_stats_set(proto, message_type);
+
+ _HFI_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d,"
+ "src=%p,len=%d returns %d\n",
+ (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh),
+ __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err);
+
+ if (err != PSM2_EP_NO_RESOURCES)
+ return err;
+ if (proto->flags & IPS_PROTO_FLAG_SDMA)
+ proto->stats.writev_busy_cnt++;
+ else
+ proto->stats.pio_busy_cnt++;
+
+ if (proto->ctrl_msg_queue_enqueue & proto->
+ message_type_to_index[message_type]) {
+ /* We only queue control msg without payload */
+ psmi_assert(paylen == 0);
+
+ if ((*msg_queue_mask) & proto->
+ message_type_to_index[message_type]) {
+
+ if (message_type == OPCODE_ACK) {
+ /* Pending queue should contain latest ACK type message,
+ * overwrite the previous one. */
+ ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask,
+ flow, ctrlscb, message_type);
+ }
+
+ err = PSM2_OK;
+ } else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) {
+ /* entry is free */
+ if (message_type == OPCODE_ACK) {
+ /* Track the index of last ACK type message in queue*/
+ flow->ack_index = ctrlq->ctrlq_head;
+ }
+
+ *msg_queue_mask |=
+ message_type2index(proto, message_type);
+
+ ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask,
+ flow, ctrlscb, message_type);
+
+ ctrlq->ctrlq_head =
+ (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE;
+ /* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */
+ psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+ PSMI_TIMER_PRIO_0);
+
+ err = PSM2_OK;
+ } else {
+ proto->ctrl_msg_queue_overflow++;
+ }
+ }
+
+ return err;
+}
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb)
+{
+ ips_epaddr_t *ipsaddr = flow->ipsaddr;
+ struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+ ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb);
+ if ((proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+ (scb->tidctrl == 0) && (scb->nfrag == 1)) {
+ scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+ ips_do_cksum(proto, &scb->ips_lrh,
+ ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]);
+ }
+
+ /* If this is the first scb on flow, pull in both timers. */
+ if (flow->timer_ack == NULL) {
+ psmi_assert(flow->timer_send == NULL);
+ flow->timer_ack = scb->timer_ack;
+ flow->timer_send = scb->timer_send;
+ }
+ psmi_assert(flow->timer_ack != NULL);
+ psmi_assert(flow->timer_send != NULL);
+
+ /* Every flow has a pending head that points into the unacked queue.
+ * If sends are already pending, process those first */
+ if (SLIST_EMPTY(&flow->scb_pend))
+ SLIST_FIRST(&flow->scb_pend) = scb;
+
+ /* Insert scb into flow's unacked queue */
+ STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq);
+
+#ifdef PSM_DEBUG
+ /* update scb counters in flow. */
+ flow->scb_num_pending++;
+ flow->scb_num_unacked++;
+#endif
+}
+MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue);
+
+/*
+ * This function attempts to flush the current list of pending
+ * packets through PIO.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Packet triggered through PIO.
+ * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm2_error_t
+ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
+{
+ struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+ struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+ int num_sent = 0;
+ uint64_t t_cyc;
+ ips_scb_t *scb;
+ psm2_error_t err = PSM2_OK;
+
+ psmi_assert(!SLIST_EMPTY(scb_pend));
+
+ /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+ if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) {
+ if (nflushed)
+ *nflushed = 0;
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) {
+ scb = SLIST_FIRST(scb_pend);
+ psmi_assert(scb->nfrag == 1);
+
+ if ((err = ips_spio_transfer_frame(proto, flow, &scb->pbc,
+ ips_scb_buffer(scb),
+ scb->payload_size,
+ PSMI_FALSE,
+ scb->ips_lrh.
+ flags &
+ IPS_SEND_FLAG_PKTCKSUM,
+ scb->cksum[0]
+#ifdef PSM_CUDA
+ , IS_TRANSFER_BUF_GPU_MEM(scb)
+#endif
+ )) == PSM2_OK) {
+ t_cyc = get_cycles();
+ scb->flags &= ~IPS_SEND_FLAG_PENDING;
+ scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+ scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc;
+ psmi_timer_request(proto->timerq, flow->timer_ack,
+ scb->abs_timeout);
+ num_sent++;
+ flow->credits--;
+ SLIST_REMOVE_HEAD(scb_pend, next);
+#ifdef PSM_DEBUG
+ flow->scb_num_pending--;
+#endif
+
+ } else
+ break;
+ }
+
+ /* If out of flow credits re-schedule send timer */
+ if (!SLIST_EMPTY(scb_pend)) {
+ proto->stats.pio_busy_cnt++;
+ psmi_timer_request(proto->timerq, flow->timer_send,
+ get_cycles() + proto->timeout_send);
+ }
+
+ if (nflushed != NULL)
+ *nflushed = num_sent;
+
+ return err;
+}
+
+/*
+ * Flush all packets currently marked as pending
+ */
+static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+ struct ips_scb_pendlist *slist, int *num_sent);
+
+/*
+ * Flush all packets queued up on a flow via send DMA.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Able to flush entire pending queue for DMA.
+ * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA.
+ * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ * or writev returned a recoverable error (no mem for
+ * descriptors, dma interrupted or no space left in dma
+ * queue).
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ * rxe/txe parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+psm2_error_t
+ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed)
+{
+ struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+ struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+ ips_scb_t *scb = NULL;
+ psm2_error_t err = PSM2_OK;
+ int nsent = 0;
+
+ psmi_assert(!SLIST_EMPTY(scb_pend));
+
+ /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+ if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) {
+ if (nflushed)
+ *nflushed = 0;
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ err = scb_dma_send(proto, flow, scb_pend, &nsent);
+ if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES &&
+ err != PSM2_OK_NO_PROGRESS)
+ goto fail;
+
+ if (nsent > 0) {
+ uint64_t t_cyc = get_cycles();
+ int i = 0;
+ /*
+ * inflight counter proto->iovec_cntr_next_inflight should not drift
+ * from completion counter proto->iovec_cntr_last_completed away too
+ * far because we only have very small scb counter compared with
+ * uint32_t counter value.
+ */
+#ifdef PSM_DEBUG
+ flow->scb_num_pending -= nsent;
+#endif
+ SLIST_FOREACH(scb, scb_pend, next) {
+ if (++i > nsent)
+ break;
+ scb->flags &= ~IPS_SEND_FLAG_PENDING;
+ scb->ack_timeout =
+ scb->nfrag * proto->epinfo.ep_timeout_ack;
+ scb->abs_timeout =
+ scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc;
+
+ psmi_assert(proto->sdma_scb_queue
+ [proto->sdma_fill_index] == NULL);
+ proto->sdma_scb_queue[proto->sdma_fill_index] = scb;
+ scb->dma_complete = 0;
+
+ proto->sdma_avail_counter--;
+ proto->sdma_fill_index++;
+ if (proto->sdma_fill_index == proto->sdma_queue_size)
+ proto->sdma_fill_index = 0;
+
+ /* Flow credits can temporarily go to negative for
+ * packets tracking purpose, because we have sdma
+ * chunk processing which can't send exact number
+ * of packets as the number of credits.
+ */
+ flow->credits -= scb->nfrag;
+ }
+ SLIST_FIRST(scb_pend) = scb;
+ }
+
+ if (SLIST_FIRST(scb_pend) != NULL) {
+ psmi_assert(flow->scb_num_pending > 0);
+
+ switch (flow->protocol) {
+ case PSM_PROTOCOL_TIDFLOW:
+ /* For Tidflow we can cancel the ack timer if we have flow credits
+ * available and schedule the send timer. If we are out of flow
+ * credits then the ack timer is scheduled as we are waiting for
+ * an ACK to reclaim credits. This is required since multiple
+ * tidflows may be active concurrently.
+ */
+ if (flow->credits > 0) {
+ /* Cancel ack timer and reschedule send timer. Increment
+ * writev_busy_cnt as this really is DMA buffer exhaustion.
+ */
+ psmi_timer_cancel(proto->timerq,
+ flow->timer_ack);
+ psmi_timer_request(proto->timerq,
+ flow->timer_send,
+ get_cycles() +
+ (proto->timeout_send << 1));
+ proto->stats.writev_busy_cnt++;
+ } else {
+ /* Re-instate ACK timer to reap flow credits */
+ psmi_timer_request(proto->timerq,
+ flow->timer_ack,
+ get_cycles() +
+ (proto->epinfo.
+ ep_timeout_ack >> 2));
+ }
+
+ break;
+ case PSM_PROTOCOL_GO_BACK_N:
+ default:
+ if (flow->credits > 0) {
+ /* Schedule send timer and increment writev_busy_cnt */
+ psmi_timer_request(proto->timerq,
+ flow->timer_send,
+ get_cycles() +
+ (proto->timeout_send << 1));
+ proto->stats.writev_busy_cnt++;
+ } else {
+ /* Schedule ACK timer to reap flow credits */
+ psmi_timer_request(proto->timerq,
+ flow->timer_ack,
+ get_cycles() +
+ (proto->epinfo.
+ ep_timeout_ack >> 2));
+ }
+ break;
+ }
+ } else {
+ /* Schedule ack timer */
+ psmi_timer_cancel(proto->timerq, flow->timer_send);
+ psmi_timer_request(proto->timerq, flow->timer_ack,
+ get_cycles() + proto->epinfo.ep_timeout_ack);
+ }
+
+ /* We overwrite error with its new meaning for flushing packets */
+ if (nsent > 0)
+ if (scb)
+ err = PSM2_OK_NO_PROGRESS; /* partial flush */
+ else
+ err = PSM2_OK; /* complete flush */
+ else
+ err = PSM2_EP_NO_RESOURCES; /* no flush at all */
+
+fail:
+ if (nflushed)
+ *nflushed = nsent;
+
+ return err;
+}
+
+/*
+ * Fault injection in dma sends. Since DMA through writev() is all-or-nothing,
+ * we don't inject faults on a packet-per-packet basis since the code gets
+ * quite complex. Instead, each call to flush_dma or transfer_frame is treated
+ * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND
+ * setting.
+ *
+ * The effect is as if the event was successful but dropped on the wire
+ * somewhere.
+ */
+PSMI_ALWAYS_INLINE(int dma_do_fault())
+{
+
+ if_pf(PSMI_FAULTINJ_ENABLED()) {
+ PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1,
+ IPS_FAULTINJ_DMALOST);
+ return psmi_faultinj_is_fault(fi);
+ }
+ else
+ return 0;
+}
+
+/*
+ * Driver defines the following sdma completion error code, returned
+ * as negative value:
+ * #define SDMA_TXREQ_S_OK 0
+ * #define SDMA_TXREQ_S_SENDERROR 1
+ * #define SDMA_TXREQ_S_ABORTED 2
+ * #define SDMA_TXREQ_S_SHUTDOWN 3
+ *
+ * When hfi is in freeze mode, driver will complete all the pending
+ * sdma request as aborted. Since PSM needs to recover from hfi
+ * freeze mode, this routine ignore aborted error.
+ */
+psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto)
+{
+ ips_scb_t *scb;
+ struct hfi1_sdma_comp_entry *comp;
+ uint32_t status;
+
+ while (proto->sdma_done_index != proto->sdma_fill_index) {
+ comp = &proto->sdma_comp_queue[proto->sdma_done_index];
+ status = comp->status;
+ psmi_rmb();
+
+ if (status == QUEUED)
+ return PSM2_OK;
+
+ /* Mark sdma request is complete */
+ scb = proto->sdma_scb_queue[proto->sdma_done_index];
+ if (scb) {
+ scb->dma_complete = 1;
+ proto->sdma_scb_queue[proto->sdma_done_index] = NULL;
+ }
+
+ if (status == ERROR && ((int)comp->errcode) != -2) {
+ psm2_error_t err =
+ psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE,
+ "SDMA completion error: %d (fd=%d, index=%d)",
+ 0 - comp->errcode,
+ proto->fd,
+ proto->sdma_done_index);
+ return err;
+ }
+
+ proto->sdma_avail_counter++;
+ proto->sdma_done_index++;
+ if (proto->sdma_done_index == proto->sdma_queue_size)
+ proto->sdma_done_index = 0;
+ }
+
+ return PSM2_OK;
+}
+
+/*
+
+Handles ENOMEM on a DMA completion.
+
+ */
+static inline
+psm2_error_t
+handle_ENOMEM_on_DMA_completion(struct ips_proto *proto)
+{
+ psm2_error_t err;
+ time_t now = time(NULL);
+
+ if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) {
+ uint64_t lengthEvicted =
+ ips_tidcache_evict(&proto->protoexp->tidc, -1);
+
+ if (!proto->writevFailTime)
+ proto->writevFailTime = now;
+
+ if (lengthEvicted)
+ return PSM2_OK; /* signals a retry of the writev command. */
+ else
+ return PSM2_EP_NO_RESOURCES; /* should signal a return of
+ no progress, and retry later */
+ }
+ else if (!proto->writevFailTime)
+ {
+ proto->writevFailTime = now;
+ return PSM2_EP_NO_RESOURCES; /* should signal a return of
+ no progress, and retry later */
+ }
+ else
+ {
+ static const double thirtySeconds = 30.0;
+
+ if (difftime(now, proto->writevFailTime) >
+ thirtySeconds) {
+ err = psmi_handle_error(
+ proto->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "SDMA completion error: out of "
+ "memory (fd=%d, index=%d)",
+ proto->fd,
+ proto->sdma_done_index);
+ return err;
+ }
+ return PSM2_EP_NO_RESOURCES; /* should signal a return of
+ no progress, and retry later */
+ }
+}
+
+/* ips_dma_transfer_frame is used only for control messages, and is
+ * not enabled by default, and not tested by QA; expected send
+ * dma goes through scb_dma_send() */
+psm2_error_t
+ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
+ ips_scb_t *scb, void *payload, uint32_t paylen,
+ uint32_t have_cksum, uint32_t cksum)
+{
+ ssize_t ret;
+ psm2_error_t err;
+ struct sdma_req_info *sdmahdr;
+ uint16_t iovcnt;
+ struct iovec iovec[2];
+
+ /* See comments above for fault injection */
+ if_pf(dma_do_fault())
+ return PSM2_OK;
+
+ /*
+ * Check if there is a sdma queue slot.
+ */
+ if (proto->sdma_avail_counter == 0) {
+ err = ips_proto_dma_completion_update(proto);
+ if (err)
+ return err;
+
+ if (proto->sdma_avail_counter == 0) {
+ return PSM2_EP_NO_RESOURCES;
+ }
+ }
+
+ /*
+ * If we have checksum, put to the end of payload. We make sure
+ * there is enough space in payload for us to put 8 bytes checksum.
+ * for control message, payload is internal PSM buffer, not user buffer.
+ */
+ if (have_cksum) {
+ uint32_t *ckptr = (uint32_t *) ((char *)payload + paylen);
+ *ckptr = cksum;
+ ckptr++;
+ *ckptr = cksum;
+ paylen += PSM_CRC_SIZE_IN_BYTES;
+ }
+
+ /*
+ * Setup PBC.
+ */
+ ips_proto_pbc_update(proto, flow, PSMI_TRUE,
+ &scb->pbc, HFI_MESSAGE_HDR_SIZE, paylen);
+
+ /*
+ * Setup SDMA header and io vector.
+ */
+ sdmahdr = (struct sdma_req_info *)
+ psmi_get_sdma_req_info(scb, proto->ips_extra_sdmahdr_size);
+ sdmahdr->npkts = 1;
+ sdmahdr->fragsize = flow->frag_size;
+
+ sdmahdr->comp_idx = proto->sdma_fill_index;
+ psmi_assert(proto->sdma_comp_queue
+ [proto->sdma_fill_index].status != QUEUED);
+
+ iovcnt = 1;
+ iovec[0].iov_base = sdmahdr;
+ iovec[0].iov_len = HFI_SDMA_HDR_SIZE +
+ proto->ips_extra_sdmahdr_size;
+ if (paylen > 0) {
+ iovcnt++;
+ iovec[1].iov_base = payload;
+ iovec[1].iov_len = paylen;
+ }
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+ sdmahdr->ctrl = 2 |
+ (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+ } else {
+#endif
+ sdmahdr->ctrl = 1 |
+ (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+ }
+#endif
+
+ /*
+ * Write into driver to do SDMA work.
+ */
+retry:
+ ret = hfi_cmd_writev(proto->fd, iovec, iovcnt);
+
+ if (ret > 0) {
+ proto->writevFailTime = 0;
+ psmi_assert_always(ret == 1);
+
+ proto->sdma_avail_counter--;
+ proto->sdma_fill_index++;
+ if (proto->sdma_fill_index == proto->sdma_queue_size)
+ proto->sdma_fill_index = 0;
+
+ /*
+ * Wait for completion of this control message if
+ * stack buffer payload is used. This should not be
+ * a performance issue because sdma control message
+ * is not a performance code path.
+ */
+ if (iovcnt > 1) {
+ /* Setup scb ready for completion. */
+ psmi_assert(proto->sdma_scb_queue
+ [sdmahdr->comp_idx] == NULL);
+ proto->sdma_scb_queue[sdmahdr->comp_idx] = scb;
+ scb->dma_complete = 0;
+
+ /* Wait for completion */
+ err = ips_proto_dma_wait_until(proto, scb);
+ } else
+ err = PSM2_OK;
+ } else {
+ /*
+ * ret == 0: Driver did not queue packet. Try later.
+ * ENOMEM: No kernel memory to queue request, try later? *
+ * ECOMM: Link may have gone down
+ * EINTR: Got interrupt while in writev
+ */
+ if (errno == ENOMEM) {
+ err = handle_ENOMEM_on_DMA_completion(proto);
+ if (err == PSM2_OK)
+ goto retry;
+ } else if (ret == 0 || errno == ECOMM || errno == EINTR) {
+ err = psmi_context_check_status(
+ (const psmi_context_t *)&proto->ep->context);
+ /*
+ * During a link bounce the err returned from
+ * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case
+ * the error code which we need to return to the calling flush
+ * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to
+ * signal it to restart the timers to flush the packets.
+ * Not doing so would leave the packet on the unacked and
+ * pending q without the sdma descriptors ever being updated.
+ */
+ if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK)
+ err = PSM2_EP_NO_RESOURCES;
+ }
+
+ else
+ err = psmi_handle_error(proto->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Unhandled error in writev(): "
+ "%s (fd=%d,iovec=%p,len=%d)",
+ strerror(errno),
+ proto->fd,
+ &iovec,
+ 1);
+ }
+
+ return err;
+}
+
+/*
+ * Caller still expects num_sent to always be correctly set in case of an
+ * error.
+ *
+ * Recoverable errors:
+ * PSM2_OK: At least one packet was successfully queued up for DMA.
+ * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ * or writev returned a recoverable error (no mem for
+ * descriptors, dma interrupted or no space left in dma
+ * queue).
+ * PSM2_OK_NO_PROGRESS: Cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected
+ * error in calling writev(), or chip failure, rxe/txe
+ * parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+static
+psm2_error_t
+scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+ struct ips_scb_pendlist *slist, int *num_sent)
+{
+ psm2_error_t err = PSM2_OK;
+ struct sdma_req_info *sdmahdr;
+ struct ips_scb *scb;
+ struct iovec *iovec;
+ uint16_t iovcnt;
+
+ unsigned int vec_idx = 0;
+ unsigned int scb_idx = 0, scb_sent = 0;
+ unsigned int num = 0, max_elem;
+ uint32_t have_cksum;
+ uint32_t fillidx;
+ int16_t credits;
+ ssize_t ret;
+
+ /* See comments above for fault injection */
+ if_pf(dma_do_fault()) goto fail;
+
+ /* Check how many SCBs to send based on flow credits */
+ credits = flow->credits;
+ psmi_assert(SLIST_FIRST(slist) != NULL);
+ SLIST_FOREACH(scb, slist, next) {
+ num++;
+ credits -= scb->nfrag;
+ if (credits <= 0)
+ break;
+ }
+ if (proto->sdma_avail_counter < num) {
+ /* if there is not enough sdma slot,
+ * update and use what we have.
+ */
+ err = ips_proto_dma_completion_update(proto);
+ if (err)
+ goto fail;
+ if (proto->sdma_avail_counter == 0) {
+ err = PSM2_EP_NO_RESOURCES;
+ goto fail;
+ }
+ if (proto->sdma_avail_counter < num)
+ num = proto->sdma_avail_counter;
+ }
+
+ /* header, payload, checksum, tidarray */
+ max_elem = 4 * num;
+ iovec = alloca(sizeof(struct iovec) * max_elem);
+
+ if_pf(iovec == NULL) {
+ err = psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_NO_MEMORY,
+ "alloca for %d bytes failed in writev",
+ (int)(sizeof(struct iovec) * max_elem));
+ goto fail;
+ }
+
+ fillidx = proto->sdma_fill_index;
+ SLIST_FOREACH(scb, slist, next) {
+ /* Can't exceed posix max writev count */
+ if (vec_idx + (int)!!(scb->payload_size > 0) >= UIO_MAXIOV)
+ break;
+
+ psmi_assert(vec_idx < max_elem);
+ psmi_assert_always(((scb->payload_size & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == non_dw_mul_sdma));
+
+ /* Checksum all eager packets */
+ have_cksum = scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM;
+
+ /*
+ * Setup PBC.
+ */
+ ips_proto_pbc_update(
+ proto,
+ flow,
+ PSMI_FALSE,
+ &scb->pbc,
+ HFI_MESSAGE_HDR_SIZE,
+ scb->payload_size +
+ (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0));
+
+ sdmahdr = (struct sdma_req_info *)
+ psmi_get_sdma_req_info(scb, proto->ips_extra_sdmahdr_size);
+
+ sdmahdr->npkts =
+ scb->nfrag > 1 ? scb->nfrag_remaining : scb->nfrag;
+ sdmahdr->fragsize =
+ scb->frag_size ? scb->frag_size : flow->frag_size;
+
+ sdmahdr->comp_idx = fillidx;
+ psmi_assert(proto->sdma_comp_queue[fillidx].status != QUEUED);
+ fillidx++;
+ if (fillidx == proto->sdma_queue_size)
+ fillidx = 0;
+
+ /*
+ * Setup io vector.
+ */
+ iovec[vec_idx].iov_base = sdmahdr;
+ iovec[vec_idx].iov_len = HFI_SDMA_HDR_SIZE +
+ proto->ips_extra_sdmahdr_size;
+ vec_idx++;
+ iovcnt = 1;
+ _HFI_VDBG("hdr=%p,%d\n",
+ iovec[vec_idx - 1].iov_base,
+ (int)iovec[vec_idx - 1].iov_len);
+
+ if (scb->payload_size > 0) {
+ /*
+ * OPA1 supports byte-aligned payload. If it is
+ * single packet per scb, use payload_size, else
+ * multi-packets per scb, use remaining chunk_size.
+ * payload_size is the remaining chunk first packet
+ * length.
+ */
+ iovec[vec_idx].iov_base = ips_scb_buffer(scb);
+ iovec[vec_idx].iov_len = scb->nfrag > 1
+ ? scb->chunk_size_remaining
+ : scb->payload_size;
+ vec_idx++;
+ iovcnt++;
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED && IS_TRANSFER_BUF_GPU_MEM(scb)) {
+ /* without this attr, CUDA memory accesses
+ * do not synchronize with gpudirect-rdma accesses.
+ * We set this field only if the currently loaded driver
+ * supports this field. If not, we have other problems
+ * where we have a non gpu-direct enabled driver loaded
+ * and PSM2 is trying to use GPU features.
+ */
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ sdmahdr->flags = HFI1_BUF_GPU_MEM;
+ else
+ sdmahdr->flags = 0;
+ } else if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ sdmahdr->flags = 0;
+#endif
+
+ _HFI_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n",
+ scb->seq_num.psn_num,
+ iovec[vec_idx - 2].iov_base,
+ (int)iovec[vec_idx - 2].iov_len,
+ iovec[vec_idx - 1].iov_base,
+ (int)iovec[vec_idx - 1].iov_len);
+ }
+
+ /* If checksum then update checksum */
+ if (have_cksum) {
+ scb->cksum[1] = scb->cksum[0];
+ iovec[vec_idx].iov_base = scb->cksum;
+ iovec[vec_idx].iov_len = PSM_CRC_SIZE_IN_BYTES;
+ vec_idx++;
+ iovcnt++;
+
+ _HFI_VDBG("chsum=%p,%d\n",
+ iovec[vec_idx - 1].iov_base,
+ (int)iovec[vec_idx - 1].iov_len);
+ }
+
+ /*
+ * If it is TID receive, attached tid info.
+ */
+ if (scb->tidctrl) {
+ iovec[vec_idx].iov_base = scb->tsess;
+ iovec[vec_idx].iov_len = scb->tsess_length;
+ vec_idx++;
+ iovcnt++;
+
+#ifdef PSM_CUDA
+ /*
+ * The driver knows to check for "flags" field in
+ * sdma_req_info only if ctrl=2.
+ */
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+ sdmahdr->ctrl = 2 |
+ (EXPECTED << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+ } else {
+#endif
+ sdmahdr->ctrl = 1 |
+ (EXPECTED << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+ }
+#endif
+ _HFI_VDBG("tid-info=%p,%d\n",
+ iovec[vec_idx - 1].iov_base,
+ (int)iovec[vec_idx - 1].iov_len);
+ } else {
+#ifdef PSM_CUDA
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+ sdmahdr->ctrl = 2 |
+ (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+ } else {
+#endif
+ sdmahdr->ctrl = 1 |
+ (EAGER << HFI1_SDMA_REQ_OPCODE_SHIFT) |
+ (iovcnt << HFI1_SDMA_REQ_IOVCNT_SHIFT);
+#ifdef PSM_CUDA
+ }
+#endif
+ }
+
+ /* Can bound the number to send by 'num' */
+ if (++scb_idx == num)
+ break;
+ }
+ psmi_assert(vec_idx > 0);
+retry:
+ ret = hfi_cmd_writev(proto->fd, iovec, vec_idx);
+
+ if (ret > 0) {
+ proto->writevFailTime = 0;
+ /* No need for inflight system call, we can infer it's value
+ * from
+ * writev's return value */
+ scb_sent += ret;
+ } else {
+ /*
+ * ret == 0: Driver did not queue packet. Try later.
+ * ENOMEM: No kernel memory to queue request, try later?
+ * ECOMM: Link may have gone down
+ * EINTR: Got interrupt while in writev
+ */
+ if (errno == ENOMEM) {
+ err = handle_ENOMEM_on_DMA_completion(proto);
+ if (err == PSM2_OK)
+ goto retry;
+ } else if (ret == 0 || errno == ECOMM || errno == EINTR) {
+ err = psmi_context_check_status(
+ (const psmi_context_t *)&proto->ep->context);
+ /*
+ * During a link bounce the err returned from
+ * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case
+ * the error code which we need to return to the calling flush
+ * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to
+ * signal the caller to restart the timers to flush the packets.
+ * Not doing so would leave the packet on the unacked and
+ * pending q without the sdma descriptors ever being updated.
+ */
+ if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK)
+ err = PSM2_EP_NO_RESOURCES;
+ } else {
+ err = psmi_handle_error(
+ proto->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Unexpected error in writev(): %s (errno=%d) "
+ "(fd=%d,iovec=%p,len=%d)",
+ strerror(errno),
+ errno,
+ proto->fd,
+ iovec,
+ vec_idx);
+ goto fail;
+ }
+ }
+
+fail:
+ *num_sent = scb_sent;
+ psmi_assert(*num_sent <= num && *num_sent >= 0);
+ return err;
+}
+
+/*
+ * Because we only lazily reap send dma completions, it's possible that we
+ * receive a packet's remote acknowledgement before seeing that packet's local
+ * completion. As part of processing ack packets and releasing scbs, we issue
+ * a wait for the local completion if the scb is marked as having been sent via
+ * send dma.
+ */
+psm2_error_t
+ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb)
+{
+ psm2_error_t err = PSM2_OK;
+ int spin_cnt = 0;
+ int did_yield = 0;
+
+ PSMI_PROFILE_BLOCK();
+
+ do {
+ if (spin_cnt++ == proto->ep->yield_spin_cnt) {
+ /* Have to yield holding the PSM lock, mostly because we don't
+ * support another thread changing internal state at this point in
+ * the code.
+ */
+ did_yield = 1;
+ spin_cnt = 0;
+ sched_yield();
+ }
+
+ err = ips_proto_dma_completion_update(proto);
+ if (err)
+ return err;
+ } while (scb->dma_complete == 0);
+
+ if (did_yield)
+ proto->stats.writev_compl_delay++;
+
+ PSMI_PROFILE_UNBLOCK();
+
+ return err;
+}
+
+psm2_error_t
+ips_proto_timer_ack_callback(struct psmi_timer *current_timer,
+ uint64_t current)
+{
+ struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+ struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+ uint64_t t_cyc_next = get_cycles();
+ psmi_seqnum_t err_chk_seq;
+ ips_scb_t *scb, ctrlscb;
+ uint8_t message_type;
+
+ if (STAILQ_EMPTY(&flow->scb_unacked))
+ return PSM2_OK;
+
+ scb = STAILQ_FIRST(&flow->scb_unacked);
+
+ if (current >= scb->abs_timeout) {
+ int done_local = 0;
+
+ /* We have to ensure that the send is at least locally complete before
+ * sending an error check or else earlier data can get to the
+ * destination *after* we pio or dma this err_chk.
+ */
+ if (flow->transfer == PSM_TRANSFER_DMA) {
+ /* error is caught inside this routine */
+ ips_proto_dma_completion_update(proto);
+
+ if (scb->dma_complete)
+ done_local = 1;
+ else
+ proto->stats.writev_compl_eagain++;
+ } else
+ done_local = 1; /* Always done for PIO flows */
+
+ scb->ack_timeout =
+ min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor,
+ proto->epinfo.ep_timeout_ack_max);
+ scb->abs_timeout = t_cyc_next + scb->ack_timeout;
+
+ if (done_local) {
+ _HFI_VDBG
+ ("sending err_chk flow=%d with first=%d,last=%d\n",
+ flow->flowid,
+ STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num,
+ STAILQ_LAST(&flow->scb_unacked, ips_scb,
+ nextq)->seq_num.psn_num);
+
+ ctrlscb.flags = 0;
+ if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD)
+ ctrlscb.flags |= IPS_SEND_FLAG_INTR;
+
+ err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ?
+ flow->xmit_seq_num :
+ SLIST_FIRST(&flow->scb_pend)->seq_num;
+
+ if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+ message_type = OPCODE_ERR_CHK_GEN;
+ err_chk_seq.psn_seq -= 1;
+ /* Receive descriptor index */
+ ctrlscb.ips_lrh.data[0].u64 =
+ scb->tidsendc->rdescid.u64;
+ /* Send descriptor index */
+ ctrlscb.ips_lrh.data[1].u64 =
+ scb->tidsendc->sdescid.u64;
+ } else {
+ PSM2_LOG_MSG("sending ERR_CHK message");
+ message_type = OPCODE_ERR_CHK;
+ err_chk_seq.psn_num = (err_chk_seq.psn_num - 1)
+ & proto->psn_mask;
+ }
+ ctrlscb.ips_lrh.bth[2] =
+ __cpu_to_be32(err_chk_seq.psn_num);
+
+ ips_proto_send_ctrl_message(flow, message_type,
+ &flow->ipsaddr->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+
+ t_cyc_next = get_cycles() + scb->ack_timeout;
+ } else
+ t_cyc_next += (scb->abs_timeout - current);
+
+ psmi_timer_request(proto->timerq, current_timer, t_cyc_next);
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_timer_send_callback(struct psmi_timer *current_timer,
+ uint64_t current)
+{
+ struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+ struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+
+ /* If flow is marked as congested adjust injection rate - see process nak
+ * when a congestion NAK is received.
+ */
+ if_pf(flow->flags & IPS_FLOW_FLAG_CONGESTED) {
+
+ /* Clear congestion flag and decrease injection rate */
+ flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+ if ((flow->path->pr_ccti +
+ proto->cace[flow->path->pr_sl].ccti_increase) <=
+ proto->ccti_limit)
+ ips_cca_adjust_rate(flow->path,
+ proto->cace[flow->path->pr_sl].
+ ccti_increase);
+ }
+
+ if (!SLIST_EMPTY(&flow->scb_pend))
+ flow->flush(flow, NULL);
+
+ return PSM2_OK;
+}
+
+psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment)
+{
+ struct ips_proto *proto = path_rec->proto;
+
+ /* Increment/decrement ccti for path */
+ psmi_assert_always(path_rec->pr_ccti >=
+ proto->cace[path_rec->pr_sl].ccti_min);
+ path_rec->pr_ccti += cct_increment;
+
+ /* Determine new active IPD. */
+#if _HFI_DEBUGGING
+ uint16_t prev_ipd = 0;
+ uint16_t prev_divisor = 0;
+ if (_HFI_CCADBG_ON) {
+ prev_ipd = path_rec->pr_active_ipd;
+ prev_divisor = path_rec->pr_cca_divisor;
+ }
+#endif
+ if ((path_rec->pr_static_ipd) &&
+ ((path_rec->pr_static_ipd + 1) >
+ (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) {
+ path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1;
+ path_rec->pr_cca_divisor = 0;
+ } else {
+ path_rec->pr_active_ipd =
+ proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK;
+ path_rec->pr_cca_divisor =
+ proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT;
+ }
+
+#if _HFI_DEBUGGING
+ if (_HFI_CCADBG_ON) {
+ _HFI_CCADBG_ALWAYS("CCA: %s injection rate to <%x.%x> from <%x.%x>\n",
+ (cct_increment > 0) ? "Decreasing" : "Increasing",
+ path_rec->pr_cca_divisor, path_rec->pr_active_ipd,
+ prev_divisor, prev_ipd);
+ }
+#endif
+
+ /* Reschedule CCA timer if this path is still marked as congested */
+ if (path_rec->pr_ccti > proto->cace[path_rec->pr_sl].ccti_min) {
+ if (path_rec->pr_timer_cca == NULL) {
+ path_rec->pr_timer_cca =
+ (struct psmi_timer *)psmi_mpool_get(proto->
+ timer_pool);
+ psmi_assert(path_rec->pr_timer_cca != NULL);
+ psmi_timer_entry_init(path_rec->pr_timer_cca,
+ ips_cca_timer_callback, path_rec);
+ }
+ psmi_timer_request(proto->timerq,
+ path_rec->pr_timer_cca,
+ get_cycles() +
+ proto->cace[path_rec->pr_sl].
+ ccti_timer_cycles);
+ } else if (path_rec->pr_timer_cca) {
+ psmi_mpool_put(path_rec->pr_timer_cca);
+ path_rec->pr_timer_cca = NULL;
+ }
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current)
+{
+ ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context;
+
+ /* Increase injection rate for flow. Decrement CCTI */
+ if (path_rec->pr_ccti > path_rec->proto->cace[path_rec->pr_sl].ccti_min)
+ return ips_cca_adjust_rate(path_rec, -1);
+
+ psmi_mpool_put(path_rec->pr_timer_cca);
+ path_rec->pr_timer_cca = NULL;
+ return PSM2_OK;
+}
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
new file mode 100644
index 0000000..00da753
--- /dev/null
+++ b/ptl_ips/ips_proto.h
@@ -0,0 +1,687 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_H
+#define _IPS_PROTO_H
+
+#include "psm_user.h"
+
+#include "ips_recvhdrq.h"
+#include "ips_tid.h"
+#include "ips_scb.h"
+#include "ips_epstate.h"
+#include "ips_spio.h"
+#include "ips_stats.h"
+#include "ips_proto_am.h"
+#include "ips_tidflow.h"
+#include "ips_path_rec.h"
+
+typedef enum ips_path_type {
+ IPS_PATH_LOW_PRIORITY,
+ IPS_PATH_NORMAL_PRIORITY,
+ IPS_PATH_HIGH_PRIORITY,
+ IPS_PATH_MAX_PRIORITY
+} ips_path_type_t;
+
+/*
+ * Local Endpoint info.
+ *
+ * Contains information necessary for composing packets for the local endpoint
+ */
+struct ips_epinfo {
+ uint16_t ep_base_lid;
+ uint8_t ep_baseqp;
+ uint8_t ep_lmc;
+ opa_rate ep_link_rate;
+ uint16_t ep_context;
+ uint16_t ep_subcontext;
+ uint16_t ep_hfi_type;
+ uint16_t ep_sl; /* HFI_SL only when path record not used */
+ uint16_t ep_mtu;
+ uint16_t ep_piosize;
+ uint16_t ep_pkey; /* PSM2_PKEY only when path record not used */
+ uint16_t ep_jkey;
+ uint64_t ep_timeout_ack; /* PSM2_ERRCHK_TIMEOUT if no path record */
+ uint64_t ep_timeout_ack_max;
+ uint32_t ep_timeout_ack_factor;
+};
+
+/*
+ * Remote Endpoint info.
+ *
+ * Contains information necessary for composing packets for a remote endpoint
+ */
+#define IPS_MAX_PATH_LMC 3
+typedef struct ips_path_grp {
+ /* For LMC/Torus keep list of base and max dlid. Used for pkt verification */
+ uint16_t pg_base_lid;
+ uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY];
+ uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY];
+ ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY];
+} ips_path_grp_t;
+
+/*
+ * Control messages.
+ *
+ * ips low-level control messages to ensure reliability of eager packets.
+ *
+ */
+struct ips_proto;
+psm2_error_t ips_proto_init(const psmi_context_t *context, const struct ptl *ptl, int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, const struct psmi_timer_ctrl *timerq, /* PTL's timerq */
+ const struct ips_epstate *epstate, /* PTL's epstate */
+ const struct ips_spio *spioc, /* PTL's spio control */
+ struct ips_proto *proto); /* output protocol */
+
+psm2_error_t ips_proto_fini(struct ips_proto *proto, int force,
+ uint64_t timeout);
+
+/*
+ * Control message structures
+ */
+#define CTRL_MSG_QEUEUE_SIZE 64 /* power of two */
+
+struct ips_ctrlq_elem {
+ uint8_t message_type;
+ uint16_t *msg_queue_mask;
+ ips_scb_t msg_scb;
+};
+
+struct ips_ctrlq {
+ /* Queued control messages, queued when pio is busy */
+ struct ips_proto *ctrlq_proto;
+
+ uint32_t ctrlq_head;
+ uint32_t ctrlq_tail;
+ uint32_t ctrlq_overflow;
+
+ struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN;
+ struct psmi_timer ctrlq_timer; /* when in timerq */
+};
+
+/* Connect/disconnect, as implemented by ips */
+
+/*
+ * Connections are not pairwise but we keep a single 'epaddr' for messages-from
+ * and messages-to a remote 'epaddr'. State transitions for connecting TO and
+ * FROM 'epaddrs' are the following:
+ * Connect TO (Connect OUTGOING):
+ * NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE
+ *
+ * Connect FROM (we receive a connect request - Connect INCOMING)
+ * NONE -> ESTABLISHED -> NONE
+ */
+#define CSTATE_ESTABLISHED 1
+#define CSTATE_NONE 2
+#define CSTATE_OUTGOING_DISCONNECTED 3
+#define CSTATE_OUTGOING_WAITING 4
+#define CSTATE_OUTGOING_WAITING_DISC 5
+
+psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep,
+ const psm2_epid_t *array_of_epid,
+ const int *array_of_epid_mask,
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr,
+ uint64_t timeout_in);
+
+psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[],
+ uint64_t timeout_in);
+
+int ips_proto_isconnected(struct ips_epaddr *ipsaddr);
+
+/*
+ * Pending operation structures
+ */
+struct ips_pend_sreq {
+ STAILQ_ENTRY(ips_pend_sreq) next;
+ psm2_mq_req_t req;
+ uint32_t type;
+};
+
+#define IPS_PENDSEND_EAGER_DATA 1
+#define IPS_PENDSEND_EAGER_REQ 2
+#define IPS_PENDSEND_EXP_TIDS 3
+#define IPS_PENDSEND_EXP_SENDS 4
+
+STAILQ_HEAD(ips_pendsendq, ips_pend_sreq);
+
+struct ips_pend_sends {
+ struct ips_proto *proto; /* back ptr */
+ struct psmi_timer timer;
+ struct ips_pendsendq pendq;
+};
+
+/*
+ * One instance of the protocol
+ */
+
+struct ips_protoexp;
+
+struct ips_proto_stats {
+ uint64_t pio_busy_cnt;
+ uint64_t writev_busy_cnt;
+ uint64_t writev_compl_eagain;
+ uint64_t writev_compl_delay;
+ uint64_t scb_egr_unavail_cnt;
+ uint64_t scb_exp_unavail_cnt;
+ uint64_t hdr_overflow;
+ uint64_t egr_overflow;
+ uint64_t lid_zero_errs;
+ uint64_t unknown_packets;
+ uint64_t stray_packets;
+};
+
+struct ips_proto_error_stats {
+ uint64_t num_icrc_err;
+ uint64_t num_ecc_err;
+ uint64_t num_len_err;
+ uint64_t num_tid_err;
+ uint64_t num_dc_err;
+ uint64_t num_dcunc_err;
+ uint64_t num_khdrlen_err;
+};
+
+/*
+ * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init
+ */
+struct ips_proto_epaddr_stats {
+ uint64_t err_chk_send;
+ uint64_t err_chk_recv;
+ uint64_t nak_send;
+ uint64_t nak_recv;
+ uint64_t connect_req;
+ uint64_t disconnect_req;
+ uint64_t tids_grant_send;
+ uint64_t tids_grant_recv;
+ uint64_t send_rexmit;
+ uint64_t congestion_pkts; /* IB CCA FECN packets */
+};
+
+/* OPP support structure. */
+struct opp_api {
+ void *(*op_path_find_hca) (const char *name, void **device);
+ void *(*op_path_open) (void *device, int port_num);
+ void (*op_path_close) (void *context);
+ int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query,
+ ibta_path_rec_t *response);
+};
+
+struct ips_ibta_compliance_fn {
+ psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid,
+ uint16_t dlid, uint16_t desthfi_type,
+ unsigned long timeout,
+ ips_path_grp_t **ppathgrp);
+ psm2_error_t(*fini) (struct ips_proto *proto);
+};
+
+/* please don't change the flow id order */
+typedef enum ips_epaddr_flow {
+ EP_FLOW_GO_BACK_N_PIO,
+ EP_FLOW_GO_BACK_N_DMA,
+ EP_FLOW_TIDFLOW, /* Can either pio or dma for tidflow */
+ EP_FLOW_LAST /* Keep this the last endpoint flow */
+} ips_epaddr_flow_t;
+
+typedef enum psm_transfer_type {
+ PSM_TRANSFER_PIO,
+ PSM_TRANSFER_DMA,
+ PSM_TRANSFER_LAST /* Keep this the last transfer type */
+} psm_transfer_type_t;
+
+typedef enum psm_protocol_type {
+ PSM_PROTOCOL_GO_BACK_N,
+ PSM_PROTOCOL_TIDFLOW,
+ PSM_PROTOCOL_LAST /* Keep this the last protocol type */
+} psm_protocol_type_t;
+
+struct ips_proto {
+ struct ptl *ptl; /* cached */
+ psm2_ep_t ep; /* cached, for errors */
+ psm2_mq_t mq; /* cached, for mq handling */
+ int fd; /* cached, for writev ops */
+
+ /* Pending sends */
+ struct ips_pend_sends pend_sends;
+ struct ips_epstate *epstate;
+ struct psmi_timer_ctrl *timerq;
+
+ struct ips_protoexp *protoexp;
+ struct ips_scbctrl *scbc_rv;
+ struct ips_spio *spioc;
+ struct ips_scbctrl scbc_egr;
+ struct ips_epinfo epinfo;
+
+ ips_scb_t **sdma_scb_queue;
+ struct hfi1_sdma_comp_entry *sdma_comp_queue;
+ uint16_t sdma_queue_size;
+ uint16_t sdma_fill_index;
+ uint16_t sdma_done_index;
+ uint16_t sdma_avail_counter;
+
+ uint64_t timeout_send;
+ uint32_t flags; /* < if IPS_PROTO_FLAG_SDMA is NOT set, SPIO flow will be initialized
+ * < if IPS_PROTO_FLAG_SPIO is NOT set, SDMA flow will be initialized
+ * < so both flows (SDMA and PIO) will be initialized if both of the
+ * < IPS_PROTO_FLAG_S{DMA,PIO} are CLEARED
+ */
+ uint32_t iovec_thresh_eager;
+ uint32_t iovec_thresh_eager_blocking;
+ uint32_t psn_mask;
+ uint32_t scb_bufsize;
+ uint16_t flow_credits;
+ mpool_t pend_sends_pool;
+ mpool_t timer_pool;
+ struct ips_ibta_compliance_fn ibta;
+ struct ips_proto_stats stats;
+ struct ips_proto_error_stats error_stats;
+ struct ips_proto_epaddr_stats epaddr_stats;
+
+ struct ips_proto_am proto_am;
+
+ struct ips_ctrlq ctrlq;
+ /* pure sdma mode, use dma flow, otherwise, use pio flow */
+ ips_epaddr_flow_t msgflowid;
+
+ /* Handling tid errors */
+ uint32_t tiderr_cnt;
+ uint32_t tiderr_max;
+ uint64_t tiderr_tnext;
+ uint64_t tiderr_warn_interval;
+
+ uint64_t t_init;
+ uint64_t t_fini;
+ uint32_t runid_key;
+
+ int num_connected_outgoing;
+ int num_connected_incoming;
+ int num_disconnect_requests;
+
+ /* misc state variables. */
+
+ /* Smallest interval in cycles between which we warn about stray
+ * messages This is a per-endpoint quantity, overridable with
+ * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die"
+ * message.
+ */
+ uint64_t stray_warn_interval;
+ int done_warning;
+ int done_once;
+ int num_bogus_warnings;
+ struct {
+ uint32_t interval_secs;
+ uint64_t next_warning;
+ uint64_t count;
+ } psmi_logevent_tid_send_reqs;
+
+ /* SL2SC and SC2VL table for protocol */
+ uint16_t sl2sc[32];
+ uint16_t sc2vl[32];
+
+ /* CCA per port */
+ uint16_t *cct; /* cct table */
+ uint16_t ccti_size; /* ccti table size */
+ uint16_t ccti_limit; /* should be <= size-1 */
+
+ uint16_t ccti_portctrl; /* QP or SL CC */
+ uint32_t ccti_ctrlmap; /* map for valid sl */
+ struct cace { /* CACongestionEntry */
+ uint8_t ccti_increase; /* steps to increase */
+ /* uint16_t ccti_timer;*/ /* CCTI Timer in units of 1.024 usec */
+ uint64_t ccti_timer_cycles; /* converted from us_2_cycles() */
+ uint8_t ccti_threshold; /* threshold to make log */
+ uint8_t ccti_min; /* min value for ccti */
+ } cace[32]; /* 32 service levels */
+
+ /* Path record support */
+ uint8_t ips_ipd_delay[IBV_RATE_300_GBPS + 1];
+ struct hsearch_data ips_path_rec_hash;
+ struct hsearch_data ips_path_grp_hash;
+ void *opp_lib;
+ void *hndl;
+ void *device;
+ void *opp_ctxt;
+ struct opp_api opp_fn;
+
+#ifdef PSM_CUDA
+ struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg;
+ struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg;
+ mpool_t cuda_hostbuf_pool_send;
+ mpool_t cuda_hostbuf_pool_small_send;
+ cudaStream_t cudastream_send;
+ unsigned cuda_prefetch_limit;
+#endif
+ int ips_extra_sdmahdr_size;
+/*
+ * Control message queue for pending messages.
+ *
+ * Control messages are queued as pending when no PIO is available for sending
+ * the message. They are composed on the fly and do not need buffering.
+ *
+ * Variables here are write once (at init) and read afterwards (except the msg
+ * queue overflow counters).
+ */
+ uint32_t ctrl_msg_queue_overflow;
+ uint32_t ctrl_msg_queue_enqueue;
+ uint32_t message_type_to_index[256];
+#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)])
+
+ time_t writevFailTime;
+};
+
+/*
+ * Endpoint address, encapsulates per-endpoint protocol metadata
+ *
+ * Directly implements the ptl epaddr.
+ */
+typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed);
+
+/**
+ * ips_flow is a structure that combines all information regarding a send
+ * from one endpoint to another one. Specifically, it is the place where
+ * the Maximum Transmission Unit for a send is calculated, given how many
+ * factors could possibly influence the MTU calculation. See ips_flow_init
+ * documentation for more details.
+ */
+struct ips_flow {
+ SLIST_ENTRY(ips_flow) next; /* List of flows with pending acks */
+ ips_flow_flush_fn_t flush; /* flush function for this flow */
+
+ struct ips_epaddr *ipsaddr; /* back pointer, remote endpoint */
+ ips_path_rec_t *path; /* Path to use for flow */
+
+ uint16_t frag_size; /* < This flow's fragment size, calculated as the
+ < minimum of all relevant MTUs involved */
+
+ uint16_t flowid:2; /* flow id: pio(0) or dma(1) or tidflow(2) */
+ uint16_t transfer:3; /* spio or sdma */
+ uint16_t protocol:3; /* go-back-n or tidflow */
+ uint16_t flags:8; /* flow state flags */
+
+ uint16_t cca_ooo_pkts; /* cca out of order packets */
+ uint16_t cwin; /* Size of congestion window */
+ uint16_t ack_interval; /* interval to ack packets */
+ uint16_t ack_counter; /* counter to ack packets */
+ int16_t credits; /* Current credits available to send on flow */
+ uint32_t ack_index; /* Index of the last ACK message type in pending message queue */
+
+ psmi_seqnum_t xmit_seq_num; /* transmit packet sequence number */
+ psmi_seqnum_t xmit_ack_num; /* acked packet sequence number */
+ psmi_seqnum_t recv_seq_num; /* recieved packet sequence number */
+
+ psmi_timer *timer_send; /* timer for frames that got a busy PIO */
+ psmi_timer *timer_ack; /* timer for unacked frames */
+
+ STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked; /* unacked queue */
+ SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend; /* pending queue */
+
+#ifdef PSM_DEBUG
+ uint32_t scb_num_pending; /* pending scb counter */
+ uint32_t scb_num_unacked; /* unacked scb counter */
+#endif
+};
+
+#define IPS_FLOW_MSG_TOGGLE_OOO_MASK (1 << 0) /* ooo msg check */
+#define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK (1 << 1) /* unexp msg check */
+/*
+ * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other.
+ */
+struct ips_epaddr {
+ struct psm2_epaddr epaddr; /* inlined psm level epaddr */
+ struct ips_msgctl *msgctl; /* ips level msg control */
+
+ struct ips_epaddr *next; /* linklist */
+
+ struct ips_flow flows[EP_FLOW_LAST - 1]; /* pio and dma */
+ ips_path_grp_t *pathgrp; /* pointer to slid/dlid group in hash */
+
+ uint32_t connidx_outgoing; /* peer's connection idx */
+ uint32_t connidx_incoming; /* my connection idx */
+
+ uint16_t ctrl_msg_queued; /* bitmap of queued control messages to be send */
+ uint32_t window_rv; /* RNDV window size per connection */
+
+ uint8_t hpp_index; /* high priority index */
+ uint8_t context; /* real context value */
+ uint8_t subcontext; /* sub context, 3 bits, 5 bits for future */
+ uint8_t msg_toggle; /* only 2 bits used, 6 bits for future */
+
+ /* this portion is only for connect/disconnect */
+ uint64_t s_timeout; /* used as a time in close */
+ uint32_t runid_key; /* peer process pid */
+ uint32_t credit:2; /* credit to connect/disconnect: 0 or 1 */
+ uint32_t cstate_outgoing:3; /* connection state to, max 7 */
+ uint32_t cstate_incoming:3; /* connection state from, max 7 */
+ uint32_t delay_in_ms:8; /* disconnect delay in ms */
+ uint32_t cerror_outgoing:8; /* error code during connection */
+ uint32_t cerror_incoming:8; /* error code during connection */
+};
+
+/*
+ * ips_msgctl_t is per connection struct.
+ */
+struct ips_msgctl {
+ struct ips_epaddr master_epaddr; /* Master rail's epaddr */
+
+ struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */
+ uint16_t mq_send_seqnum; /* next sending message sequence */
+ uint16_t mq_recv_seqnum; /* next receiving message sequence */
+ uint16_t am_send_seqnum; /* next sending message sequence */
+ uint16_t am_recv_seqnum; /* next receiving message sequence */
+ uint16_t ipsaddr_count; /* number of ipsaddr to use */
+ uint16_t outoforder_count; /* number of outoforder messages */
+};
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node)
+{
+ ips_epaddr_t *cur;
+
+ /* The new node is inserted before head. */
+ node->next = head;
+
+ /* Circle around the linked list to head's predecessor and update. */
+ for (cur = head; cur->next != head; cur = cur->next);
+ cur->next = node;
+}
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_REMOVE(ips_epaddr_t *node)
+{
+ ips_epaddr_t *cur;
+
+ /* Circle around to node's predecessor and update. */
+ for (cur = node; cur->next != node; cur = cur->next);
+ cur->next = node->next;
+ node->next = node;
+}
+
+/*
+ * Initialize a flow, setting its attributes. Selects the path the flow will
+ * use as well as calculates the flow's fragment size defined as:
+ * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends
+ * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends
+ */
+void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+ ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+ psm_protocol_type_t protocol, ips_path_type_t path_type,
+ uint32_t flow_index);
+MOCK_DCL_EPILOGUE(ips_flow_init);
+
+void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr,
+ struct ips_flow *flow);
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb);
+MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue);
+
+psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed);
+psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed);
+
+/* Wrapper for enqueue + flush */
+psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb);
+
+void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb);
+psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto,
+ ips_epaddr_t *ipsaddr, int *nflushed);
+psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb);
+psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto);
+
+psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto,
+ struct ips_flow *flow, ips_scb_t *scb,
+ void *payload, uint32_t paylen,
+ uint32_t have_cksum, uint32_t cksum);
+
+/*
+ * Protocol receive processing
+ *
+ */
+/* Error handling for unknown packet, packet is unknown when epid doesn't match
+ * in epstate table */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev);
+/* Exposed for fastpath only */
+int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev);
+/* Handling error cases */
+int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * Protocol exception handling and frame dumps
+ */
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len);
+void ips_proto_dump_err_stats(struct ips_proto *proto);
+void ips_proto_show_rhf_errors(const uint32_t *rhdr);
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg);
+void ips_proto_dump_frame(void *frame, int lenght, char *message);
+void ips_proto_dump_data(void *data, int data_length);
+void ips_proto_dump_eager(uint32_t *curr_rcv_hdr);
+
+/*
+ * Checksum of ips packets
+ */
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc);
+
+/*
+ * Matched-Queue processing and sends
+ */
+psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto,
+ psm2_mq_req_t req);
+psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto,
+ psm2_mq_req_t req);
+int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev);
+void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl);
+int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev);
+
+psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr,
+ uint32_t flags, psm2_mq_tag_t *tag,
+ const void *ubuf, uint32_t len);
+
+psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr,
+ uint32_t flags, psm2_mq_tag_t *tag,
+ const void *ubuf, uint32_t len, void *context,
+ psm2_mq_req_t *req_o);
+
+#define IPS_NON_DW_MUL_NOT_ALLOWED 0
+#define IPS_NON_DW_MUL_ALLOWED 1
+void ips_proto_mq_set_non_dw_mul_sdma(uint32_t mode);
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * IPS packet service routine table.
+ */
+typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev);
+extern ips_packet_service_fn_t
+ ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED];
+
+/* IBTA feature related functions (path record, sl2sc2vl etc.) */
+psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto);
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto);
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto);
+MOCK_DCL_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto);
+
+PSMI_ALWAYS_INLINE(
+void* psmi_get_sdma_req_info(struct ips_scb *scb, int sdmahdr_extra_bytes))
+{
+#ifdef PSM_CUDA
+ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+ return (void *)(((char*)&scb->pbc) - sizeof(struct sdma_req_info_v6_3) -
+ sdmahdr_extra_bytes);
+#endif
+ return (void *)(((char*)&scb->pbc) - sizeof(struct sdma_req_info_v6_3));
+}
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset,
+ uint32_t len))
+{
+ uint32_t window_len;
+ window_len = len - offset;
+ if (window_len >= max_window)
+ window_len = max_window;
+ return window_len;
+}
+#endif
+
+#endif /* _IPS_PROTO_H */
diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c
new file mode 100644
index 0000000..98a7460
--- /dev/null
+++ b/ptl_ips/ips_proto_am.c
@@ -0,0 +1,595 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+struct ips_am_token {
+ struct psmi_am_token tok;
+
+ /* ptl-specific token stuff */
+ struct ips_epaddr *epaddr_rail;
+ struct ips_proto_am *proto_am;
+};
+
+struct ips_am_message {
+ struct ips_message_header p_hdr;
+ struct ips_am_message *next;
+ struct ips_epaddr *ipsaddr;
+ struct ips_proto_am *proto_am;
+ uint64_t *payload;
+ uint32_t paylen;
+ uint16_t seqnum;
+};
+
+/* These variables are shared for all packet flows in a PSM process; they are
+ * shared across multiple rails. There is no single AM object to hang these
+ * off of, so they are declared here as globals. */
+static struct {
+ struct ips_am_message head;
+ struct ips_am_message *tail;
+} ips_am_outoforder_q;
+
+static mpool_t ips_am_msg_pool;
+
+/* This calculation ensures that the number of reply slots will always be at
+ * least twice as large + 1 as the number of request slots. This is optimal: the
+ * minimum amount required is actually only twice as many, but it is much
+ * slower. */
+#define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1)
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+ int num_send_slots,
+ uint32_t imm_size,
+ struct ips_proto_am *proto_am)
+{
+ psm2_error_t err = PSM2_OK;
+ int send_buf_size = proto->ep->context.ctrl->__hfi_piosize;
+ int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots);
+ int num_req_slots = num_send_slots - num_rep_slots;
+
+ proto_am->proto = proto;
+
+ /* In a node pair, the number of reply send buffers on at least one of
+ * the nodes must be at least double the number (optimal: double + 1) of
+ * send descriptors on the other node. While this constraint applies
+ * only to the reply send buffers, allowing the caller to tune only the
+ * number of request send buffers would be awkward, as they have no
+ * knowledge of the subdivision of the memory into separate mempools for
+ * requests and replies. It's an internal concern at this point. */
+ if ((err = ips_scbctrl_init(&proto->ep->context,
+ num_req_slots,
+ num_req_slots,
+ imm_size,
+ send_buf_size,
+ NULL,
+ NULL,
+ &proto_am->scbc_request)))
+ goto fail;
+
+ if ((err = ips_scbctrl_init(&proto->ep->context,
+ num_rep_slots,
+ num_rep_slots,
+ imm_size,
+ send_buf_size,
+ NULL,
+ NULL,
+ &proto_am->scbc_reply)))
+ goto fail;
+
+ if (ips_am_msg_pool == NULL) {
+ union psmi_envvar_val max_msgs;
+
+ ips_am_outoforder_q.head.next = NULL;
+ ips_am_outoforder_q.tail = &ips_am_outoforder_q.head;
+
+ psmi_getenv("PSM2_AM_MAX_OOO_MSGS",
+ "Maximum number of OOO Active Messages to queue before dropping.",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)1024, &max_msgs);
+
+ ips_am_msg_pool = psmi_mpool_create(
+ sizeof(struct ips_am_message),
+ 32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL);
+ }
+fail:
+ return err;
+}
+MOCK_DEF_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am)
+{
+ ips_scbctrl_fini(&proto_am->scbc_request);
+ ips_scbctrl_fini(&proto_am->scbc_reply);
+ if (ips_am_msg_pool != NULL) {
+ psmi_mpool_destroy(ips_am_msg_pool);
+ ips_am_msg_pool = NULL;
+ }
+
+ return PSM2_OK;
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+ int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS);
+ int max_payload =
+ ep->context.ctrl->__hfi_piosize -
+ ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t));
+
+ if (parameters == NULL) {
+ return PSM2_PARAM_ERR;
+ }
+
+ parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS;
+ parameters->max_nargs = max_nargs;
+ parameters->max_request_short = max_payload;
+ parameters->max_reply_short = max_payload;
+
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr,
+ psm2_amarg_t *args, int nargs, uint8_t opcode,
+ void *src, size_t len, int flags, int pad_bytes)
+{
+ int i, hdr_qwords = IPS_AM_HDR_NARGS;
+ struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto;
+ struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+
+ /* There are a limited number of bits for nargs in the header, making
+ overflow very easy. Make sure the values match. */
+ psmi_assert(nargs == scb->ips_lrh.amhdr_nargs);
+
+ _HFI_VDBG("%s src=%p len=%d, nargs=%d\n",
+ ((opcode == OPCODE_AM_REQUEST) ||
+ (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep",
+ src, (int)len, nargs);
+
+ if (nargs == 1) { /* fastpath */
+ scb->ips_lrh.data[0].u64w0 = args[0].u64w0;
+ hdr_qwords--;
+ } else if (nargs > 1) {
+ /* Easily unrollable but leave as is in case we can increase
+ * qwords on the chip in the near future */
+ for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--)
+ scb->ips_lrh.data[i].u64w0 = args[i].u64w0;
+
+ if (nargs > IPS_AM_HDR_NARGS) {
+ /* Slow case -- we don't have iovec and not enough
+ * space in the message header, so we have to copy the
+ * user's arguments even if the payload is marked ASYNC
+ */
+ uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb);
+ size_t arg_payload_len =
+ sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS);
+
+ psmi_mq_mtucpy((void *)bufp,
+ &args[IPS_AM_HDR_NARGS],
+ arg_payload_len);
+ bufp += arg_payload_len;
+ scb->payload_size = arg_payload_len;
+
+ if (src != NULL && len > 0) {
+ psmi_mq_mtucpy((void *)bufp, src, len);
+ scb->payload_size += len;
+ }
+
+ psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+ scb->payload_size += pad_bytes;
+ scb->ips_lrh.amhdr_len = pad_bytes;
+ goto send_scb;
+ }
+ }
+
+ if (len == 0) {
+ scb->payload_size = 0;
+ scb->ips_lrh.amhdr_len = 0;
+ } else if (len <= (hdr_qwords << 3)) {
+ /* Inline the payload into the header. */
+ /* This path CANNOT handle length = 0 due to limited space
+ in the header. If IPS_SEND_FLAG_AMISTINY is set, an
+ amhdr_len value of 0 means a full payload, i.e.
+ 1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */
+ psmi_assert(len > 0);
+
+ psmi_mq_mtucpy(&scb->ips_lrh.
+ data[IPS_AM_HDR_NARGS - hdr_qwords], src, len);
+ scb->payload_size = 0;
+ psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS));
+ scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1);
+ scb->flags |= IPS_SEND_FLAG_AMISTINY;
+ } else { /* Whatever's left requires a separate payload */
+ if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */
+ ips_scb_buffer(scb) = src;
+ else /* May need to re-xmit user data, keep it around */
+ psmi_mq_mtucpy(ips_scb_buffer(scb), src, len);
+
+ psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+ scb->payload_size = len + pad_bytes;
+ scb->ips_lrh.amhdr_len = pad_bytes;
+ }
+
+send_scb:
+ ips_scb_opcode(scb) = opcode;
+ scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++;
+ ips_proto_flow_enqueue(flow, scb);
+ flow->flush(flow, NULL);
+
+ return PSM2_OK;
+}
+
+static inline int
+calculate_pad_bytes(size_t len)
+{
+ /* Align to dword (4 bytes) */
+ size_t dword_aligned_len = (len + 3) & ~3;
+ return dword_aligned_len - len;
+}
+
+static inline
+void
+ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs,
+ int pad_bytes,
+ psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+ psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+
+ scb->completion_am = completion_fn;
+ scb->cb_param = completion_ctxt;
+ scb->ips_lrh.amhdr_hidx = handler;
+ scb->ips_lrh.amhdr_len = pad_bytes;
+ scb->ips_lrh.amhdr_nargs = nargs;
+ scb->ips_lrh.flags = 0;
+ if (completion_fn)
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+ return;
+}
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ struct ips_proto_am *proto_am = &epaddr->proto->proto_am;
+ psm2_error_t err;
+ ips_scb_t *scb;
+ ips_epaddr_t *ipsaddr;
+ int pad_bytes = calculate_pad_bytes(len);
+ int payload_sz = (nargs << 3);
+
+ if_pt(!(flags & PSM2_AM_FLAG_ASYNC))
+ payload_sz += len;
+
+ if (payload_sz > (IPS_AM_HDR_NARGS << 3)) {
+ /* Payload can't fit in header, allocate buffer to carry data */
+ int arg_sz = (nargs > IPS_AM_HDR_NARGS) ?
+ ((nargs - IPS_AM_HDR_NARGS) << 3) : 0;
+
+ /* len + pad_bytes + overflow_args */
+ PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+ err,
+ ((scb = ips_scbctrl_alloc(
+ &proto_am->scbc_request,
+ 1,
+ len + pad_bytes + arg_sz,
+ IPS_SCB_FLAG_ADD_BUFFER)) != NULL));
+ } else {
+ PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+ err,
+ ((scb = ips_scbctrl_alloc_tiny(
+ &proto_am->scbc_request)) != NULL));
+ }
+
+ psmi_assert_always(scb != NULL);
+ ips_am_scb_init(scb, handler, nargs, pad_bytes,
+ completion_fn, completion_ctxt);
+
+ /* Select the next ipsaddr for multi-rail */
+ ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next;
+ ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+
+ return am_short_reqrep(scb, ipsaddr, args,
+ nargs,
+ (flags & PSM2_AM_FLAG_NOREPLY) ?
+ OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST,
+ src, len, flags, pad_bytes);
+}
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+ struct ips_am_token *token = (struct ips_am_token *)tok;
+ struct ips_proto_am *proto_am = token->proto_am;
+ struct ips_epaddr *ipsaddr = token->epaddr_rail;
+ int pad_bytes = calculate_pad_bytes(len);
+ int scb_flags = 0;
+ ips_scb_t *scb;
+
+ if (!token->tok.can_reply) {
+ _HFI_ERROR("Invalid AM reply for request!");
+ return PSM2_AM_INVALID_REPLY;
+ }
+
+ psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply));
+
+ if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) {
+ scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply);
+ } else {
+ int payload_sz = (nargs << 3);
+
+ payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ?
+ 0 : (len + pad_bytes);
+ scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ?
+ IPS_SCB_FLAG_ADD_BUFFER : 0;
+
+ scb =
+ ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz,
+ scb_flags);
+ }
+
+ psmi_assert_always(scb != NULL);
+ ips_am_scb_init(scb, handler, nargs, pad_bytes,
+ completion_fn, completion_ctxt);
+ am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY,
+ src, len, flags, pad_bytes);
+ return PSM2_OK;
+}
+
+/* Prepares and runs a handler from a receive event. */
+static int
+ips_am_run_handler(const struct ips_message_header *p_hdr,
+ struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am,
+ uint64_t *payload,
+ uint32_t paylen)
+{
+ struct ips_am_token token;
+ int nargs = p_hdr->amhdr_nargs;
+ psm2_am_handler_fn_t hfn;
+ psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data;
+
+ token.tok.flags = p_hdr->flags;
+ token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr;
+ token.tok.can_reply =
+ (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST);
+ token.epaddr_rail = ipsaddr;
+ token.proto_am = proto_am;
+
+ if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) {
+ /* Payload is packed into header after args */
+ payload = (uint64_t *)&p_hdr->data[nargs].u64;
+ paylen = p_hdr->amhdr_len;
+ /* Interpret amhdr_len == 0 as 16 bytes of payload */
+ if (paylen == 0)
+ paylen = 1 << IPS_AM_HDR_LEN_BITS;
+ } else {
+ if (nargs > IPS_AM_HDR_NARGS) {
+ /* Args are split across header and payload */
+ int payload_args_len =
+ (nargs - IPS_AM_HDR_NARGS) *
+ sizeof(psm2_amarg_t);
+
+ args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t));
+
+ args[0].u64 = p_hdr->data[0].u64;
+ args[1].u64 = p_hdr->data[1].u64;
+
+ memcpy(&args[2], payload, payload_args_len);
+
+ payload += nargs - IPS_AM_HDR_NARGS;
+ paylen -= payload_args_len;
+ }
+
+ /* Subtract off padding bytes (dword padding) for non-TINY. */
+ paylen -= p_hdr->amhdr_len;
+ }
+
+ hfn = psm_am_get_handler_function(proto_am->proto->ep,
+ p_hdr->amhdr_hidx);
+
+ int ret = hfn(&token, args, nargs, payload, paylen);
+ return ret;
+}
+
+static int
+ips_proto_am_handle_outoforder_queue()
+{
+ struct ips_am_message *msg, *prev;
+ int ret = IPS_RECVHDRQ_CONTINUE;
+
+ prev = &ips_am_outoforder_q.head;
+ msg = ips_am_outoforder_q.head.next;
+
+ while (msg != NULL) {
+ struct ips_epaddr *ipsaddr = msg->ipsaddr;
+ if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) {
+ prev = msg;
+ msg = msg->next;
+ continue;
+ }
+
+ ipsaddr->msgctl->am_recv_seqnum++;
+
+ if (ips_am_run_handler(&msg->p_hdr,
+ ipsaddr, msg->proto_am,
+ msg->payload, msg->paylen))
+ ret = IPS_RECVHDRQ_BREAK;
+
+ prev->next = msg->next;
+ if (prev->next == NULL)
+ ips_am_outoforder_q.tail = prev;
+
+ psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload);
+ psmi_mpool_put(msg);
+
+ msg = prev->next;
+ }
+
+ return ret;
+}
+
+static void
+ips_proto_am_queue_msg(struct ips_am_message *msg)
+{
+ msg->next = NULL;
+ ips_am_outoforder_q.tail->next = msg;
+ ips_am_outoforder_q.tail = msg;
+}
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am;
+ ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+ struct ips_flow *flow;
+ struct ips_am_message *msg = NULL;
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ enum ips_msg_order msgorder;
+
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ /*
+ * Based on AM request/reply traffic pattern, if we don't have a reply
+ * scb slot then we can't process the request packet, we just silently
+ * drop it. Otherwise, it will be a deadlock. note:
+ * ips_proto_is_expected_or_nak() can not be called in this case.
+ */
+ if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST &&
+ !ips_scbctrl_avail(&proto_am->scbc_reply))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ if (!ips_proto_is_expected_or_nak(rcv_ev))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ uint16_t send_msgseq =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+ msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq,
+ &ipsaddr->msgctl->am_recv_seqnum);
+
+ if (msgorder == IPS_MSG_ORDER_FUTURE)
+ return IPS_RECVHDRQ_REVISIT;
+ else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+ uint64_t *msg_payload;
+ uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+ psmi_assert(paylen == 0 || payload);
+ msg = psmi_mpool_get(ips_am_msg_pool);
+ if (unlikely(msg == NULL)) {
+ /* Out of memory, drop the packet. */
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) &
+ rcv_ev->proto->psn_mask;
+ return IPS_RECVHDRQ_BREAK;
+ }
+ msg_payload = psmi_mq_sysbuf_alloc(
+ msg->proto_am->proto->mq,
+ ips_recvhdrq_event_paylen(rcv_ev));
+ if (unlikely(msg_payload == NULL)) {
+ /* Out of memory, drop the packet. */
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) &
+ rcv_ev->proto->psn_mask;
+ psmi_mpool_put(msg);
+ return IPS_RECVHDRQ_BREAK;
+ }
+
+ memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header));
+ memcpy(msg_payload, payload, paylen);
+
+ msg->payload = msg_payload;
+ msg->ipsaddr = ipsaddr;
+ msg->proto_am = proto_am;
+ msg->paylen = paylen;
+ msg->seqnum =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) &
+ HFI_KHDR_MSGSEQ_MASK;
+
+ ips_proto_am_queue_msg(msg);
+ } else if ((msgorder == IPS_MSG_ORDER_EXPECTED) ||
+ (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) {
+ uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+ psmi_assert(paylen == 0 || payload);
+ if (ips_am_run_handler(p_hdr, ipsaddr, proto_am,
+ payload, paylen))
+ ret = IPS_RECVHDRQ_BREAK;
+
+ ips_proto_am_handle_outoforder_queue();
+ }
+
+ /* Look if the handler replied, if it didn't, ack the request */
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+ return ret;
+}
diff --git a/ptl_ips/ips_proto_am.h b/ptl_ips/ips_proto_am.h
new file mode 100644
index 0000000..3e0a271
--- /dev/null
+++ b/ptl_ips/ips_proto_am.h
@@ -0,0 +1,93 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_AM_H
+#define _IPS_PROTO_AM_H
+
+#include "psm_user.h"
+#include "ips_scb.h"
+
+struct ips_proto_am {
+ struct ips_proto *proto; /* back pointer */
+ struct ips_scbctrl scbc_request;
+ struct ips_scbctrl scbc_reply;
+};
+
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn, void *completion_ctxt);
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt);
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+ int num_send_slots,
+ uint32_t imm_size,
+ struct ips_proto_am *proto_am);
+MOCK_DCL_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am);
+
+#endif /* _IPS_PROTO_AM_H */
diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c
new file mode 100644
index 0000000..e537d10
--- /dev/null
+++ b/ptl_ips/ips_proto_connect.c
@@ -0,0 +1,1551 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/*
+ * define connection version. this is the basic version, optimized
+ * version will be added later for scalability.
+ */
+#define IPS_CONNECT_VERNO 0x0001
+
+struct ips_connect_hdr {
+ uint16_t connect_verno; /* should be ver 1 */
+ uint16_t psm_verno; /* should be 2.0 */
+ uint32_t connidx; /* ignore if 0xffffffff */
+ uint64_t epid; /* epid of connector process */
+};
+
+struct ips_connect_reqrep {
+ uint16_t connect_verno; /* should be ver 1 */
+ uint16_t psm_verno; /* should be 2.0 */
+ uint32_t connidx; /* ignore if 0xffffffff */
+ uint64_t epid; /* epid of connector process */
+ /* above should be same as ips_connect_hdr */
+
+ uint16_t connect_result; /* error code */
+ uint16_t sl; /* service level for matching */
+ uint16_t mtu; /* receive payload */
+ uint16_t job_pkey; /* partition key for verification */
+
+ uint32_t runid_key; /* one-time stamp connect key */
+ uint32_t initpsn; /* initial psn for flow */
+
+ char hostname[128]; /* sender's hostname string */
+};
+
+/* Startup protocol in PSM/IPS
+ *
+ * Start timer.
+ *
+ * For all nodes to connect to:
+ * Grab connect lock
+ * Look up epid in table
+ * MATCH.
+ * assert cstate_outgoing != CONNECT_WAITING (no re-entrancy)
+ * If cstate_outgoing == CONNECT_DONE
+ * return the already connected address.
+ * else
+ * assert cstate_outgoing == CONNECT_NONE
+ * assert cstate_incoming == CONNECT_DONE
+ * cstate_outgoing := CONNECT_WAITING
+ * assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN
+ * req->connidx := epaddr->connidx_incoming
+ * add to list of pending connect.
+ * NO MATCH
+ * allocate epaddr and put in table
+ * cstate_outgoing := CONNECT_WAITING
+ * cstate_incoming := CONNECT_NONE
+ * connidx_outgoing := UNKNOWN
+ * req->connidx := epaddr->connidx_incoming := NEW connidx integer
+ * add to list of pending connect
+ * Release connect lock
+ *
+ * expected_connect_count = ep->total_connect_count + num_to_connect
+ * while (expected_connect_count != ep->total_connect_count)
+ * check for timeout
+ * progress();
+ *
+ * For all connection requests received (within progress loop)
+ * If uuid doesn't match, NAK the connect and skip request
+ * Grab connect lock
+ * Lock up epid in table
+ * MATCH
+ * if cstate_incoming == CONNECT_DONE
+ * req->connidx := epaddr->connidx_incoming
+ * compose reply and send again (this is a dupe request).
+ * else
+ * assert cstate_incoming == CONNECT_NONE
+ * assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE)
+ * cstate_incoming := CONNECT_DONE
+ * epaddr->connidx_outgoing := req->connidx
+ * req->connidx := epaddr->connidx_incoming
+ * NO MATCH
+ * allocate epaddr and put in table
+ * cstate_incoming := CONNECT_DONE
+ * epaddr->connidx_outgoing = req->connidx;
+ * rep->connidx := epaddr->connidx_incoming := NEW connidx integer
+ * compose connect reply and send
+ * Release connect lock
+ *
+ * For all connection replies received:
+ * If connect_result != 0, process error and skip.
+ * assert cstate_outgoing == CONNECT_WAITING
+ * if cstate_incoming == CONNECT_DONE
+ * assert rep->connidx == epaddr->connidx_outgoing
+ * else
+ * epaddr->connidx_outgoing := rep->connidx
+ * cstate_outgoing := CONNECT_DONE
+ * ep->total_connect_count ++
+ *
+ * * Fill in a connection request:
+ * 1. Set connect protocol version and PSM versions
+ * 2. Set the uuid attached to current endpoint and add the job_pkey
+ * the node wishes to communicate post-connect.
+ * 3. Set our mtu, bitwidth and endianess to detect inconsistencies
+ *
+ */
+
+/**
+ * Configure flows for an ipsaddr.
+ *
+ * @arg ipsaddr - the ipsaddr to configure the flows for
+ * @arg proto - the protocol used
+ *
+ * @pre proto's flags must be set
+ *
+ * Flows should be configured:
+ * - immediately upon creation of an ipsaddr
+ * - whenever a connection is established and the receiver's characteristics
+ * (e.g. mtu) become known
+ */
+ustatic
+void
+ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+ /* PIO flow uses the normal priority path, to separate low
+ * priority path for bulk sdma data packets
+ */
+ ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto,
+ ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+ IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO);
+
+ /* DMA flow uses the low priority path, multi MTU sized eager
+ * message uses the same flow to transfer to avoid out of order.
+ */
+ ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], proto,
+ ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N,
+ IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_DMA);
+}
+
+/*
+ * Teardown any unnecessary timers that could still be active and assign NULL
+ * to pointers in flow structs. We do this mainly for PIO and DMA flows.
+ * TidFlow teardowns are conducted in ips_protoexp_fini()
+ */
+static
+void
+ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+ struct ips_flow *flow;
+ int i;
+
+ for (i = 0; i < EP_FLOW_TIDFLOW; i++) {
+ flow = &ipsaddr->flows[i];
+
+ /* Cancel any stale flow->timers in flight */
+ if (flow->timer_ack) {
+ psmi_timer_cancel(proto->timerq, flow->timer_ack);
+ flow->timer_ack = NULL;
+ }
+
+ if (flow->timer_send) {
+ psmi_timer_cancel(proto->timerq, flow->timer_send);
+ flow->timer_send = NULL;
+ }
+
+ flow->flush = NULL;
+ flow->path = NULL;
+ flow->ipsaddr = NULL;
+ }
+}
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+ const char *hostname, uint16_t hfi_type, unsigned long timeout);
+
+/*
+ * Given a connection request, set mtu, communication index and hdr length
+ * parameters.
+ *
+ * The most subtle parameter is the mtu. When set as 'req->mtu', the mtu
+ * is our connecting peer's declared mtu (which may not be the same as our
+ * mtu). The approach is to take the smaller of both mtus when communicating
+ * with that peer. Also, when using pio, the size can be further restricted by
+ * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers).
+ */
+static
+psm2_error_t
+ips_ipsaddr_set_req_params(struct ips_proto *proto,
+ ips_epaddr_t *ipsaddr,
+ const struct ips_connect_reqrep *req,
+ uint32_t paylen)
+{
+ psm2_ep_t ep;
+ psm2_epaddr_t epaddr;
+ psm2_error_t err = PSM2_OK;
+ int i, start, count;
+ uint64_t *data;
+ psmi_assert_always(req->mtu > 0);
+ uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu);
+ int ptype, pidx;
+
+ /*
+ * Make RNDV window size being dependent on MTU size;
+ * This is due to fact that number of send packets
+ * within a given window must not exceed 2048 (@ref PSM_TID_MAX_PKTS).
+ * Use smaller of two values:
+ * unified MTU * PSM_TID_MAX_PKTS vs already configured window size.
+ */
+ ipsaddr->window_rv = min(common_mtu * PSM_TID_MAX_PKTS, proto->mq->hfi_base_window_rv);
+
+ /*
+ * For static routes i.e. "none" path resolution update all paths to
+ * have the same profile (mtu, sl etc.).
+ *
+ * For path record queries the epr_mtu and epr_sl are setup correctly
+ * from the path itself.
+ */
+ for (ptype = IPS_PATH_LOW_PRIORITY;
+ ptype < IPS_PATH_MAX_PRIORITY; ptype++)
+ for (pidx = 0;
+ pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) {
+ if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) {
+ ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+ common_mtu;
+ } else {
+ ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+ min(common_mtu,
+ ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu);
+ }
+ }
+
+ /*
+ * We've got updated mtu/path records, need to re-initialize the flows to take
+ * into account _real_ (updated) remote endpoint characteristics
+ */
+ ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+ /*
+ * Save peer's info.
+ */
+ ipsaddr->connidx_outgoing = req->connidx;
+ ipsaddr->runid_key = req->runid_key;
+ /* ipsaddr->initpsn = req->initpsn; */
+
+ err =
+ psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid),
+ (char *)req->hostname, 0);
+ if (err)
+ return err;
+
+ /*
+ * Check if there is other rails to setup.
+ */
+ paylen -= sizeof(struct ips_connect_reqrep);
+ if (paylen == 0)
+ return PSM2_OK;
+
+ /*
+ * Yes, other rail's gid/epid is attached.
+ */
+ if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) {
+ return PSM2_INTERNAL_ERR;
+ }
+ count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t));
+ if (count > HFI_MAX_RAILS)
+ return PSM2_INTERNAL_ERR;
+
+ /*
+ * Both side are ordered, so just search from small to big.
+ */
+ start = 0;
+ data = (uint64_t *) (req + 1);
+ ep = proto->ep->mctxt_next;
+
+ struct drand48_data drand48_data;
+ srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data);
+
+ /* Loop over all slave endpoints */
+ while (ep != ep->mctxt_master) {
+ for (i = start; i < count; i++) {
+
+ /* There is a gid match, create the epaddr */
+ if (data[2 * i] == ep->gid_hi) {
+
+ epaddr =
+ ips_alloc_epaddr(&ep->ptl_ips.ptl->proto, 0,
+ data[2 * i + 1], NULL,
+ PSMI_HFI_TYPE_OPA1,
+ 5000);
+ if (epaddr == NULL)
+ return PSM2_NO_MEMORY;
+
+ /* link the ipsaddr */
+ IPS_MCTXT_APPEND(ipsaddr,
+ (ips_epaddr_t *) epaddr);
+
+ /* Setup message control info to the same struct */
+ ((ips_epaddr_t *) epaddr)->msgctl =
+ ipsaddr->msgctl;
+ ipsaddr->msgctl->ipsaddr_count++;
+
+ /* randomize the rail to start traffic */
+ long int rnum;
+ lrand48_r(&drand48_data, &rnum);
+ if ((rnum % count) == i) {
+ ipsaddr->msgctl->ipsaddr_next =
+ (ips_epaddr_t *) epaddr;
+ }
+
+ /* update the starting point,
+ * all previous ones are not valid anymore */
+ start = i + 1;
+ break;
+ }
+ }
+
+ ep = ep->mctxt_next;
+ }
+
+ return PSM2_OK;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_request(struct ips_proto *proto,
+ struct ips_flow *flow, uint8_t message_type,
+ uint16_t *msg_queue_mask, uint64_t timeout)
+{
+ psm2_error_t err = PSM2_OK;
+ ips_scb_t ctrlscb;
+ /* msg header plus gid+epid for all rails plus checksum */
+ char payload[sizeof(struct ips_connect_reqrep) +
+ 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES];
+ uint32_t paylen;
+
+ ctrlscb.flags = 0;
+ paylen = ips_proto_build_connect_message(proto,
+ flow->ipsaddr, message_type, payload);
+ psmi_assert_always(paylen <= sizeof(payload));
+
+ do {
+ err = ips_proto_send_ctrl_message(flow, message_type,
+ msg_queue_mask, &ctrlscb, payload, paylen);
+ if (err == PSM2_OK) {
+ break;
+ }
+ if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) {
+ break;
+ }
+ } while (get_cycles() < timeout);
+
+ return err;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_reply(struct ips_proto *proto,
+ struct ips_flow *flow, uint8_t message_type,
+ uint16_t *msg_queue_mask)
+{
+ /* This will try up to 100 times until the message is sent. The code
+ * is persistent because dropping replies will lead to a lack of
+ * overall progress on the connection/disconnection. We do not want
+ * to poll from here, and we cannot afford a lengthy timeout, since
+ * this is called from the receive path.
+ */
+ psm2_error_t err = PSM2_OK;
+ int i;
+ ips_scb_t ctrlscb;
+ /* msg header plus gid+epid for all rails plus checksum */
+ char payload[sizeof(struct ips_connect_reqrep) +
+ 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES];
+ uint32_t paylen;
+
+ ctrlscb.flags = 0;
+ paylen = ips_proto_build_connect_message(proto,
+ flow->ipsaddr, message_type, payload);
+ psmi_assert_always(paylen <= sizeof(payload));
+
+ for (i = 0; i < 100; i++) {
+ err = ips_proto_send_ctrl_message(flow, message_type,
+ msg_queue_mask, &ctrlscb, payload, paylen);
+ if (err == PSM2_OK) {
+ break;
+ }
+ }
+
+ return err;
+}
+
+int
+ips_proto_build_connect_message(struct ips_proto *proto,
+ ips_epaddr_t *ipsaddr,
+ uint8_t opcode, void *payload)
+{
+ struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+ struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload;
+ uint32_t paylen = 0;
+
+ psmi_assert_always(proto != NULL);
+
+ hdr->connect_verno = IPS_CONNECT_VERNO;
+ hdr->psm_verno = PSMI_VERNO;
+ hdr->connidx = (uint32_t) ipsaddr->connidx_incoming;
+ hdr->epid = proto->ep->epid;
+
+ switch (opcode) {
+ case OPCODE_CONNECT_REPLY:
+ case OPCODE_CONNECT_REQUEST:
+ if (opcode == OPCODE_CONNECT_REQUEST) {
+ req->connect_result = PSM2_OK;
+ req->runid_key = proto->runid_key;
+ } else {
+ req->connect_result = ipsaddr->cerror_incoming;
+ req->runid_key = ipsaddr->runid_key;
+ }
+
+ req->sl = proto->epinfo.ep_sl;
+ req->mtu = proto->epinfo.ep_mtu;
+ req->job_pkey = proto->epinfo.ep_pkey;
+
+ strncpy(req->hostname, psmi_gethostname(),
+ sizeof(req->hostname) - 1);
+ req->hostname[sizeof(req->hostname) - 1] = '\0';
+
+ paylen = sizeof(struct ips_connect_reqrep);
+
+ /* Attach all multi-context subnetids and epids. */
+ if (proto->ep->mctxt_master == proto->ep) {
+ psm2_ep_t ep = proto->ep->mctxt_next;
+ uint64_t *data = (uint64_t *) (req + 1);
+ while (ep != proto->ep) {
+ *data = ep->gid_hi;
+ paylen += sizeof(uint64_t);
+ data++;
+ *data = ep->epid;
+ paylen += sizeof(uint64_t);
+ data++;
+ ep = ep->mctxt_next;
+ }
+ }
+
+ break;
+
+ case OPCODE_DISCONNECT_REQUEST:
+ case OPCODE_DISCONNECT_REPLY:
+ paylen = sizeof(struct ips_connect_hdr);
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unexpected/unhandled connection opcode 0x%x\n",
+ opcode);
+ break;
+ }
+
+ return paylen;
+}
+
+void
+MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+ ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+ psm_protocol_type_t protocol, ips_path_type_t path_type,
+ uint32_t flow_index)
+{
+ psmi_assert_always(protocol < PSM_PROTOCOL_LAST);
+ psmi_assert_always(flow_index < EP_FLOW_LAST);
+
+ SLIST_NEXT(flow, next) = NULL;
+ if (transfer_type == PSM_TRANSFER_PIO) {
+ flow->flush = ips_proto_flow_flush_pio;
+ } else {
+ flow->flush = ips_proto_flow_flush_dma;
+ }
+
+ flow->path =
+ ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp);
+
+ /* Select the fragment size for this flow. Flow is the common
+ * denominator between the local endpoint, the remote endpoint,
+ * the path between those and whether it's a PIO or DMA send.
+ * Hence, it "owns" the maximum transmission unit in its frag_size
+ * member.
+ */
+
+ /* min of local MTU and path MTU */
+ flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu);
+ /* if PIO, need to consider local pio buffer size */
+ if (transfer_type == PSM_TRANSFER_PIO) {
+ flow->frag_size = min(flow->frag_size, proto->epinfo.ep_piosize);
+ _HFI_VDBG("[ipsaddr=%p] PIO flow->frag_size: %u = min("
+ "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u), proto->epinfo.ep_piosize(%u))\n",
+ ipsaddr, flow->frag_size, proto->epinfo.ep_mtu,
+ flow->path->pr_mtu, proto->epinfo.ep_piosize);
+ } else {
+ _HFI_VDBG("[ipsaddr=%p] SDMA flow->frag_size: %u = min("
+ "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n",
+ ipsaddr, flow->frag_size, proto->epinfo.ep_mtu,
+ flow->path->pr_mtu);
+ }
+
+ flow->ipsaddr = ipsaddr;
+ flow->transfer = transfer_type;
+ flow->protocol = protocol;
+ flow->flowid = flow_index;
+ flow->xmit_seq_num.psn_val = 0;
+ flow->recv_seq_num.psn_val = 0;
+ flow->xmit_ack_num.psn_val = 0;
+ flow->flags = 0;
+ flow->cca_ooo_pkts = 0;
+ flow->credits = flow->cwin = proto->flow_credits;
+ flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1);
+ flow->ack_counter = 0;
+#ifdef PSM_DEBUG
+ flow->scb_num_pending = 0;
+ flow->scb_num_unacked = 0;
+#endif
+
+ flow->timer_ack = NULL;
+ flow->timer_send = NULL;
+
+ STAILQ_INIT(&flow->scb_unacked);
+ SLIST_INIT(&flow->scb_pend);
+ return;
+}
+MOCK_DEF_EPILOGUE(ips_flow_init);
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+ const char *hostname, uint16_t hfi_type, unsigned long timeout)
+{
+ psm2_error_t err = PSM2_OK;
+ psm2_epaddr_t epaddr;
+ ips_epaddr_t *ipsaddr;
+ ips_path_grp_t *pathgrp;
+ uint16_t lid;
+
+ /* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl
+ * structures are collocated in memory for performance reasons -- this is
+ * why ips allocates memory for all three together.
+ *
+ * The PSM/PTL structure data is filled in upon successfully ep connect in
+ * ips_ptl_connect().
+ */
+ if (master) {
+ struct ips_msgctl *msgctl;
+
+ /* Although an ips_msgtl is allocated here, it can be safely casted to
+ both an ips_epaddr and a psm2_epaddr. It is eventually freed as an
+ ips_epaddr. */
+ msgctl =
+ (struct ips_msgctl *)psmi_calloc(proto->ep,
+ PER_PEER_ENDPOINT, 1,
+ sizeof(struct ips_msgctl));
+ if (msgctl == NULL)
+ return NULL;
+
+ ipsaddr = &msgctl->master_epaddr;
+ epaddr = (psm2_epaddr_t) ipsaddr;
+
+ ipsaddr->msgctl = msgctl;
+
+ /* initialize items in ips_msgctl_t */
+ msgctl->ipsaddr_next = ipsaddr;
+ msgctl->mq_send_seqnum = 0;
+ msgctl->mq_recv_seqnum = 0;
+ msgctl->am_send_seqnum = 0;
+ msgctl->am_recv_seqnum = 0;
+ msgctl->ipsaddr_count = 1;
+ msgctl->outoforder_count = 0;
+ } else {
+ epaddr =
+ (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1,
+ sizeof(struct ips_epaddr));
+ psmi_assert_always(epaddr);
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ }
+
+ epaddr->ptlctl = proto->ptl->ctl;
+ epaddr->proto = proto;
+ epaddr->epid = epid;
+
+ /* IPS-level epaddr */
+ ipsaddr->next = ipsaddr;
+
+ ipsaddr->ctrl_msg_queued = 0;
+ ipsaddr->msg_toggle = 0;
+
+ /* Actual context of peer */
+ ipsaddr->context = PSMI_EPID_GET_CONTEXT(epid);
+ /* Subcontext */
+ ipsaddr->subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+
+ /* Get path record for <service, slid, dlid> tuple */
+ lid = PSMI_EPID_GET_LID(epid);
+ err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid,
+ __cpu_to_be16(lid), hfi_type, timeout,
+ &pathgrp);
+ if (err != PSM2_OK) {
+ psmi_free(epaddr);
+ return NULL;
+ }
+ ipsaddr->pathgrp = pathgrp;
+
+ /* Setup high priority path index, control messages use the high
+ * priority CONTROL path.
+ */
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+ ipsaddr->hpp_index = 0;
+ else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+ ipsaddr->hpp_index = ipsaddr->context %
+ ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+ else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+ ipsaddr->hpp_index = proto->epinfo.ep_context %
+ ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+ else /* Base LID */
+ ipsaddr->hpp_index = 0;
+
+ /*
+ * Set up the flows on this ipsaddr
+ */
+ ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+ /* clear connection state. */
+ ipsaddr->cstate_outgoing = CSTATE_NONE;
+ ipsaddr->cstate_incoming = CSTATE_NONE;
+
+ /* Add epaddr to PSM's epid table */
+ psmi_epid_add(proto->ep, epaddr->epid, epaddr);
+ psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr);
+
+ return epaddr;
+}
+
+static
+void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto)
+{
+ ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+ ips_flow_fini(ipsaddr, proto);
+
+ _HFI_VDBG("epaddr=%p,ipsaddr=%p,connidx_incoming=%d\n", epaddr, ipsaddr,
+ ipsaddr->connidx_incoming);
+ psmi_epid_remove(epaddr->proto->ep, epaddr->epid);
+ ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming);
+ psmi_free(epaddr);
+ return;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto,
+ psm2_epaddr_t epaddr, struct ips_connect_reqrep *req,
+ uint32_t paylen);
+
+psm2_error_t
+ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode,
+ struct ips_message_header *p_hdr, void *payload,
+ uint32_t paylen)
+{
+ struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+ psm2_epaddr_t epaddr;
+ ips_epaddr_t *ipsaddr;
+ psm2_error_t err = PSM2_OK;
+
+ PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+ epaddr = psmi_epid_lookup(proto->ep, hdr->epid);
+ ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL;
+
+ switch (opcode) {
+ case OPCODE_CONNECT_REQUEST:
+ err = ptl_handle_connect_req(proto, epaddr,
+ (struct ips_connect_reqrep *)hdr,
+ paylen);
+ break;
+
+ case OPCODE_CONNECT_REPLY:
+ {
+ struct ips_connect_reqrep *req =
+ (struct ips_connect_reqrep *)payload;
+
+ if (!ipsaddr || req->runid_key != proto->runid_key) {
+ _HFI_PRDBG
+ ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid %d:%d:%d\n",
+ ipsaddr, req->runid_key, proto->runid_key,
+ (int)PSMI_EPID_GET_LID(hdr->epid),
+ (int)PSMI_EPID_GET_CONTEXT(hdr->epid),
+ (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid));
+ } else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) {
+ /* possible dupe */
+ _HFI_VDBG("connect dupe, expected %d got %d\n",
+ CSTATE_OUTGOING_WAITING,
+ ipsaddr->cstate_outgoing);
+ } else {
+ /* Reply to our request for connection (i.e. outgoing connection) */
+ if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) {
+ err =
+ ips_ipsaddr_set_req_params(proto,
+ ipsaddr,
+ req,
+ paylen);
+ if (err)
+ goto fail;
+ }
+ ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED;
+ ipsaddr->cerror_outgoing = req->connect_result;
+ }
+ }
+ break;
+
+ case OPCODE_DISCONNECT_REQUEST:
+ {
+ ips_epaddr_t ipsaddr_f; /* fake a ptl addr */
+ int epaddr_do_free = 0;
+ psmi_assert_always(paylen ==
+ sizeof(struct ips_connect_hdr));
+ _HFI_VDBG("Got a disconnect from %s\n",
+ psmi_epaddr_get_name(hdr->epid));
+ proto->num_disconnect_requests++;
+ /* It's possible to get a disconnection request on a ipsaddr that
+ * we've since removed if the request is a dupe. Instead of
+ * silently dropping the packet, we "echo" the request in the
+ * reply. */
+ if (ipsaddr == NULL) {
+ ips_path_grp_t *pathgrp;
+ uint16_t lid;
+
+ ipsaddr = &ipsaddr_f;
+ memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t));
+ ipsaddr_f.context =
+ PSMI_EPID_GET_CONTEXT(hdr->epid);
+ ipsaddr_f.subcontext =
+ PSMI_EPID_GET_SUBCONTEXT(hdr->epid);
+
+ /* Get path record for peer */
+ lid = PSMI_EPID_GET_LID(hdr->epid);
+ err = proto->ibta.get_path_rec(proto,
+ proto->epinfo.
+ ep_base_lid,
+ __cpu_to_be16(lid),
+ PSMI_HFI_TYPE_OPA1,
+ 3000, &pathgrp);
+ if (err != PSM2_OK)
+ goto fail;
+
+ ipsaddr_f.pathgrp = pathgrp;
+ ((psm2_epaddr_t) &ipsaddr_f)->ptlctl =
+ proto->ptl->ctl;
+ ((psm2_epaddr_t) &ipsaddr_f)->proto = proto;
+ /* If the send fails because of pio_busy, don't let ips queue
+ * the request on an invalid ipsaddr, just drop the reply */
+ ipsaddr_f.ctrl_msg_queued = ~0;
+
+ psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+
+ ips_flow_init(&ipsaddr_f.
+ flows[proto->msgflowid], proto,
+ &ipsaddr_f, PSM_TRANSFER_PIO,
+ PSM_PROTOCOL_GO_BACK_N,
+ IPS_PATH_LOW_PRIORITY,
+ EP_FLOW_GO_BACK_N_PIO);
+ _HFI_VDBG
+ ("Disconnect on unknown epaddr, just echo request\n");
+ } else if (ipsaddr->cstate_incoming != CSTATE_NONE) {
+ ipsaddr->cstate_incoming = CSTATE_NONE;
+ proto->num_connected_incoming--;
+ if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+ epaddr_do_free = 1;
+ }
+ }
+
+ ips_proto_send_ctrl_message_reply(proto, &ipsaddr->
+ flows[proto->
+ msgflowid],
+ OPCODE_DISCONNECT_REPLY,
+ &ipsaddr->
+ ctrl_msg_queued);
+ /* We can safely free the ipsaddr if required since disconnect
+ * messages are never enqueued so no reference to ipsaddr is kept */
+ if (epaddr_do_free) {
+ ips_free_epaddr(epaddr, proto);
+ epaddr = NULL;
+ }
+ }
+ break;
+
+ case OPCODE_DISCONNECT_REPLY:
+ if (!ipsaddr) {
+ _HFI_VDBG
+ ("Unknown disconnect reply from epid %d:%d.%d\n",
+ (int)PSMI_EPID_GET_LID(hdr->epid),
+ (int)PSMI_EPID_GET_CONTEXT(hdr->epid),
+ (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid));
+ break;
+ } else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) {
+ ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+ /* Freed in disconnect() if cstate_incoming == NONE */
+ } /* else dupe reply */
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unexpected/unhandled connect opcode 0x%x\n",
+ opcode);
+ }
+
+fail:
+ return err;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr,
+ struct ips_connect_reqrep *req, uint32_t paylen)
+{
+ ips_epaddr_t *ipsaddr;
+ psm2_error_t err = PSM2_OK;
+ uint16_t connect_result;
+ int newconnect = 0;
+
+ if (req->epid == proto->ep->epid) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR,
+ "Network connectivity problem: Locally detected duplicate "
+ "LIDs 0x%04x on hosts %s and %s. (Exiting)",
+ (uint32_t) psm2_epid_nid(req->epid),
+ psmi_epaddr_get_hostname(req->epid),
+ psmi_gethostname());
+ /* XXX no return */
+ abort();
+ } else if (epaddr == NULL) { /* new ep connect before we call into connect */
+ newconnect = 1;
+ if ((epaddr =
+ ips_alloc_epaddr(proto, 1, req->epid, req->hostname,
+ PSMI_HFI_TYPE_OPA1,
+ 5000)) == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) {
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ /* Duplicate lid detection. */
+ if (ipsaddr->runid_key == req->runid_key)
+ goto do_reply; /* duplicate request, not duplicate lid */
+ else { /* Some out of context message. Just drop it */
+ if (!proto->done_warning) {
+ psmi_syslog(proto->ep, 1, LOG_INFO,
+ "Non-fatal connection problem: Received an out-of-context "
+ "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+ req->hostname,
+ (int)psm2_epid_nid(req->epid),
+ psm2_epid_context(req->epid));
+ proto->done_warning = 1;
+ }
+ goto no_reply;
+ }
+ } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) {
+ /* pre-created epaddr in multi-rail */
+ psmi_assert_always(epaddr->proto->ep !=
+ epaddr->proto->ep->mctxt_master);
+ newconnect = 1;
+ }
+
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE);
+
+ /* Check connect version and psm version */
+ if (req->connect_verno < 0x0001) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION,
+ "Connect protocol (%x,%x) is obsolete and incompatible",
+ (req->connect_verno >> 8) & 0xff,
+ req->connect_verno & 0xff);
+ connect_result = PSM2_EPID_INVALID_CONNECT;
+ } else if (!psmi_verno_isinteroperable(req->psm_verno)) {
+ connect_result = PSM2_EPID_INVALID_VERSION;
+ } else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) &&
+ proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY &&
+ proto->epinfo.ep_pkey != req->job_pkey) {
+ connect_result = PSM2_EPID_INVALID_PKEY;
+ } else if (req->sl != proto->epinfo.ep_sl) {
+ connect_result = PSM2_EPID_INVALID_CONNECT;
+ _HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl);
+ } else {
+ connect_result = PSM2_OK;
+ if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+ ips_epstate_idx idx;
+ psmi_assert_always(newconnect == 1);
+ err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+ if (err)
+ goto fail;
+ ipsaddr->connidx_incoming = idx;
+ }
+ }
+
+ /* Incoming connection request */
+ if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) {
+ err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen);
+ if (err)
+ goto fail;
+ }
+ ipsaddr->cstate_incoming = CSTATE_ESTABLISHED;
+ ipsaddr->cerror_incoming = connect_result;
+
+ ipsaddr->runid_key = req->runid_key;
+
+ proto->num_connected_incoming++;
+
+do_reply:
+ ips_proto_send_ctrl_message_reply(proto,
+ &ipsaddr->flows[proto->msgflowid],
+ OPCODE_CONNECT_REPLY,
+ &ipsaddr->ctrl_msg_queued);
+no_reply:
+fail:
+ return err;
+}
+
+psm2_error_t
+ips_proto_connect(struct ips_proto *proto, int numep,
+ const psm2_epid_t *array_of_epid,
+ const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+ int i, n, n_first;
+ psm2_error_t err = PSM2_OK;
+ psm2_epaddr_t epaddr;
+ ips_epaddr_t *ipsaddr;
+ ips_epstate_idx idx;
+ int numep_toconnect = 0, numep_left;
+ union psmi_envvar_val credits_intval;
+ int connect_credits;
+
+ psmi_getenv("PSM2_CONNECT_CREDITS",
+ "End-point connect request credits.",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)100, &credits_intval);
+
+ connect_credits = credits_intval.e_uint;
+
+ PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+ /* All timeout values are in cycles */
+ uint64_t t_start = get_cycles();
+ /* Print a timeout at the warning interval */
+ union psmi_envvar_val warn_intval;
+ uint64_t to_warning_interval;
+ uint64_t to_warning_next;
+
+ /* Setup warning interval */
+ psmi_getenv("PSM2_CONNECT_WARN_INTERVAL",
+ "Period in seconds to warn if connections are not completed."
+ "Default is 300 seconds, 0 to disable",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)300, &warn_intval);
+
+ to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL);
+ to_warning_next = t_start + to_warning_interval;
+
+ /* Some sanity checks */
+ psmi_assert_always(array_of_epid_mask != NULL);
+
+ /* First pass: make sure array of errors is at least fully defined */
+ for (i = 0; i < numep; i++) {
+ _HFI_VDBG("epid-connect=%s connect to %d:%d:%d\n",
+ array_of_epid_mask[i] ? "YES" : " NO",
+ (int)PSMI_EPID_GET_LID(array_of_epid[i]),
+ (int)PSMI_EPID_GET_CONTEXT(array_of_epid[i]),
+ (int)PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i]));
+ if (array_of_epid_mask[i]) {
+ array_of_errors[i] = PSM2_EPID_UNKNOWN;
+ array_of_epaddr[i] = NULL;
+ }
+ }
+
+ /* Second pass: see what to connect and what is connectable. */
+ for (i = 0, numep_toconnect = 0; i < numep; i++) {
+ if (!array_of_epid_mask[i])
+ continue;
+
+ /* Can't send to epid on same lid if not loopback */
+ if ((psm2_epid_nid(proto->ep->epid) ==
+ psm2_epid_nid(array_of_epid[i])) &&
+ !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) {
+ array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+ continue;
+ }
+
+ if ((PSMI_EPID_VERSION == PSMI_EPID_V2)
+ && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) !=
+ PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ " Trying to connect to a HFI (subnet id - %"PRIu64")on a"
+ " different subnet - %"PRIu64" \n",
+ PSMI_GET_SUBNET_ID(proto->ep->gid_hi),
+ PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]));
+ }
+
+ epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]);
+ if (epaddr == NULL) {
+ /* We're sending a connect request message before some other node
+ * has sent its connect message */
+ epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i],
+ NULL,
+ PSMI_HFI_TYPE_OPA1,
+ (timeout_in / 1000000UL));
+ if (epaddr == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+ if (err)
+ goto fail;
+ ipsaddr->connidx_incoming = idx;
+ } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { /* already connected */
+ psmi_assert_always(((ips_epaddr_t *) epaddr)->
+ cstate_outgoing == CSTATE_ESTABLISHED);
+ array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED;
+ array_of_epaddr[i] = epaddr;
+ continue;
+ } else if (((ips_epaddr_t *) epaddr)->cstate_incoming ==
+ CSTATE_NONE) {
+ /* pre-created epaddr in multi-rail */
+ psmi_assert_always(epaddr->proto->ep !=
+ epaddr->proto->ep->mctxt_master);
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+ if (err)
+ goto fail;
+ ipsaddr->connidx_incoming = idx;
+ } else {
+ /* We've already received a connect request message from a remote
+ * peer, it's time to send our own. */
+ ipsaddr = (ips_epaddr_t *) epaddr;
+ /* No re-entrancy sanity check and makes sure we are not connected
+ * twice (caller's precondition) */
+ psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE);
+ psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE);
+ }
+
+ ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING;
+ ipsaddr->cerror_outgoing = PSM2_OK;
+ array_of_epaddr[i] = epaddr;
+ ipsaddr->s_timeout = get_cycles();
+ ipsaddr->delay_in_ms = 1;
+ ipsaddr->credit = 0;
+ numep_toconnect++;
+ }
+
+ /* Second pass: do the actual connect.
+ * PSM2_EPID_UNKNOWN: Not connected yet.
+ * PSM2_EPID_UNREACHABLE: Not to be connected.
+ * PSM2_OK: Successfully connected.
+ * Start sending connect messages at a random index between 0 and numep-1
+ */
+ numep_left = numep_toconnect;
+ n_first = ((uint32_t) get_cycles()) % numep;
+ while (numep_left > 0) {
+ for (n = 0; n < numep; n++) {
+ int keep_polling = 1;
+ i = (n_first + n) % numep;
+ if (!array_of_epid_mask[i])
+ continue;
+ switch (array_of_errors[i]) {
+ case PSM2_EPID_UNREACHABLE:
+ case PSM2_EPID_ALREADY_CONNECTED:
+ case PSM2_OK:
+ continue;
+ default:
+ break;
+ }
+ psmi_assert_always(array_of_epaddr[i] != NULL);
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+ /* This is not the real error code, we only set OK here
+ * so we know to stop polling for the reply. The actual
+ * error is in ipsaddr->cerror_outgoing */
+ array_of_errors[i] = PSM2_OK;
+ numep_left--;
+ connect_credits++;
+ ipsaddr->credit = 0;
+ continue;
+ }
+ while (keep_polling) {
+ if (!psmi_cycles_left(t_start, timeout_in)) {
+ err = PSM2_TIMEOUT;
+ goto err_timeout;
+ }
+ if (to_warning_interval
+ && get_cycles() >= to_warning_next) {
+#if _HFI_DEBUGGING
+ uint64_t waiting_time = 0;
+ if (_HFI_INFO_ON) {
+ waiting_time = cycles_to_nanosecs(
+ get_cycles() -
+ t_start) / SEC_ULL;
+ }
+#endif
+ const char *first_name = NULL;
+ int num_waiting = 0;
+
+ for (i = 0; i < numep; i++) {
+ if (!array_of_epid_mask[i] ||
+ array_of_errors[i] !=
+ PSM2_EPID_UNKNOWN)
+ continue;
+ if (!first_name)
+ first_name =
+ psmi_epaddr_get_name
+ (array_of_epid[i]);
+ num_waiting++;
+ }
+ if (_HFI_INFO_ON) {
+ if (first_name) {
+ _HFI_INFO_ALWAYS
+ ("Couldn't connect to %s (and %d others). "
+ "Time elapsed %02i:%02i:%02i. Still trying...\n",
+ first_name, num_waiting,
+ (int)(waiting_time / 3600),
+ (int)((waiting_time / 60) -
+ ((waiting_time /
+ 3600) * 60)),
+ (int)(waiting_time -
+ ((waiting_time /
+ 60) * 60)));
+ }
+ }
+ to_warning_next =
+ get_cycles() + to_warning_interval;
+ }
+
+ if (get_cycles() > ipsaddr->s_timeout) {
+ if (!ipsaddr->credit && connect_credits) {
+ ipsaddr->credit = 1;
+ connect_credits--;
+ }
+ if (ipsaddr->credit) {
+ _HFI_VDBG
+ ("Connect req to %u:%u:%u\n",
+ __be16_to_cpu(ipsaddr->
+ pathgrp->
+ pg_base_lid),
+ ipsaddr->context,
+ ipsaddr->subcontext);
+ if (
+ ips_proto_send_ctrl_message_request
+ (proto, &ipsaddr->
+ flows[proto->msgflowid],
+ OPCODE_CONNECT_REQUEST,
+ &ipsaddr->ctrl_msg_queued,
+ 0) == PSM2_OK) {
+ keep_polling = 0;
+ ipsaddr->delay_in_ms =
+ min(100,
+ ipsaddr->
+ delay_in_ms <<
+ 1);
+ ipsaddr->s_timeout =
+ get_cycles() +
+ nanosecs_to_cycles
+ (ipsaddr->
+ delay_in_ms *
+ MSEC_ULL);
+ }
+ /* If not, send got "busy", keep trying */
+ } else {
+ keep_polling = 0;
+ }
+ }
+
+ if ((err =
+ psmi_err_only(psmi_poll_internal
+ (proto->ep, 1))))
+ goto fail;
+
+ if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+ /* This is not the real error code, we only set OK here
+ * so we know to stop polling for the reply. The actual
+ * error is in ipsaddr->cerror_outgoing */
+ array_of_errors[i] = PSM2_OK;
+ numep_left--;
+ connect_credits++;
+ ipsaddr->credit = 0;
+ break;
+ }
+ }
+ }
+ }
+
+err_timeout:
+ /* Find the worst error to report */
+ for (i = 0; i < numep; i++) {
+ if (!array_of_epid_mask[i])
+ continue;
+ switch (array_of_errors[i]) {
+ /* These are benign */
+ case PSM2_EPID_UNREACHABLE:
+ case PSM2_EPID_ALREADY_CONNECTED:
+ break;
+ case PSM2_EPID_UNKNOWN:
+ array_of_errors[i] = PSM2_TIMEOUT;
+ err = psmi_error_cmp(err, PSM2_TIMEOUT);
+ break;
+ case PSM2_OK:
+ /* Restore the real connect error */
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ array_of_errors[i] = ipsaddr->cerror_outgoing;
+ psmi_assert_always(ipsaddr->cstate_outgoing ==
+ CSTATE_ESTABLISHED);
+ if (ipsaddr->cerror_outgoing != PSM2_OK) {
+ err = psmi_error_cmp(err, ipsaddr->cerror_outgoing);
+ ips_free_epaddr(array_of_epaddr[i], proto);
+ array_of_epaddr[i] = NULL;
+ } else {
+ proto->num_connected_outgoing++;
+ psmi_assert_always(ipsaddr->pathgrp->
+ pg_path[0]
+ [IPS_PATH_HIGH_PRIORITY]->
+ pr_mtu > 0);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+fail:
+ return err;
+}
+
+/* Repercussions on MQ.
+ *
+ * If num_connected==0, everything that exists in the posted queue should
+ * complete and the error must be marked epid_was_closed.
+ *
+ */
+
+psm2_error_t
+ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+ ips_epaddr_t *ipsaddr;
+ int numep_left, numep_todisc, i, n;
+ int n_first;
+ int has_pending;
+ uint64_t timeout;
+ psm2_error_t err = PSM2_OK;
+ uint64_t reqs_sent = 0;
+ union psmi_envvar_val credits_intval;
+ int disconnect_credits;
+ uint64_t t_warning, t_start;
+ union psmi_envvar_val warn_intval;
+ unsigned warning_secs;
+
+ /* In case of a forced close, we cancel whatever timers are pending
+ * on the proto so that we don't have zombie timers coming back
+ * after the internal structures of PSM2 have been destroyed
+ */
+ if (force) {
+ struct psmi_timer *t_cursor;
+ TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) {
+ psmi_timer_cancel(proto->timerq, t_cursor);
+ }
+ }
+
+ psmi_assert_always(numep > 0);
+
+ psmi_getenv("PSM2_DISCONNECT_CREDITS",
+ "End-point disconnect request credits.",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)100, &credits_intval);
+
+ disconnect_credits = credits_intval.e_uint;
+
+ /* Setup warning interval */
+ psmi_getenv("PSM2_DISCONNECT_WARN_INTERVAL",
+ "Period in seconds to warn if disconnections are not completed."
+ "Default is 300 seconds, 0 to disable.",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)300, &warn_intval);
+
+ warning_secs = warn_intval.e_uint;
+
+ PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+ /* First pass: see what to disconnect and what is disconnectable */
+ for (i = 0, numep_todisc = 0; i < numep; i++) {
+ if (!array_of_epaddr_mask[i])
+ continue;
+ psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl ==
+ proto->ptl);
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ ipsaddr->credit = 0;
+ if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+ array_of_errors[i] = PSM2_OK;
+ continue;
+ } else {
+ psmi_assert_always(ipsaddr->cstate_outgoing ==
+ CSTATE_ESTABLISHED);
+ }
+ _HFI_VDBG("disconnecting %p\n", array_of_epaddr[i]);
+ array_of_errors[i] = PSM2_EPID_UNKNOWN;
+ numep_todisc++;
+ }
+ if (numep_todisc == 0)
+ goto success;
+
+ /* Wait for everyone to ack previous packets before putting */
+ if (timeout_in == 0)
+ timeout = ~0ULL;
+ else
+ timeout = get_cycles() + nanosecs_to_cycles(timeout_in);
+
+ t_start = get_cycles();
+ t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL);
+
+ n_first = ((uint32_t) get_cycles()) % numep;
+ if (!force) {
+ numep_left = numep_todisc;
+ do {
+ for (n = 0; n < numep; n++) {
+ i = (n_first + n) % numep;
+ if (!array_of_epaddr_mask[i]
+ || array_of_errors[i] == PSM2_OK)
+ continue;
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ switch (ipsaddr->cstate_outgoing) {
+ case CSTATE_OUTGOING_DISCONNECTED:
+ array_of_errors[i] = PSM2_OK;
+ numep_left--;
+ disconnect_credits++;
+ ipsaddr->credit = 0;
+ continue;
+ case CSTATE_OUTGOING_WAITING_DISC:
+ if (ipsaddr->s_timeout > get_cycles())
+ continue;
+ ipsaddr->delay_in_ms =
+ min(100, ipsaddr->delay_in_ms << 1);
+ ipsaddr->s_timeout = get_cycles() +
+ nanosecs_to_cycles(ipsaddr->
+ delay_in_ms *
+ MSEC_ULL);
+ ips_proto_send_ctrl_message_request
+ (proto,
+ &ipsaddr->flows[proto->msgflowid],
+ OPCODE_DISCONNECT_REQUEST,
+ &ipsaddr->ctrl_msg_queued,
+ timeout);
+ reqs_sent++;
+ break;
+ case CSTATE_ESTABLISHED:
+ /* Still pending acks, hold off for now */
+ has_pending =
+ !STAILQ_EMPTY(&ipsaddr->flows
+ [EP_FLOW_GO_BACK_N_PIO].
+ scb_unacked)
+ ||
+ !STAILQ_EMPTY(&ipsaddr->flows
+ [EP_FLOW_GO_BACK_N_DMA].
+ scb_unacked);
+ if (has_pending)
+ continue;
+ if (!ipsaddr->credit
+ && disconnect_credits) {
+ ipsaddr->credit = 1;
+ disconnect_credits--;
+ }
+ if (!ipsaddr->credit)
+ continue;
+ ipsaddr->delay_in_ms = 1;
+ ipsaddr->cstate_outgoing =
+ CSTATE_OUTGOING_WAITING_DISC;
+ ipsaddr->s_timeout =
+ get_cycles() +
+ nanosecs_to_cycles(MSEC_ULL);
+ ips_proto_send_ctrl_message_request
+ (proto,
+ &ipsaddr->flows[proto->msgflowid],
+ OPCODE_DISCONNECT_REQUEST,
+ &ipsaddr->ctrl_msg_queued,
+ timeout);
+ reqs_sent++;
+ break;
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_INTERNAL_ERR,
+ "Unhandled/unknown close state %d",
+ ipsaddr->cstate_outgoing);
+ break;
+ }
+ }
+ if (numep_left == 0)
+ break;
+
+ if ((err =
+ psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+ goto fail;
+
+ if (warning_secs && get_cycles() > t_warning) {
+ _HFI_INFO
+ ("graceful close in progress for %d/%d peers "
+ "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+ numep_left, numep_todisc,
+ (int)(cycles_to_nanosecs
+ (get_cycles() - t_start) / MSEC_ULL),
+ (int)(timeout_in / MSEC_ULL),
+ (unsigned long long)reqs_sent);
+ t_warning =
+ get_cycles() +
+ nanosecs_to_cycles(warning_secs * SEC_ULL);
+ }
+ }
+ while (timeout > get_cycles());
+
+ if (numep_left > 0) {
+ err = PSM2_TIMEOUT;
+ for (i = 0; i < numep; i++) {
+ if (!array_of_epaddr_mask[i])
+ continue;
+ if (array_of_errors[i] == PSM2_EPID_UNKNOWN) {
+ array_of_errors[i] = PSM2_TIMEOUT;
+ _HFI_VDBG
+ ("disc timeout on index %d, epaddr %s\n",
+ i,
+ psmi_epaddr_get_name
+ (array_of_epaddr[i]->epid));
+ }
+ }
+ _HFI_PRDBG("graceful close incomplete for %d/%d peers "
+ "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+ numep_left, numep_todisc,
+ (int)(cycles_to_nanosecs
+ (get_cycles() - t_start) / MSEC_ULL),
+ (int)(timeout_in / MSEC_ULL),
+ (unsigned long long)reqs_sent);
+ } else
+ _HFI_PRDBG
+ ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n",
+ numep_todisc,
+ (int)(cycles_to_nanosecs(get_cycles() - t_start) /
+ MSEC_ULL), (unsigned long long)reqs_sent);
+ } else {
+ for (n = 0; n < numep; n++) {
+ i = (n_first + n) % numep;
+ if (!array_of_epaddr_mask[i])
+ continue;
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ psmi_assert_always(ipsaddr->cstate_outgoing ==
+ CSTATE_ESTABLISHED);
+ ips_proto_send_ctrl_message_request(proto, &ipsaddr->
+ flows[proto->msgflowid],
+ OPCODE_DISCONNECT_REQUEST,
+ &ipsaddr->ctrl_msg_queued,
+ 0);
+ /* Force state to DISCONNECTED */
+ ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+ array_of_errors[i] = PSM2_OK;
+ }
+ _HFI_VDBG("non-graceful close complete from %d peers\n", numep);
+ }
+
+ for (i = 0; i < numep; i++) {
+ if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK)
+ continue;
+ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+ if (ipsaddr->cstate_outgoing == CSTATE_NONE)
+ continue;
+ psmi_assert_always(ipsaddr->cstate_outgoing ==
+ CSTATE_OUTGOING_DISCONNECTED);
+ proto->num_connected_outgoing--;
+ /* Remote disconnect req arrived already, remove this epid. If it
+ * hasn't arrived yet, that's okay, we'll pick it up later and just
+ * mark our connect-to status as being "none". */
+ if (ipsaddr->cstate_incoming == CSTATE_NONE) {
+ ips_free_epaddr(array_of_epaddr[i], proto);
+ array_of_epaddr[i] = NULL;
+ } else
+ ipsaddr->cstate_outgoing = CSTATE_NONE;
+ }
+
+fail:
+success:
+ return err;
+}
+
+int ips_proto_isconnected(ips_epaddr_t *ipsaddr)
+{
+ if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED ||
+ ipsaddr->cstate_incoming == CSTATE_ESTABLISHED)
+ return 1;
+ else
+ return 0;
+}
diff --git a/ptl_ips/ips_proto_dump.c b/ptl_ips/ips_proto_dump.c
new file mode 100644
index 0000000..3e3e8e7
--- /dev/null
+++ b/ptl_ips/ips_proto_dump.c
@@ -0,0 +1,255 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_header.h"
+#include "ips_proto_help.h"
+
+void ips_proto_dump_frame(void *frame, int lenght, char *message)
+{
+ uint8_t *raw_frame = frame;
+ int counter;
+ char default_message[] = "<UNKNOWN>";
+
+ if (!message)
+ message = default_message;
+
+ printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame,
+ message);
+
+ for (counter = 0; counter < lenght; counter++) {
+ if ((counter % 16) == 0)
+ printf("\n");
+
+ if ((counter % 4) == 0)
+ printf(" ");
+
+ printf("%02X ", raw_frame[counter]);
+ }
+ printf("\n");
+}
+
+void ips_proto_dump_data(void *data, int data_length)
+{
+ int counter;
+ uint8_t *payload = (uint8_t *) data;
+
+ printf("\nHex dump of data, length = %i\n", data_length);
+
+ for (counter = 0; counter < data_length; counter++) {
+ if ((counter % 16) == 0)
+ printf("\n %04d: ", counter);
+
+ if ((counter % 4) == 0)
+ printf(" ");
+
+ printf("%02X ", payload[counter]);
+ }
+ printf("\n");
+}
+
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg)
+{
+ psmi_seqnum_t ack_seq_num;
+
+ printf("\nHeader decoding in hex: %s\n", msg ? msg : "");
+
+ printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n",
+ __be16_to_cpu(p_hdr->lrh[0]));
+ printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1]));
+ printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2]));
+ printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3]));
+
+ printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n",
+ __be32_to_cpu(p_hdr->bth[0]));
+ printf("BTH: F1-B1-Res6-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1]));
+ printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2]));
+
+ printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1));
+ printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n",
+ __le32_to_cpu(p_hdr->khdr.kdeth0));
+
+ printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr));
+
+ ack_seq_num.psn_num = p_hdr->ack_seq_num;
+ if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+ printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n",
+ (__be32_to_cpu(p_hdr->bth[1]) >>
+ HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK,
+ (__be32_to_cpu(p_hdr->bth[2]) >>
+ HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK,
+ (__be32_to_cpu(p_hdr->bth[2]) >>
+ HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK);
+ else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW)
+ printf("ack_seq_num gen %x, seq %x\n",
+ ack_seq_num.psn_gen, ack_seq_num.psn_seq);
+ else
+ printf("ack_seq_num %x\n", ack_seq_num.psn_num);
+
+ printf("src_rank/connidx %x\n", p_hdr->connidx);
+ if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+ printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc);
+ printf("flags %x\n", p_hdr->flags);
+}
+
+/* linux doesn't have strlcat; this is a stripped down implementation */
+/* not super-efficient, but we use it rarely, and only for short strings */
+/* not fully standards conforming! */
+static size_t strlcat(char *d, const char *s, size_t l)
+{
+ int dlen = strlen(d), slen, max;
+ if (l <= dlen) /* bug */
+ return l;
+ slen = strlen(s);
+ max = l - (dlen + 1);
+ if (slen > max)
+ slen = max;
+ memcpy(d + dlen, s, slen);
+ d[dlen + slen] = '\0';
+ return dlen + slen + 1; /* standard says to return full length, not actual */
+}
+
+/* decode RHF errors; only used one place now, may want more later */
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len)
+{
+ *msg = '\0'; /* if no errors, and so don't need to check what's first */
+
+ if (err & HFI_RHF_ICRCERR)
+ strlcat(msg, "icrcerr ", len);
+ if (err & HFI_RHF_ECCERR)
+ strlcat(msg, "eccerr ", len);
+ if (err & HFI_RHF_LENERR)
+ strlcat(msg, "lenerr ", len);
+ if (err & HFI_RHF_TIDERR)
+ strlcat(msg, "tiderr ", len);
+ if (err & HFI_RHF_DCERR)
+ strlcat(msg, "dcerr ", len);
+ if (err & HFI_RHF_DCUNCERR)
+ strlcat(msg, "dcuncerr ", len);
+ if (err & HFI_RHF_KHDRLENERR)
+ strlcat(msg, "khdrlenerr ", len);
+}
+
+void ips_proto_dump_err_stats(struct ips_proto *proto)
+{
+ char err_stat_msg[2048];
+ char tmp_buf[128];
+ int len = sizeof(err_stat_msg);
+
+ if (!(hfi_debug & __HFI_PKTDBG))
+ return;
+
+ *err_stat_msg = '\0';
+
+ if (proto->error_stats.num_icrc_err ||
+ proto->error_stats.num_ecc_err ||
+ proto->error_stats.num_len_err ||
+ proto->error_stats.num_tid_err ||
+ proto->error_stats.num_dc_err ||
+ proto->error_stats.num_dcunc_err ||
+ proto->error_stats.num_khdrlen_err) {
+
+ snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: ");
+
+ if (proto->error_stats.num_icrc_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %" PRIu64 " ",
+ proto->error_stats.num_icrc_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_ecc_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %" PRIu64 " ",
+ proto->error_stats.num_ecc_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_len_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %" PRIu64 " ",
+ proto->error_stats.num_len_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_tid_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "TID: %" PRIu64 " ",
+ proto->error_stats.num_tid_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_dc_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "DC: %" PRIu64 " ",
+ proto->error_stats.num_dc_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_dcunc_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf),
+ "DCUNC: %" PRIu64 " ",
+ proto->error_stats.num_dcunc_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+
+ if (proto->error_stats.num_khdrlen_err) {
+ snprintf(tmp_buf, sizeof(tmp_buf),
+ "KHDRLEN: %" PRIu64 " ",
+ proto->error_stats.num_khdrlen_err);
+ strlcat(err_stat_msg, tmp_buf, len);
+ }
+ strlcat(err_stat_msg, "\n", len);
+ } else
+ strlcat(err_stat_msg, "No previous errors.\n", len);
+
+ _HFI_ERROR("%s", err_stat_msg);
+}
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
new file mode 100644
index 0000000..c0ca988
--- /dev/null
+++ b/ptl_ips/ips_proto_expected.c
@@ -0,0 +1,2957 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+static uint32_t hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw = 0;
+
+void
+ips_protoexp_hfi1_check_dma_no_hdrsupp_for_msgs_leq_8dw(void)
+{
+ if ((hfi_get_user_major_version() > 6) ||
+ (hfi_get_user_major_version() == 6 &&
+ hfi_get_user_minor_version() >= 3)) {
+ hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw = 1;
+ }
+}
+
+/*
+ * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is
+ * needed
+ */
+#define _HFI_EXP _HFI_VDBG
+
+/*
+ * Timer callbacks. When we need work to be done out of the receive process
+ * loop, we schedule work on timers to be done at a later time.
+ */
+static psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static void
+ips_protoexp_do_tf_seqerr(struct ips_protoexp *protoexp,
+ struct ips_tid_recv_desc *tidrecvc,
+ struct ips_message_header *p_hdr);
+static void
+ips_protoexp_do_tf_generr(struct ips_protoexp *protoexp,
+ struct ips_tid_recv_desc *tidrecvc,
+ struct ips_message_header *p_hdr);
+
+static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+static void ips_tid_avail_callback(struct ips_tid *tidc, void *context);
+static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context);
+
+/* Defined at the ptl-level (breaks abstractions but needed for shared vs
+ * non-shared contexts */
+extern int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc);
+static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc);
+
+#ifdef PSM_CUDA
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+ struct ips_tid_send_desc *tidsendc);
+static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+ psm2_mq_req_t req,
+ struct ips_tid_send_desc *tidsendc,
+ struct ips_cuda_hostbuf *chb_prev,
+ uint32_t tsess_srcoff,
+ uint32_t tsess_length,
+ uint32_t tsess_unaligned_start,
+ psm2_chb_match_type_t type);
+#endif
+
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+ const struct ips_proto *proto,
+ uint32_t protoexp_flags,
+ int num_of_send_bufs,
+ int num_of_send_desc, struct ips_protoexp **protoexp_o)
+{
+ ips_protoexp_hfi1_check_dma_no_hdrsupp_for_msgs_leq_8dw();
+
+ struct ips_protoexp *protoexp = NULL;
+ uint32_t tidmtu_max;
+ psm2_error_t err = PSM2_OK;
+
+ protoexp = (struct ips_protoexp *)
+ psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp));
+ if (protoexp == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ *protoexp_o = protoexp;
+
+ protoexp->ptl = (const struct ptl *)proto->ptl;
+ protoexp->proto = (struct ips_proto *)proto;
+ protoexp->timerq = proto->timerq;
+ srand48_r((long int) getpid(), &protoexp->tidflow_drand48_data);
+ protoexp->tid_flags = protoexp_flags;
+ if (context->runtime_flags & HFI1_CAP_HDRSUPP) {
+ union psmi_envvar_val env_hdrsupp;
+
+ psmi_getenv("PSM2_HDRSUPP",
+ "header suppression(0 disables it)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)1, &env_hdrsupp);
+ if (env_hdrsupp.e_uint)
+ protoexp->tid_flags |= IPS_PROTOEXP_FLAG_HDR_SUPP;
+ else
+ /* user wants to turn off header suppression */
+ context->ctrl->__hfi_tfvalid = 0;
+ }
+
+ if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) {
+ protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED;
+ }
+
+ {
+ /*
+ * Adjust the session window size so that tid-grant message can
+ * fit into a single frag size packet for single transfer, PSM
+ * must send tid-grant message with a single packet.
+ */
+ uint32_t fragsize, winsize;
+
+ if (proto->flags & IPS_PROTO_FLAG_SDMA)
+ fragsize = proto->epinfo.ep_mtu;
+ else
+ fragsize = proto->epinfo.ep_piosize;
+
+ winsize = 2 * PSMI_PAGESIZE /* bytes per tid-pair */
+ /* space in packet */
+ * min((fragsize - sizeof(ips_tid_session_list)),
+ /* space in tidsendc/tidrecvc descriptor */
+ PSM_TIDLIST_BUFSIZE)
+ / sizeof(uint32_t); /* convert to tid-pair */
+
+ if (proto->mq->hfi_base_window_rv > winsize)
+ proto->mq->hfi_base_window_rv = winsize;
+ }
+
+ /* Must be initialized already */
+ /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012
+ psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL &&
+ proto->ep->mq->rreq_pool != NULL &&
+ proto->ep->mq->sreq_pool != NULL);
+ */
+ psmi_assert_always(proto->timerq != NULL);
+ /* Make sure pbc is at the right place before the message header */
+ psmi_assert_always(sizeof(struct hfi_pbc) == (size_t)
+ (offsetof(struct ips_scb, ips_lrh) -
+ offsetof(struct ips_scb, pbc)));
+
+ /* These request pools are managed by the MQ component */
+ protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool;
+ protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool;
+
+ /* tid traffic xfer type */
+ if (proto->flags & IPS_PROTO_FLAG_SPIO)
+ protoexp->tid_xfer_type = PSM_TRANSFER_PIO;
+ else
+ protoexp->tid_xfer_type = PSM_TRANSFER_DMA;
+
+ /* ctrl ack/nak xfer type */
+ if (proto->flags & IPS_PROTO_FLAG_SDMA)
+ protoexp->ctrl_xfer_type = PSM_TRANSFER_DMA;
+ else
+ protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO;
+
+ /* Initialize tid flow control. */
+ err = ips_tf_init(protoexp, context, &protoexp->tfc,
+ ips_tidflow_avail_callback);
+ if (err != PSM2_OK)
+ goto fail;
+
+ if (proto->flags & IPS_PROTO_FLAG_SPIO)
+ tidmtu_max = proto->epinfo.ep_piosize;
+ else
+ tidmtu_max = proto->epinfo.ep_mtu;
+
+ protoexp->tid_send_fragsize = tidmtu_max;
+
+ if ((err = ips_tid_init(context, protoexp,
+ ips_tid_avail_callback, protoexp)))
+ goto fail;
+
+ if ((err = ips_scbctrl_init(context, num_of_send_desc, 0,
+ 0, 0, ips_tid_scbavail_callback,
+ protoexp, &protoexp->tid_scbc_rv)))
+ goto fail;
+
+ {
+ /* Determine interval to generate headers (relevant only when header
+ * suppression is enabled) else headers will always be generated.
+ *
+ * The PSM2_EXPECTED_HEADERS environment variable can specify the
+ * packet interval to generate headers at. Else a header packet is
+ * generated every
+ * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize).
+ * Note: A header is always generated for the last packet in the flow.
+ */
+
+ union psmi_envvar_val env_exp_hdr;
+ uint32_t defval = min(PSM_DEFAULT_EXPECTED_HEADER,
+ proto->mq->hfi_base_window_rv /
+ protoexp->tid_send_fragsize);
+
+ psmi_getenv("PSM2_EXPECTED_HEADERS",
+ "Interval to generate expected protocol headers",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)defval, &env_exp_hdr);
+
+ protoexp->hdr_pkt_interval = env_exp_hdr.e_uint;
+ /* Account for flow credits - Should try to have atleast 4 headers
+ * generated per window.
+ */
+ protoexp->hdr_pkt_interval =
+ max(min
+ (protoexp->hdr_pkt_interval, proto->flow_credits >> 2),
+ 1);
+
+ if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) {
+ _HFI_VDBG
+ ("Overriding PSM2_EXPECTED_HEADERS=%u to be '%u'\n",
+ env_exp_hdr.e_uint, protoexp->hdr_pkt_interval);
+ }
+
+ }
+
+ {
+ union psmi_envvar_val env_rts_cts_interleave;
+
+ psmi_getenv("PSM2_RTS_CTS_INTERLEAVE",
+ "Interleave the handling of RTS to provide a fair distribution between multiple senders",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, &env_rts_cts_interleave);
+ if (env_rts_cts_interleave.e_uint)
+ protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE;
+ }
+
+ /* Send descriptors.
+ *
+ * There can be up to 2^32 of these send descriptors. We conservatively
+ * allocate 256 but large node configurations can allocate up to sdesc_num
+ * of these (they are about 2k each).
+ * We impose a theoretical limit of 2^30.
+ */
+ {
+ struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS;
+ uint32_t maxsz, chunksz;
+
+ if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+ &rlim, &maxsz, &chunksz)))
+ goto fail;
+
+ protoexp->tid_desc_send_pool =
+ psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz,
+ maxsz, 0, DESCRIPTORS, NULL, NULL);
+
+ if (protoexp->tid_desc_send_pool == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate tid descriptor memory pool");
+ goto fail;
+ }
+ }
+
+ /* Receive descriptors are an array in tidflow structure. */
+
+ /* This pool can never be smaller than the max number of rreqs that can be
+ * allocated. */
+ {
+ uint32_t rreq_per_chunk, rreq_max;
+
+ psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL);
+
+ psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool,
+ &rreq_per_chunk, &rreq_max);
+
+ protoexp->tid_getreq_pool =
+ psmi_mpool_create(sizeof(struct ips_tid_get_request),
+ rreq_per_chunk, rreq_max, 0, DESCRIPTORS,
+ NULL, NULL);
+
+ if (protoexp->tid_getreq_pool == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate getreq descriptor memory pool");
+ goto fail;
+ }
+ }
+
+ /* Timers to handle requeueing of work out of the receive path */
+ psmi_timer_entry_init(&protoexp->timer_send,
+ ips_tid_pendsend_timer_callback, protoexp);
+ STAILQ_INIT(&protoexp->pend_sendq);
+ psmi_timer_entry_init(&protoexp->timer_getreqs,
+ ips_tid_pendtids_timer_callback, protoexp);
+ STAILQ_INIT(&protoexp->pend_getreqsq);
+
+ protoexp->tid_page_offset_mask = PSMI_PAGESIZE - 1;
+ protoexp->tid_page_mask = ~(PSMI_PAGESIZE - 1);
+
+ /*
+ * After ips_tid_init(), we know if we use tidcache or not.
+ * if tid cache is used, we can't use tid debug.
+ */
+#ifdef PSM_DEBUG
+ if (protoexp->tidc.tid_array == NULL)
+ protoexp->tid_flags |= IPS_PROTOEXP_FLAG_TID_DEBUG;
+#endif
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+ int i;
+ protoexp->tid_info = (struct ips_tidinfo *)
+ psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS,
+ sizeof(struct ips_tidinfo));
+ if (protoexp->tid_info == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ for (i = 0; i < IPS_TID_MAX_TIDS; i++) {
+ protoexp->tid_info[i].state = TIDSTATE_FREE;
+ protoexp->tid_info[i].tidrecvc = NULL;
+ protoexp->tid_info[i].tid = 0xFFFFFFFF;
+ }
+ } else
+ protoexp->tid_info = NULL;
+
+#ifdef PSM_CUDA
+ {
+ if (PSMI_IS_CUDA_ENABLED &&
+ !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+ struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+ uint32_t maxsz, chunksz, max_elements;
+
+ if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+ &rlim, &maxsz, &chunksz)))
+ goto fail;
+
+ /* the maxsz is the amount in MB, not the number of entries,
+ * since the element size depends on the window size */
+ max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+ /* mpool requires max_elements to be power of 2. round down. */
+ max_elements = 1 << (31 - __builtin_clz(max_elements));
+ protoexp->cuda_hostbuf_recv_cfg.bufsz =
+ proto->mq->hfi_base_window_rv;
+
+ protoexp->cuda_hostbuf_pool_recv =
+ psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+ chunksz, max_elements, 0,
+ UNDEFINED, NULL, NULL,
+ psmi_cuda_hostbuf_alloc_func,
+ (void *)
+ &protoexp->cuda_hostbuf_recv_cfg);
+
+ if (protoexp->cuda_hostbuf_pool_recv == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate CUDA host receive buffer pool");
+ goto fail;
+ }
+
+ protoexp->cuda_hostbuf_small_recv_cfg.bufsz =
+ CUDA_SMALLHOSTBUF_SZ;
+ protoexp->cuda_hostbuf_pool_small_recv =
+ psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+ chunksz, max_elements, 0,
+ UNDEFINED, NULL, NULL,
+ psmi_cuda_hostbuf_alloc_func,
+ (void *)
+ &protoexp->cuda_hostbuf_small_recv_cfg);
+
+ if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate CUDA host small receive buffer pool");
+ goto fail;
+ }
+
+ if (cuda_runtime_version >= 7000) {
+ PSMI_CUDA_CALL(cudaStreamCreateWithFlags,
+ &protoexp->cudastream_recv,
+ cudaStreamNonBlocking);
+ } else {
+ PSMI_CUDA_CALL(cudaStreamCreate,
+ &protoexp->cudastream_recv);
+ }
+ STAILQ_INIT(&protoexp->cudapend_getreqsq);
+ } else {
+ protoexp->cuda_hostbuf_pool_recv = NULL;
+ protoexp->cuda_hostbuf_pool_small_recv = NULL;
+ }
+ }
+#endif
+ psmi_assert(err == PSM2_OK);
+ return err;
+
+fail:
+#ifdef PSM_CUDA
+ if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL)
+ psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+ if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL)
+ psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+#endif
+ if (protoexp != NULL && protoexp->tid_getreq_pool != NULL)
+ psmi_mpool_destroy(protoexp->tid_getreq_pool);
+ if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL)
+ psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+ if (protoexp != NULL)
+ ips_scbctrl_fini(&protoexp->tid_scbc_rv);
+ if (protoexp != NULL)
+ psmi_free(protoexp);
+ return err;
+}
+MOCK_DEF_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
+{
+ psm2_error_t err = PSM2_OK;
+
+#ifdef PSM_CUDA
+ if(PSMI_IS_CUDA_ENABLED &&
+ !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+ psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+ psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+ }
+#endif
+ psmi_mpool_destroy(protoexp->tid_getreq_pool);
+ psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+
+ if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv)))
+ goto fail;
+
+ if ((err = ips_tid_fini(&protoexp->tidc)))
+ goto fail;
+
+ if ((err = ips_tf_fini(&protoexp->tfc)))
+ goto fail;
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG)
+ psmi_free(protoexp->tid_info);
+
+ psmi_free(protoexp);
+
+fail:
+ return err;
+}
+
+/* New scbs now available. If we have pending sends or pending get requests,
+ * turn on the timer so it can be processed. */
+static
+void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+ struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+ if (!STAILQ_EMPTY(&protoexp->pend_sendq))
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+ if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+ return;
+}
+
+/* New Tids are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void ips_tid_avail_callback(struct ips_tid *tidc, void *context)
+{
+ struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+ if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+ return;
+}
+
+/* New Tid Flows are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context)
+{
+ struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+ if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+ {
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+ }
+ return;
+}
+
+/*
+ * The tid get request is always issued from within the receive progress loop,
+ * which is why we always enqueue the request instead of issuing it directly.
+ * Eventually, if we expose tid_get to users, we will want to differentiate
+ * when the request comes from the receive progress loop from cases where the
+ * tid_get is issued directly from user code.
+ *
+ */
+psm2_error_t
+ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+ void *buf,
+ uint32_t length,
+ psm2_epaddr_t epaddr,
+ uint32_t remote_tok,
+ uint32_t flags,
+ ips_tid_completion_callback_t callback,
+ void *context)
+{
+ struct ips_tid_get_request *getreq;
+ int count, nbytes, tids, tidflows;
+
+ PSM2_LOG_MSG("entering");
+ psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0);
+ getreq = (struct ips_tid_get_request *)
+ psmi_mpool_get(protoexp->tid_getreq_pool);
+
+ /* We can't *really* run out of these here because we always allocate as
+ * much as available receive reqs */
+ if_pf(getreq == NULL)
+ {
+ PSM2_LOG_MSG("leaving");
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Ran out of 'getreq' descriptors");
+ }
+
+ getreq->tidgr_protoexp = protoexp;
+ getreq->tidgr_epaddr = epaddr;
+ getreq->tidgr_lbuf = buf;
+ getreq->tidgr_length = length;
+ getreq->tidgr_sendtoken = remote_tok;
+ getreq->tidgr_ucontext = context;
+ getreq->tidgr_callback = callback;
+ getreq->tidgr_offset = 0;
+ getreq->tidgr_bytesdone = 0;
+ getreq->tidgr_flags = flags;
+
+#ifdef PSM_CUDA
+ psm2_mq_req_t req = (psm2_mq_req_t)context;
+ if ((req->is_buf_gpu_mem &&
+ !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) ||
+ ((req->is_buf_gpu_mem &&
+ (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
+ gpudirect_recv_threshold &&
+ length > gpudirect_recv_threshold))) {
+ getreq->cuda_hostbuf_used = 1;
+ getreq->tidgr_cuda_bytesdone = 0;
+ STAILQ_INIT(&getreq->pend_cudabuf);
+ } else
+ getreq->cuda_hostbuf_used = 0;
+#endif
+
+ /* nbytes is the bytes each channel should transfer. */
+ count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count;
+#ifdef PSM_CUDA
+ if (req->is_buf_gpu_mem)
+ nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE);
+ else
+#endif
+ nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE);
+ getreq->tidgr_rndv_winsz =
+ min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv);
+ /* must be within the tid window size */
+ if (getreq->tidgr_rndv_winsz > PSM_TID_WINSIZE)
+ getreq->tidgr_rndv_winsz = PSM_TID_WINSIZE;
+
+ STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next);
+ tids = ips_tid_num_available(&protoexp->tidc);
+ tidflows = ips_tf_available(&protoexp->tfc);
+
+ if (tids > 0 && tidflows > 0)
+ ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0);
+ else if (tids != -1 && tidflows != -1)
+ psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+
+/* List of perf events */
+#define _ips_logeventid_tid_send_reqs 0 /* out of tid send descriptors */
+
+#define ips_logevent_id(event) _ips_logeventid_ ## event
+#define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr)
+
+static
+void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context)
+{
+ uint64_t t_now = get_cycles();
+
+ switch (eventid) {
+ case ips_logevent_id(tid_send_reqs):{
+ psm2_epaddr_t epaddr = (psm2_epaddr_t) context;
+ proto->psmi_logevent_tid_send_reqs.count++;
+
+ if (t_now >=
+ proto->psmi_logevent_tid_send_reqs.next_warning) {
+ psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK,
+ "Non-fatal temporary exhaustion of send tid dma descriptors "
+ "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)",
+ (double)
+ cycles_to_nanosecs(t_now -
+ proto->
+ t_init) /
+ 1.0e9,
+ (int)psm2_epid_nid(epaddr->
+ epid),
+ (int)psm2_epid_context(epaddr->
+ epid),
+ (long long)proto->
+ psmi_logevent_tid_send_reqs.
+ count);
+ proto->psmi_logevent_tid_send_reqs.
+ next_warning =
+ t_now +
+ sec_2_cycles(proto->
+ psmi_logevent_tid_send_reqs.
+ interval_secs);
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return;
+}
+
+/*
+ * Expected Protocol.
+ *
+ * We're granted tids (as part of a tid get request) and expected to fulfill
+ * the request by associating the request's sendtoken to a tid send descriptor.
+ *
+ * It's possible to be out of tid send descriptors when somehow all allocated
+ * descriptors can't complete all of their sends. For example, the targets of
+ * the sends may be busy in computation loops and not processing incoming
+ * packets.
+ */
+
+void
+ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc)
+{
+ ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+ struct ips_proto *proto = tidrecvc->protoexp->proto;
+ struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+ ips_scb_t *scb;
+
+ scb = tidrecvc->grantscb;
+
+ ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+ scb->ips_lrh.khdr.kdeth0 = 0;
+ scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val;
+ scb->ips_lrh.data[0] = tidrecvc->rdescid;
+ scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length;
+ scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken;
+
+ ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list;
+ ips_scb_length(scb) = tidrecvc->tsess_tidlist_length;
+
+ PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_TX, proto->ep->epid,
+ flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d",
+ tidrecvc->getreq->tidgr_sendtoken);
+
+ ips_proto_flow_enqueue(flow, scb);
+ flow->flush(flow, NULL);
+}
+
+void
+ips_protoexp_send_tid_completion(struct ips_tid_recv_desc *tidrecvc,
+ ptl_arg_t sdescid)
+{
+ ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+ struct ips_proto *proto = tidrecvc->protoexp->proto;
+ struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+ ips_scb_t *scb;
+
+ PSM_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM_LOG_EPM_TX, proto->ep->epid,
+ flow->ipsaddr->epaddr.epid ,"sdescid._desc_idx: %d",
+ sdescid._desc_idx);
+ scb = tidrecvc->completescb;
+
+ ips_scb_opcode(scb) = OPCODE_EXPTID_COMPLETION;
+ scb->ips_lrh.khdr.kdeth0 = 0;
+ scb->ips_lrh.data[0] = sdescid;
+
+ /* Attached tidflow gen/seq */
+ scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val;
+
+ ips_proto_flow_enqueue(flow, scb);
+ flow->flush(flow, NULL);
+
+ if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+ flow->flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */
+ ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */
+ }
+}
+
+#ifdef PSM_CUDA
+static
+void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb)
+{
+ PSMI_CUDA_CALL(cudaFreeHost, chb->host_buf);
+ PSMI_CUDA_CALL(cudaEventDestroy, chb->copy_status);
+ psmi_free(chb);
+ return;
+}
+#endif
+
+int
+ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr;
+ ptl_arg_t desc_id = p_hdr->data[0];
+ struct ips_tid_send_desc *tidsendc;
+
+ PSM2_LOG_MSG("entering");
+ PSM_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+ rcv_ev->proto->ep->mq->ep->epid,"desc_id._desc_idx: %d",desc_id._desc_idx);
+
+ if (!ips_proto_is_expected_or_nak(rcv_ev))
+ {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+
+ if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ)
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq,
+ &ipsaddr->flows[ips_proto_flowid(p_hdr)]);
+
+ ips_proto_process_ack(rcv_ev);
+
+ /*
+ * Get the session send descriptor and complete.
+ */
+ tidsendc = (struct ips_tid_send_desc *)
+ psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+ desc_id._desc_idx);
+ _HFI_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc);
+ if (tidsendc == NULL) {
+ _HFI_ERROR
+ ("exptid comp: Index %d is out of range\n",
+ desc_id._desc_idx);
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ } else {
+ ptl_arg_t desc_tidsendc;
+
+ psmi_mpool_get_obj_index_gen_count(tidsendc,
+ &desc_tidsendc._desc_idx,
+ &desc_tidsendc._desc_genc);
+
+ _HFI_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n",
+ desc_id._desc_idx, desc_id._desc_genc,
+ desc_tidsendc._desc_idx, desc_tidsendc._desc_genc);
+
+ /* See if the reference is still live and valid */
+ if (desc_tidsendc.u64 != desc_id.u64) {
+ _HFI_ERROR("exptid comp: Genc %d does not match\n",
+ desc_id._desc_genc);
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+ }
+
+ if (!STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)) {
+ struct ips_message_header hdr;
+
+ /* Hack to handle the tidflow */
+ hdr.data[0] = rcv_ev->p_hdr->data[0];
+ hdr.ack_seq_num = rcv_ev->p_hdr->mdata;
+ hdr.khdr.kdeth0 = __cpu_to_le32(3 << HFI_KHDR_TIDCTRL_SHIFT);
+ rcv_ev->p_hdr = &hdr;
+
+ /*
+ * This call should directly complete the tidflow
+ * and free all scb on the unacked queue.
+ */
+ ips_proto_process_ack(rcv_ev);
+
+ /* Keep KW happy. */
+ rcv_ev->p_hdr = NULL;
+ psmi_assert(STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked));
+ }
+
+ psm2_mq_req_t req = tidsendc->mqreq;
+ /* Check if we can complete the send request. */
+ req->send_msgoff += tidsendc->length;
+
+#ifdef PSM_CUDA
+ if (req->cuda_hostbuf_used) {
+ if (tidsendc->cuda_num_buf == 1) {
+ tidsendc->cuda_hostbuf[0]->bytes_read +=
+ tidsendc->tid_list.tsess_length;
+ if(tidsendc->cuda_hostbuf[0]->bytes_read ==
+ tidsendc->cuda_hostbuf[0]->size){
+ STAILQ_REMOVE(&req->sendreq_prefetch,
+ tidsendc->cuda_hostbuf[0],
+ ips_cuda_hostbuf, req_next);
+ if (tidsendc->cuda_hostbuf[0]->is_tempbuf)
+ psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]);
+ else {
+ tidsendc->cuda_hostbuf[0]->req = NULL;
+ tidsendc->cuda_hostbuf[0]->offset = 0;
+ tidsendc->cuda_hostbuf[0]->bytes_read = 0;
+ psmi_mpool_put(tidsendc->cuda_hostbuf[0]);
+ }
+ psmi_cuda_run_prefetcher(protoexp, tidsendc);
+ }
+ } else
+ psmi_free(tidsendc->userbuf);
+ }
+#endif
+ if (req->send_msgoff == req->send_msglen) {
+ psmi_mq_handle_rts_complete(req);
+ }
+
+ psmi_mpool_put(tidsendc);
+
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ struct ips_protoexp *protoexp = proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_tid_recv_desc *tidrecvc;
+ ptl_arg_t desc_id;
+ psmi_seqnum_t sequence_num, tf_sequence_num;
+
+ psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+ PSM2_LOG_MSG("entering");
+
+ desc_id._desc_idx = ips_proto_flowid(p_hdr);
+ PSM_LOG_EPM(OPCODE_EXPTID,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+ proto->ep->mq->ep->epid,"desc_id._desc_idx: %d", desc_id._desc_idx);
+
+ desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE; /* skip */
+ }
+
+ /* IBTA CCA handling for expected flow. */
+ if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) {
+ /* Mark flow to generate BECN in control packet */
+ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+ /* Update stats for congestion encountered */
+ proto->epaddr_stats.congestion_pkts++;
+ /* Clear FECN event */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+ }
+
+ sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+ if_pf(protoexp->tid_flags & IPS_PROTOEXP_FLAG_HDR_SUPP) {
+ /* Drop packet if generation number does not match. There
+ * is a window that before we program the hardware tidflow
+ * table with new gen/seq, hardware might receive some
+ * packets with the old generation.
+ */
+ if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen)
+ {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+
+#ifdef PSM_DEBUG
+ /* Check if new packet falls into expected seq range, we need
+ * to deal with wrap around of the seq value from 2047 to 0
+ * because seq is only 11 bits. */
+ int16_t seq_off = (int16_t)(sequence_num.psn_seq -
+ tidrecvc->tidflow_genseq.psn_seq);
+ if (seq_off < 0)
+ seq_off += 2048; /* seq is 11 bits */
+ psmi_assert(seq_off < 1024);
+#endif
+ /* NOTE: with RSM in use, we should not automatically update
+ * our PSN from the HFI's PSN. The HFI doesn't know about
+ * RSM interceptions.
+ */
+ /* (DON'T!) Update the shadow tidflow_genseq */
+ /* tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; */
+
+ }
+ /* Always check the sequence number if we get a header, even if SH. */
+ if_pt(sequence_num.psn_num == tidrecvc->tidflow_genseq.psn_num) {
+ /* Update the shadow tidflow_genseq */
+ tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1;
+
+ /* update the fake tidflow table with new seq, this is for
+ * seqerr and err_chk_gen processing to get the latest
+ * valid sequence number */
+ hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx,
+ tidrecvc->tidflow_genseq.psn_gen,
+ tidrecvc->tidflow_genseq.psn_seq);
+ } else {
+ /* Generation mismatch */
+ if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) {
+ ips_protoexp_do_tf_generr(protoexp,
+ tidrecvc, p_hdr);
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ } else {
+ /* Possible sequence mismatch error */
+ /* First, check if this is a recoverable SeqErr -
+ * caused by a good packet arriving in a tidflow that
+ * has had a FECN bit set on some earlier packet.
+ */
+
+ /* If this is the first RSM packet, our own PSN state
+ * is probably old. Pull from the HFI if it has
+ * newer data.
+ */
+ tf_sequence_num.psn_val =
+ hfi_tidflow_get_seqnum(
+ hfi_tidflow_get(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx));
+ if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq)
+ tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq;
+
+ /* Now re-check the sequence numbers. */
+ if (sequence_num.psn_seq > tidrecvc->tidflow_genseq.psn_seq) {
+ /* It really was a sequence error. Restart. */
+ ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr);
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ } else {
+ /* False SeqErr. We can accept this packet. */
+ if (sequence_num.psn_seq == tidrecvc->tidflow_genseq.psn_seq)
+ tidrecvc->tidflow_genseq.psn_seq++;
+ }
+ }
+ }
+
+ /* Reset the swapped generation count as we received a valid packet */
+ tidrecvc->tidflow_nswap_gen = 0;
+
+ /* Do some sanity checking */
+ psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
+ int recv_completion = (tidrecvc->recv_tidbytes ==
+ (p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev)));
+
+ /* If sender requested an ACK with the packet and it is not the last
+ * packet, or if the incoming flow faced congestion, respond with an
+ * ACK packet. The ACK when congested will have the BECN bit set.
+ */
+ if (((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) &&
+ !recv_completion) ||
+ (tidrecvc->tidflow.flags & IPS_FLOW_FLAG_GEN_BECN)) {
+ ips_scb_t ctrlscb;
+
+ /* Ack sender with descriptor index */
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid;
+ ctrlscb.ips_lrh.ack_seq_num = tidrecvc->tidflow_genseq.psn_val;
+
+ ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+ OPCODE_ACK,
+ &tidrecvc->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+
+ /* If RSM has found a TID packet marked with FECN, the payload
+ * will be written to the eager buffer, and we will have a payload
+ * pointer here. In that case, copy the payload into the user's
+ * buffer. If RSM did not intercept this EXPTID packet, the HFI
+ * will handle the packet payload.
+ * Possibly should assert(0 < paylen < MTU).
+ */
+ if (ips_recvhdrq_event_payload(rcv_ev) &&
+ ips_recvhdrq_event_paylen(rcv_ev))
+ psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset,
+ ips_recvhdrq_event_payload(rcv_ev),
+ ips_recvhdrq_event_paylen(rcv_ev));
+
+ /* If last packet then we are done. We send a tid transfer completion
+ * packet back to sender, free all tids and close the current tidflow
+ * as well as tidrecvc descriptor.
+ * Note: If we were out of tidflow, this will invoke the callback to
+ * schedule pending transfer.
+ */
+ if (recv_completion) {
+ /* copy unaligned data if any */
+ uint8_t *dst, *src;
+
+ if (tidrecvc->tid_list.tsess_unaligned_start) {
+ dst = (uint8_t *)tidrecvc->buffer;
+ src = (uint8_t *)p_hdr->exp_ustart;
+#ifdef PSM_CUDA
+ if (tidrecvc->is_ptr_gpu_backed) {
+ PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+ tidrecvc->tid_list.tsess_unaligned_start,
+ cudaMemcpyHostToDevice);
+ } else
+#endif
+ ips_protoexp_unaligned_copy(dst, src,
+ tidrecvc->tid_list.tsess_unaligned_start);
+ }
+
+ if (tidrecvc->tid_list.tsess_unaligned_end) {
+ dst = (uint8_t *)tidrecvc->buffer +
+ tidrecvc->recv_msglen -
+ tidrecvc->tid_list.tsess_unaligned_end;
+ src = (uint8_t *)p_hdr->exp_uend;
+#ifdef PSM_CUDA
+ if (tidrecvc->is_ptr_gpu_backed) {
+ PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+ tidrecvc->tid_list.tsess_unaligned_end,
+ cudaMemcpyHostToDevice);
+ } else
+#endif
+ ips_protoexp_unaligned_copy(dst, src,
+ tidrecvc->tid_list.tsess_unaligned_end);
+ }
+
+ /* reply tid transfer completion packet to sender */
+ ips_protoexp_send_tid_completion(tidrecvc, p_hdr->exp_sdescid);
+
+ /* Mark receive as done */
+ ips_tid_recv_free(tidrecvc);
+ }
+ PSM2_LOG_MSG("leaving");
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+#ifndef PSM_DEBUG
+# define ips_dump_tids(tid_list, msg, ...)
+#else
+static
+void ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...)
+{
+ char buf[256];
+ size_t off = 0;
+ int i, num_tids = tid_list->tsess_tidcount;
+
+ va_list argptr;
+ va_start(argptr, msg);
+ off += vsnprintf(buf, sizeof(buf) - off, msg, argptr);
+ va_end(argptr);
+
+ for (i = 0; i < num_tids && off < (sizeof(buf) - 1); i++)
+ off += snprintf(buf + off, sizeof(buf) - off, "%d%s",
+ IPS_TIDINFO_GET_TID(tid_list->tsess_list[i]),
+ i < num_tids - 1 ? "," : "");
+
+ _HFI_VDBG("%s\n", buf);
+ return;
+}
+#endif
+
+static
+void ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc)
+{
+ char buf[256];
+ size_t off = 0;
+ int i;
+
+ off += snprintf(buf + off, sizeof(buf) - off,
+ "Remaining bytes: %d Member id %d is not in tid_session_id=%d :",
+ tidsendc->remaining_tidbytes, tidsendc->tid_idx,
+ tidsendc->rdescid._desc_idx);
+
+ for (i = 0; i < tidsendc->tid_list.tsess_tidcount + 1; i++)
+ off += snprintf(buf + off, sizeof(buf) - off, "%d,",
+ IPS_TIDINFO_GET_TID(tidsendc->tid_list.
+ tsess_list[i]));
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Trying to use tid idx %d and there are %d members: %s\n",
+ tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount,
+ buf);
+ return;
+}
+
+#ifdef PSM_CUDA
+static
+psm2_error_t
+psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq)
+{
+ struct ips_protoexp *protoexp = getreq->tidgr_protoexp;
+ struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead =
+ &getreq->pend_cudabuf;
+ struct ips_cuda_hostbuf *chb;
+ cudaError_t status;
+
+ /* Get the getreq's first memcpy op */
+ while (!STAILQ_EMPTY(cmemcpyhead)) {
+ chb = STAILQ_FIRST(cmemcpyhead);
+ PSMI_CUDA_CHECK_EVENT(chb->copy_status, status);
+ if (status != cudaSuccess) {
+ /* At least one of the copies is still
+ * in progress. Schedule the timer,
+ * then leave the CUDA progress phase
+ * and check for other pending TID work.
+ */
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ return PSM2_OK_NO_PROGRESS;
+ }
+ /* The getreq's oldest cudabuf is done. Reclaim it. */
+ getreq->tidgr_cuda_bytesdone += chb->size;
+ STAILQ_REMOVE_HEAD(cmemcpyhead, next);
+ psmi_mpool_put(chb);
+ }
+ return PSM2_OK;
+}
+
+static
+struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len)
+{
+ struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*)
+ psmi_calloc(PSMI_EP_NONE,
+ UNDEFINED, 1,
+ sizeof(struct ips_cuda_hostbuf));
+ if (chb == NULL) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+ "Couldn't allocate cuda host buffers ");
+ }
+ PSMI_CUDA_CALL(cudaHostAlloc,
+ (void **) &chb->host_buf,
+ window_len,
+ cudaHostAllocPortable);
+ PSMI_CUDA_CALL(cudaEventCreate, &chb->copy_status);
+ return chb;
+}
+
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+ struct ips_tid_send_desc *tidsendc)
+{
+ struct ips_proto *proto = protoexp->proto;
+ struct ips_cuda_hostbuf *chb = NULL;
+ psm2_mq_req_t req = tidsendc->mqreq;
+ uint32_t offset, window_len;
+
+ /* try to push the prefetcher forward */
+ if (req->prefetch_send_msgoff < req->send_msglen) {
+ /* some data remains to be sent */
+ offset = req->prefetch_send_msgoff;
+ window_len =
+ ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+ offset, req->buf_len);
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+ proto->cuda_hostbuf_pool_small_send);
+ if (chb == NULL)
+ chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+ proto->cuda_hostbuf_pool_send);
+ /* were any buffers available for the prefetcher? */
+ if (chb == NULL)
+ return;
+ req->prefetch_send_msgoff += window_len;
+ chb->offset = offset;
+ chb->size = window_len;
+ chb->req = req;
+ chb->gpu_buf = (void *) req->buf + offset;
+ chb->bytes_read = 0;
+ PSMI_CUDA_CALL(cudaMemcpyAsync,
+ chb->host_buf, chb->gpu_buf,
+ window_len,
+ cudaMemcpyDeviceToHost,
+ proto->cudastream_send);
+ PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+ proto->cudastream_send);
+
+ STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+ return;
+ }
+ return;
+}
+
+static
+void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+ psm2_mq_req_t req,
+ struct ips_tid_send_desc *tidsendc,
+ struct ips_cuda_hostbuf *chb_prev,
+ uint32_t tsess_srcoff,
+ uint32_t tsess_length,
+ uint32_t tsess_unaligned_start,
+ psm2_chb_match_type_t type)
+{
+ struct ips_proto *proto = protoexp->proto;
+ struct ips_cuda_hostbuf *chb = NULL;
+ uint32_t offset, window_len, attached=0;
+
+ /* try to push the prefetcher forward */
+ while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) {
+ /* some data remains to be sent */
+ offset = req->prefetch_send_msgoff;
+ window_len =
+ ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+ offset, req->buf_len);
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+ proto->cuda_hostbuf_pool_small_send);
+ if (chb == NULL)
+ chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+ proto->cuda_hostbuf_pool_send);
+
+ /* were any buffers available? If not force allocate */
+ if (chb == NULL) {
+ chb = psmi_allocate_chb(window_len);
+ chb->is_tempbuf = 1;
+ }
+ req->prefetch_send_msgoff += window_len;
+ chb->offset = offset;
+ chb->size = window_len;
+ chb->req = req;
+ chb->gpu_buf = (void *) req->buf + offset;
+ chb->bytes_read = 0;
+ PSMI_CUDA_CALL(cudaMemcpyAsync,
+ chb->host_buf, chb->gpu_buf,
+ window_len,
+ cudaMemcpyDeviceToHost,
+ proto->cudastream_send);
+ PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+ proto->cudastream_send);
+
+ STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+ if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+ if ((tsess_srcoff < chb->offset)
+ && ((tsess_srcoff + tsess_length) > chb->offset)) {
+ tidsendc->cuda_hostbuf[0] = chb_prev;
+ tidsendc->cuda_hostbuf[1] = chb;
+ tidsendc->cuda_num_buf = 2;
+ void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+ tsess_length);
+ tidsendc->userbuf =
+ (void *)((uintptr_t) buffer);
+ tidsendc->buffer =
+ (void *)((uintptr_t)tidsendc->userbuf +
+ tsess_unaligned_start);
+ return;
+ }
+ } else {
+ if (attached) {
+ tidsendc->cuda_hostbuf[0] = chb_prev;
+ tidsendc->cuda_hostbuf[1] = chb;
+ tidsendc->cuda_num_buf = 2;
+ void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+ tsess_length);
+ tidsendc->userbuf =
+ (void *)((uintptr_t) buffer);
+ tidsendc->buffer =
+ (void *)((uintptr_t)tidsendc->userbuf +
+ tsess_unaligned_start);
+ attached = 0;
+ return;
+ }
+ if ((tsess_srcoff > chb->offset)
+ && (tsess_srcoff < (chb->offset + chb->size))
+ && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) {
+ chb_prev = chb;
+ attached = 1;
+ chb = NULL;
+ continue;
+ } else if ((chb->offset <= tsess_srcoff) &&
+ ((tsess_srcoff + tsess_length) <=
+ (chb->offset+chb->size))) {
+ tidsendc->cuda_hostbuf[0] = chb;
+ tidsendc->cuda_hostbuf[1] = NULL;
+ tidsendc->cuda_num_buf = 1;
+ tidsendc->userbuf =
+ (void *)((uintptr_t) chb->host_buf +
+ tsess_srcoff - chb->offset);
+ tidsendc->buffer =
+ (void *)((uintptr_t)tidsendc->userbuf +
+ tsess_unaligned_start );
+ return;
+ } else
+ chb = NULL;
+ }
+ }
+}
+
+
+static
+psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb,
+ ips_tid_session_list *tid_list,
+ uint32_t prefetch_send_msgoff)
+{
+ /* To get a match:
+ * 1. Tid list offset + length is contained within a chb
+ * 2. Tid list offset + length is contained within
+ * the prefetched offset of this req.
+ * 3. Tid list offset + length is partially prefetched
+ * within one chb. (A partial match)
+ */
+ if (chb->offset <= tid_list->tsess_srcoff) {
+ if ((chb->offset + chb->size) >=
+ (tid_list->tsess_srcoff + tid_list->tsess_length)) {
+ return PSMI_CUDA_FULL_MATCH_FOUND;
+ } else {
+ if((chb->offset + chb->size) > tid_list->tsess_srcoff){
+ if(((chb->offset + (2 * chb->size)) >
+ (tid_list->tsess_srcoff + tid_list->tsess_length)) &&
+ ((prefetch_send_msgoff) >=
+ (tid_list->tsess_srcoff + tid_list->tsess_length))){
+ return PSMI_CUDA_SPLIT_MATCH_FOUND;
+ } else if((tid_list->tsess_srcoff + tid_list->tsess_length)
+ > prefetch_send_msgoff) {
+ return PSMI_CUDA_PARTIAL_MATCH_FOUND;
+ }
+ }
+ }
+ }
+ return PSMI_CUDA_CONTINUE;
+}
+#endif
+
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+ ips_epaddr_t *ipsaddr,
+ psm2_mq_req_t req,
+ ptl_arg_t rdescid,
+ uint32_t tidflow_genseq,
+ ips_tid_session_list *tid_list,
+ uint32_t tid_list_size)
+{
+ struct ips_tid_send_desc *tidsendc;
+ uint32_t i, j, *src, *dst;
+
+ PSM2_LOG_MSG("entering");
+ psmi_assert(tid_list_size > sizeof(ips_tid_session_list));
+ psmi_assert(tid_list_size <= sizeof(tidsendc->filler));
+ psmi_assert(tid_list->tsess_tidcount > 0);
+ psmi_assert((rdescid._desc_genc>>16) == 0);
+
+ tidsendc = (struct ips_tid_send_desc *)
+ psmi_mpool_get(protoexp->tid_desc_send_pool);
+ if (tidsendc == NULL) {
+ PSM2_LOG_MSG("leaving");
+ ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr);
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ req->ptl_req_ptr = (void *)tidsendc;
+ tidsendc->protoexp = protoexp;
+
+ /* Uniquely identify this send descriptor in space and time */
+ tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc);
+ tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc);
+ tidsendc->rdescid = rdescid;
+ tidsendc->ipsaddr = ipsaddr;
+ tidsendc->mqreq = req;
+
+ /*
+ * Copy received tidinfo to local tidsendc buffer.
+ * while doing the copy, we try to merge the tids based on
+ * following rules:
+ * 1. both tids are virtually contiguous(i and i+1 in the array);
+ * 2. both tids have the same tidpair value;
+ * 3. first tid (i) has tidctrl=1;
+ * 4. second tid (i+1) has tidctrl=2;
+ * 5. total length does not exceed 512 pages (2M);
+ *
+ * The restriction of 512 pages comes from the limited number
+ * of bits we have for KDETH.OFFSET:
+ * - The entire mapping space provided through TIDs is to be
+ * viewed as a zero-based address mapping.
+ * - We have 15 bits in KDETH offset field through which we
+ * can address upto a maximum of 2MB.
+ * (with 64-byte offset mode or KDETH.OM = 1)
+ * - Assuming a 4KB page size, 2MB/4KB = 512 pages.
+ */
+ psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list,
+ sizeof(ips_tid_session_list));
+ ips_dump_tids(tid_list, "Received %d tids: ",
+ tid_list->tsess_tidcount);
+
+ src = tid_list->tsess_list;
+ dst = tidsendc->tid_list.tsess_list;
+ dst[0] = src[0];
+ j = 0; i = 1;
+ while (i < tid_list->tsess_tidcount) {
+ if ((((dst[j]>>IPS_TIDINFO_TIDCTRL_SHIFT)+1) ==
+ (src[i]>>IPS_TIDINFO_TIDCTRL_SHIFT)) &&
+ (((dst[j]&IPS_TIDINFO_LENGTH_MASK)+
+ (src[i]&IPS_TIDINFO_LENGTH_MASK)) <=
+ PSM_MAX_NUM_PAGES_IN_TIDPAIR)) {
+ /*
+ * merge 'i' to 'j'
+ * (We need to specify "tidctrl" value as 3
+ * if we merge the individual tid-pairs.
+ * Doing that here)
+ */
+ dst[j] += (2 << IPS_TIDINFO_TIDCTRL_SHIFT) +
+ (src[i] & IPS_TIDINFO_LENGTH_MASK);
+ i++;
+ if (i == tid_list->tsess_tidcount) break;
+ }
+ j++;
+ /* copy 'i' to 'j' */
+ dst[j] = src[i];
+ i++;
+ }
+ tidsendc->tid_list.tsess_tidcount = j + 1;
+ tid_list = &tidsendc->tid_list;
+
+ /* Initialize tidflow for window. Use path requested by remote endpoint */
+ ips_flow_init(&tidsendc->tidflow, protoexp->proto, ipsaddr,
+ protoexp->tid_xfer_type, PSM_PROTOCOL_TIDFLOW,
+ IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW);
+ tidsendc->tidflow.xmit_seq_num.psn_val = tidflow_genseq;
+ tidsendc->tidflow.xmit_ack_num.psn_val = tidflow_genseq;
+
+ tidsendc->userbuf =
+ (void *)((uintptr_t) req->buf + tid_list->tsess_srcoff);
+ tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf +
+ tid_list->tsess_unaligned_start);
+ tidsendc->length = tid_list->tsess_length;
+ tidsendc->ctrl_msg_queued = 0;
+ tidsendc->frag_size = min(protoexp->tid_send_fragsize,
+ tidsendc->tidflow.frag_size);
+
+#ifdef PSM_CUDA
+ /* Matching on previous prefetches and initiating next prefetch */
+ struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL;
+ psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE;
+
+ /* check if the prefetcher has a buffer ready to use */
+ tidsendc->cuda_hostbuf[0] = NULL;
+ tidsendc->cuda_hostbuf[1] = NULL;
+ tidsendc->cuda_num_buf = 0;
+ if (req->cuda_hostbuf_used) {
+ /* To get a match:
+ * 1. Tid list offset + length is contained within a chb
+ * 2. Tid list offset + length is contained within
+ * the prefetched offset of this req.
+ * 3. Tid list offset + length is partially prefetched
+ * within one chb. (A partial match)
+ */
+ STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) {
+ rc = psmi_find_match_in_prefeteched_chb(chb,
+ tid_list,
+ req->prefetch_send_msgoff);
+ if (rc < PSMI_CUDA_CONTINUE)
+ break;
+ }
+ if (rc == PSMI_CUDA_FULL_MATCH_FOUND) {
+ tidsendc->userbuf =
+ (void *)((uintptr_t) chb->host_buf+
+ tid_list->tsess_srcoff - chb->offset);
+ tidsendc->buffer =
+ (void *)((uintptr_t)tidsendc->userbuf +
+ tid_list->tsess_unaligned_start);
+ /* now associate the buffer with the tidsendc */
+ tidsendc->cuda_hostbuf[0] = chb;
+ tidsendc->cuda_hostbuf[1] = NULL;
+ tidsendc->cuda_num_buf = 1;
+ } else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){
+ void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+ tid_list->tsess_length);
+ tidsendc->userbuf =
+ (void *)((uintptr_t) buffer);
+ tidsendc->buffer =
+ (void *)((uintptr_t)tidsendc->userbuf +
+ tid_list->tsess_unaligned_start);
+ chb_next = STAILQ_NEXT(chb, req_next);
+ tidsendc->cuda_hostbuf[0] = chb;
+ tidsendc->cuda_hostbuf[1] = chb_next;
+ tidsendc->cuda_num_buf = 2;
+ } else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+ psmi_attach_chb_to_tidsendc(protoexp, req,
+ tidsendc,
+ chb,
+ tid_list->tsess_srcoff,
+ tid_list->tsess_length,
+ tid_list->tsess_unaligned_start,
+ rc);
+ } else {
+ psmi_attach_chb_to_tidsendc(protoexp, req,
+ tidsendc,
+ NULL,
+ tid_list->tsess_srcoff,
+ tid_list->tsess_length,
+ tid_list->tsess_unaligned_start,
+ PSMI_CUDA_CONTINUE);
+ }
+ }
+#endif
+
+ /* frag size must be 64B multiples */
+ tidsendc->frag_size &= (~63);
+ tidsendc->is_complete = 0;
+ tidsendc->tid_idx = 0;
+ tidsendc->frame_send = 0;
+
+ tidsendc->tidbytes = 0;
+ tidsendc->remaining_tidbytes = tid_list->tsess_length -
+ tid_list->tsess_unaligned_start - tid_list->tsess_unaligned_end;
+ tidsendc->remaining_bytes_in_tid =
+ (IPS_TIDINFO_GET_LENGTH(tid_list->tsess_list[0]) << 12) -
+ tid_list->tsess_tidoffset;
+ tidsendc->offset_in_tid = tid_list->tsess_tidoffset;
+
+ _HFI_EXP
+ ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n",
+ tidsendc->sdescid._desc_idx, rdescid._desc_idx,
+ tid_list->tsess_srcoff, tid_list->tsess_length,
+ tid_list->tsess_unaligned_start, tid_list->tsess_unaligned_end);
+
+ ips_tid_send_exp(tidsendc);
+
+ /* Add as a pending op and ring up the timer */
+ if (tidsendc->is_complete == 0) {
+ STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+ psmi_timer_request(protoexp->timerq, &protoexp->timer_send,
+ PSMI_TIMER_PRIO_1);
+ }
+
+ PSM2_LOG_MSG("leaving");
+ /* Consider breaking out of progress engine here */
+ return PSM2_OK;
+}
+
+static
+ips_scb_t *
+ips_scb_prepare_tid_sendctrl(struct ips_flow *flow,
+ struct ips_tid_send_desc *tidsendc)
+{
+ struct ips_protoexp *protoexp = tidsendc->protoexp;
+ uint32_t *tsess_list = tidsendc->tid_list.tsess_list;
+ uint32_t tid, omode, offset, chunk_size;
+ uint32_t startidx, endidx;
+ uint32_t frame_len, nfrag;
+ uint8_t *bufptr = tidsendc->buffer;
+ ips_scb_t *scb;
+ uint8_t is_payload_per_frag_leq_8dw = 0;
+ /* If payload in the first and last nfrag is less then or equal
+ * to 8DW we disable header suppression so as to detect uncorrectable
+ * errors which will otherwise be non-detectable(since header is
+ * suppressed we lose RHF.EccErr)
+ */
+ if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL)
+ return NULL;
+
+ /*
+ * Make sure the next offset is in 64B multiples with the tid.
+ */
+ frame_len =
+ min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes);
+ if (frame_len > tidsendc->frag_size) {
+ frame_len =
+ tidsendc->frag_size - (tidsendc->offset_in_tid & 63);
+ }
+ /*
+ * Frame length is the amount of payload to be included in a particular
+ * frag of the scb, so we check if frame len is less than or equal
+ * to 8DW. If length is less then then or equal to 8DW for the first
+ * frag then we avoid header suppression
+ */
+ if (frame_len <= 32)
+ is_payload_per_frag_leq_8dw = 1;
+
+ /*
+ * Using large offset mode based on offset length.
+ */
+ if (tidsendc->offset_in_tid < 131072) { /* 2^15 * 4 */
+ psmi_assert((tidsendc->offset_in_tid % 4) == 0);
+ offset = tidsendc->offset_in_tid / 4;
+ omode = 0;
+ } else {
+ psmi_assert((tidsendc->offset_in_tid % 64) == 0);
+ offset = tidsendc->offset_in_tid / 64;
+ omode = 1;
+ }
+ startidx = tidsendc->tid_idx;
+ tid = IPS_TIDINFO_GET_TID(tsess_list[startidx]);
+ scb->ips_lrh.khdr.kdeth0 = (offset & HFI_KHDR_OFFSET_MASK) |
+ (omode << HFI_KHDR_OM_SHIFT) | (tid << HFI_KHDR_TID_SHIFT);
+
+ scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(tsess_list[startidx]);
+ scb->tsess = (uint32_t *) &tsess_list[startidx];
+
+ /*
+ * Payload and buffer address for current packet. payload_size
+ * must be the first packet size because it is used to initialize
+ * the packet header.
+ */
+ scb->payload_size = frame_len;
+ ips_scb_buffer(scb) = (void *)bufptr;
+ scb->frag_size = tidsendc->frag_size;
+
+ /*
+ * Other packet fields.
+ */
+ PSM_LOG_EPM(OPCODE_EXPTID,PSM_LOG_EPM_TX, protoexp->proto->ep->epid,
+ flow->ipsaddr->epaddr.epid,
+ "psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid._desc_idx: %d, tidsendc->sdescid._desc_idx: %d",
+ psmi_mpool_get_obj_index(tidsendc->mqreq),tidsendc->rdescid._desc_idx,tidsendc->sdescid._desc_idx);
+ ips_scb_opcode(scb) = OPCODE_EXPTID;
+ scb->ips_lrh.exp_sdescid = tidsendc->sdescid;
+ scb->ips_lrh.exp_rdescid_genc = (uint16_t)tidsendc->rdescid._desc_genc;
+ scb->ips_lrh.exp_offset = tidsendc->tidbytes;
+
+ scb->tidsendc = tidsendc;
+ SLIST_NEXT(scb, next) = NULL;
+
+ /*
+ * Loop over the tid session list, count the frag number and payload size.
+ */
+ nfrag = 1;
+ chunk_size = frame_len;
+ while (1) {
+ /* Record last tididx used */
+ endidx = tidsendc->tid_idx;
+ /* Check if all tidbytes are done */
+ tidsendc->remaining_tidbytes -= frame_len;
+ if (!tidsendc->remaining_tidbytes) {
+ /* We do another frame length check for the last frag */
+ if (frame_len <= 32)
+ is_payload_per_frag_leq_8dw = 1;
+ break;
+ }
+
+ /* Update in current tid */
+ tidsendc->remaining_bytes_in_tid -= frame_len;
+ tidsendc->offset_in_tid += frame_len;
+ psmi_assert((tidsendc->offset_in_tid % 64) == 0);
+
+ /* Done with this tid, move on to the next tid */
+ if (!tidsendc->remaining_bytes_in_tid) {
+ tidsendc->tid_idx++;
+ psmi_assert(tidsendc->tid_idx <
+ tidsendc->tid_list.tsess_tidcount);
+ tidsendc->remaining_bytes_in_tid =
+ IPS_TIDINFO_GET_LENGTH(tsess_list
+ [tidsendc->tid_idx]) << 12;
+ tidsendc->offset_in_tid = 0;
+ }
+
+ /* For PIO, only single packet per scb allowed */
+ if (flow->transfer == PSM_TRANSFER_PIO) {
+ break;
+ }
+
+ frame_len =
+ min(tidsendc->remaining_bytes_in_tid,
+ tidsendc->remaining_tidbytes);
+ if (frame_len > tidsendc->frag_size)
+ frame_len = tidsendc->frag_size;
+ nfrag++;
+ chunk_size += frame_len;
+ }
+
+ scb->nfrag = nfrag;
+ if (nfrag > 1) {
+ scb->nfrag_remaining = scb->nfrag;
+ scb->chunk_size = scb->chunk_size_remaining = chunk_size;
+ }
+ scb->tsess_length = (endidx - startidx + 1) * sizeof(uint32_t);
+
+ /* Keep track of latest buffer location so we restart at the
+ * right location, if we don't complete the transfer */
+ tidsendc->buffer = bufptr + chunk_size;
+ tidsendc->tidbytes += chunk_size;
+
+ if (flow->transfer == PSM_TRANSFER_DMA &&
+ hfi1_supports_dma_no_hdrsupp_for_msgs_leq_8dw) {
+ is_payload_per_frag_leq_8dw = 0;
+ }
+
+ /* If last packet, we want a completion notification */
+ if (!tidsendc->remaining_tidbytes) {
+ /* last packet/chunk, attach unaligned data */
+ uint8_t *dst, *src;
+
+ if (tidsendc->tid_list.tsess_unaligned_start) {
+ dst = (uint8_t *)scb->ips_lrh.exp_ustart;
+ src = (uint8_t *)tidsendc->userbuf;
+#ifdef PSM_CUDA
+ if (!tidsendc->mqreq->cuda_hostbuf_used) {
+ PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+ tidsendc->tid_list.tsess_unaligned_start,
+ cudaMemcpyDeviceToHost);
+ } else
+#endif
+ ips_protoexp_unaligned_copy(dst, src,
+ tidsendc->tid_list.tsess_unaligned_start);
+
+ }
+
+ if (tidsendc->tid_list.tsess_unaligned_end) {
+ dst = (uint8_t *)&scb->ips_lrh.exp_uend;
+ src = (uint8_t *)tidsendc->userbuf +
+ tidsendc->length -
+ tidsendc->tid_list.tsess_unaligned_end;
+#ifdef PSM_CUDA
+ if (!tidsendc->mqreq->cuda_hostbuf_used) {
+ PSMI_CUDA_CALL(cudaMemcpy, dst, src,
+ tidsendc->tid_list.tsess_unaligned_end,
+ cudaMemcpyDeviceToHost);
+ } else
+#endif
+ ips_protoexp_unaligned_copy(dst, src,
+ tidsendc->tid_list.tsess_unaligned_end);
+ }
+ /*
+ * If the number of fragments is greater then one and
+ * "no header suppression" flag is unset then we go
+ * ahead and suppress the header */
+ if ((scb->nfrag > 1) && (!is_payload_per_frag_leq_8dw))
+ scb->flags |= IPS_SEND_FLAG_HDRSUPP;
+ else
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+ tidsendc->is_complete = 1;
+ } else {
+ /* Do not suppress header every hdr_pkt_interval */
+ if ((++tidsendc->frame_send %
+ protoexp->hdr_pkt_interval) == 0)
+ /* Request an ACK */
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+ else {
+ if (!is_payload_per_frag_leq_8dw) {
+ /* Request hdr supp */
+ scb->flags |= IPS_SEND_FLAG_HDRSUPP;
+ }
+ }
+ /* assert only single packet per scb */
+ psmi_assert(scb->nfrag == 1);
+ }
+
+#ifdef PSM_CUDA
+ if (tidsendc->mqreq->is_buf_gpu_mem && /* request's buffer comes from GPU realm */
+ !tidsendc->mqreq->cuda_hostbuf_used) { /* and it was NOT moved to HOST memory */
+ scb->mq_req = tidsendc->mqreq; /* so let's mark it per scb, not to check its locality again */
+ }
+#endif
+
+ return scb;
+}
+
+/*
+ * Returns:
+ *
+ * PSM2_OK: scb was allocated for at least one frame, the packet may be queued
+ * or actually sent.
+ *
+ * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow
+ * to be enqueued before polling receive queue.
+ *
+ * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more
+ * scbs become available.
+ *
+ * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now.
+ *
+ */
+
+static
+psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
+{
+ ips_scb_t *scb = NULL;
+ psm2_error_t err = PSM2_OK, err_f;
+ struct ips_protoexp *protoexp = tidsendc->protoexp;
+ struct ips_proto *proto = protoexp->proto;
+ struct ips_flow *flow = &tidsendc->tidflow;
+
+#ifdef PSM_CUDA
+ struct ips_cuda_hostbuf *chb, *chb_next;
+ cudaError_t chb_status;
+ uint32_t offset_in_chb, i;
+ for (i = 0; i < tidsendc->cuda_num_buf; i++) {
+ chb = tidsendc->cuda_hostbuf[i];
+ if (chb) {
+ PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status);
+ if (chb_status != cudaSuccess) {
+ err = PSM2_OK_NO_PROGRESS;
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+ }
+ }
+
+ if (tidsendc->cuda_num_buf == 2) {
+ chb = tidsendc->cuda_hostbuf[0];
+ chb_next = tidsendc->cuda_hostbuf[1];
+ offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset;
+ /* Copying data from multiple cuda
+ * host buffers into a bounce buffer.
+ */
+ memcpy(tidsendc->buffer, chb->host_buf +
+ offset_in_chb, chb->size-offset_in_chb);
+ memcpy(tidsendc->buffer+ chb->size -
+ offset_in_chb, chb_next->host_buf,
+ tidsendc->tid_list.tsess_srcoff +
+ tidsendc->tid_list.tsess_length - chb_next->offset);
+
+ chb->bytes_read += chb->size - offset_in_chb;
+ chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff +
+ tidsendc->tid_list.tsess_length -
+ chb_next->offset;
+ if(chb->bytes_read == chb->size) {
+ STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb,
+ ips_cuda_hostbuf, req_next);
+ if (chb->is_tempbuf)
+ psmi_deallocate_chb(chb);
+ else {
+ chb->req = NULL;
+ chb->offset = 0;
+ chb->bytes_read = 0;
+ psmi_mpool_put(chb);
+ }
+ psmi_cuda_run_prefetcher(protoexp, tidsendc);
+ }
+ if(chb_next->bytes_read == chb_next->size) {
+ STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next,
+ ips_cuda_hostbuf, req_next);
+ if (chb_next->is_tempbuf)
+ psmi_deallocate_chb(chb_next);
+ else{
+ chb_next->req = NULL;
+ chb_next->offset = 0;
+ chb_next->bytes_read = 0;
+ psmi_mpool_put(chb_next);
+ }
+ psmi_cuda_run_prefetcher(protoexp, tidsendc);
+ }
+ }
+#endif
+ /*
+ * We aggressively try to grab as many scbs as possible, enqueue them to a
+ * flow and flush them when either we're out of scbs our we've completely
+ * filled the send request.
+ */
+ while (!tidsendc->is_complete) {
+ if_pf(tidsendc->tid_list.tsess_tidcount &&
+ (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount ||
+ tidsendc->tid_idx < 0))
+ ips_expsend_tiderr(tidsendc);
+
+ if ((scb =
+ ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) {
+ proto->stats.scb_exp_unavail_cnt++;
+ err = PSM2_EP_NO_RESOURCES;
+ break;
+ } else {
+ ips_proto_flow_enqueue(flow, scb);
+ }
+ }
+
+ if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */
+ int num_sent;
+
+ err_f = flow->flush(flow, &num_sent);
+
+ if (err != PSM2_EP_NO_RESOURCES) {
+ /* PSM2_EP_NO_RESOURCES is reserved for out-of-scbs */
+ if (err_f == PSM2_EP_NO_RESOURCES)
+ err = PSM2_TIMEOUT; /* force a resend reschedule */
+ else if (err_f == PSM2_OK && num_sent > 0 &&
+ !ips_ptl_recvq_isempty(protoexp->ptl))
+ err = PSM2_OK_NO_PROGRESS; /* force a rcvhdrq service */
+ }
+ }
+
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+
+static
+psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+ struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context;
+ struct ips_tid_send_pend *phead = &protoexp->pend_sendq;
+ struct ips_tid_send_desc *tidsendc;
+ psm2_error_t err = PSM2_OK;
+
+ while (!STAILQ_EMPTY(phead)) {
+ tidsendc = STAILQ_FIRST(phead);
+
+ err = ips_tid_send_exp(tidsendc);
+
+ if (tidsendc->is_complete)
+ STAILQ_REMOVE_HEAD(phead, next);
+
+ if (err == PSM2_OK) {
+ /* Was able to complete the send, keep going */
+ } else if (err == PSM2_EP_NO_RESOURCES) {
+ /* No more sendbufs available, sendbuf callback will requeue this
+ * timer */
+ break;
+ } else if (err == PSM2_TIMEOUT) {
+ /* Always a case of try later:
+ * On PIO flow, means no send pio bufs available
+ * On DMA flow, means kernel can't queue request or would have to block
+ */
+ psmi_timer_request(protoexp->proto->timerq,
+ &protoexp->timer_send,
+ get_cycles() +
+ protoexp->proto->timeout_send);
+ break;
+ } else {
+ /* Forced to reschedule later so we can check receive queue */
+ psmi_assert(err == PSM2_OK_NO_PROGRESS);
+ psmi_timer_request(protoexp->proto->timerq,
+ &protoexp->timer_send,
+ PSMI_TIMER_PRIO_1);
+ break;
+ }
+ }
+
+ return PSM2_OK;
+}
+
+/* Right now, in the kernel we are allowing for virtually non-contiguous pages,
+ in a single call, and we are therefore locking one page at a time, but since
+ the intended use of this routine is for a single group of
+ virtually contiguous pages, that should change to improve
+ performance. That means possibly changing the calling MPI code.
+ Doing so gets rid of some of the loop stuff here, and in the driver,
+ and allows for a single call to the core VM code in the kernel,
+ rather than one per page, definitely improving performance. */
+
+static
+psm2_error_t
+ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp,
+ struct ips_tid_recv_desc *tidrecvc,
+ uint32_t nbytes_this)
+{
+ ips_tid_session_list *tid_list = &tidrecvc->tid_list;
+ uintptr_t bufptr = (uintptr_t) tidrecvc->buffer;
+ uint32_t size = nbytes_this;
+ psm2_error_t err = PSM2_OK;
+ uintptr_t pageaddr;
+ uint32_t tidoff, pageoff, pagelen, reglen, num_tids;
+
+ psmi_assert(size >= 4);
+
+ /*
+ * The following calculation does not work when size < 4
+ * and bufptr is byte aligned, it can get negative value.
+ */
+ tid_list->tsess_unaligned_start = (bufptr & 3) ? (4 - (bufptr & 3)) : 0;
+ size -= tid_list->tsess_unaligned_start;
+ bufptr += tid_list->tsess_unaligned_start;
+
+ tid_list->tsess_unaligned_end = size & 3;
+ size -= tid_list->tsess_unaligned_end;
+
+ psmi_assert(size > 0);
+
+#ifdef PSM_CUDA
+ /* Driver pins GPU pages when using GPU Direct RDMA for TID recieves,
+ * to accomadate this change the calculations of pageaddr, pagelen
+ * and pageoff have been modified to take GPU page size into
+ * consideration.
+ */
+ if (tidrecvc->is_ptr_gpu_backed) {
+ uint64_t page_mask = ~(PSMI_GPU_PAGESIZE -1);
+ uint32_t page_offset_mask = (PSMI_GPU_PAGESIZE -1);
+ pageaddr = bufptr & page_mask;
+ pagelen = (uint32_t) (PSMI_GPU_PAGESIZE +
+ ((bufptr + size - 1) & page_mask) -
+ (bufptr & page_mask));
+ tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask);
+ } else {
+ pageaddr = bufptr & protoexp->tid_page_mask;
+ pagelen = (uint32_t) (PSMI_PAGESIZE +
+ ((bufptr + size - 1) & protoexp->tid_page_mask) -
+ (bufptr & protoexp->tid_page_mask));
+ tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
+ }
+#else
+ pageaddr = bufptr & protoexp->tid_page_mask;
+ pagelen = (uint32_t) (PSMI_PAGESIZE +
+ ((bufptr + size - 1) & protoexp->tid_page_mask) -
+ (bufptr & protoexp->tid_page_mask));
+ tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
+#endif
+
+ reglen = pagelen;
+ if (protoexp->tidc.tid_array) {
+ if ((err = ips_tidcache_acquire(&protoexp->tidc,
+ (void *)pageaddr, ®len,
+ (uint32_t *) tid_list->tsess_list, &num_tids,
+ &tidoff
+#ifdef PSM_CUDA
+ , tidrecvc->is_ptr_gpu_backed
+#endif
+ )))
+ goto fail;
+ } else {
+ if ((err = ips_tid_acquire(&protoexp->tidc,
+ (void *)pageaddr, ®len,
+ (uint32_t *) tid_list->tsess_list, &num_tids
+#ifdef PSM_CUDA
+ , tidrecvc->is_ptr_gpu_backed
+#endif
+ )))
+ goto fail;
+ }
+
+ /*
+ * PSM2 currently provides storage space enough to hold upto
+ * 1024 tids. (PSM_TIDLIST_BUFSIZE). So, make sure we
+ * don't get more than what we can hold from the tidcache here.
+ *
+ * The reason for 1024 tids comes from the PSM_TID_WINSIZE value
+ * (currently 4MB. So, if in future, there is a change to this macro,
+ * then you would need a change to PSM_TIDLIST_BUFSIZE as well).
+ *
+ * Assuming a 4KB page size, to be able to receive
+ * a message of 4MB size, we'd need an maximum of 4MB/4KB = 1024 tids.
+ */
+ psmi_assert(num_tids > 0);
+ psmi_assert(num_tids <= (PSM_TID_WINSIZE/PSM_TIDLIST_BUFSIZE));
+ if (reglen > pagelen) {
+ err = psmi_handle_error(protoexp->tidc.context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "PSM tid registration: "
+ "register more pages than asked");
+ goto fail;
+ } else if (reglen < pagelen) {
+ /*
+ * driver registered less pages, update PSM records.
+ */
+ tid_list->tsess_unaligned_end = 0;
+ tidrecvc->recv_tidbytes = reglen - pageoff;
+ tidrecvc->recv_msglen = tid_list->tsess_unaligned_start +
+ tidrecvc->recv_tidbytes;
+ } else {
+ tidrecvc->recv_tidbytes = size;
+ tidrecvc->recv_msglen = nbytes_this;
+ }
+
+ tid_list->tsess_tidcount = num_tids;
+ tid_list->tsess_tidoffset = tidoff;
+
+ ips_dump_tids(tid_list, "Registered %d tids: ", num_tids);
+
+fail:
+ return err;
+}
+
+static
+psm2_error_t
+ips_tid_recv_alloc(struct ips_protoexp *protoexp,
+ ips_epaddr_t *ipsaddr,
+ const struct ips_tid_get_request *getreq,
+ uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc)
+{
+ psm2_error_t err;
+ ips_scb_t *grantscb, *completescb;
+ struct ips_tid_recv_desc *tidrecvc;
+
+ PSM2_LOG_MSG("entering");
+ /* Allocate all necessary resources. */
+
+ /* 1. allocate a tid grant scb. */
+ grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+ if (grantscb == NULL) {
+ /* ips_tid_scbavail_callback() will reschedule */
+ PSM2_LOG_MSG("leaving");
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ /* 2. allocate a tid complete scb. */
+ completescb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+ if (completescb == NULL) {
+ ips_scbctrl_free(grantscb);
+ /* ips_tid_scbavail_callback() will reschedule */
+ PSM2_LOG_MSG("leaving");
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ /* 3. allocate a tid flow entry. */
+ err = ips_tf_allocate(&protoexp->tfc, &tidrecvc);
+ if (err != PSM2_OK) {
+ ips_scbctrl_free(completescb);
+ ips_scbctrl_free(grantscb);
+ /* Unable to get a tidflow for expected protocol. */
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+#ifdef PSM_CUDA
+ psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext;
+
+ if (req->is_buf_gpu_mem)
+ tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used;
+ else
+ tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem;
+
+ /* 4. allocate a cuda bounce buffer, if required */
+ struct ips_cuda_hostbuf *chb = NULL;
+ if (getreq->cuda_hostbuf_used) {
+ if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ)
+ chb = (struct ips_cuda_hostbuf *)
+ psmi_mpool_get(
+ protoexp->cuda_hostbuf_pool_small_recv);
+ if (chb == NULL)
+ chb = (struct ips_cuda_hostbuf *)
+ psmi_mpool_get(
+ protoexp->cuda_hostbuf_pool_recv);
+ if (chb == NULL) {
+ /* Unable to get a cudahostbuf for TID.
+ * Release the resources we're holding and reschedule.*/
+ ips_tf_deallocate(&protoexp->tfc,
+ tidrecvc->rdescid._desc_idx);
+ ips_scbctrl_free(completescb);
+ ips_scbctrl_free(grantscb);
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ PSM2_LOG_MSG("leaving");
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ tidrecvc->cuda_hostbuf = chb;
+ tidrecvc->buffer = chb->host_buf;
+ chb->size = 0;
+ chb->gpu_buf = (void *)((uintptr_t) getreq->tidgr_lbuf +
+ getreq->tidgr_offset);
+ } else {
+ chb = NULL;
+ tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf +
+ getreq->tidgr_offset);
+ tidrecvc->cuda_hostbuf = NULL;
+ }
+#else
+ tidrecvc->buffer =
+ (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset);
+#endif
+
+ /* 5. allocate some tids from driver. */
+ err = ips_tid_recv_alloc_frag(protoexp, tidrecvc, nbytes_this);
+ if (err != PSM2_OK) {
+#ifdef PSM_CUDA
+ if (chb)
+ psmi_mpool_put(chb);
+#endif
+ ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+ ips_scbctrl_free(completescb);
+ ips_scbctrl_free(grantscb);
+ /* Unable to register tids */
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+ PSM2_LOG_MSG("leaving");
+ return err;
+ }
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+ int num_tids = tidrecvc->tid_list.tsess_tidcount;
+ int tid, i;
+ for (i = 0; i < num_tids; i++) {
+ tid =
+ IPS_TIDINFO_GET_TID(tidrecvc->tid_list.
+ tsess_list[i]) * 2 +
+ IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list.
+ tsess_list[i]) - 1;
+ psmi_assert(protoexp->tid_info[tid].state ==
+ TIDSTATE_FREE);
+ psmi_assert(protoexp->tid_info[tid].tidrecvc == NULL);
+ psmi_assert(protoexp->tid_info[tid].tid == 0xFFFFFFFF);
+ protoexp->tid_info[tid].state = TIDSTATE_USED;
+ protoexp->tid_info[tid].tidrecvc = tidrecvc;
+ protoexp->tid_info[tid].tid =
+ tidrecvc->tid_list.tsess_list[i];
+ }
+ }
+
+ /* Initialize recv descriptor */
+ tidrecvc->ipsaddr = ipsaddr;
+ tidrecvc->getreq = (struct ips_tid_get_request *)getreq;
+
+ /* Initialize tidflow, instead calling generic routine:
+ ips_flow_init(&tidrecvc->tidflow, protoexp->proto, ipsaddr,
+ protoexp->ctrl_xfer_type, PSM_PROTOCOL_TIDFLOW,
+ IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW);
+ * only reset following necessary field. */
+ tidrecvc->tidflow.ipsaddr = ipsaddr;
+ tidrecvc->tidflow.flags = 0;
+
+ tidrecvc->tidflow_nswap_gen = 0;
+ tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen;
+ tidrecvc->tidflow_genseq.psn_seq = 0; /* Always start sequence number at 0 (zero),
+ in order to prevent wraparound sequence numbers */
+ hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx,
+ tidrecvc->tidflow_genseq.psn_gen,
+ tidrecvc->tidflow_genseq.psn_seq);
+
+ tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset;
+ tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen;
+
+ tidrecvc->ctrl_msg_queued = 0;
+ tidrecvc->state = TIDRECVC_STATE_BUSY;
+
+ tidrecvc->stats.nSeqErr = 0;
+ tidrecvc->stats.nGenErr = 0;
+ tidrecvc->stats.nReXmit = 0;
+ tidrecvc->stats.nErrChkReceived = 0;
+
+ /* This gets sent out as a control message, so we need to force 4-byte IB
+ * alignment */
+ tidrecvc->tsess_tidlist_length = (uint16_t)
+ PSMI_ALIGNUP((sizeof(ips_tid_session_list) +
+ (tidrecvc->tid_list.tsess_tidcount *
+ sizeof(uint32_t))), 4);
+
+ _HFI_EXP("alloc tidrecv=%d, paylen=%d, ntid=%d\n",
+ tidrecvc->rdescid._desc_idx,
+ tidrecvc->tsess_tidlist_length,
+ tidrecvc->tid_list.tsess_tidcount);
+
+ tidrecvc->grantscb = grantscb;
+ tidrecvc->completescb = completescb;
+
+ *ptidrecvc = tidrecvc; /* return to caller */
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+ struct ips_tid_get_pend *phead =
+ &((struct ips_protoexp *)timer->context)->pend_getreqsq;
+ struct ips_protoexp *protoexp;
+ struct ips_tid_get_request *getreq;
+ struct ips_tid_recv_desc *tidrecvc;
+ ips_epaddr_t *ipsaddr;
+ uint32_t nbytes_this, count;
+ int ret;
+
+ PSM2_LOG_MSG("entering");
+
+#ifdef PSM_CUDA
+ if (!(((struct ips_protoexp *)timer->context)->proto->flags
+ & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) ||
+ ((((struct ips_protoexp *)timer->context)->proto->flags &
+ IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
+ gpudirect_recv_threshold)) {
+ /* Before processing pending TID requests, first try to free up
+ * any CUDA host buffers that are now idle. */
+ struct ips_tid_get_cudapend *cphead =
+ &((struct ips_protoexp *)timer->context)->cudapend_getreqsq;
+ psm2_error_t err;
+
+ /* See if any CUDA memcpys are in progress. Grab the first getreq... */
+ while (!STAILQ_EMPTY(cphead)) {
+ getreq = STAILQ_FIRST(cphead);
+
+ err = psmi_cuda_reclaim_hostbufs(getreq);
+ if (err == PSM2_OK_NO_PROGRESS)
+ goto cudapend_exit;
+
+ /* This pending cuda getreq has no more CUDA ops queued up.
+ * Either it's completely done, or the CUDA copies have caught
+ * up with the TID data xfer, but the TID xfer itself is not
+ * finished.
+ */
+ if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) {
+ /* TID xfer is done.
+ * We should only get here if:
+ * this was involved a cuda copy, and
+ * the TIX xfer is done.
+ */
+ psmi_assert(getreq->cuda_hostbuf_used);
+ psmi_assert(getreq->tidgr_length ==
+ getreq->tidgr_offset);
+
+ /* Remove from the cudapend list, and reclaim */
+ getreq->tidgr_protoexp = NULL;
+ getreq->tidgr_epaddr = NULL;
+ STAILQ_REMOVE_HEAD(cphead, tidgr_next);
+
+ /* mark the req as done */
+ if (getreq->tidgr_callback)
+ getreq->tidgr_callback(getreq->tidgr_ucontext);
+ psmi_mpool_put(getreq);
+ } else
+ break; /* CUDA xfers in progress. Leave. */
+ }
+ }
+cudapend_exit:
+#endif
+
+ while (!STAILQ_EMPTY(phead)) {
+ getreq = STAILQ_FIRST(phead);
+ ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr);
+ count = ipsaddr->msgctl->ipsaddr_count;
+
+ipsaddr_next:
+ ipsaddr = ipsaddr->msgctl->ipsaddr_next;
+ ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+ protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp;
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+ struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid];
+ if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) {
+ break; /* skip sending next CTS */
+ }
+ }
+
+#ifdef PSM_CUDA
+ if (getreq->cuda_hostbuf_used) {
+ /* If this is a large transfer, we may be able to
+ * start reclaiming before all of the data is sent. */
+ psmi_cuda_reclaim_hostbufs(getreq);
+ }
+#endif
+ /*
+ * Calculate the next window size, avoid the last
+ * window too small.
+ */
+ nbytes_this = getreq->tidgr_length - getreq->tidgr_offset;
+ if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz)
+ nbytes_this = getreq->tidgr_rndv_winsz;
+ else if (nbytes_this > getreq->tidgr_rndv_winsz)
+ nbytes_this /= 2;
+
+ /*
+ * If there is a next window and the next window
+ * length is greater than PAGESIZE, make sure the window
+ * starts on a page boundary.
+ */
+#ifdef PSM_CUDA
+ psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext;
+ if (req->is_buf_gpu_mem){
+ if (((getreq->tidgr_offset + nbytes_this) <
+ getreq->tidgr_length) &&
+ nbytes_this > PSMI_GPU_PAGESIZE) {
+ uint32_t pageoff =
+ (((uintptr_t)getreq->tidgr_lbuf) &
+ (PSMI_GPU_PAGESIZE - 1)) +
+ getreq->tidgr_offset + nbytes_this;
+ nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1);
+ }
+ } else {
+#endif
+ if ((getreq->tidgr_offset + nbytes_this) <
+ getreq->tidgr_length &&
+ nbytes_this > PSMI_PAGESIZE) {
+ uint32_t pageoff =
+ (((uintptr_t)getreq->tidgr_lbuf) &
+ (PSMI_PAGESIZE - 1)) +
+ getreq->tidgr_offset + nbytes_this;
+ nbytes_this -= pageoff & (PSMI_PAGESIZE - 1);
+ }
+#ifdef PSM_CUDA
+ }
+#endif
+
+ psmi_assert(nbytes_this >= 4);
+ psmi_assert(nbytes_this <= PSM_TID_WINSIZE);
+
+ if ((ret = ips_tid_num_available(&protoexp->tidc)) <= 0) {
+ /* We're out of tids. If this process used all the resource,
+ * the free callback will reschedule the operation, otherwise,
+ * we reschedule it here */
+ if (ret == 0)
+ {
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ }
+ } else if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) {
+ /* We're out of tidflow. If this process used all the resource,
+ * the free callback will reschedule the operation, otherwise,
+ * we reschedule it here */
+ if (ret == 0)
+ {
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ }
+ } else if (ips_tid_recv_alloc(protoexp, ipsaddr,
+ getreq, nbytes_this, &tidrecvc) == PSM2_OK) {
+ ips_protoexp_send_tid_grant(tidrecvc);
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+ /*
+ * Once the CTS was sent, we mark it per 'flow' object
+ * not to proceed with next CTSes until that one is done.
+ */
+ struct ips_proto *proto = tidrecvc->protoexp->proto;
+ struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+ flow->flags |= IPS_FLOW_FLAG_SKIP_CTS;
+ }
+
+ /*
+ * nbytes_this is the asked length for this session,
+ * ips_tid_recv_alloc() might register less pages, the
+ * real length is in tidrecvc->recv_msglen.
+ */
+ getreq->tidgr_offset += tidrecvc->recv_msglen;
+ psmi_assert(getreq->tidgr_offset <=
+ getreq->tidgr_length);
+ _HFI_VDBG("GRANT tididx=%d srcoff=%d nbytes=%d/%d\n",
+ tidrecvc->rdescid._desc_idx,
+ getreq->tidgr_offset, tidrecvc->recv_msglen,
+ getreq->tidgr_length);
+
+ if (getreq->tidgr_offset == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+ if (getreq->cuda_hostbuf_used) {
+ /* this completes the tid xfer setup.
+ move to the pending cuda ops queue,
+ set the timer to catch completion */
+ STAILQ_REMOVE_HEAD(phead, tidgr_next);
+ STAILQ_INSERT_TAIL(
+ &getreq->tidgr_protoexp->cudapend_getreqsq,
+ getreq, tidgr_next);
+ psmi_timer_request(getreq->tidgr_protoexp->timerq,
+ &getreq->tidgr_protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ continue;
+ }
+#endif
+ getreq->tidgr_protoexp = NULL;
+ getreq->tidgr_epaddr = NULL;
+ STAILQ_REMOVE_HEAD(phead, tidgr_next);
+ continue; /* try next grant request */
+ }
+ else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) {
+ /* In case of multi rail, PSM sends one CTS per request
+ * per card after which the request is moved to the end
+ * of the queue.
+ */
+ count--;
+ if (count)
+ goto ipsaddr_next;
+ STAILQ_REMOVE_HEAD(phead, tidgr_next);
+ STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next);
+ continue;
+ }
+
+ /* created a tidrecvc, reset count */
+ count = ipsaddr->msgctl->ipsaddr_count;
+ goto ipsaddr_next; /* try next fragment on next ipsaddr */
+ }
+
+ /*
+ * We need to loop until we can't get a tidrecvc on all
+ * ipsaddrs, then the callbacks on the home protoexp where
+ * getreq is linked can resume this routine. Otherwise, we
+ * might make this getreq to be orphaned and cause deadlock.
+ */
+ count--;
+ if (count)
+ goto ipsaddr_next;
+ break;
+ }
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK; /* XXX err-broken */
+}
+
+#ifdef PSM_CUDA
+static
+void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
+{
+ struct ips_protoexp *protoexp = tidrecvc->protoexp;
+ struct ips_cuda_hostbuf *chb;
+
+ chb = tidrecvc->cuda_hostbuf;
+ chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
+ tidrecvc->tid_list.tsess_unaligned_end;
+
+ PSMI_CUDA_CALL(cudaMemcpyAsync,
+ chb->gpu_buf, chb->host_buf,
+ tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
+ tidrecvc->tid_list.tsess_unaligned_end,
+ cudaMemcpyHostToDevice,
+ protoexp->cudastream_recv);
+ PSMI_CUDA_CALL(cudaEventRecord, chb->copy_status,
+ protoexp->cudastream_recv);
+
+ STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next);
+ tidrecvc->cuda_hostbuf = NULL;
+ ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0);
+}
+#endif
+
+static
+psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
+{
+ struct ips_protoexp *protoexp = tidrecvc->protoexp;
+ struct ips_tid_get_request *getreq = tidrecvc->getreq;
+ int tidcount = tidrecvc->tid_list.tsess_tidcount;
+ psm2_error_t err = PSM2_OK;
+
+ psmi_assert(getreq != NULL);
+ psmi_assert(tidcount > 0);
+ psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
+
+#ifdef PSM_CUDA
+ if (tidrecvc->cuda_hostbuf)
+ psmi_cudamemcpy_tid_to_device(tidrecvc);
+#endif
+
+ if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+ int tid, i;
+
+ for (i = 0; i < tidcount; i++) {
+ tid =
+ IPS_TIDINFO_GET_TID(tidrecvc->tid_list.
+ tsess_list[i]) * 2 +
+ IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list.
+ tsess_list[i]) - 1;
+ psmi_assert(protoexp->tid_info[tid].state ==
+ TIDSTATE_USED);
+ psmi_assert(protoexp->tid_info[tid].tidrecvc ==
+ tidrecvc);
+ psmi_assert(protoexp->tid_info[tid].tid ==
+ tidrecvc->tid_list.tsess_list[i]);
+ protoexp->tid_info[tid].state = TIDSTATE_FREE;
+ protoexp->tid_info[tid].tidrecvc = NULL;
+ protoexp->tid_info[tid].tid = 0xFFFFFFFF;
+ }
+ }
+
+ ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ",
+ tidrecvc->tid_list.tsess_tidcount);
+
+ if (protoexp->tidc.tid_array) {
+ if ((err = ips_tidcache_release(&protoexp->tidc,
+ tidrecvc->tid_list.tsess_list, tidcount)))
+ goto fail;
+ } else {
+ if ((err = ips_tid_release(&protoexp->tidc,
+ tidrecvc->tid_list.tsess_list, tidcount)))
+ goto fail;
+ }
+
+ getreq->tidgr_bytesdone += tidrecvc->recv_msglen;
+
+ _HFI_EXP("req=%p bytes=%d/%d\n",
+ getreq->tidgr_ucontext,
+ getreq->tidgr_bytesdone, getreq->tidgr_length);
+
+ tidrecvc->state = TIDRECVC_STATE_FREE;
+
+ /* finally free the tidflow */
+ ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+
+ if (getreq->tidgr_bytesdone == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+ /* if cuda, we handle callbacks when the cuda xfer is done */
+ if (!getreq->cuda_hostbuf_used) {
+ if (getreq->tidgr_callback)
+ getreq->tidgr_callback(getreq->tidgr_ucontext);
+ psmi_mpool_put(getreq);
+ }
+#else
+ if (getreq->tidgr_callback)
+ getreq->tidgr_callback(getreq->tidgr_ucontext);
+ psmi_mpool_put(getreq);
+#endif
+ } else {
+ /* We just released some tids.
+ * If requests are waiting on tids to be
+ * freed, queue up the timer */
+ if (getreq->tidgr_offset < getreq->tidgr_length) {
+ ips_tid_pendtids_timer_callback(&getreq->
+ tidgr_protoexp->
+ timer_getreqs, 0);
+ }
+ }
+
+ if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) {
+ psmi_timer_request(protoexp->timerq,
+ &protoexp->timer_getreqs,
+ PSMI_TIMER_PRIO_1);
+ }
+
+fail:
+ return err;
+}
+
+void
+ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_tid_recv_desc *tidrecvc;
+ struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+
+ ptl_arg_t desc_id;
+ int tidpair = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+ HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK;
+ int tidctrl = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+ HFI_KHDR_TIDCTRL_SHIFT) & HFI_KHDR_TIDCTRL_MASK;
+ int tid0, tid1, tid;
+
+ psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+ /* Expected sends not enabled */
+ if (protoexp == NULL)
+ return;
+
+ /* Not doing extra tid debugging or not really a tiderr */
+ if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) ||
+ !(rcv_ev->error_flags & HFI_RHF_TIDERR))
+ return;
+
+ if (rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) {
+ _HFI_ERROR("receive type %d is not "
+ "expected in tid debugging\n", rcv_ev->ptype);
+ return;
+ }
+
+ desc_id._desc_idx = ips_proto_flowid(p_hdr);
+ desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidctrl != 3)
+ tid0 = tid1 = tidpair * 2 + tidctrl - 1;
+ else {
+ tid0 = tidpair * 2;
+ tid1 = tid0 + 1;
+ }
+
+ for (tid = tid0; tid <= tid1; tid++) {
+ if (protoexp->tid_info[tid].state == TIDSTATE_USED)
+ continue;
+
+ char buf[128];
+ char *s = "invalid (not even in table)";
+
+ if (tidrecvc->rdescid._desc_genc ==
+ desc_id._desc_genc)
+ s = "valid";
+ else {
+ snprintf(buf, sizeof(buf) - 1,
+ "wrong generation (gen=%d,received=%d)",
+ tidrecvc->rdescid._desc_genc,
+ desc_id._desc_genc);
+ buf[sizeof(buf) - 1] = '\0';
+ s = buf;
+ }
+
+ if (protoexp->tid_info[tid].tidrecvc != tidrecvc) {
+ _HFI_ERROR
+ ("tid %d not a known member of tidsess %d\n",
+ tid, desc_id._desc_idx);
+ }
+
+ _HFI_ERROR("tid %d is marked unused (session=%d): %s\n", tid,
+ desc_id._desc_idx, s);
+ }
+ return;
+}
+
+void
+ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_tid_recv_desc *tidrecvc;
+ struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ int hdr_err = rcv_ev->error_flags & HFI_RHF_KHDRLENERR;
+ uint8_t op_code = _get_proto_hfi_opcode(p_hdr);
+ char pktmsg[128];
+ char errmsg[256];
+
+ psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+ /* Expected sends not enabled */
+ if (protoexp == NULL)
+ return;
+
+ ips_proto_get_rhf_errstring(rcv_ev->error_flags, pktmsg,
+ sizeof(pktmsg));
+
+ snprintf(errmsg, sizeof(errmsg),
+ "%s pkt type opcode 0x%x at hd=0x%x %s\n",
+ (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "Eager" :
+ (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "Expected" :
+ (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "Non-kd" :
+ "<Error>", op_code, rcv_ev->recvq->state->hdrq_head, pktmsg);
+
+ if (!hdr_err) {
+ ptl_arg_t desc_id;
+ psmi_seqnum_t sequence_num;
+
+ desc_id._desc_idx = ips_proto_flowid(p_hdr);
+ desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+ /* Print this at very verbose level. Noisy links can have a few of
+ * these! */
+ _HFI_VDBG
+ ("Data Error Pkt and Recv Generation Mismatch: %s",
+ errmsg);
+ return; /* skip */
+ }
+
+ if (tidrecvc->state == TIDRECVC_STATE_FREE) {
+ _HFI_EPDBG
+ ("Data Error Pkt for a Completed Rendezvous: %s",
+ errmsg);
+ return; /* skip */
+ }
+
+ /* See if CRC error for a previous packet */
+ sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+ if (sequence_num.psn_gen == tidrecvc->tidflow_genseq.psn_gen) {
+ /* Try to recover the flow by restarting from previous known good
+ * sequence (possible if the packet with CRC error is after the "known
+ * good PSN" else we can't restart the flow.
+ */
+ return ips_protoexp_do_tf_seqerr(protoexp,
+ tidrecvc, p_hdr);
+ } else {
+ /* Print this at very verbose level */
+ _HFI_VDBG
+ ("Data Error Packet. GenMismatch: Yes. Tidrecvc: %p. "
+ "Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n",
+ tidrecvc, sequence_num.psn_gen,
+ sequence_num.psn_seq,
+ tidrecvc->tidflow_genseq.psn_gen,
+ tidrecvc->tidflow_genseq.psn_seq, errmsg);
+ }
+
+ } else {
+ _HFI_VDBG("HDR_ERROR: %s\n", errmsg);
+ }
+
+}
+
+psm2_error_t
+ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc)
+{
+ psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
+ ips_tfgen_allocate(&tidrecvc->protoexp->tfc,
+ tidrecvc->rdescid._desc_idx,
+ &tidrecvc->tidflow_active_gen);
+
+ /* Update tidflow table with new generation number */
+ tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen;
+ hfi_tidflow_set_entry(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx,
+ tidrecvc->tidflow_genseq.psn_gen,
+ tidrecvc->tidflow_genseq.psn_seq);
+
+ /* Increment swapped generation count for tidflow */
+ tidrecvc->tidflow_nswap_gen++;
+ return PSM2_OK;
+}
+
+void
+ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_tid_recv_desc *tidrecvc;
+ ptl_arg_t desc_id;
+
+ psmi_assert_always(protoexp != NULL);
+ psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+ desc_id._desc_idx = ips_proto_flowid(p_hdr);
+ desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc
+ && tidrecvc->state == TIDRECVC_STATE_BUSY)
+ ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr);
+
+ return;
+}
+
+static
+void ips_protoexp_do_tf_seqerr(struct ips_protoexp *protoexp,
+ struct ips_tid_recv_desc *tidrecvc,
+ struct ips_message_header *p_hdr)
+{
+ psmi_seqnum_t sequence_num, tf_sequence_num;
+ ips_scb_t ctrlscb;
+
+ /* Update stats for sequence errors */
+ tidrecvc->stats.nSeqErr++;
+
+ sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+ /* Only care about sequence error for currently active generation */
+ if (tidrecvc->tidflow_active_gen != sequence_num.psn_gen)
+ return;
+
+ /* If a "large" number of swapped generation we are loosing packets
+ * for this flow. Request throttling of tidflow by generating a
+ * BECN. With header suppression we will miss some FECN packet
+ * on OPA hence keeping track of swapped generation is another
+ * mechanism to do congestion control for tidflows.
+ *
+ * For mismatched sender/receiver/link speeds we can get into a
+ * deadly embrace where minimal progress is made due to generation
+ * mismatch errors. This can occur if we wrap around the generation
+ * count without making progress. Hence in cases where the swapped
+ * generation count is > 254 stop sending BECN (and the NAK) so the
+ * send -> receiver pipeline is flushed with an error check and things
+ * can sync up. This should be an extremely rare event.
+ */
+
+ if_pf(tidrecvc->tidflow_nswap_gen >= 254)
+ return; /* Do not send NAK. Let error check kick in. */
+
+ if_pf((tidrecvc->tidflow_nswap_gen > 4) &&
+ (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+ _HFI_CCADBG("Generating BECN. Number of swapped gen: %d.\n",
+ tidrecvc->tidflow_nswap_gen);
+ /* Mark flow to generate BECN in control packet */
+ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+ /* Update stats for congestion encountered */
+ protoexp->proto->epaddr_stats.congestion_pkts++;
+ }
+
+ /* Get the latest seq from hardware tidflow table, if that value is
+ * reliable. The value is not reliable if context sharing is used,
+ * because context sharing might drop packet even though hardware
+ * has received it successfully. The hardware table may also be
+ * incorrect if RSM is intercepting TID & FECN & SH packets.
+ * We can handle this condition by taking the most recent PSN whether
+ * it comes from the tidflow table or from PSM's own accounting.
+ */
+ if (!tidrecvc->context->tf_ctrl) {
+ tf_sequence_num.psn_val =
+ hfi_tidflow_get_seqnum(
+ hfi_tidflow_get(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx));
+ if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq)
+ tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq;
+ }
+
+ /* Swap generation for the flow. */
+ ips_protoexp_flow_newgen(tidrecvc);
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid;
+ /* Keep peer generation but use my last received sequence */
+ sequence_num.psn_seq = tidrecvc->tidflow_genseq.psn_seq;
+ ctrlscb.ips_lrh.ack_seq_num = sequence_num.psn_val;
+
+ /* My new generation and last received sequence */
+ ctrlscb.ips_lrh.data[1].u32w0 = tidrecvc->tidflow_genseq.psn_val;
+
+ ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+ OPCODE_NAK,
+ &tidrecvc->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+
+ /* Update stats for retransmit */
+ tidrecvc->stats.nReXmit++;
+
+ return;
+}
+
+void
+ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_tid_recv_desc *tidrecvc;
+ ptl_arg_t desc_id;
+
+ psmi_assert_always(protoexp != NULL);
+ psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID);
+
+ /* For a generation error our NAK crossed on the wire or this is a stale
+ * packet. Error recovery should sync things up again. Just drop this
+ * packet.
+ */
+ desc_id._desc_idx = ips_proto_flowid(p_hdr);
+ desc_id._desc_genc = p_hdr->exp_rdescid_genc;
+
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc
+ && tidrecvc->state == TIDRECVC_STATE_BUSY)
+ ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr);
+
+ return;
+}
+
+static
+void ips_protoexp_do_tf_generr(struct ips_protoexp *protoexp,
+ struct ips_tid_recv_desc *tidrecvc,
+ struct ips_message_header *p_hdr)
+{
+ /* Update stats for generation errors */
+ tidrecvc->stats.nGenErr++;
+
+ /* If packet faced congestion we may want to generate
+ * a CN packet to rate control sender.
+ */
+
+ return;
+}
diff --git a/ptl_ips/ips_proto_header.h b/ptl_ips/ips_proto_header.h
new file mode 100644
index 0000000..6677162
--- /dev/null
+++ b/ptl_ips/ips_proto_header.h
@@ -0,0 +1,181 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HEADER_H
+#define _IPS_PROTO_HEADER_H
+
+/* Although defined as macros, the *_BITS values below are NOT meant to be
+ changed. They are defined this way so that their values are written in
+ exactly one place. These macros are used in struct ips_message_header
+ below, as well as in the active messages code for the purpose of
+ establishing how many arguments/handlers are supported, and to assert that
+ values written into the header fields are not too large for the number of
+ bits available. The preprocessor check below ensures less than 32 bits are
+ used.
+ */
+
+/* Number of bits to use for the amhdr_len field. */
+#define IPS_AM_HDR_LEN_BITS 4
+
+/* Number of bits to use for the amhdr_hidx field. Bounds the number of
+ * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */
+#define IPS_AM_HDR_HIDX_BITS 8
+
+/* Number of bits to use for the amhdr_nargs field. Bounds the number of
+ arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */
+#define IPS_AM_HDR_NARGS_BITS 4
+
+#if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32
+#error "Bad IPS header definition: AM fields must use 32 bits or less"
+#endif
+
+/* Number of AM arguments that can be packets into struct_ips_message_header.
+ Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in
+ the data payload. */
+#define IPS_AM_HDR_NARGS \
+ (sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t))
+
+/* The actual size of the message header is determined by three paramters:
+ * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware)
+ * OPA words contain LRH and BTH
+ * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol)
+ * IPS hardware words contain ips-protocol-specific data
+ * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol)
+ * IPS user words contain ips-protocol-specific data
+ *
+ * The header message size is determined to as IWORDS + HWORDS + UWORDS
+ */
+struct ips_message_header {
+ __be16 lrh[4];
+ __be32 bth[3];
+
+ /* fields below this point are in host byte order */
+ struct hfi_kdeth khdr;
+
+ struct {
+ __u32 flags:6;
+ __u32 connidx:26; /* connection idx */
+ };
+
+ union {
+ struct {
+ struct {
+ __u32 ack_seq_num:31;
+ __u32 reserved:1;
+ };
+
+ union {
+ struct { /* for active message */
+ __u32 amhdr_len:IPS_AM_HDR_LEN_BITS;
+ __u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS;
+ __u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS;
+ };
+ __u32 mdata; /* for misc data */
+ };
+
+ /* Inline arguments and/or message payload */
+ union {
+ ptl_arg_t data[2];
+ __u32 uwords[4];
+ };
+ };
+
+ /* for message header packet only */
+ struct {
+ __u32 pad1;
+ __u32 tag[3]; /* 96 bits psm tag */
+ ptl_arg_t hdr_data;
+ };
+
+ /* for expected tid packet only */
+ struct {
+ __u8 exp_ustart[3]; /* unaligned start bytes */
+ __u8 exp_uend[3]; /* unaligned end bytes */
+ __u16 exp_rdescid_genc; /* tidrecvc gen count */
+ ptl_arg_t exp_sdescid; /* sender descriptor id */
+ __u32 exp_cksum; /* optional checksum */
+ __u32 exp_offset; /* packet offset */
+ };
+ };
+};
+
+/*
+ * OpCodes in BTH[0], 24-31 bits. Order is important!!!
+ */
+#define OPCODE_RESERVED 0xC0 /* reserved */
+#define OPCODE_TINY 0xC1 /* 0 <= msglen <= 8 */
+#define OPCODE_SHORT 0xC2 /* 8 < msglen <= MTU */
+#define OPCODE_EAGER 0xC3 /* eager packet */
+#define OPCODE_LONG_RTS 0xC4 /* ready to send */
+#define OPCODE_LONG_CTS 0xC5 /* confirm to send */
+#define OPCODE_LONG_DATA 0xC6 /* long data packets */
+#define OPCODE_EXPTID 0xC7 /* expected tid data */
+#define OPCODE_EXPTID_COMPLETION 0xC8 /* expected tid completion */
+#define OPCODE_ACK 0xC9 /* explicit ACK packet */
+#define OPCODE_NAK 0xCA /* explicit NAK packet */
+#define OPCODE_BECN 0xCB /* congestion control */
+#define OPCODE_ERR_CHK 0xCC /* query eager receiving */
+#define OPCODE_ERR_CHK_GEN 0xCD /* query tid receiving */
+#define OPCODE_CONNECT_REQUEST 0xCE /* connect request */
+#define OPCODE_CONNECT_REPLY 0xCF /* connect reply */
+#define OPCODE_DISCONNECT_REQUEST 0xD0 /* disconnect request */
+#define OPCODE_DISCONNECT_REPLY 0xD1 /* disconnect reply */
+#define OPCODE_AM_REQUEST_NOREPLY 0xD2 /* AM request w/o reply */
+#define OPCODE_AM_REQUEST 0xD3 /* AM request */
+#define OPCODE_AM_REPLY 0xD4 /* AM reply */
+#define OPCODE_FUTURE_FROM 0xD5 /* reserved for expansion */
+#define OPCODE_FUTURE_TO 0xDF /* reserved for expansion */
+
+#endif /* _IPS_PROTO_HEADER_H */
diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h
new file mode 100644
index 0000000..5434b02
--- /dev/null
+++ b/ptl_ips/ips_proto_help.h
@@ -0,0 +1,705 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2017 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2017 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HELP_H
+#define _IPS_PROTO_HELP_H
+
+#include "ips_recvhdrq.h"
+#include "ips_proto.h"
+#include "ipserror.h"
+#include "psm_mq_internal.h" /* psmi_mq_handle_tiny_envelope */
+#include "ptl_ips.h"
+
+/* hfi_opcode is not the ips-level opcode. */
+PSMI_ALWAYS_INLINE(
+uint8_t
+_get_proto_hfi_opcode(const struct ips_message_header *p_hdr))
+{
+ return ((__be32_to_cpu(p_hdr->bth[0]) >>
+ HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+uint8_t
+ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow))
+{
+ /*
+ * Setup ACK request if more than ack_interval packets
+ * have not been requested an ACK
+ */
+ if (scb->flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) {
+ flow->ack_counter = 0;
+ } else {
+ flow->ack_counter++;
+ if (flow->ack_counter > flow->ack_interval) {
+ flow->ack_counter = 0;
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+ }
+ }
+
+ /* Bottom 6 bits wind up in protocol header fields, other bits
+ * control other aspects of packet composition */
+ return (uint8_t) (scb->flags & IPS_SEND_FLAG_PROTO_OPTS);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_epaddr_flow_t
+ips_proto_flowid(struct ips_message_header *p_hdr))
+{
+ return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >>
+ HFI_BTH_FLOWID_SHIFT) &
+ HFI_BTH_FLOWID_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+int
+ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr,
+ void *payload, uint32_t paylen, uint32_t *cksum))
+{
+ uint16_t paywords;
+
+ /* Update the payload words in header */
+ paywords = (sizeof(struct ips_message_header) + paylen +
+ PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >>
+ BYTE2DWORD_SHIFT;
+ p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+ /* Need to regenerate KDETH checksum after updating payload length */
+ /* ips_kdeth_cksum(p_hdr); */
+
+ *cksum = 0xffffffff;
+
+ /* Checksum header */
+ *cksum = ips_crc_calculate(sizeof(struct ips_message_header),
+ (uint8_t *) p_hdr, *cksum);
+
+ /* Checksum payload (if any) */
+ if (paylen) {
+ psmi_assert_always(payload);
+ *cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum);
+ }
+
+ return 0;
+}
+
+/* Get pbc static rate value for flow for a given message length */
+PSMI_ALWAYS_INLINE(
+uint16_t
+ips_proto_pbc_static_rate(struct ips_proto *proto, struct ips_flow *flow,
+ uint32_t msgLen))
+{
+ uint32_t rate = 0;
+
+ /* The PBC rate is based on which HFI type as different media have different
+ * mechanism for static rate control.
+ */
+
+ switch (proto->epinfo.ep_hfi_type) {
+ case PSMI_HFI_TYPE_OPA1:
+ {
+ /*
+ * time_to_send is:
+ *
+ * (packet_length) [bits] / (pkt_egress_rate) [bits/sec]
+ * -----------------------------------------------------
+ * fabric_clock_period == (1 / 805 * 10^6) [1/sec]
+ *
+ * (where pkt_egress_rate is assumed to be 100 Gbit/s.)
+ */
+ uint32_t time_to_send = (8 * msgLen * 805) / (100000);
+ rate = (time_to_send >> flow->path->pr_cca_divisor) *
+ (flow->path->pr_active_ipd);
+
+ if (rate > 65535)
+ rate = 65535;
+
+ }
+ break;
+
+ default:
+ rate = 0;
+ }
+
+ return (uint16_t) rate;
+}
+
+/* This is a helper function to convert Per Buffer Control to little-endian */
+PSMI_ALWAYS_INLINE(
+void ips_proto_pbc_to_le(struct hfi_pbc *pbc))
+{
+ pbc->pbc0 = __cpu_to_le32(pbc->pbc0);
+ pbc->PbcStaticRateControlCnt = __cpu_to_le16(pbc->PbcStaticRateControlCnt);
+ pbc->fill1 = __cpu_to_le16(pbc->fill1);
+}
+
+/* This is only used for SDMA cases; pbc is really a pointer to
+ * struct ips_pbc_header * or the equivalent un-named structure
+ * in ips_scb. Please note pcb will be in little-endian byte
+ * order on return */
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow,
+ uint32_t isCtrlMsg, struct hfi_pbc *pbc, uint32_t hdrlen,
+ uint32_t paylen))
+{
+ int dw = (sizeof(struct hfi_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT;
+ int sc = proto->sl2sc[flow->path->pr_sl];
+ int vl = proto->sc2vl[sc];
+ uint16_t static_rate = 0;
+
+ if_pf(!isCtrlMsg && flow->path->pr_active_ipd)
+ static_rate =
+ ips_proto_pbc_static_rate(proto, flow, hdrlen + paylen);
+
+ pbc->pbc0 = (dw & HFI_PBC_LENGTHDWS_MASK) |
+ ((vl & HFI_PBC_VL_MASK) << HFI_PBC_VL_SHIFT) |
+ (((sc >> HFI_PBC_SC4_SHIFT) &
+ HFI_PBC_SC4_MASK) << HFI_PBC_DCINFO_SHIFT);
+
+ pbc->PbcStaticRateControlCnt = static_rate & HFI_PBC_STATICRCC_MASK;
+
+ /* Per Buffer Control must be in little-endian */
+ ips_proto_pbc_to_le(pbc);
+
+ return;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t
+ips_proto_dest_context_from_header(struct ips_proto *proto,
+ struct ips_message_header *p_hdr))
+{
+ return (__be32_to_cpu(p_hdr->bth[1]) & 0xFF);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+ struct ips_flow *flow, ips_scb_t *scb, uint8_t flags))
+{
+ uint32_t paywords = (sizeof(struct ips_message_header) +
+ scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >>
+ BYTE2DWORD_SHIFT;
+ struct ips_message_header *p_hdr = &scb->ips_lrh;
+
+#if 0
+ /*
+ * This scb has been used by this connection last time,
+ * so some of the header fields are already set.
+ */
+ if (scb->flow == flow) {
+ p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+ p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+ (scb->
+ opcode << BTH_OPCODE_SHIFT) |
+ (extra_bytes <<
+ BTH_EXTRA_BYTE_SHIFT));
+ p_hdr->bth[2] =
+ __cpu_to_be32(flow->xmit_seq_num.
+ psn | (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+ p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset |
+ (scb->
+ offset_mode <<
+ HFI_KHDR_OM_SHIFT) | (scb->
+ tid <<
+ HFI_KHDR_TID_SHIFT)
+ | (scb->
+ tidctrl <<
+ HFI_KHDR_TIDCTRL_SHIFT) |
+ (scb->
+ flags & IPS_SEND_FLAG_INTR)
+ | (scb->
+ flags &
+ IPS_SEND_FLAG_HDR_SUPPRESS)
+ | (IPS_PROTO_VERSION <<
+ HFI_KHDR_KVER_SHIFT));
+
+ /* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */
+
+ p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+ p_hdr->flags = flags;
+
+ return;
+ }
+#endif
+
+ /* Setup LRH fields */
+ p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+ ((flow->path->pr_sl & HFI_LRH_SL_MASK) <<
+ HFI_LRH_SL_SHIFT) |
+ ((proto->sl2sc[flow->path->pr_sl] &
+ HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT));
+ p_hdr->lrh[1] = flow->path->pr_dlid;
+ p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+ p_hdr->lrh[3] = flow->path->pr_slid;
+
+ /* Setup BTH fields */
+ p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+ (scb->opcode << HFI_BTH_OPCODE_SHIFT));
+ p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num |
+ (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+ if (scb->tidctrl) { /* expected receive packet */
+ p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+ (ipsaddr->
+ subcontext <<
+ HFI_BTH_SUBCTXT_SHIFT) |
+ (scb->tidsendc->
+ rdescid._desc_idx
+ << HFI_BTH_FLOWID_SHIFT)
+ | (proto->epinfo.
+ ep_baseqp <<
+ HFI_BTH_QP_SHIFT));
+
+ /* Setup KHDR fields */
+ p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+ (scb->
+ tidctrl <<
+ HFI_KHDR_TIDCTRL_SHIFT) |
+ (scb->
+ flags & IPS_SEND_FLAG_INTR)
+ | (scb->
+ flags &
+ IPS_SEND_FLAG_HDRSUPP) |
+ (IPS_PROTO_VERSION <<
+ HFI_KHDR_KVER_SHIFT));
+ } else { /* eager receive packet */
+ p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context |
+ (ipsaddr->
+ subcontext <<
+ HFI_BTH_SUBCTXT_SHIFT) |
+ (flow->flowid
+ << HFI_BTH_FLOWID_SHIFT)
+ | (proto->epinfo.
+ ep_baseqp <<
+ HFI_BTH_QP_SHIFT));
+
+ /* Setup KHDR fields */
+ p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+ (scb->
+ flags & IPS_SEND_FLAG_INTR)
+ | (IPS_PROTO_VERSION <<
+ HFI_KHDR_KVER_SHIFT));
+
+ p_hdr->ack_seq_num = flow->recv_seq_num.psn_num;
+ }
+
+ p_hdr->khdr.job_key = __cpu_to_le32(proto->epinfo.ep_jkey);
+ p_hdr->connidx = ipsaddr->connidx_outgoing;
+ p_hdr->flags = flags;
+
+ scb->flow = flow;
+
+ return;
+}
+
+/*
+ * Assumes that the following fields are already set in scb:
+ * payload
+ * payload_size
+ * flags
+ */
+PSMI_INLINE(
+void
+ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+ struct ips_flow *flow, ips_scb_t *scb))
+{
+ psmi_assert((scb->payload_size & 3) == 0);
+ ips_proto_hdr(proto, ipsaddr, flow, scb,
+ ips_flow_gen_ackflags(scb, flow));
+
+ scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+ scb->abs_timeout = TIMEOUT_INFINITE;
+ scb->flags |= IPS_SEND_FLAG_PENDING;
+
+ if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+ flow->xmit_seq_num.psn_seq += scb->nfrag;
+ scb->seq_num = flow->xmit_seq_num;
+ scb->seq_num.psn_seq--;
+ } else {
+ flow->xmit_seq_num.psn_num =
+ (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask;
+ scb->seq_num.psn_num =
+ (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask;
+ }
+
+ return;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype))
+{
+ switch (msgtype) {
+ case OPCODE_ACK:
+ break;
+ case OPCODE_ERR_CHK:
+ case OPCODE_ERR_CHK_GEN:
+ proto->epaddr_stats.err_chk_send++;
+ break;
+ case OPCODE_NAK:
+ proto->epaddr_stats.nak_send++;
+ break;
+ case OPCODE_CONNECT_REQUEST:
+ proto->epaddr_stats.connect_req++;
+ break;
+ case OPCODE_DISCONNECT_REQUEST:
+ proto->epaddr_stats.disconnect_req++;
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+/*
+ * Exported there solely for inlining is_expected_or_nak and mq_tiny handling
+ */
+extern
+psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow,
+ uint8_t message_type, uint16_t *msg_queue_mask,
+ ips_scb_t *ctrlscb, void *payload, uint32_t paylen);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+ if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+ if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) {
+ flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */
+ } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) {
+ SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+ }
+
+ flow->flags |= IPS_FLOW_FLAG_PENDING_ACK;
+ }
+ else {
+ ips_scb_t ctrlscb;
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+ /* Coalesced ACKs disabled. Send ACK immediately */
+ ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+ &flow->ipsaddr->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+ if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+ if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+ flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */
+ } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) {
+ SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+ }
+
+ flow->flags |= IPS_FLOW_FLAG_PENDING_NAK;
+ }
+ else {
+ ips_scb_t ctrlscb;
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+ /* Coalesced ACKs disabled. Send NAK immediately */
+ ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+ &flow->ipsaddr->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+}
+
+/* return 1 if packet is next expected in flow
+ * return 0 if packet is not next expected in flow (and nak packet).
+ */
+PSMI_ALWAYS_INLINE(
+int
+ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev))
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+ struct ips_flow *flow;
+ psmi_seqnum_t sequence_num;
+
+ psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) ||
+ (flowid == EP_FLOW_GO_BACK_N_DMA)
+ );
+ flow = &ipsaddr->flows[flowid];
+ /* If packet faced congestion generate BECN in NAK. */
+ if_pf((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) &&
+ ((flow->cca_ooo_pkts & 0xf) == 0)) {
+ /* Generate a BECN for every 16th OOO packet marked with a FECN. */
+ flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+ flow->cca_ooo_pkts++;
+ rcv_ev->proto->epaddr_stats.congestion_pkts++;
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */
+ }
+
+ sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+ if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) {
+ flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND;
+
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num + 1) & proto->psn_mask;
+ flow->cca_ooo_pkts = 0;
+
+ /* don't process ack, caller will do it. */
+ return 1;
+
+ }
+
+ int16_t diff = (int16_t) (sequence_num.psn_num -
+ flow->recv_seq_num.psn_num);
+ if (diff > 0) {
+ if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) {
+ /* Queue/Send NAK to peer */
+ ips_proto_send_nak((struct ips_recvhdrq *)
+ rcv_ev->recvq, flow);
+ flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+ flow->cca_ooo_pkts = 0;
+ } else if (proto->flags & IPS_PROTO_FLAG_CCA) {
+ flow->cca_ooo_pkts = diff;
+ if (flow->cca_ooo_pkts > flow->ack_interval) {
+ ips_scb_t ctrlscb;
+
+ rcv_ev->proto->epaddr_stats.congestion_pkts++;
+ flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+ _HFI_CCADBG
+ ("BECN Generation. Expected: %d, Got: %d.\n",
+ flow->recv_seq_num.psn_num,
+ sequence_num.psn_num);
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.data[0].u32w0 =
+ flow->cca_ooo_pkts;
+ /* Send Control message to throttle flow. Will clear flow flag and
+ * reset cca_ooo_pkts.
+ */
+ ips_proto_send_ctrl_message(flow,
+ OPCODE_BECN,
+ &flow->ipsaddr->
+ ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+ }
+ }
+
+ /* process ack if packet is not in sequence. */
+ ips_proto_process_ack(rcv_ev);
+
+ return 0;
+}
+
+/*
+ * Note, some code depends on the literal values specified in this enum.
+ */
+enum ips_msg_order {
+ IPS_MSG_ORDER_PAST = 3, /* Old message, recv & drop */
+ IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */
+ IPS_MSG_ORDER_EXPECTED = 1, /* Expected message, always recv */
+ IPS_MSG_ORDER_FUTURE_RECV = 0, /* Future message, buffer in OOO Q */
+ IPS_MSG_ORDER_FUTURE = -1, /* Future message, leave on RHQ */
+};
+
+PSMI_ALWAYS_INLINE(
+enum ips_msg_order
+ips_proto_check_msg_order(ips_epaddr_t *ipsaddr,
+ struct ips_flow *flow,
+ uint16_t send_seqnum,
+ uint16_t *recv_seqnum))
+
+{
+ int16_t diff = (int16_t) (*recv_seqnum - send_seqnum);
+
+ if (likely(diff == 0)) {
+ *recv_seqnum += 1;
+
+ ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK;
+ if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK)
+ return IPS_MSG_ORDER_EXPECTED_MATCH;
+
+ return IPS_MSG_ORDER_EXPECTED;
+ } else if (diff > 0) {
+ return IPS_MSG_ORDER_PAST;
+ }
+
+ ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK;
+ if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) {
+ /*
+ * Second time to see the same ooo message, receive and put
+ * into OOO queue.
+ */
+ return IPS_MSG_ORDER_FUTURE_RECV;
+ }
+
+ /* The first time to see an OOO message, leave it there and try
+ * next time. But we need to revert back the receiving flow PSN. */
+ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) & psn_mask;
+ return IPS_MSG_ORDER_FUTURE;
+}
+
+PSMI_INLINE(
+int
+ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
+{
+ uint32_t index;
+
+ /* NOTE: Fault injection will currently not work with hardware
+ * suppression. See note below for reason why as we currently
+ * do not update the hardware tidflow table if FI is dropping
+ * the packet.
+ *
+ * We need to look into the packet before dropping it and
+ * if it's an expected packet AND we have hardware suppression
+ * then we need to update the hardware tidflow table and the
+ * associated tidrecvc state to fake having received a packet
+ * until some point in the window defined by the loss rate.
+ * This way the subsequent err chk will be NAKd and we can resync
+ * the flow with the sender.
+ *
+ * Note: For real errors the hardware generates seq/gen errors
+ * which are handled appropriately by the protocol.
+ */
+
+ if_pf(PSMI_FAULTINJ_ENABLED()) {
+ PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1,
+ IPS_FAULTINJ_RECVLOST);
+ if (psmi_faultinj_is_fault(fi_recv))
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+
+ /* see file ips_proto_header.h for details */
+ index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED;
+ if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED))
+ index = 0;
+
+ return ips_packet_service_routine[index]
+ ((struct ips_recvhdrq_event *)rcv_ev);
+}
+
+/*
+ * Breaks header encapsulation but needed in mq sends so we can pay
+ * "near-equal" attention to putting sends on the wire and servicing the
+ * receive queue.
+ */
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_recv_progress_if_busy(ptl_t *ptl, psm2_error_t err))
+{
+ if (err == PSM2_EP_NO_RESOURCES) {
+ ptl->ctl->ep_poll(ptl, 0);
+ return PSM2_OK;
+ } else
+ return err;
+}
+
+/* Find next lowest power of a two for a 32 bit number*/
+PSMI_ALWAYS_INLINE(
+unsigned int
+ips_next_low_pow2(unsigned int v))
+{
+
+ const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 };
+ const unsigned int S[] = { 1, 2, 4, 8, 16 };
+ register unsigned int r = 1;
+ int i;
+
+ for (i = 4; i >= 0; i--) {
+ if (v & b[i]) {
+ v >>= S[i];
+ r <<= S[i];
+ }
+ }
+
+ return r;
+}
+
+PSMI_ALWAYS_INLINE(
+ips_path_rec_t *
+ips_select_path(struct ips_proto *proto, ips_path_type_t path_type,
+ ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp))
+{
+ uint32_t path_idx;
+
+ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+ /* If dispersive routes are configured then select the routes in round
+ * robin order. We may want to use congestion information to select the
+ * least lightly loaded path.
+ */
+ path_idx = pathgrp->pg_next_path[path_type];
+ if (++pathgrp->pg_next_path[path_type] >=
+ pathgrp->pg_num_paths[path_type])
+ pathgrp->pg_next_path[path_type] = 0;
+ } else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+ path_idx = /* Key on destination context */
+ ipsaddr->context % pathgrp->pg_num_paths[path_type];
+ else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+ path_idx = /* Key off src context */
+ proto->epinfo.ep_context % pathgrp->pg_num_paths[path_type];
+ else /* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */
+ path_idx = 0;
+
+ return pathgrp->pg_path[path_idx][path_type];
+}
+
+#endif /* _IPS_PROTO_HELP_H */
diff --git a/ptl_ips/ips_proto_internal.h b/ptl_ips/ips_proto_internal.h
new file mode 100644
index 0000000..fb46d63
--- /dev/null
+++ b/ptl_ips/ips_proto_internal.h
@@ -0,0 +1,96 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_INTERNAL_H
+#define _IPS_PROTO_INTERNAL_H
+
+#include "ips_proto_header.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+/*
+ * Connect protocol.
+ *
+ * On receive, handled by upcalling into the connect interface.
+ * On send, handled by ips_proto by having connect compose the message.
+ */
+psm2_error_t ips_proto_process_connect(struct ips_proto *proto,
+ uint8_t opcode,
+ struct ips_message_header *p_hdr,
+ void *payload, uint32_t paylen);
+int ips_proto_build_connect_message(struct ips_proto *proto,
+ ips_epaddr_t *ptladdr,
+ uint8_t opcode, void *payload);
+
+psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer,
+ uint64_t current);
+
+psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment);
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto);
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto);
+
+int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev);
+
+#endif /* _IPS_PROTO_INTERNAL_H */
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
new file mode 100644
index 0000000..e36492f
--- /dev/null
+++ b/ptl_ips/ips_proto_mq.c
@@ -0,0 +1,1733 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+uint32_t non_dw_mul_sdma = 0;
+
+void
+ips_proto_mq_set_non_dw_mul_sdma(uint32_t mode)
+{
+ non_dw_mul_sdma = mode;
+}
+
+PSMI_NEVER_INLINE(ips_scb_t *
+ ips_poll_scb(struct ips_proto *proto,
+ int npkts, int len, uint32_t flags, int istiny))
+{
+ ips_scb_t *scb = NULL;
+ psmi_assert(npkts > 0);
+ psm2_error_t err;
+
+ proto->stats.scb_egr_unavail_cnt++;
+
+ PSMI_BLOCKUNTIL(proto->ep, err,
+ ((scb =
+ (istiny ?
+ ips_scbctrl_alloc_tiny(&proto->scbc_egr) :
+ ips_scbctrl_alloc(&proto->scbc_egr, npkts, len,
+ flags))) != NULL));
+ psmi_assert(scb != NULL);
+ return scb;
+}
+
+PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto))
+{
+ ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr);
+ /* common case should branch right through */
+ if_pt(scb != NULL)
+ return scb;
+ else
+ return ips_poll_scb(proto, 1, 0, 0, 1);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_scb_t *
+mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags))
+{
+ psmi_assert(npkts > 0);
+ ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags);
+ if_pt(scb != NULL) {
+ return scb;
+ }
+ else {
+ return ips_poll_scb(proto, npkts, len, flags,
+ 0 /* not tiny scb */);
+ }
+}
+
+static
+int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
+{
+ psm2_mq_req_t req = (psm2_mq_req_t) reqp;
+
+ /* This code path is executed when the send is on a device buffer
+ * and the receive is completed using eager buffers. As there is no
+ * completion notification sent to the sender, this is the only place
+ * where send side chb's can be freed and put back into the mpool.
+ */
+#ifdef PSM_CUDA
+ struct ips_cuda_hostbuf *chb;
+ if (req->cuda_hostbuf_used) {
+ while (!STAILQ_EMPTY(&req->sendreq_prefetch)) {
+ /* If any prefetched buffers weren't used, they
+ must be reclaimed here. */
+ chb = STAILQ_FIRST(&req->sendreq_prefetch);
+ STAILQ_REMOVE_HEAD(&req->sendreq_prefetch,
+ req_next);
+ psmi_mpool_put(chb);
+ }
+ }
+#endif
+
+ req->send_msgoff += nbytes;
+ /*
+ * the reason to use >= is because
+ * we may have DW pad in nbytes.
+ */
+ if (req->send_msgoff >= req->send_msglen) {
+ req->state = MQ_STATE_COMPLETE;
+ ips_barrier();
+ if(!psmi_is_req_internal(req))
+ mq_qq_append(&req->mq->completed_q, req);
+ }
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int ips_proto_mq_rv_complete(void *reqp)
+{
+ psm2_mq_req_t req = (psm2_mq_req_t) reqp;
+ psmi_mq_handle_rts_complete(req);
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+void ips_proto_mq_rv_complete_exp(void *reqp)
+{
+ ips_proto_mq_rv_complete(reqp);
+ return;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars))
+{
+ unsigned char *dest = vdest;
+ const unsigned char *src = vsrc;
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+ PSMI_CUDA_CALL(cudaMemcpy,
+ vdest, vsrc, nchars, cudaMemcpyDefault);
+ return;
+ }
+#endif
+
+ if (nchars >> 2)
+ hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+ dest += (nchars >> 2) << 2;
+ src += (nchars >> 2) << 2;
+ switch (nchars & 0x03) {
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+ return;
+}
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars))
+{
+ unsigned char *dest = vdest;
+ const unsigned char *src = vsrc;
+
+ if (nchars >> 2)
+ hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+ dest += (nchars >> 2) << 2;
+ src += (nchars >> 2) << 2;
+ switch (nchars & 0x03) {
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+ return;
+}
+#endif
+
+extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope
+ *
+ * Recoverable errors:
+ * PSM2_OK: If PIO, envelope is sent.
+ * If DMA, all queued up packets on flow were flushed.
+ *
+ * Recoverable errors converted to PSM2_OK just before return:
+ * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets.
+ * PSM2_EP_NO_RESOURCES:
+ * If PIO, no pio available or cable currently pulled.
+ * If DMA, can be that no scb's available to handle unaligned packets
+ * or writev returned a recoverable error (no mem for
+ * descriptors, dma interrupted or no space left in dma queue).
+ *
+ * Unrecoverable errors (PIO or DMA).
+ * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ * rxe/txe parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow,
+ struct ips_scb *scb, int do_flush))
+{
+ psm2_error_t err = PSM2_OK;
+
+ ips_proto_flow_enqueue(flow, scb);
+
+ if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush)
+ err = flow->flush(flow, NULL);
+
+ if (do_flush)
+ err = ips_recv_progress_if_busy(proto->ptl, err);
+
+ /* As per the PSM error model (or lack thereof), PSM clients expect to see
+ * only PSM2_OK as a recoverable error */
+ if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS)
+ err = PSM2_OK;
+ return err;
+}
+
+/*
+ * We don't use message striping for middle message protocol,
+ * Tests on sandy-bridge two HFIs show lower bandwidth if
+ * message striping is used.
+ */
+ustatic
+psm2_error_t
+ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
+ struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf,
+ uint32_t len)
+{
+ ips_epaddr_t *ipsaddr = flow->ipsaddr;
+ psm2_error_t err = PSM2_OK;
+ uintptr_t buf = (uintptr_t) ubuf;
+ uint32_t nbytes_left, pktlen, offset, chunk_size;
+ uint16_t msgseq, padding;
+ ips_scb_t *scb;
+ uint32_t is_non_dw_mul_allowed = IPS_NON_DW_MUL_NOT_ALLOWED;
+
+ psmi_assert(len > 0);
+ psmi_assert(req != NULL);
+
+ if (flow->transfer == PSM_TRANSFER_DMA) {
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+ /* max chunk size is the rv window size */
+ chunk_size = ipsaddr->window_rv;
+ is_non_dw_mul_allowed = non_dw_mul_sdma;
+ } else {
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+ chunk_size = flow->frag_size;
+ }
+ msgseq = ipsaddr->msgctl->mq_send_seqnum++;
+
+ nbytes_left = len;
+ offset = 0;
+ do {
+ if (is_non_dw_mul_allowed) {
+ // no need to care about padding if non-double word multiple message size is allowed.
+ padding = 0;
+ } else {
+ padding = nbytes_left & 0x3;
+ }
+
+ if (padding) {
+ psmi_assert(nbytes_left > flow->frag_size);
+ /* over reading should be OK on sender because
+ * the padding area is within the whole buffer,
+ * receiver will discard the extra bytes via
+ * padcnt in packet header
+ */
+ padding = 4 - padding;
+ pktlen = flow->frag_size - padding;
+ } else {
+ pktlen = min(chunk_size, nbytes_left);
+ psmi_assert(((pktlen & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == is_non_dw_mul_allowed));
+ }
+
+ scb = mq_alloc_pkts(proto, 1, 0, 0);
+ psmi_assert(scb != NULL);
+
+ ips_scb_opcode(scb) = OPCODE_EAGER;
+ scb->ips_lrh.khdr.kdeth0 = msgseq;
+ ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+ ips_scb_hdrdata(scb).u32w1 = len;
+ ips_scb_hdrdata(scb).u32w0 = offset; /* initial offset */
+
+ _HFI_VDBG
+ ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n",
+ (void *)buf, pktlen, flow->frag_size, nbytes_left);
+ ips_scb_buffer(scb) = (void *)buf;
+
+ buf += pktlen;
+ offset += pktlen;
+ nbytes_left -= pktlen;
+
+ pktlen += padding;
+ psmi_assert(((pktlen & 0x3) == 0) || (IPS_NON_DW_MUL_ALLOWED == is_non_dw_mul_allowed));
+
+ scb->frag_size = flow->frag_size;
+ scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size;
+ if (scb->nfrag > 1) {
+ ips_scb_length(scb) = flow->frag_size;
+ scb->nfrag_remaining = scb->nfrag;
+ scb->chunk_size =
+ scb->chunk_size_remaining = pktlen;
+ } else
+ ips_scb_length(scb) = pktlen;
+
+ if (nbytes_left == 0) { /* last segment/packet */
+ ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+ ips_scb_cb_param(scb) = req;
+
+ /* Set ACKREQ if single packet per scb. For multi
+ * packets per scb, it is SDMA, driver will set
+ * ACKREQ in last packet, we only need ACK for
+ * last packet.
+ */
+ if (scb->nfrag == 1)
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+ } else {
+ req->send_msgoff += pktlen;
+ }
+
+ ips_proto_flow_enqueue(flow, scb);
+ if (flow->transfer == PSM_TRANSFER_PIO) {
+ /* we need to flush the pio pending queue as quick as possible */
+ err = flow->flush(flow, NULL);
+ }
+
+ } while (nbytes_left);
+
+ /* after all sdma setup, flush sdma queue,
+ * we want one system call to handle as many scbs as possible.
+ */
+ if (flow->transfer == PSM_TRANSFER_DMA) {
+ err = flow->flush(flow, NULL);
+ }
+
+ /* before return, try to make some progress. */
+ if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) {
+ err =
+ ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES);
+ }
+
+ return err;
+}
+
+static
+psm2_error_t
+ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
+ ips_epaddr_t *ipsaddr, const void *buf, uint32_t len)
+{
+ struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+ psm2_error_t err = PSM2_OK;
+ ips_scb_t *scb;
+
+ PSM2_LOG_MSG("entering");
+ req->buf = (void *)buf;
+ req->buf_len = len;
+ req->send_msglen = len;
+ req->recv_msgoff = 0;
+ req->rts_peer = (psm2_epaddr_t) ipsaddr;
+
+ scb = mq_alloc_pkts(proto, 1, 0, 0);
+ psmi_assert(scb);
+ ips_scb_opcode(scb) = OPCODE_LONG_RTS;
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+ if (req->type & MQE_TYPE_WAITING)
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING;
+
+ scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+ ips_scb_copy_tag(scb->ips_lrh.tag, req->tag.tag);
+ ips_scb_hdrdata(scb).u32w1 = len;
+ ips_scb_hdrdata(scb).u32w0 = psmi_mpool_get_obj_index(req);
+
+ if (len <= flow->frag_size &&
+#ifdef PSM_CUDA
+ !req->is_buf_gpu_mem &&
+#endif
+ !(len & 0x3)) {
+ ips_scb_buffer(scb) = (void *)buf;
+ ips_scb_length(scb) = len;
+ req->send_msgoff = len;
+ } else {
+ ips_scb_length(scb) = 0;
+ req->send_msgoff = 0;
+ }
+
+#ifdef PSM_CUDA
+ /* Used to indicate to the receiver that the send
+ * is issued on a device buffer. This helps the
+ * receiver select TID instead of using eager buffers.
+ */
+ if (req->is_buf_gpu_mem) {
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_GPU_BUF;
+ scb->mq_req = req; /* request comes from GPU domain (device) ... */
+ }
+ req->cuda_hostbuf_used = 0;
+ if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) &&
+ req->is_buf_gpu_mem &&
+ (len > GPUDIRECT_THRESH_RV)) ||
+ ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) &&
+ req->is_buf_gpu_mem &&
+ (len > gpudirect_send_threshold))) {
+ /* send from intermediate host buffer */
+ struct ips_cuda_hostbuf *chb;
+ uint32_t offset, window_len;
+ int prefetch_lookahead = 0;
+
+ STAILQ_INIT(&req->sendreq_prefetch);
+ offset = 0;
+ req->cuda_hostbuf_used = 1;
+ scb->mq_req = NULL; /* ... but it is transferred to host memory,
+ so setting req = NULL lets us take a faster
+ decision on scb's locality while sending
+ (see IS_CUDA_BUF() macro) */
+
+ /* start prefetching */
+ req->prefetch_send_msgoff = 0;
+ while ((offset < len) &&
+ (prefetch_lookahead < proto->cuda_prefetch_limit)) {
+ chb = NULL;
+ window_len =
+ ips_cuda_next_window(ipsaddr->window_rv,
+ offset, len);
+
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ chb = (struct ips_cuda_hostbuf *)
+ psmi_mpool_get(
+ proto->cuda_hostbuf_pool_small_send);
+ if (chb == NULL)
+ chb = (struct ips_cuda_hostbuf *)
+ psmi_mpool_get(
+ proto->cuda_hostbuf_pool_send);
+
+ /* any buffers available? */
+ if (chb == NULL)
+ break;
+
+ req->prefetch_send_msgoff += window_len;
+
+ chb->offset = offset;
+ chb->size = window_len;
+ chb->req = req;
+ chb->gpu_buf = (void *) buf + offset;
+ chb->bytes_read = 0;
+
+ PSMI_CUDA_CALL(cudaMemcpyAsync,
+ chb->host_buf, chb->gpu_buf,
+ window_len,
+ cudaMemcpyDeviceToHost,
+ proto->cudastream_send);
+ PSMI_CUDA_CALL(cudaEventRecord,
+ chb->copy_status,
+ proto->cudastream_send);
+
+ STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb,
+ req_next);
+ offset += window_len;
+ prefetch_lookahead++;
+ }
+ }
+#endif
+
+ PSM_LOG_EPM_COND(len > proto->mq->hfi_thresh_rv && proto->protoexp,OPCODE_LONG_RTS,PSM_LOG_EPM_TX,proto->ep->epid, req->rts_peer->epid,
+ "ips_scb_hdrdata(scb).u32w0: %d",ips_scb_hdrdata(scb).u32w0);
+
+ if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE)))
+ goto fail;
+
+ /* Assume that we already put a few rndv requests in flight. This helps
+ * for bibw microbenchmarks and doesn't hurt the 'blocking' case since
+ * we're going to poll anyway */
+ psmi_poll_internal(proto->ep, 1);
+
+fail:
+ _HFI_VDBG
+ ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n",
+ psmi_epaddr_get_name(proto->ep->epid),
+ psmi_epaddr_get_name(req->rts_peer->epid), buf, len,
+ req->tag.tag[0], req->tag.tag[1], req->tag.tag[2], req,
+ psmi_mpool_get_obj_index(req), psm2_error_get_string(err));
+ PSM2_LOG_MSG("leaving");
+ return err;
+}
+
+psm2_error_t
+ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len,
+ void *context, psm2_mq_req_t *req_o)
+{
+ psm2_error_t err = PSM2_OK;
+ struct ips_proto *proto;
+ struct ips_flow *flow;
+ ips_epaddr_t *ipsaddr;
+ ips_scb_t *scb;
+ psm2_mq_req_t req;
+
+ req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+ if_pf(req == NULL)
+ return PSM2_NO_MEMORY;
+
+ ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+ ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+ proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+ req->send_msglen = len;
+ req->tag = *tag;
+ req->context = context;
+
+#ifdef PSM_CUDA
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)ubuf);
+ req->is_buf_gpu_mem = 1;
+ goto do_rendezvous;
+ } else
+ req->is_buf_gpu_mem = 0;
+#endif
+
+ if (flags & PSM2_MQ_FLAG_SENDSYNC) {
+ goto do_rendezvous;
+ } else if (len <= mq->hfi_thresh_tiny) {
+ flow = &ipsaddr->flows[proto->msgflowid];
+ scb = mq_alloc_tiny(proto);
+ psmi_assert(scb);
+ ips_scb_opcode(scb) = OPCODE_TINY;
+ scb->ips_lrh.khdr.kdeth0 =
+ ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+ ipsaddr->msgctl->mq_send_seqnum++;
+ ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+#ifdef PSM_CUDA
+ mq_copy_tiny_host_mem
+#else
+ mq_copy_tiny
+#endif
+ ((uint32_t *) &ips_scb_hdrdata(scb),
+ (uint32_t *) ubuf, len);
+ err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+ if (err != PSM2_OK)
+ return err;
+
+ /* We can mark this op complete since all the data is now copied
+ * into an SCB that remains live until it is remotely acked */
+ req->state = MQ_STATE_COMPLETE;
+ mq_qq_append(&mq->completed_q, req);
+ _HFI_VDBG
+ ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+ len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+ } else if (len <= ipsaddr->flows[proto->msgflowid].frag_size) {
+ uint32_t paylen = len & ~0x3;
+
+ scb = mq_alloc_pkts(proto, 1, 0, 0);
+ psmi_assert(scb);
+
+ ips_scb_opcode(scb) = OPCODE_SHORT;
+ scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+ ips_scb_hdrdata(scb).u32w1 = len;
+ ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+ ips_scb_buffer(scb) = (void *)ubuf;
+ ips_scb_length(scb) = paylen;
+ if (len > paylen) {
+ /* there are nonDW bytes, copy to header */
+#ifdef PSM_CUDA
+ mq_copy_tiny_host_mem
+#else
+ mq_copy_tiny
+#endif
+ ((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
+ (uint32_t *)((uintptr_t)ubuf + paylen),
+ len - paylen);
+
+ /* for complete callback */
+ req->send_msgoff = len - paylen;
+ } else {
+ req->send_msgoff = 0;
+ }
+
+ /*
+ * Need ack for send side completion because we
+ * send from user buffer.
+ */
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+
+ flow = &ipsaddr->flows[proto->msgflowid];
+ err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+ if (err != PSM2_OK)
+ return err;
+
+ /*
+ * It should be OK to check the buffer address in
+ * 'scb' to be changed, when this scb is done, the
+ * address is set to NULL when scb is put back to
+ * scb pool. Even if the same scb is re-used, it
+ * is not possible to set to this 'buf' address.
+ */
+ if (ips_scb_buffer(scb) == (void *)ubuf) {
+ /* continue to send from user buffer */
+ ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+ ips_scb_cb_param(scb) = req;
+ } else {
+ /* mark the message done */
+ req->state = MQ_STATE_COMPLETE;
+ mq_qq_append(&mq->completed_q, req);
+ }
+ _HFI_VDBG
+ ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+ len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+ } else if (len <= mq->hfi_thresh_rv) {
+ if (len <= proto->iovec_thresh_eager) {
+ /* use PIO transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+ } else {
+ /* use SDMA transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+ }
+
+ req->send_msgoff = 0;
+ err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+ if (err != PSM2_OK)
+ return err;
+
+ _HFI_VDBG
+ ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+ len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+ } else { /* skip eager accounting below */
+do_rendezvous:
+ err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+ *req_o = req;
+ return err;
+ }
+
+ *req_o = req;
+ mq->stats.tx_num++;
+ mq->stats.tx_eager_num++;
+ mq->stats.tx_eager_bytes += len;
+
+ return err;
+}
+
+psm2_error_t
+ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+ psm2_error_t err = PSM2_OK;
+ struct ips_proto *proto;
+ struct ips_flow *flow;
+ ips_epaddr_t *ipsaddr;
+ ips_scb_t *scb;
+
+ ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+ ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+ proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+#ifdef PSM_CUDA
+ int gpu_mem;
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+ gpu_mem = 1;
+ goto do_rendezvous;
+ } else
+ gpu_mem = 0;
+#endif
+
+ if (flags & PSM2_MQ_FLAG_SENDSYNC) {
+ goto do_rendezvous;
+ } else if (len <= mq->hfi_thresh_tiny) {
+ flow = &ipsaddr->flows[proto->msgflowid];
+ scb = mq_alloc_tiny(proto);
+ psmi_assert(scb);
+ ips_scb_opcode(scb) = OPCODE_TINY;
+ scb->ips_lrh.khdr.kdeth0 =
+ ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+ ipsaddr->msgctl->mq_send_seqnum++;
+ ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+#ifdef PSM_CUDA
+ mq_copy_tiny_host_mem
+#else
+ mq_copy_tiny
+#endif
+ ((uint32_t *) &ips_scb_hdrdata(scb),
+ (uint32_t *) ubuf, len);
+ err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+ if (err != PSM2_OK)
+ return err;
+
+ _HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+ ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+ } else if (len <= ipsaddr->flows[proto->msgflowid].frag_size) {
+ uint32_t paylen = len & ~0x3;
+
+ scb = mq_alloc_pkts(proto, 1, 0, 0);
+ psmi_assert(scb);
+
+ ips_scb_opcode(scb) = OPCODE_SHORT;
+ scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+ ips_scb_hdrdata(scb).u32w1 = len;
+ ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+ ips_scb_buffer(scb) = (void *)ubuf;
+ ips_scb_length(scb) = paylen;
+ if (len > paylen) {
+ /* there are nonDW bytes, copy to header */
+#ifdef PSM_CUDA
+ mq_copy_tiny_host_mem
+#else
+ mq_copy_tiny
+#endif
+ ((uint32_t *)&ips_scb_hdrdata(scb).u32w0,
+ (uint32_t *)((uintptr_t)ubuf + paylen),
+ len - paylen);
+ }
+
+ /*
+ * Need ack for send side completion because we
+ * send from user buffer.
+ */
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+
+ flow = &ipsaddr->flows[proto->msgflowid];
+ err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+ if (err != PSM2_OK)
+ return err;
+
+ /*
+ * It should be OK to check the buffer address in
+ * 'scb' to be changed, when this scb is done, the
+ * address is set to NULL when scb is put back to
+ * scb pool. Even if the same scb is re-used, it
+ * is not possible to set to this 'ubuf' address.
+ */
+ if (ips_scb_buffer(scb) == (void *)ubuf) {
+ if (flow->transfer != PSM_TRANSFER_PIO ||
+ paylen > proto->scb_bufsize ||
+ !ips_scbctrl_bufalloc(scb)) {
+ /* sdma transfer (can't change user buffer),
+ * or, payload is larger than bounce buffer,
+ * or, can't allocate bounce buffer,
+ * send from user buffer till complete */
+ PSMI_BLOCKUNTIL(mq->ep, err,
+ ips_scb_buffer(scb) != (void*)ubuf);
+ if (err > PSM2_OK_NO_PROGRESS)
+ return err;
+ err = PSM2_OK;
+ } else {
+ /* copy to bounce buffer */
+#ifdef PSM_CUDA
+ ips_shortcpy_host_mem
+#else
+ ips_shortcpy
+#endif
+ (ips_scb_buffer(scb),
+ (void*)ubuf, paylen);
+ }
+ }
+ _HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+ ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+ } else if (len <= mq->hfi_thresh_rv) {
+ psm2_mq_req_t req;
+
+ if (len <= proto->iovec_thresh_eager_blocking) {
+ /* use PIO transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+ } else {
+ /* use SDMA transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+ }
+
+ /* Block until we can get a req */
+ PSMI_BLOCKUNTIL(mq->ep, err,
+ (req =
+ psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+ if (err > PSM2_OK_NO_PROGRESS)
+ return err;
+
+ req->type |= MQE_TYPE_WAITING;
+ req->send_msglen = len;
+ req->tag = *tag;
+ req->send_msgoff = 0;
+ req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+
+ err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+ if (err != PSM2_OK)
+ return err;
+
+ psmi_mq_wait_internal(&req);
+
+ _HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+ psmi_epaddr_get_name(mq->ep->epid),
+ psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+ ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+ } else {
+ psm2_mq_req_t req;
+do_rendezvous:
+ /* Block until we can get a req */
+ PSMI_BLOCKUNTIL(mq->ep, err,
+ (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+ if (err > PSM2_OK_NO_PROGRESS)
+ return err;
+
+ req->type |= MQE_TYPE_WAITING;
+ req->tag = *tag;
+ req->flags |= PSMI_REQ_FLAG_IS_INTERNAL;
+
+#ifdef PSM_CUDA
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (gpu_mem) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)ubuf);
+ req->is_buf_gpu_mem = 1;
+ } else
+ req->is_buf_gpu_mem = 0;
+#endif
+
+ err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+ if (err != PSM2_OK)
+ return err;
+ psmi_mq_wait_internal(&req);
+ return err; /* skip accounting, done separately at completion time */
+ }
+
+ mq->stats.tx_num++;
+ mq->stats.tx_eager_num++;
+ mq->stats.tx_eager_bytes += len;
+
+ return err;
+}
+
+static
+psm2_error_t
+ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
+{
+ psm2_epaddr_t epaddr = req->rts_peer;
+ struct ips_proto *proto = epaddr->proto;
+
+ /* We have a match.
+ * We may already set with first packet,
+ * If we're doing eager-based r-v, just send back the sreq and length and
+ * have the sender complete the send.
+ */
+ PSM2_LOG_MSG("entering");
+#ifdef PSM_CUDA
+ /* Cases where we do not use TIDs:
+ * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes
+ * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv
+ * 3) Recv on gpu buf and len is less than 3 bytes
+ * 4) Expected protocol not initialized.
+ */
+ if ((!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem &&
+ req->recv_msglen <= GPUDIRECT_THRESH_RV)||
+ (!req->is_sendbuf_gpu_mem &&
+ req->recv_msglen <= proto->mq->hfi_thresh_rv))) ||
+ (req->is_buf_gpu_mem && req->recv_msglen <= GPUDIRECT_THRESH_RV) ||
+ proto->protoexp == NULL) { /* no expected tid recieve */
+#else
+ if (req->recv_msglen <= proto->mq->hfi_thresh_rv ||/* less rv theshold */
+ proto->protoexp == NULL) { /* no expected tid recieve */
+#endif
+ /* there is no order requirement, try to push CTS request
+ * directly, if fails, then queue it for later try. */
+ if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) {
+ struct ips_pend_sends *pends = &proto->pend_sends;
+ struct ips_pend_sreq *sreq =
+ psmi_mpool_get(proto->pend_sends_pool);
+ psmi_assert(sreq != NULL);
+ if (sreq == NULL)
+ {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_NO_MEMORY;
+ }
+ sreq->type = IPS_PENDSEND_EAGER_REQ;
+ sreq->req = req;
+
+ STAILQ_INSERT_TAIL(&pends->pendq, sreq, next);
+ psmi_timer_request(proto->timerq, &pends->timer,
+ PSMI_TIMER_PRIO_1);
+ }
+ } else {
+ ips_protoexp_tid_get_from_token(proto->protoexp, req->buf,
+ req->recv_msglen, epaddr,
+ req->rts_reqidx_peer,
+ req->
+ type & MQE_TYPE_WAITING_PEER ?
+ IPS_PROTOEXP_TIDGET_PEERWAIT :
+ 0, ips_proto_mq_rv_complete_exp,
+ req);
+ }
+
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req)
+{
+ ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+ struct ips_flow *flow;
+ ips_scb_t *scb;
+ ptl_arg_t *args;
+
+ PSM2_LOG_MSG("entering");
+ psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[proto->msgflowid];
+ scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0);
+ if (scb == NULL)
+ {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK_NO_PROGRESS;
+ }
+
+ args = (ptl_arg_t *) ips_scb_uwords(scb);
+
+ ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+ scb->ips_lrh.khdr.kdeth0 = 0;
+ args[0].u32w0 = psmi_mpool_get_obj_index(req);
+ args[1].u32w1 = req->recv_msglen;
+ args[1].u32w0 = req->rts_reqidx_peer;
+
+ PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_TX, proto->ep->epid,
+ flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d",
+ req->rts_reqidx_peer);
+
+ ips_proto_flow_enqueue(flow, scb);
+ flow->flush(flow, NULL);
+
+ /* have already received enough bytes */
+ if (req->recv_msgoff == req->recv_msglen) {
+ ips_proto_mq_rv_complete(req);
+ }
+
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
+{
+ psm2_error_t err = PSM2_OK;
+ uintptr_t buf = (uintptr_t) req->buf + req->recv_msgoff;
+ ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+ uint32_t nbytes_left = req->send_msglen - req->recv_msgoff;
+ uint32_t nbytes_sent = 0;
+ uint32_t nbytes_this, chunk_size;
+ uint16_t frag_size, unaligned_bytes;
+ struct ips_flow *flow;
+ ips_scb_t *scb;
+
+ psmi_assert(nbytes_left > 0);
+
+ PSM2_LOG_MSG("entering.");
+ if (
+#ifdef PSM_CUDA
+ req->is_buf_gpu_mem ||
+#endif
+ req->send_msglen > proto->iovec_thresh_eager) {
+ /* use SDMA transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+ frag_size = flow->path->pr_mtu;
+ /* max chunk size is the rv window size */
+ chunk_size = ipsaddr->window_rv;
+ } else {
+ /* use PIO transfer */
+ psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0);
+ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+ chunk_size = frag_size = flow->frag_size;
+ }
+
+ do {
+ /*
+ * don't try to call progression routine such as:
+ * ips_recv_progress_if_busy() in this loop,
+ * it will cause recursive call of this function.
+ */
+
+ /*
+ * When tid code path is enabled, we don’t allocate scbc_rv
+ * objects. If the message is less than the hfi_thresh_rv,
+ * we normally use eager protocol to do the transfer.
+ * However, if it is sync send, we use the rendezvous
+ * rts/cts/rts-data protocol.
+ * In this case, because scbc_rv is null,
+ * we use scbc_egr instead.
+ */
+
+ scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv
+ : &proto->scbc_egr, 1, 0, 0);
+ if (scb == NULL) {
+ err = PSM2_OK_NO_PROGRESS;
+ break;
+ }
+
+ ips_scb_opcode(scb) = OPCODE_LONG_DATA;
+ scb->ips_lrh.khdr.kdeth0 = 0;
+ scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer;
+ scb->ips_lrh.data[1].u32w1 = req->send_msglen;
+
+ /* attached unaligned bytes into packet header */
+ unaligned_bytes = nbytes_left & 0x3;
+ if (unaligned_bytes) {
+ mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata,
+ (uint32_t *)buf, unaligned_bytes);
+
+ /* position to send */
+ buf += unaligned_bytes;
+ req->recv_msgoff += unaligned_bytes;
+ psmi_assert(req->recv_msgoff < 4);
+
+ /* for complete callback */
+ req->send_msgoff += unaligned_bytes;
+
+ nbytes_left -= unaligned_bytes;
+ nbytes_sent += unaligned_bytes;
+ }
+ scb->ips_lrh.data[1].u32w0 = req->recv_msgoff;
+ ips_scb_buffer(scb) = (void *)buf;
+
+ scb->frag_size = frag_size;
+ nbytes_this = min(chunk_size, nbytes_left);
+ if (nbytes_this > 0)
+ scb->nfrag = (nbytes_this + frag_size - 1) / frag_size;
+ else
+ scb->nfrag = 1;
+
+ if (scb->nfrag > 1) {
+ ips_scb_length(scb) = frag_size;
+ scb->nfrag_remaining = scb->nfrag;
+ scb->chunk_size =
+ scb->chunk_size_remaining = nbytes_this;
+ } else
+ ips_scb_length(scb) = nbytes_this;
+
+ buf += nbytes_this;
+ req->recv_msgoff += nbytes_this;
+ nbytes_sent += nbytes_this;
+ nbytes_left -= nbytes_this;
+ if (nbytes_left == 0) {
+ /* because of scb callback, use eager complete */
+ ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+ ips_scb_cb_param(scb) = req;
+
+ /* Set ACKREQ if single packet per scb. For multi
+ * packets per scb, it is SDMA, driver will set
+ * ACKREQ in last packet, we only need ACK for
+ * last packet.
+ */
+ if (scb->nfrag == 1)
+ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+ } else {
+ req->send_msgoff += nbytes_this;
+ }
+
+ ips_proto_flow_enqueue(flow, scb);
+ if (flow->transfer == PSM_TRANSFER_PIO) {
+ /* we need to flush the pio pending queue as quick as possible */
+ flow->flush(flow, NULL);
+ }
+
+ } while (nbytes_left);
+
+ /* for sdma, if some bytes are queued, flush them */
+ if (flow->transfer == PSM_TRANSFER_DMA && nbytes_sent) {
+ flow->flush(flow, NULL);
+ }
+
+ PSM2_LOG_MSG("leaving.");
+
+ return err;
+}
+
+int
+ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_proto *proto = rcv_ev->proto;
+ psm2_mq_t mq = proto->ep->mq;
+ struct ips_flow *flow;
+ psm2_mq_req_t req;
+ uint32_t paylen;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ PSM2_LOG_MSG("entering");
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+ req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0);
+ psmi_assert(req != NULL);
+
+ /*
+ * if there is payload, it is expected tid protocol
+ * with tid session info as the payload.
+ */
+ paylen = ips_recvhdrq_event_paylen(rcv_ev);
+ if (paylen > 0) {
+ ips_tid_session_list *payload =
+ ips_recvhdrq_event_payload(rcv_ev);
+ psmi_assert(paylen == 0 || payload);
+ PSM_LOG_EPM(OPCODE_LONG_CTS,PSM_LOG_EPM_RX,rcv_ev->ipsaddr->epaddr.epid,
+ mq->ep->epid,"p_hdr->data[1].u32w0 %d",
+ p_hdr->data[1].u32w0);
+ proto->epaddr_stats.tids_grant_recv++;
+
+ psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv);
+ psmi_assert(proto->protoexp != NULL);
+
+ /* ptl_req_ptr will be set to each tidsendc */
+ if (req->ptl_req_ptr == NULL) {
+ req->send_msglen = p_hdr->data[1].u32w1;
+ }
+ psmi_assert(req->send_msglen == p_hdr->data[1].u32w1);
+
+ if (ips_tid_send_handle_tidreq(proto->protoexp,
+ rcv_ev->ipsaddr, req, p_hdr->data[0],
+ p_hdr->mdata, payload, paylen) == 0) {
+ proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+ } else {
+ flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ flow->recv_seq_num.psn_num -= 1; /* Decrement seq number to NAK proper CTS */
+ ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+ static unsigned int msg_cnt = 0;
+ if (msg_cnt++ == 0) { /* Report the message only once */
+ _HFI_INFO("PSM2 memory shortage detected. Please consider modifying PSM2_MEMORY setting\n");
+ }
+ return PSM2_EP_NO_RESOURCES;
+ }
+ } else {
+ req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */
+ req->send_msglen = p_hdr->data[1].u32w1;
+
+ if (req->send_msgoff >= req->send_msglen) {
+ /* already sent enough bytes, may truncate so using >= */
+ ips_proto_mq_rv_complete(req);
+ } else if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) {
+ /* there is no order requirement, tried to push RTS data
+ * directly and not done, so queue it for later try. */
+ struct ips_pend_sreq *sreq =
+ psmi_mpool_get(proto->pend_sends_pool);
+ psmi_assert(sreq != NULL);
+
+ sreq->type = IPS_PENDSEND_EAGER_DATA;
+ sreq->req = req;
+ STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next);
+ /* Make sure it's processed by timer */
+ psmi_timer_request(proto->timerq, &proto->pend_sends.timer,
+ PSMI_TIMER_PRIO_1);
+ }
+ }
+
+ flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
+{
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ psm2_mq_t mq = rcv_ev->proto->mq;
+ ips_msgctl_t *msgctl = ipsaddr->msgctl;
+ enum ips_msg_order msgorder;
+ char *payload;
+ uint32_t paylen;
+ psm2_mq_req_t req;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ PSM2_LOG_MSG("entering");
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+
+ msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+ &ipsaddr->msgctl->mq_recv_seqnum);
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+ {
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_REVISIT;
+ }
+
+ payload = ips_recvhdrq_event_payload(rcv_ev);
+ paylen = ips_recvhdrq_event_paylen(rcv_ev);
+ /* either no payload or whole message */
+ psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1);
+
+ /*
+ * We can't have past message sequence here. For eager message,
+ * it must always have an eager queue matching because even in
+ * truncation case the code logic will wait till all packets
+ * have been received.
+ */
+ psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+ _HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n",
+ (long long)p_hdr->data[0].u64,
+ p_hdr->data[1].u32w0, p_hdr->data[1].u32w1);
+
+ int rc = psmi_mq_handle_rts(mq,
+ (psm2_epaddr_t) &ipsaddr->msgctl->
+ master_epaddr,
+ (psm2_mq_tag_t *) p_hdr->tag,
+ p_hdr->data[1].u32w1, payload, paylen,
+ msgorder, ips_proto_mq_rts_match_callback,
+ &req);
+ if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) & psn_mask;
+ ipsaddr->msgctl->mq_recv_seqnum--;
+
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_REVISIT;
+ }
+
+ req->rts_peer = (psm2_epaddr_t) ipsaddr;
+ req->rts_reqidx_peer = p_hdr->data[1].u32w0;
+ if (req->send_msglen > mq->hfi_thresh_rv)
+ {
+ PSM_LOG_EPM(OPCODE_LONG_RTS,PSM_LOG_EPM_RX,req->rts_peer->epid,mq->ep->epid,
+ "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+ }
+ if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING)
+ req->type |= MQE_TYPE_WAITING_PEER;
+
+#ifdef PSM_CUDA
+ if (p_hdr->flags & IPS_SEND_FLAG_GPU_BUF)
+ req->is_sendbuf_gpu_mem = 1;
+ else
+ req->is_sendbuf_gpu_mem = 0;
+#endif
+
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+ /* for out of order matching only */
+ req->msg_seqnum =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+ req->ptl_req_ptr = (void *)msgctl;
+
+ msgctl->outoforder_count++;
+ mq_qq_append(&mq->outoforder_q, req);
+
+ ret = IPS_RECVHDRQ_BREAK;
+ } else {
+ ipsaddr->msg_toggle = 0;
+
+ if (rc == MQ_RET_MATCH_OK)
+ ips_proto_mq_rts_match_callback(req, 1);
+
+ /* XXX if blocking, break out of progress loop */
+
+ if (msgctl->outoforder_count)
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+ if (rc == MQ_RET_UNEXP_OK)
+ ret = IPS_RECVHDRQ_BREAK;
+ }
+
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ PSM2_LOG_MSG("leaving");
+ return ret;
+}
+
+int
+ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
+{
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ psm2_mq_t mq = rcv_ev->proto->mq;
+ ips_msgctl_t *msgctl = ipsaddr->msgctl;
+ enum ips_msg_order msgorder;
+ char *payload;
+ uint32_t paylen;
+ psm2_mq_req_t req;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+ &ipsaddr->msgctl->mq_recv_seqnum);
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+ return IPS_RECVHDRQ_REVISIT;
+
+ payload = (void *)&p_hdr->hdr_data;
+ paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+ HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK;
+
+ /*
+ * We can't have past message sequence here. For eager message,
+ * it must always have an eager queue matching because even in
+ * truncation case the code logic will wait till all packets
+ * have been received.
+ */
+ psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+ OPCODE_TINY, p_hdr->hdr_data.u32w1);
+
+ /* store in req below too! */
+ int rc = psmi_mq_handle_envelope(mq,
+ (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+ (psm2_mq_tag_t *) p_hdr->tag, paylen, 0,
+ payload, paylen, msgorder, OPCODE_TINY, &req);
+ if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) & psn_mask;
+ ipsaddr->msgctl->mq_recv_seqnum--;
+
+ return IPS_RECVHDRQ_REVISIT;
+ }
+
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+ /* for out of order matching only */
+ req->msg_seqnum =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+ req->ptl_req_ptr = (void *)msgctl;
+
+ msgctl->outoforder_count++;
+ mq_qq_append(&mq->outoforder_q, req);
+
+ ret = IPS_RECVHDRQ_BREAK;
+ } else {
+ ipsaddr->msg_toggle = 0;
+
+ if (msgctl->outoforder_count)
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+ if (rc == MQ_RET_UNEXP_OK)
+ ret = IPS_RECVHDRQ_BREAK;
+ }
+
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ return ret;
+}
+
+int
+ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
+{
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ psm2_mq_t mq = rcv_ev->proto->mq;
+ ips_msgctl_t *msgctl = ipsaddr->msgctl;
+ enum ips_msg_order msgorder;
+ char *payload;
+ uint32_t paylen;
+ psm2_mq_req_t req;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+ &ipsaddr->msgctl->mq_recv_seqnum);
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+ return IPS_RECVHDRQ_REVISIT;
+
+ payload = ips_recvhdrq_event_payload(rcv_ev);
+ paylen = ips_recvhdrq_event_paylen(rcv_ev);
+ psmi_assert(paylen == 0 || payload);
+
+ /*
+ * We can't have past message sequence here. For eager message,
+ * it must always have an eager queue matching because even in
+ * truncation case the code logic will wait till all packets
+ * have been received.
+ */
+ psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+ OPCODE_SHORT, p_hdr->hdr_data.u32w1);
+
+ /* store in req below too! */
+ int rc = psmi_mq_handle_envelope(mq,
+ (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+ (psm2_mq_tag_t *) p_hdr->tag,
+ p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+ payload, paylen, msgorder, OPCODE_SHORT, &req);
+ if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) & psn_mask;
+ ipsaddr->msgctl->mq_recv_seqnum--;
+
+ return IPS_RECVHDRQ_REVISIT;
+ }
+
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+ /* for out of order matching only */
+ req->msg_seqnum =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+ req->ptl_req_ptr = (void *)msgctl;
+
+ msgctl->outoforder_count++;
+ mq_qq_append(&mq->outoforder_q, req);
+
+ ret = IPS_RECVHDRQ_BREAK;
+ } else {
+ ipsaddr->msg_toggle = 0;
+
+ if (msgctl->outoforder_count)
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+ if (rc == MQ_RET_UNEXP_OK)
+ ret = IPS_RECVHDRQ_BREAK;
+ }
+
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ return ret;
+}
+
+int
+ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
+{
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ psm2_mq_t mq = rcv_ev->proto->mq;
+ ips_msgctl_t *msgctl = ipsaddr->msgctl;
+ enum ips_msg_order msgorder;
+ char *payload;
+ uint32_t paylen;
+ psm2_mq_req_t req;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+ &ipsaddr->msgctl->mq_recv_seqnum);
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+ return IPS_RECVHDRQ_REVISIT;
+
+ payload = ips_recvhdrq_event_payload(rcv_ev);
+ paylen = ips_recvhdrq_event_paylen(rcv_ev);
+ psmi_assert(paylen == 0 || payload);
+
+ if (msgorder == IPS_MSG_ORDER_PAST ||
+ msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+ req = mq_eager_match(mq, msgctl,
+ __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK);
+ /*
+ * It is future message sequence or past message sequence,
+ * and there is request matching in eager queue, we handle
+ * the packet data and return. We can't go continue to
+ * match envelope.
+ * Past message sequence must always have a matching!!!
+ * error is caught below.
+ */
+ if (req) {
+ psmi_mq_handle_data(mq, req,
+ p_hdr->data[1].u32w0, payload, paylen);
+
+ if (msgorder == IPS_MSG_ORDER_FUTURE_RECV)
+ ret = IPS_RECVHDRQ_BREAK;
+
+ if ((__be32_to_cpu(p_hdr->bth[2]) &
+ IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)
+ rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ return ret;
+ }
+
+ psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV);
+ /*
+ * For future message sequence, since there is no eager
+ * queue matching yet, this must be the first packet for
+ * the message sequence. And of course, expected message
+ * sequence is always the first packet for the sequence.
+ */
+ }
+
+ /*
+ * We can't have past message sequence here. For eager message,
+ * it must always have an eager queue matching because even in
+ * truncation case the code logic will wait till all packets
+ * have been received.
+ */
+ psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+ OPCODE_EAGER, p_hdr->hdr_data.u32w1);
+
+ /* store in req below too! */
+ int rc = psmi_mq_handle_envelope(mq,
+ (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+ (psm2_mq_tag_t *) p_hdr->tag,
+ p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+ payload, paylen, msgorder, OPCODE_EAGER, &req);
+ if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+ flow->recv_seq_num.psn_num =
+ (flow->recv_seq_num.psn_num - 1) & psn_mask;
+ ipsaddr->msgctl->mq_recv_seqnum--;
+
+ return IPS_RECVHDRQ_REVISIT;
+ }
+
+ /* for both outoforder matching and eager matching */
+ req->msg_seqnum =
+ __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+ req->ptl_req_ptr = (void *)msgctl;
+
+ if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+ msgctl->outoforder_count++;
+ mq_qq_append(&mq->outoforder_q, req);
+
+ ret = IPS_RECVHDRQ_BREAK;
+ } else {
+ ipsaddr->msg_toggle = 0;
+
+ if (msgctl->outoforder_count)
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+ if (rc == MQ_RET_UNEXP_OK)
+ ret = IPS_RECVHDRQ_BREAK;
+ }
+
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ return ret;
+}
+
+/*
+ * Progress the out of order queue to see if any message matches
+ * current receiving sequence number.
+ */
+void
+ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl)
+{
+ psm2_mq_req_t req;
+
+ do {
+ req =
+ mq_ooo_match(&mq->outoforder_q, msgctl,
+ msgctl->mq_recv_seqnum);
+ if (req == NULL)
+ return;
+
+ msgctl->outoforder_count--;
+ msgctl->mq_recv_seqnum++;
+
+ psmi_mq_handle_outoforder(mq, req);
+
+ } while (msgctl->outoforder_count > 0);
+
+ return;
+}
+
+int
+ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ psm2_mq_t mq = rcv_ev->proto->mq;
+ char *payload;
+ uint32_t paylen;
+ psm2_mq_req_t req;
+ struct ips_flow *flow;
+
+ /*
+ * if PSN does not match, drop the packet.
+ */
+ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+ return IPS_RECVHDRQ_CONTINUE;
+
+ req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0);
+ psmi_assert(req != NULL);
+ psmi_assert(p_hdr->data[1].u32w1 == req->send_msglen);
+
+ /*
+ * if a packet has very small offset, it must have unaligned data
+ * attached in the packet header, and this must be the first packet
+ * for that message.
+ */
+ if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) {
+ psmi_assert(p_hdr->data[1].u32w0 == (req->send_msglen&0x3));
+ mq_copy_tiny((uint32_t *)req->buf,
+ (uint32_t *)&p_hdr->mdata,
+ p_hdr->data[1].u32w0);
+ req->send_msgoff += p_hdr->data[1].u32w0;
+ }
+
+ payload = ips_recvhdrq_event_payload(rcv_ev);
+ paylen = ips_recvhdrq_event_paylen(rcv_ev);
+ psmi_assert(paylen == 0 || payload);
+
+ psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen);
+
+ flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+ (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+ ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+ ips_proto_process_ack(rcv_ev);
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h
new file mode 100644
index 0000000..6e5e49a
--- /dev/null
+++ b/ptl_ips/ips_proto_params.h
@@ -0,0 +1,264 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_PARAMS_H
+#define _IPS_PROTO_PARAMS_H
+
+/*
+ * send method: dma, pio;
+ * recv method: tid, egr;
+ *
+ * send-recv mode combinations: 1=on, 0=off
+ * A: dma:1, pio=1, tid=1, egr=1;
+ * B: dma:0, pio=1, tid=1, egr=1;
+ * C: dma:1, pio=0, tid=1, egr=1;
+ * D: dma:1, pio=1, tid=0, egr=1;
+ * E: dma:0, pio=1, tid=0, egr=1;
+ * F: dma:1, pio=0, tid=0, egr=1;
+ *
+ * message packet type:
+ * T: tiny; S: short; E: eager;
+ * LR: long rts; LC: long cts; LD: long data;
+ * ED: expected data; EC: expected completion;
+ * C: ctrl msg;
+ *
+ * send,recv method for each packet type and each send-recv mode
+ * -------------------------------------------------------------------
+ * | | A | B | C | D | E | F |
+ * -------------------------------------------------------------------
+ * | T | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | S | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | E | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | E | dma,egr | pio,egr | dma,egr | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | LR | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LC | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LD | x | x | x | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | LD | x | x | x | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | ED | dma,tid | pio,tid | dma,tid | x | x | x |
+ * -------------------------------------------------------------------
+ * | EC | pio,egr | pio,egr | dma,egr | x | x | x |
+ * -------------------------------------------------------------------
+ * | C | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ */
+
+/* Constants */
+#define BYTE2DWORD_SHIFT 2
+#define LOWER_16_BITS 0xFFFF
+#define PSM_CACHE_LINE_BYTES 64
+#define PSM2_FLOW_CREDITS 64
+#define PSM_CRC_SIZE_IN_BYTES 8
+
+/*
+ * version of protocol header (known to chip also).
+ * This value for OPA is defined in spec.
+ */
+#define IPS_PROTO_VERSION 0x1
+
+/* Send retransmission */
+#define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */
+
+#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT 160 /* in millisecs */
+#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT 640 /* in millisecs */
+#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2
+#define PSM_TID_TIMEOUT_DEFAULT "160:640:2" /* update from above params */
+
+/* time conversion macros */
+#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us))
+#define ms_2_cycles(ms) nanosecs_to_cycles(1000000ULL*(ms))
+#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec))
+
+/* Per-flow flags */
+#define IPS_FLOW_FLAG_NAK_SEND 0x01
+#define IPS_FLOW_FLAG_PENDING_ACK 0x02
+#define IPS_FLOW_FLAG_PENDING_NAK 0x04
+#define IPS_FLOW_FLAG_GEN_BECN 0x08
+#define IPS_FLOW_FLAG_CONGESTED 0x10
+#define IPS_FLOW_FLAG_SKIP_CTS 0x20
+
+/* tid session expected send flags */
+#define EXP_SEND_FLAG_CLEAR_ALL 0x00
+#define EXP_SEND_FLAG_FREE_TIDS 0x01
+
+#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's */
+
+/*
+ * scb flags for wire,
+ * Only the lower 6 bits are wire-protocol options
+ */
+#define IPS_SEND_FLAG_NONE 0x00
+#define IPS_SEND_FLAG_BLOCKING 0x01 /* blocking send */
+#define IPS_SEND_FLAG_PKTCKSUM 0x02 /* Has packet checksum */
+#define IPS_SEND_FLAG_AMISTINY 0x04 /* AM is tiny, exclusive */
+
+#ifdef PSM_CUDA
+/* This flag is used to indicate to the reciever when
+ * the send is issued on a device buffer. This helps in
+ * selecting TID path on the recieve side regardless of
+ * the receive buffers locality. It is used
+ * in a special case where the send is on a device
+ * buffer and the receive is on a host buffer.
+ */
+#define IPS_SEND_FLAG_GPU_BUF 0x08
+#endif
+
+#define IPS_SEND_FLAG_PROTO_OPTS 0x3f /* only 6bits wire flags */
+
+/* scb flags */
+#define IPS_SEND_FLAG_PENDING 0x0100
+#define IPS_SEND_FLAG_PERSISTENT 0x0200
+
+/* 0x10000000, interrupt when done */
+#define IPS_SEND_FLAG_INTR (1<<HFI_KHDR_INTR_SHIFT)
+/* 0x20000000, header suppression */
+#define IPS_SEND_FLAG_HDRSUPP (1<<HFI_KHDR_SH_SHIFT)
+/* 0x80000000, request ack (normal) */
+#define IPS_SEND_FLAG_ACKREQ (1<<HFI_BTH_ACK_SHIFT)
+
+/* proto flags */
+#define IPS_PROTO_FLAG_SDMA 0x01 /* all sdma, no pio */
+#define IPS_PROTO_FLAG_SPIO 0x02 /* all spio, no dma */
+#define IPS_PROTO_FLAG_RCVTHREAD 0x04 /* psm recv thread is on */
+#define IPS_PROTO_FLAG_LOOPBACK 0x08 /* psm loopback over hfi */
+#define IPS_PROTO_FLAG_CKSUM 0x10 /* psm checksum is on */
+
+/* Coalesced ACKs (On by default) */
+#define IPS_PROTO_FLAG_COALESCE_ACKS 0x20
+
+/* Use Path Record query (off by default) */
+#define IPS_PROTO_FLAG_QUERY_PATH_REC 0x40
+
+/* Path selection policies:
+ *
+ * (a) Adaptive - Dynamically determine the least loaded paths using various
+ * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs.
+ *
+ * (b) Static schemes -
+ * (i) static_src - Use path keyed off source context
+ * (ii) static_dest - Use path keyed off destination context
+ * (iii) static_base - Use only the base lid path - default till Oct'09.
+ *
+ * The default is adaptive. If a zero lmc network is used then there exists
+ * just one path between endpoints the (b)(iii) case above.
+ *
+ */
+
+#define IPS_PROTO_FLAG_PPOLICY_ADAPTIVE 0x200
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_SRC 0x400
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_DST 0x800
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_BASE 0x1000
+
+/* All static policies */
+#define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
+
+/* IBTA CCA Protocol support */
+#define IPS_PROTO_FLAG_CCA 0x2000
+#define IPS_PROTO_FLAG_CCA_PRESCAN 0x4000 /* Enable RAPID CCA prescanning */
+
+#ifdef PSM_CUDA
+/* Use RNDV (TID) for all message sizes */
+#define IPS_PROTO_FLAG_ALWAYS_RNDV 0x10000
+/* Use GPUDirect RDMA for SDMA */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND 0x20000
+/* Use GPUDirect RDMA for TID */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV 0x40000
+#endif
+
+#define IPS_PROTOEXP_FLAG_ENABLED 0x01 /* default */
+#define IPS_PROTOEXP_FLAG_HDR_SUPP 0x02 /* Header suppression enabled */
+#define IPS_PROTOEXP_FLAG_TID_DEBUG 0x04 /* *not* default */
+#define IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE 0x08 /* Interleave RTS handling. */
+#define IPS_PROTOEXP_FLAG_CTS_SERIALIZED 0x10 /* CTS serialized */
+#define IPS_PROTOEXP_FLAGS_DEFAULT IPS_PROTOEXP_FLAG_ENABLED
+
+
+/* We have to get an MTU of at least 2K, or else this breaks some assumptions
+ * in the packets that handle tid descriptors
+ */
+#define IPS_PROTOEXP_MIN_MTU 2048
+
+/* Fault injection, becomes parameters to psmi_faultinj_getspec so
+ * a comma-delimited list of
+ * "spec_name", num, denom
+ * Where num/denom means fault num out of every denom.
+ * The defines set 'denum' and assume that num is set to 1
+ *
+ * These values are all defaults, each is overridable via
+ * PSM2_FI_<spec_name> in the environment (and yes, spec_name is in lowercase
+ * *in the environment* just to minimize it appearing in the wild). The format
+ * there is <num:denom:initial_seed> so the same thing except that one can set
+ * a specific seed to the random number generator.
+ */
+#if 1
+#define IPS_FAULTINJ_DMALOST 20 /* 1 every 20 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST 100 /* 1 every 100 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY 10 /* 1 every 10 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST 200 /* 1 every 200 pkts dropped at recv */
+#else
+#define IPS_FAULTINJ_DMALOST 500 /* 1 every 500 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST 3000 /* 1 every 3000 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY 100 /* 1 every 100 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST 500 /* 1 every 500 pkts dropped at recv */
+#endif
+
+#endif /* _IPS_PROTO_PARAMS_H */
diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c
new file mode 100644
index 0000000..c55a57c
--- /dev/null
+++ b/ptl_ips/ips_proto_recv.c
@@ -0,0 +1,1447 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* receive service routine for each packet opcode */
+ips_packet_service_fn_t
+ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED] = {
+ips_proto_process_unknown_opcode, /* 0xC0 */
+ips_proto_mq_handle_tiny, /* OPCODE_TINY */
+ips_proto_mq_handle_short,
+ips_proto_mq_handle_eager,
+ips_proto_mq_handle_rts, /* RTS */
+ips_proto_mq_handle_cts, /* CTS */
+ips_proto_mq_handle_data, /* DATA */
+ips_protoexp_data, /* EXPTID */
+ips_protoexp_recv_tid_completion, /* EXPTID_COMPLETION */
+ips_proto_process_ack,
+ips_proto_process_nak,
+ips_proto_process_becn,
+ips_proto_process_err_chk,
+ips_proto_process_err_chk_gen,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_am,
+ips_proto_am,
+ips_proto_am /* OPCODE_AM_REPLY */
+};
+
+#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS 30
+static void ips_report_strays(struct ips_proto *proto);
+
+#define INC_TIME_SPEND(timer)
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto)
+{
+ uint32_t interval_secs;
+ union psmi_envvar_val env_stray;
+
+ psmi_getenv("PSM2_STRAY_WARNINTERVAL",
+ "min secs between stray process warnings",
+ PSMI_ENVVAR_LEVEL_HIDDEN,
+ PSMI_ENVVAR_TYPE_UINT,
+ (union psmi_envvar_val)PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS,
+ &env_stray);
+ interval_secs = env_stray.e_uint;
+ if (interval_secs > 0)
+ proto->stray_warn_interval = sec_2_cycles(interval_secs);
+ else
+ proto->stray_warn_interval = 0;
+
+ return PSM2_OK;
+}
+
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto)
+{
+ ips_report_strays(proto);
+ return PSM2_OK;
+}
+
+#define cycles_to_sec_f(cycles) \
+ (((double)cycles_to_nanosecs(cycles)) / 1000000000.0)
+
+struct ips_stray_epid {
+ psm2_epid_t epid;
+ uint32_t err_check_bad_sent;
+ uint32_t ipv4_addr;
+ uint32_t pid;
+ uint32_t num_messages;
+ uint64_t t_warn_next;
+ uint64_t t_first;
+ uint64_t t_last;
+};
+
+static
+void ips_report_strays(struct ips_proto *proto)
+{
+ struct ips_stray_epid *sepid;
+ struct psmi_eptab_iterator itor;
+ psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK);
+
+#if _HFI_DEBUGGING
+ double t_first = 0;
+ double t_last = 0;
+ double t_runtime = 0;
+ if (_HFI_INFO_ON) {
+ t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init);
+ }
+#endif
+
+ while ((sepid = psmi_epid_itor_next(&itor))) {
+ char ipbuf[INET_ADDRSTRLEN], *ip = NULL;
+ char bufpid[32];
+ uint32_t lid = psm2_epid_nid(sepid->epid);
+#if _HFI_DEBUGGING
+ if (_HFI_INFO_ON) {
+ t_first =
+ cycles_to_sec_f(sepid->t_first - proto->t_init);
+ t_last =
+ cycles_to_sec_f(sepid->t_last - proto->t_init);
+ }
+#endif
+ if (sepid->ipv4_addr)
+ ip = (char *)
+ inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf,
+ sizeof(ipbuf));
+ if (!ip)
+ snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid);
+
+ if (sepid->pid)
+ snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid);
+ else
+ snprintf(bufpid, sizeof(bufpid), "PID unknown");
+
+ if (_HFI_INFO_ON) {
+ _HFI_INFO_ALWAYS
+ ("Process %s on host %s=%s sent %d stray message(s) and "
+ "was told so %d time(s) (first stray message at %.1fs "
+ "(%d%%), last at %.1fs (%d%%) into application run)\n",
+ bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages,
+ sepid->err_check_bad_sent, t_first,
+ (int)(t_first * 100.0 / t_runtime), t_last,
+ (int)(t_last * 100.0 / t_runtime));
+ }
+
+ psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid);
+ psmi_free(sepid);
+ }
+ psmi_epid_itor_fini(&itor);
+ return;
+}
+
+/* New scbs now available. If we have pending sends because we were out of
+ * scbs, put the pendq on the timerq so it can be processed. */
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+ struct ips_proto *proto = (struct ips_proto *)context;
+ struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq);
+ if (sreq != NULL)
+ psmi_timer_request(proto->timerq,
+ &proto->pend_sends.timer, PSMI_TIMER_PRIO_1);
+ return;
+}
+
+psm2_error_t
+ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current)
+{
+ psm2_error_t err = PSM2_OK;
+ struct ips_pend_sends *pend_sends =
+ (struct ips_pend_sends *)timer->context;
+ struct ips_pendsendq *phead = &pend_sends->pendq;
+ struct ips_proto *proto = (struct ips_proto *)pend_sends->proto;
+ struct ips_pend_sreq *sreq;
+
+ while (!STAILQ_EMPTY(phead)) {
+ sreq = STAILQ_FIRST(phead);
+ switch (sreq->type) {
+ case IPS_PENDSEND_EAGER_REQ:
+ err = ips_proto_mq_push_cts_req(proto, sreq->req);
+ break;
+ case IPS_PENDSEND_EAGER_DATA:
+ err = ips_proto_mq_push_rts_data(proto, sreq->req);
+ break;
+
+ default:
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Unknown pendq state %d\n",
+ sreq->type);
+ }
+
+ if (err == PSM2_OK) {
+ STAILQ_REMOVE_HEAD(phead, next);
+ psmi_mpool_put(sreq);
+ } else { /* out of scbs. wait for the next scb_avail callback */
+ /* printf("!!!!! breaking out of pendq progress\n"); */
+ break;
+ }
+ }
+
+ return err;
+}
+
+PSMI_INLINE(
+int
+between(int first_seq, int last_seq, int seq))
+{
+ if (last_seq >= first_seq) {
+ if (seq < first_seq || seq > last_seq) {
+ return 0;
+ }
+ } else {
+ if (seq > last_seq && seq < first_seq) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+PSMI_INLINE(
+int
+pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow,
+ psmi_seqnum_t ack_seq_num))
+{
+ uint32_t last_num;
+ struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+
+ if (STAILQ_EMPTY(unackedq))
+ return 0;
+
+ /* scb_pend will be moved back when an nak is received, but
+ * the packet may actually be received and acked after the nak,
+ * so we use the tail of unacked queue, which may include packets
+ * not being sent out yet, this is over do, but it should be OK. */
+ last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num;
+
+ return between(flow->xmit_ack_num.psn_num,
+ last_num, ack_seq_num.psn_num);
+}
+
+PSMI_INLINE(
+struct ips_flow *
+get_tidflow(struct ips_proto *proto, ips_epaddr_t *ipsaddr,
+ struct ips_message_header *p_hdr, psmi_seqnum_t ack_seq_num))
+{
+ struct ips_protoexp *protoexp = proto->protoexp;
+ ptl_arg_t desc_id = p_hdr->data[0];
+ struct ips_tid_send_desc *tidsendc;
+ ptl_arg_t desc_tidsendc;
+ struct ips_flow *flow;
+ uint32_t last_seq;
+ struct ips_scb_unackedq *unackedq;
+
+ tidsendc = (struct ips_tid_send_desc *)
+ psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+ desc_id._desc_idx);
+ if (tidsendc == NULL) {
+ _HFI_ERROR
+ ("OPCODE_ACK: Index %d is out of range in tidflow ack\n",
+ desc_id._desc_idx);
+ return NULL;
+ }
+
+ /* Ensure generation matches */
+ psmi_mpool_get_obj_index_gen_count(tidsendc,
+ &desc_tidsendc._desc_idx,
+ &desc_tidsendc._desc_genc);
+ if (desc_tidsendc.u64 != desc_id.u64)
+ return NULL;
+
+ /* Ensure ack is within window */
+ flow = &tidsendc->tidflow;
+ unackedq = &flow->scb_unacked;
+
+ /* No unacked scbs */
+ if (STAILQ_EMPTY(unackedq))
+ return NULL;
+
+ /* Generation for ack should match */
+ if (STAILQ_FIRST(unackedq)->seq_num.psn_gen != ack_seq_num.psn_gen)
+ return NULL;
+
+ /* scb_pend will be moved back when an nak is received, but
+ * the packet may actually be received and acked after the nak,
+ * so we use the tail of unacked queue, which may include packets
+ * not being sent out yet, this is over do, but it should be OK. */
+ last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_seq;
+
+ if (between(flow->xmit_ack_num.psn_seq,
+ last_seq, ack_seq_num.psn_seq) == 0)
+ return NULL;
+
+ return flow;
+}
+
+/* NAK post process for tid flow */
+void ips_tidflow_nak_post_process(struct ips_proto *proto,
+ struct ips_flow *flow)
+{
+ ips_scb_t *scb;
+ uint32_t first_seq, ack_seq;
+
+ scb = STAILQ_FIRST(&flow->scb_unacked);
+ first_seq = __be32_to_cpu(scb->ips_lrh.bth[2]) & HFI_BTH_SEQ_MASK;
+ ack_seq = (flow->xmit_ack_num.psn_seq - 1) & HFI_BTH_SEQ_MASK;
+
+ /* If the ack SEQ falls into a multi-packets scb,
+ * don't re-send the packets already acked. */
+ if (scb->nfrag > 1 &&
+ between(first_seq, scb->seq_num.psn_seq, ack_seq)) {
+ uint32_t om, offset_in_tid, remaining_bytes_in_tid;
+ uint32_t npkt, pktlen, nbytes;
+ uint32_t idx, loop;
+
+ /* how many packets acked in this scb */
+ npkt = ((ack_seq - first_seq) & HFI_BTH_SEQ_MASK) + 1;
+
+ /* Get offset/om from current packet header */
+ offset_in_tid = __le32_to_cpu(scb->ips_lrh.khdr.kdeth0) &
+ HFI_KHDR_OFFSET_MASK;
+ om = (__le32_to_cpu(scb->ips_lrh.khdr.kdeth0) >>
+ HFI_KHDR_OM_SHIFT) & 0x1;
+ if (om)
+ offset_in_tid *= 64;
+ else
+ offset_in_tid *= 4;
+ /* bytes remaining in current tid */
+ remaining_bytes_in_tid =
+ (IPS_TIDINFO_GET_LENGTH(scb->tsess[0]) << 12) -
+ offset_in_tid;
+
+ /* packet length in current header */
+ pktlen = scb->payload_size;
+ psmi_assert(min(remaining_bytes_in_tid,
+ scb->frag_size) >= pktlen);
+ psmi_assert((((__be16_to_cpu(scb->ips_lrh.lrh[2]) &
+ HFI_LRH_PKTLEN_MASK) << BYTE2DWORD_SHIFT) -
+ sizeof(struct ips_message_header) -
+ HFI_CRC_SIZE_IN_BYTES) == pktlen);
+
+ /* Loop to find the position to start */
+ idx = 0;
+ nbytes = 0;
+ loop = npkt;
+ while (loop) {
+ remaining_bytes_in_tid -= pktlen;
+ offset_in_tid += pktlen;
+ nbytes += pktlen;
+ first_seq++;
+ loop--;
+
+ if (remaining_bytes_in_tid == 0) {
+ idx++;
+ remaining_bytes_in_tid =
+ IPS_TIDINFO_GET_LENGTH(scb->
+ tsess[idx]) << 12;
+ offset_in_tid = 0;
+ }
+
+ pktlen = min(remaining_bytes_in_tid, scb->frag_size);
+ }
+ psmi_assert((first_seq & HFI_BTH_SEQ_MASK) ==
+ ((ack_seq + 1) & HFI_BTH_SEQ_MASK));
+
+ /* 0. update scb info */
+ psmi_assert(scb->nfrag_remaining > npkt);
+ scb->nfrag_remaining -= npkt;
+ psmi_assert(scb->chunk_size_remaining > nbytes);
+ scb->chunk_size_remaining -= nbytes;
+ ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes);
+
+ /* 1. if last packet in sequence, set ACK, clear SH */
+ if (scb->nfrag_remaining == 1) {
+ psmi_assert(scb->chunk_size_remaining <=
+ scb->frag_size);
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+ scb->flags &= ~IPS_SEND_FLAG_HDRSUPP;
+
+ /* last packet is what remaining */
+ pktlen = scb->chunk_size_remaining;
+ }
+
+ /* 2. set new packet sequence number */
+ scb->ips_lrh.bth[2] = __cpu_to_be32(
+ ((first_seq & HFI_BTH_SEQ_MASK) << HFI_BTH_SEQ_SHIFT) |
+ ((scb->seq_num.psn_gen &
+ HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT) |
+ (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+ /* 3. set new packet offset */
+ scb->ips_lrh.exp_offset += nbytes;
+
+ /* 4. if packet length is changed, set new length */
+ if (scb->payload_size != pktlen) {
+ scb->payload_size = pktlen;
+ scb->ips_lrh.lrh[2] = __cpu_to_be16((
+ (scb->payload_size +
+ sizeof(struct ips_message_header) +
+ HFI_CRC_SIZE_IN_BYTES) >>
+ BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK);
+ }
+
+ /* 5. set new tidctrl and tidinfo array */
+ scb->tsess = &scb->tsess[idx];
+ scb->tsess_length -= idx * sizeof(uint32_t);
+ scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(scb->tsess[0]);
+
+ /* 6. calculate new offset mode */
+ if (offset_in_tid < 131072) { /* 2^15 * 4 */
+ offset_in_tid /= 4;
+ om = 0;
+ } else {
+ offset_in_tid /= 64;
+ om = 1;
+ }
+
+ /* 7. set new tidinfo */
+ scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32(
+ (offset_in_tid & HFI_KHDR_OFFSET_MASK) |
+ (om << HFI_KHDR_OM_SHIFT) |
+ (IPS_TIDINFO_GET_TID(scb->tsess[0])
+ << HFI_KHDR_TID_SHIFT) |
+ (scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) |
+ (scb->flags & IPS_SEND_FLAG_INTR) |
+ (scb->flags & IPS_SEND_FLAG_HDRSUPP) |
+ (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT));
+ }
+
+ /* Update unacked scb's to use the new generation */
+ while (scb) {
+ /* update with new generation */
+ scb->ips_lrh.bth[2] = __cpu_to_be32(
+ (__be32_to_cpu(scb->ips_lrh.bth[2]) &
+ (~(HFI_BTH_GEN_MASK << HFI_BTH_GEN_SHIFT))) |
+ ((flow->xmit_seq_num.psn_gen &
+ HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT));
+ scb->seq_num.psn_gen = flow->xmit_seq_num.psn_gen;
+ scb = SLIST_NEXT(scb, next);
+ }
+}
+
+/* NAK post process for dma flow */
+void ips_dmaflow_nak_post_process(struct ips_proto *proto,
+ struct ips_flow *flow)
+{
+ ips_scb_t *scb;
+ uint32_t first_num, ack_num;
+ uint16_t padding = 0;
+
+ scb = STAILQ_FIRST(&flow->scb_unacked);
+ first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask;
+ ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask;
+
+
+ /* If the ack PSN falls into a multi-packets scb,
+ * don't re-send the packets already acked. */
+ psmi_assert(scb->nfrag > 1);
+ if (between(first_num, scb->seq_num.psn_num, ack_num)) {
+ uint32_t npkt, pktlen, nbytes;
+
+ /* how many packets acked in this scb */
+ npkt = ((ack_num - first_num) & proto->psn_mask) + 1;
+
+ /* how many bytes already acked in this scb, for eager receive
+ * packets, all payload size is frag_size except the last packet
+ * which is not acked yet */
+ pktlen = scb->frag_size;
+ nbytes = (((ack_num - first_num) &
+ proto->psn_mask) + 1) * pktlen;
+
+ /* 0. update scb info */
+ psmi_assert(scb->nfrag_remaining > npkt);
+ scb->nfrag_remaining -= npkt;
+ psmi_assert(scb->chunk_size_remaining > nbytes);
+ scb->chunk_size_remaining -= nbytes;
+ ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes);
+
+ /* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */
+ if (scb->chunk_size_remaining <= scb->frag_size) {
+ psmi_assert(scb->nfrag_remaining == 1);
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+ /* last packet is what remaining */
+ /* check if padding is required*/
+ padding = scb->chunk_size_remaining & 0x3;
+ if_pf(padding) {
+ /* how much to pad with also equals how many bytes we need
+ * to rewind the source buffer offset by to keep it dw aligned */
+ padding = 4 - padding;
+ ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding);
+ scb->chunk_size_remaining += padding;
+ }
+ pktlen = scb->chunk_size_remaining;
+ }
+
+ /* 2. set new packet sequence number */
+ scb->ips_lrh.bth[2] = __cpu_to_be32(
+ ((ack_num + 1) & proto->psn_mask) |
+ (scb->flags & IPS_SEND_FLAG_ACKREQ));
+
+ /* 3. set new packet offset adjusted with padding */
+ ips_scb_hdrdata(scb).u32w0 += nbytes - padding;
+
+ /* 4. if packet length is changed, set new length */
+ if (scb->payload_size != pktlen) {
+ scb->payload_size = pktlen;
+ scb->ips_lrh.lrh[2] = __cpu_to_be16((
+ (scb->payload_size +
+ sizeof(struct ips_message_header) +
+ HFI_CRC_SIZE_IN_BYTES) >>
+ BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK);
+ }
+ }
+}
+
+/* process an incoming ack message. Separate function to allow */
+/* for better optimization by compiler */
+int
+ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_flow *flow = NULL;
+ struct ips_scb_unackedq *unackedq;
+ struct ips_scb_pendlist *scb_pend;
+ psmi_seqnum_t ack_seq_num, last_seq_num;
+ ips_epaddr_flow_t flowid;
+ ips_scb_t *scb;
+ uint32_t tidctrl;
+
+ ack_seq_num.psn_num = p_hdr->ack_seq_num;
+ tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+ if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+ ack_seq_num.psn_num =
+ (ack_seq_num.psn_num - 1) & proto->psn_mask;
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+ goto ret;
+ } else {
+ ack_seq_num.psn_seq -= 1;
+ flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num);
+ if (!flow) /* Invalid ack for flow */
+ goto ret;
+ }
+ flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+
+ unackedq = &flow->scb_unacked;
+ scb_pend = &flow->scb_pend;
+
+ if (STAILQ_EMPTY(unackedq))
+ goto ret;
+
+ last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+ INC_TIME_SPEND(TIME_SPEND_USER2);
+
+ /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+ * we can used general psn_num to compare the PSN. */
+ while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+ last_seq_num.psn_num, ack_seq_num.psn_num)
+ ) {
+
+ /* take it out of the xmit queue and .. */
+ if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+ flow->scb_num_pending--;
+#endif
+ SLIST_REMOVE_HEAD(scb_pend, next);
+ }
+
+ STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+ flow->scb_num_unacked--;
+ psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+ flow->credits += scb->nfrag;
+
+ if (flow->transfer == PSM_TRANSFER_DMA &&
+ scb->dma_complete == 0)
+ ips_proto_dma_wait_until(proto, scb);
+
+ if (scb->callback)
+ (*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+ scb->chunk_size : scb->payload_size);
+
+ if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+ ips_scbctrl_free(scb);
+
+ /* set all index pointer to NULL if all frames have been
+ * acked */
+ if (STAILQ_EMPTY(unackedq)) {
+ psmi_timer_cancel(proto->timerq, flow->timer_ack);
+ flow->timer_ack = NULL;
+ psmi_timer_cancel(proto->timerq, flow->timer_send);
+ flow->timer_send = NULL;
+
+ SLIST_FIRST(scb_pend) = NULL;
+ psmi_assert(flow->scb_num_pending == 0);
+ /* Reset congestion window - all packets ACK'd */
+ flow->credits = flow->cwin = proto->flow_credits;
+ flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+ flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+ goto ret;
+ } else if (flow->timer_ack == scb->timer_ack) {
+ /*
+ * Exchange timers with last scb on unackedq.
+ * timer in scb is used by flow, cancelling current
+ * timer and then requesting a new timer takes more
+ * time, instead, we exchange the timer between current
+ * freeing scb and the last scb on unacked queue.
+ */
+ psmi_timer *timer;
+ ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+ timer = scb->timer_ack;
+ scb->timer_ack = last->timer_ack;
+ last->timer_ack = timer;
+ timer = scb->timer_send;
+ scb->timer_send = last->timer_send;
+ last->timer_send = timer;
+
+ scb->timer_ack->context = scb;
+ scb->timer_send->context = scb;
+ last->timer_ack->context = last;
+ last->timer_send->context = last;
+ }
+ }
+
+ psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */
+
+ /* CCA: If flow is congested adjust rate */
+ if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+ if ((flow->path->pr_ccti +
+ proto->cace[flow->path->pr_sl].ccti_increase) <=
+ proto->ccti_limit) {
+ ips_cca_adjust_rate(flow->path,
+ proto->cace[flow->path->pr_sl].
+ ccti_increase);
+ /* Clear congestion event */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+ }
+ }
+ else {
+ /* Increase congestion window if flow is not congested */
+ if_pf(flow->cwin < proto->flow_credits) {
+ flow->credits +=
+ min(flow->cwin << 1,
+ proto->flow_credits) - flow->cwin;
+ flow->cwin = min(flow->cwin << 1, proto->flow_credits);
+ flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+ }
+ }
+
+ /* Reclaimed some credits - attempt to flush flow */
+ if (!SLIST_EMPTY(scb_pend))
+ flow->flush(flow, NULL);
+
+ /*
+ * If the next packet has not even been put on the wire, cancel the
+ * retransmission timer since we're still presumably waiting on free
+ * pio bufs
+ */
+ if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE)
+ psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+ret:
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* process an incoming nack message. Separate function to allow */
+/* for better optimization by compiler */
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_flow *flow = NULL;
+ struct ips_scb_unackedq *unackedq;
+ struct ips_scb_pendlist *scb_pend;
+ psmi_seqnum_t ack_seq_num, last_seq_num;
+ psm_protocol_type_t protocol;
+ ips_epaddr_flow_t flowid;
+ ips_scb_t *scb;
+ uint32_t tidctrl;
+
+ INC_TIME_SPEND(TIME_SPEND_USER3);
+
+ ack_seq_num.psn_num = p_hdr->ack_seq_num;
+ tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+ if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+ protocol = PSM_PROTOCOL_GO_BACK_N;
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+ goto ret;
+ ack_seq_num.psn_num =
+ (ack_seq_num.psn_num - 1) & proto->psn_mask;
+ flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+ } else {
+ protocol = PSM_PROTOCOL_TIDFLOW;
+ flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num);
+ if (!flow)
+ goto ret; /* Invalid ack for flow */
+ ack_seq_num.psn_seq--;
+
+ psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen);
+ psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen);
+ /* Update xmit_ack_num with both new generation and new
+ * acked sequence; update xmit_seq_num with the new flow
+ * generation, don't change the sequence number. */
+ flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0;
+ flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen;
+ psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen);
+ }
+
+ unackedq = &flow->scb_unacked;
+ scb_pend = &flow->scb_pend;
+
+ if (STAILQ_EMPTY(unackedq))
+ goto ret;
+
+ last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+ proto->epaddr_stats.nak_recv++;
+
+ _HFI_VDBG("got a nack %d on flow %d, "
+ "first is %d, last is %d\n", ack_seq_num.psn_num,
+ flow->flowid,
+ STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num.
+ psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq,
+ ips_scb,
+ nextq)->
+ seq_num.psn_num);
+
+ /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+ * we can used general psn_num to compare the PSN. */
+ while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+ last_seq_num.psn_num, ack_seq_num.psn_num)
+ ) {
+ /* take it out of the xmit queue and .. */
+ if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+ flow->scb_num_pending--;
+#endif
+ SLIST_REMOVE_HEAD(scb_pend, next);
+ }
+
+ STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+ flow->scb_num_unacked--;
+ psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+
+ if (flow->transfer == PSM_TRANSFER_DMA &&
+ scb->dma_complete == 0)
+ ips_proto_dma_wait_until(proto, scb);
+
+ if (scb->callback)
+ (*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+ scb->chunk_size : scb->payload_size);
+
+ if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+ ips_scbctrl_free(scb);
+
+ /* set all index pointer to NULL if all frames has been acked */
+ if (STAILQ_EMPTY(unackedq)) {
+ psmi_timer_cancel(proto->timerq, flow->timer_ack);
+ flow->timer_ack = NULL;
+ psmi_timer_cancel(proto->timerq, flow->timer_send);
+ flow->timer_send = NULL;
+
+ SLIST_FIRST(scb_pend) = NULL;
+ psmi_assert(flow->scb_num_pending == 0);
+ /* Reset congestion window if all packets acknowledged */
+ flow->credits = flow->cwin = proto->flow_credits;
+ flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+ flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+ goto ret;
+ } else if (flow->timer_ack == scb->timer_ack) {
+ /*
+ * Exchange timers with last scb on unackedq.
+ * timer in scb is used by flow, cancelling current
+ * timer and then requesting a new timer takes more
+ * time, instead, we exchange the timer between current
+ * freeing scb and the last scb on unacked queue.
+ */
+ psmi_timer *timer;
+ ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+ timer = scb->timer_ack;
+ scb->timer_ack = last->timer_ack;
+ last->timer_ack = timer;
+ timer = scb->timer_send;
+ scb->timer_send = last->timer_send;
+ last->timer_send = timer;
+
+ scb->timer_ack->context = scb;
+ scb->timer_send->context = scb;
+ last->timer_ack->context = last;
+ last->timer_send->context = last;
+ }
+ }
+
+ psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */
+
+ if (protocol == PSM_PROTOCOL_TIDFLOW)
+ ips_tidflow_nak_post_process(proto, flow);
+ else if (scb->nfrag > 1)
+ ips_dmaflow_nak_post_process(proto, flow);
+
+ /* Always cancel ACK timer as we are going to restart the flow */
+ psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+ /* What's now pending is all that was unacked */
+ SLIST_FIRST(scb_pend) = scb;
+#ifdef PSM_DEBUG
+ flow->scb_num_pending = flow->scb_num_unacked;
+#endif
+ while (scb && !(scb->flags & IPS_SEND_FLAG_PENDING)) {
+ /* Wait for the previous dma completion */
+ if (flow->transfer == PSM_TRANSFER_DMA &&
+ scb->dma_complete == 0)
+ ips_proto_dma_wait_until(proto, scb);
+
+ scb->flags |= IPS_SEND_FLAG_PENDING;
+ scb = SLIST_NEXT(scb, next);
+ }
+
+ /* If NAK with congestion bit set - delay re-transmitting and THEN adjust
+ * CCA rate.
+ */
+ if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+ uint64_t offset;
+
+ /* Clear congestion event and mark flow as congested */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+ flow->flags |= IPS_FLOW_FLAG_CONGESTED;
+
+ /* For congested flow use slow start i.e. reduce congestion window.
+ * For TIDFLOW we cannot reduce congestion window as peer expects
+ * header packets at regular intervals (protoexp->hdr_pkt_interval).
+ */
+ if (flow->protocol != PSM_PROTOCOL_TIDFLOW)
+ flow->credits = flow->cwin = 1;
+ else
+ flow->credits = flow->cwin;
+
+ flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+
+ /* During congestion cancel send timer and delay retransmission by
+ * random interval
+ */
+ psmi_timer_cancel(proto->timerq, flow->timer_send);
+ if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE)
+ offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1);
+ else
+ offset = 0;
+ struct drand48_data drand48_data;
+ srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data);
+ double rnum;
+ drand48_r(&drand48_data, &rnum);
+ psmi_timer_request(proto->timerq, flow->timer_send,
+ (get_cycles() +
+ (uint64_t) (offset *
+ (rnum + 1.0))));
+ }
+ else {
+ int num_resent = 0;
+
+ /* Reclaim all credits upto congestion window only */
+ flow->credits = flow->cwin;
+ flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+
+ /* Flush pending scb's */
+ flow->flush(flow, &num_resent);
+
+ proto->epaddr_stats.send_rexmit += num_resent;
+ }
+
+ret:
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+ struct ips_flow *flow;
+ psmi_seqnum_t seq_num;
+ int16_t seq_off;
+
+ INC_TIME_SPEND(TIME_SPEND_USER4);
+ PSM2_LOG_MSG("entering");
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ recvq->proto->epaddr_stats.err_chk_recv++;
+ /* Ignore FECN bit since this is the control path */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+
+ seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+ seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num);
+
+ if_pf(seq_off <= 0) {
+ _HFI_VDBG("naking for seq=%d, off=%d on flowid %d\n",
+ seq_num.psn_num, seq_off, flowid);
+
+ if (seq_off < -flow->ack_interval)
+ flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+ ips_proto_send_nak(recvq, flow);
+ flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+ }
+ else {
+ ips_scb_t ctrlscb;
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+
+ ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+ &ipsaddr->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ struct ips_protoexp *protoexp = recvq->proto->protoexp;
+ struct ips_tid_recv_desc *tidrecvc;
+ ips_scb_t ctrlscb;
+ psmi_seqnum_t err_seqnum, recvseq;
+ ptl_arg_t desc_id = p_hdr->data[0];
+ ptl_arg_t send_desc_id = p_hdr->data[1];
+ int16_t seq_off;
+ uint8_t ack_type;
+
+ INC_TIME_SPEND(TIME_SPEND_USER4);
+ PSM2_LOG_MSG("entering");
+ recvq->proto->epaddr_stats.err_chk_recv++;
+
+ /* Ignore FECN bit since this is the control path */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+
+ /* Get the flowgenseq for err chk gen */
+ err_seqnum.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+
+ /* Get receive descriptor */
+ psmi_assert(desc_id._desc_idx < HFI_TF_NFLOWS);
+ tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+ if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) {
+ /* Receive descriptor mismatch in time and space.
+ * Stale err chk gen, drop packet
+ */
+ _HFI_DBG
+ ("ERR_CHK_GEN: gen mismatch Pkt: 0x%x, Current: 0x%x\n",
+ desc_id._desc_genc, tidrecvc->rdescid._desc_genc);
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+ psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
+
+ /*
+ * We change tidrecvc->tidflow_genseq here only when a new generation
+ * is allocated and programmed into hardware. Otherwise we use local
+ * variable recvseq to create the reply.
+ */
+ recvseq = tidrecvc->tidflow_genseq;
+
+ /* Get the latest seq from hardware tidflow table. But
+ * only do this when context sharing is not used, because
+ * context sharing might drop packet even though hardware
+ * has received it successfully.
+ */
+ if (!tidrecvc->context->tf_ctrl)
+ recvseq.psn_seq = hfi_tidflow_get_seqnum(
+ hfi_tidflow_get(tidrecvc->context->ctrl,
+ tidrecvc->rdescid._desc_idx));
+
+ if (err_seqnum.psn_gen != recvseq.psn_gen) {
+ ack_type = OPCODE_NAK;
+ /* NAK without allocating a new generation */
+
+ /* My current generation and last received seq */
+ ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val;
+ } else {
+ /* Either lost packets or lost ack, we need to deal
+ * with wrap around of the seq value from 2047 to 0
+ * because seq is only 11 bits */
+ seq_off = (int16_t)(err_seqnum.psn_seq - recvseq.psn_seq);
+ if (seq_off < 0)
+ seq_off += 2048; /* seq is 11 bits */
+
+ if (seq_off < 1024) {
+ ack_type = OPCODE_NAK;
+ /* NAK with allocating a new generation */
+
+ /* set latest seq */
+ tidrecvc->tidflow_genseq.psn_seq = recvseq.psn_seq;
+ /* allocate and set a new generation */
+ ips_protoexp_flow_newgen(tidrecvc);
+ /* get the new generation */
+ recvseq.psn_gen = tidrecvc->tidflow_genseq.psn_gen;
+
+ /* My new generation and last received seq */
+ ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val;
+ } else
+ /* ACK with last received seq,
+ * no need to set ips_lrh.data[1].u32w0 */
+ ack_type = OPCODE_ACK;
+ }
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.data[0].u64 = send_desc_id.u64;
+ /* Keep peer generation but use my last received sequence */
+ err_seqnum.psn_seq = recvseq.psn_seq;
+ ctrlscb.ips_lrh.ack_seq_num = err_seqnum.psn_val;
+
+ /* May want to generate a BECN if a lot of swapped generations */
+ if_pf((tidrecvc->tidflow_nswap_gen > 4) &&
+ (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+ _HFI_CCADBG
+ ("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n",
+ tidrecvc->tidflow_nswap_gen);
+ /* Mark flow to generate BECN in control packet */
+ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+ /* Update stats for congestion encountered */
+ recvq->proto->epaddr_stats.congestion_pkts++;
+ }
+
+ ips_proto_send_ctrl_message(&tidrecvc->tidflow,
+ ack_type, &tidrecvc->ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+
+ /* Update stats for expected window */
+ tidrecvc->stats.nErrChkReceived++;
+ if (ack_type == OPCODE_NAK)
+ tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */
+
+ PSM2_LOG_MSG("leaving");
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+ int flowid = ips_proto_flowid(p_hdr);
+ struct ips_flow *flow;
+
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ if ((flow->path->pr_ccti +
+ proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) {
+ ips_cca_adjust_rate(flow->path,
+ proto->cace[flow->path->pr_sl].ccti_increase);
+ /* Clear congestion event */
+ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+ }
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
+{
+ _HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code);
+
+ if (hfi_debug & __HFI_DBG) {
+ ips_proto_show_header(proto, "received bad opcode");
+ ips_proto_dump_frame(proto, sizeof(struct ips_message_header),
+ "Opcode error protocol header dump");
+ }
+}
+
+int
+ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_message_header *protocol_header = rcv_ev->p_hdr;
+ struct ips_proto *proto = rcv_ev->proto;
+
+ proto->stats.unknown_packets++;
+ ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header);
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev)
+{
+ psm2_error_t err = PSM2_OK;
+ char *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+ psmi_assert(payload);
+ err = ips_proto_process_connect(rcv_ev->proto,
+ _get_proto_hfi_opcode(rcv_ev->p_hdr),
+ rcv_ev->p_hdr,
+ payload,
+ paylen);
+ if (err != PSM2_OK)
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Process connect/disconnect error: %d, opcode %d\n",
+ err, _get_proto_hfi_opcode(rcv_ev->p_hdr));
+
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* Return 1 if packet is ok. */
+/* Return 0 if packet should be skipped */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+ uint8_t ptype = rcv_ev->ptype;
+ struct ips_proto *proto = rcv_ev->proto;
+ psm2_ep_t ep_err;
+ char *pkt_type;
+ int opcode = (int)_get_proto_hfi_opcode(p_hdr);
+
+ /*
+ * If the protocol is disabled or not yet enabled, no processing happens
+ * We set it t_init to 0 when disabling the protocol
+ */
+ if (proto->t_init == 0)
+ return IPS_RECVHDRQ_CONTINUE;
+
+ /* Connect messages don't have to be from a known epaddr */
+ switch (opcode) {
+ case OPCODE_CONNECT_REQUEST:
+ case OPCODE_CONNECT_REPLY:
+ case OPCODE_DISCONNECT_REQUEST:
+ case OPCODE_DISCONNECT_REPLY:
+ ips_proto_connect_disconnect(
+ (struct ips_recvhdrq_event *)rcv_ev);
+ return IPS_RECVHDRQ_CONTINUE;
+ default:
+ break;
+ }
+
+ /* Packet from "unknown" peer. Log the packet and payload if at appropriate
+ * verbose level.
+ */
+ {
+ char *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+ ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+
+ ips_proto_dump_err_stats(proto);
+
+ if (hfi_debug & __HFI_PKTDBG) {
+ ips_proto_dump_frame(rcv_ev->p_hdr,
+ HFI_MESSAGE_HDR_SIZE, "header");
+ if (paylen)
+ ips_proto_dump_frame(payload, paylen, "data");
+ }
+ }
+
+ /* Other messages are definitely crosstalk. */
+ /* out-of-context expected messages are always fatal */
+ if (ptype == RCVHQ_RCV_TYPE_EXPECTED) {
+ ep_err = PSMI_EP_NORETURN;
+ pkt_type = "expected";
+ } else if (ptype == RCVHQ_RCV_TYPE_EAGER) {
+ ep_err = PSMI_EP_LOGEVENT;
+ pkt_type = "eager";
+ } else {
+ ep_err = PSMI_EP_NORETURN;
+ pkt_type = "unknown";
+ }
+
+ proto->stats.stray_packets++;
+
+ /* If we have debug mode, print the complete packet every time */
+ if (hfi_debug & __HFI_PKTDBG)
+ ips_proto_show_header(p_hdr, "invalid connidx");
+
+ /* At this point we are out of luck. */
+ psmi_handle_error(ep_err, PSM2_EPID_NETWORK_ERROR,
+ "Received %s message(s) ptype=0x%x opcode=0x%x"
+ " from an unknown process", pkt_type, ptype, opcode);
+
+ return 0; /* Always skip this packet unless the above call was a noreturn
+ * call */
+}
+
+/* get the error string as a number and a string */
+static void rhf_errnum_string(char *msg, size_t msglen, long err)
+{
+ int len;
+ char *errmsg;
+
+ len = snprintf(msg, msglen, "RHFerror %lx: ", err);
+ if (len > 0 && len < msglen) {
+ errmsg = msg + len;
+ msglen -= len;
+ } else
+ errmsg = msg;
+ *errmsg = 0;
+ ips_proto_get_rhf_errstring(err, errmsg, msglen);
+}
+
+/*
+ * Error handling
+ */
+int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_proto *proto = rcv_ev->proto;
+ int pkt_verbose_err = hfi_debug & __HFI_PKTDBG;
+ int tiderr = rcv_ev->error_flags & HFI_RHF_TIDERR;
+ int tf_seqerr = rcv_ev->error_flags & HFI_RHF_TFSEQERR;
+ int tf_generr = rcv_ev->error_flags & HFI_RHF_TFGENERR;
+ int data_err = rcv_ev->error_flags &
+ (HFI_RHF_ICRCERR | HFI_RHF_ECCERR | HFI_RHF_LENERR |
+ HFI_RHF_DCERR | HFI_RHF_DCUNCERR | HFI_RHF_KHDRLENERR);
+ char pktmsg[128];
+
+ *pktmsg = 0;
+ /*
+ * Tid errors on eager pkts mean we get a headerq overflow, perfectly
+ * safe. Tid errors on expected or other packets means trouble.
+ */
+ if (tiderr && rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) {
+ struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+
+ /* Payload dropped - Determine flow for this header and see if
+ * we need to generate a NAK.
+ *
+ * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO
+ * CONGESTION AS THE EAGER BUFFER IS FULL.
+ *
+ * Possible eager packet type:
+ *
+ * Ctrl Message - ignore
+ * MQ message - Can get flow and see if we need to NAK.
+ * AM message - Can get flow and see if we need to NAK.
+ */
+
+ proto->stats.hdr_overflow++;
+ if (data_err)
+ return 0;
+
+ switch (_get_proto_hfi_opcode(p_hdr)) {
+ case OPCODE_TINY:
+ case OPCODE_SHORT:
+ case OPCODE_EAGER:
+ case OPCODE_LONG_RTS:
+ case OPCODE_LONG_CTS:
+ case OPCODE_LONG_DATA:
+ case OPCODE_AM_REQUEST:
+ case OPCODE_AM_REQUEST_NOREPLY:
+ case OPCODE_AM_REPLY:
+ {
+ ips_epaddr_flow_t flowid =
+ ips_proto_flowid(p_hdr);
+ struct ips_epstate_entry *epstaddr;
+ struct ips_flow *flow;
+ psmi_seqnum_t sequence_num;
+ int16_t diff;
+
+ /* Obtain ipsaddr for packet */
+ epstaddr =
+ ips_epstate_lookup(rcv_ev->recvq->epstate,
+ rcv_ev->p_hdr->connidx);
+ if_pf(epstaddr == NULL
+ || epstaddr->ipsaddr == NULL)
+ return 0; /* Unknown packet - drop */
+
+ rcv_ev->ipsaddr = epstaddr->ipsaddr;
+
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &rcv_ev->ipsaddr->flows[flowid];
+ sequence_num.psn_val =
+ __be32_to_cpu(p_hdr->bth[2]);
+ diff =
+ (int16_t) (sequence_num.psn_num -
+ flow->recv_seq_num.psn_num);
+
+ if (diff >= 0
+ && !(flow->
+ flags & IPS_FLOW_FLAG_NAK_SEND)) {
+ /* Mark flow as congested and attempt to generate NAK */
+ flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+ proto->epaddr_stats.congestion_pkts++;
+
+ flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+ flow->cca_ooo_pkts = 0;
+ ips_proto_send_nak((struct ips_recvhdrq
+ *)rcv_ev->recvq,
+ flow);
+ }
+
+ /* Safe to process ACKs from header */
+ ips_proto_process_ack(rcv_ev);
+ }
+ break;
+ case OPCODE_EXPTID:
+ /* If RSM is matching packets that are TID&FECN&SH,
+ * it is possible to have a EXPTID packet encounter
+ * the eager full condition and have the payload
+ * dropped (but the header delivered).
+ * Treat this condition as a data error (corruption,etc)
+ * and send a NAK.
+ */
+ ips_protoexp_handle_data_err(rcv_ev);
+ break;
+ default:
+ break;
+ }
+ } else if (tf_generr) /* handle generr, ignore tiderr if any */
+ ips_protoexp_handle_tf_generr(rcv_ev);
+ else if (tf_seqerr)
+ ips_protoexp_handle_tf_seqerr(rcv_ev);
+ else if (tiderr) { /* tid error, but not on an eager pkt */
+ psm2_ep_t ep_err = PSMI_EP_LOGEVENT;
+ uint16_t tid, offset;
+ uint64_t t_now = get_cycles();
+
+ proto->tiderr_cnt++;
+
+ /* Whether and how we will be logging this event */
+ if (proto->tiderr_max > 0
+ && proto->tiderr_cnt >= proto->tiderr_max)
+ ep_err = PSMI_EP_NORETURN;
+ else if (proto->tiderr_warn_interval != UINT64_MAX &&
+ proto->tiderr_tnext <= t_now)
+ proto->tiderr_tnext =
+ get_cycles() + proto->tiderr_warn_interval;
+ else
+ ep_err = NULL;
+
+ if (ep_err != NULL) {
+ rhf_errnum_string(pktmsg, sizeof(pktmsg),
+ rcv_ev->error_flags);
+
+ tid = (__le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) >>
+ HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK;
+ offset = __le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) &
+ HFI_KHDR_OFFSET_MASK;
+
+ psmi_handle_error(ep_err, PSM2_EP_DEVICE_FAILURE,
+ "%s with tid=%d,offset=%d,count=%d: %s %s",
+ "TID Error",
+ tid, offset, proto->tiderr_cnt,
+ pktmsg, ep_err == PSMI_EP_NORETURN ?
+ "(Terminating...)" : "");
+ }
+
+ ips_protoexp_handle_tiderr(rcv_ev);
+ } else if (data_err) {
+#if _HFI_DEBUGGING
+ if (_HFI_DBG_ON) {
+ uint8_t op_code
+ = _get_proto_hfi_opcode(rcv_ev->p_hdr);
+
+ if (!pkt_verbose_err) {
+ rhf_errnum_string(pktmsg, sizeof(pktmsg),
+ rcv_ev->error_flags);
+ _HFI_DBG_ALWAYS
+ ("Error %s pkt type opcode 0x%x at hd=0x%x %s\n",
+ (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER)
+ ? "eager" : (rcv_ev-> ptype ==
+ RCVHQ_RCV_TYPE_EXPECTED)
+ ? "expected" : (rcv_ev->ptype ==
+ RCVHQ_RCV_TYPE_NON_KD) ? "non-kd" :
+ "<error>", op_code,
+ rcv_ev->recvq->state->hdrq_head, pktmsg);
+ }
+ }
+#endif
+
+ if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED)
+ ips_protoexp_handle_data_err(rcv_ev);
+ } else { /* not a tid or data error -- some other error */
+#if _HFI_DEBUGGING
+ if (_HFI_DBG_ON) {
+ uint8_t op_code =
+ __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF;
+
+ if (!pkt_verbose_err)
+ rhf_errnum_string(pktmsg, sizeof(pktmsg),
+ rcv_ev->error_flags);
+
+ /* else RHFerr decode printed below */
+ _HFI_DBG_ALWAYS
+ ("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n",
+ rcv_ev->ptype, op_code,
+ rcv_ev->recvq->state->hdrq_head, pktmsg);
+ }
+#endif
+ }
+ if (pkt_verbose_err) {
+ if (!*pktmsg)
+ rhf_errnum_string(pktmsg, sizeof(pktmsg),
+ rcv_ev->error_flags);
+ ips_proto_show_header(rcv_ev->p_hdr, pktmsg);
+ }
+
+ return 0;
+}
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
new file mode 100644
index 0000000..4b2617f
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.c
@@ -0,0 +1,869 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+
+/*
+ * Receive header queue initialization.
+ */
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+ const struct ips_epstate *epstate,
+ const struct ips_proto *proto,
+ const struct ips_recvq_params *hdrq_params,
+ const struct ips_recvq_params *egrq_params,
+ const struct ips_recvhdrq_callbacks *callbacks,
+ uint32_t runtime_flags,
+ uint32_t subcontext,
+ struct ips_recvhdrq *recvq,
+ struct ips_recvhdrq_state *recvq_state)
+{
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ psm2_error_t err = PSM2_OK;
+
+ memset(recvq, 0, sizeof(*recvq));
+ recvq->proto = (struct ips_proto *)proto;
+ recvq->state = recvq_state;
+ recvq->context = context;
+ recvq->subcontext = subcontext;
+ /* This runtime flags may be different from the context's runtime flags since
+ * a receive queue may be initialised to represent a "software" receive
+ * queue (shared contexts) or a hardware receive queue */
+ recvq->runtime_flags = runtime_flags;
+ recvq->hdrq = *hdrq_params; /* deep copy */
+ pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED);
+ recvq->hdrq_rhf_off =
+ (ctxt_info->rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT;
+
+ if (recvq->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+ recvq->hdrq_rhf_notail = 0;
+ recvq->state->hdrq_rhf_seq = 0; /* _seq is ignored */
+ } else {
+ recvq->hdrq_rhf_notail = 1;
+ recvq->state->hdrq_rhf_seq = 1;
+ }
+ recvq->hdrq_elemlast = ((recvq->hdrq.elemcnt - 1) * recvq->hdrq.elemsz);
+
+ recvq->egrq = *egrq_params; /* deep copy */
+ recvq->egrq_buftable =
+ ips_recvq_egrbuf_table_alloc(context->ep, recvq->egrq.base_addr,
+ recvq->egrq.elemcnt,
+ recvq->egrq.elemsz);
+ if (recvq->egrq_buftable == NULL) {
+ err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+ "Couldn't allocate memory for eager buffer index table");
+ goto fail;
+ }
+
+ recvq->epstate = epstate;
+ recvq->recvq_callbacks = *callbacks; /* deep copy */
+ SLIST_INIT(&recvq->pending_acks);
+
+ recvq->state->hdrq_head = 0;
+ recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE;
+ recvq->state->num_hdrq_done = 0;
+ recvq->state->num_egrq_done = 0;
+ recvq->state->hdr_countdown = 0;
+ recvq->state->hdrq_cachedlastscan = 0;
+
+ {
+ union psmi_envvar_val env_hdr_update;
+ psmi_getenv("PSM2_HEAD_UPDATE",
+ "header queue update interval (0 to update after all entries are processed). Default is 64",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val) 64, &env_hdr_update);
+
+ /* Cap max header update interval to size of header/eager queue */
+ recvq->state->head_update_interval =
+ min(env_hdr_update.e_uint, recvq->hdrq.elemcnt - 1);
+ recvq->state->egrq_update_interval = 1;
+ }
+
+fail:
+ return err;
+}
+
+psm2_error_t ips_recvhdrq_fini(struct ips_recvhdrq *recvq)
+{
+ ips_recvq_egrbuf_table_free(recvq->egrq_buftable);
+ return PSM2_OK;
+}
+
+/* flush the eager buffers, by setting the eager index head to eager index tail
+ if eager buffer queue is full.
+
+ Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR
+ was set in RHF errors), and no good eager packets were received, so
+ that eager head wasn't advanced.
+*/
+#if 0
+static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq)
+{
+ const uint32_t tail = ips_recvq_tail_get(&recvq->egrq);
+ const uint32_t head = ips_recvq_head_get(&recvq->egrq);
+ uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+ if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) {
+ _HFI_DBG("eager array full after overflow, flushing "
+ "(head %llx, tail %llx)\n",
+ (long long)head, (long long)tail);
+ recvq->proto->stats.egr_overflow++;
+ }
+ return;
+}
+#endif
+
+/*
+ * Helpers for ips_recvhdrq_progress.
+ */
+
+static __inline__ int
+_get_proto_subcontext(const struct ips_message_header *p_hdr)
+{
+ return ((__be32_to_cpu(p_hdr->bth[1]) >>
+ HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK);
+}
+
+/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_fecn_set(const struct ips_message_header *p_hdr)
+{
+ return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1;
+}
+
+/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_becn_set(const struct ips_message_header *p_hdr)
+{
+ return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1;
+}
+
+static __inline__ struct ips_message_header *_get_proto_hdr_from_rhf(const
+ uint32_t *
+ rcv_hdr,
+ const
+ __le32 *
+ rhf)
+{
+ return (struct ips_message_header *)(rcv_hdr +
+ hfi_hdrget_hdrq_offset(rhf));
+}
+
+static __inline__ struct ips_message_header *_get_proto_hdr(const uint32_t *
+ rcv_hdr)
+{
+ return (struct ips_message_header *)&rcv_hdr[2];
+}
+
+static __inline__ uint32_t
+_get_rhf_seq(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+ return hfi_hdrget_seq((const __le32 *)rcv_hdr + recvq->hdrq_rhf_off);
+}
+
+static __inline__ uint32_t
+_get_rhf_len_in_bytes(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+ return hfi_hdrget_length_in_bytes((const __le32 *)rcv_hdr +
+ recvq->hdrq_rhf_off);
+}
+
+static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
+{
+ char *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+ ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+
+#ifdef PSM_DEBUG
+ ips_proto_show_header((struct ips_message_header *)
+ rcv_ev->rcv_hdr, "received invalid pkt");
+#endif
+ if (hfi_debug & __HFI_PKTDBG) {
+ ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE,
+ "header");
+ if (paylen)
+ ips_proto_dump_frame(payload, paylen, "data");
+ }
+
+}
+
+static __inline__ void
+_update_error_stats(struct ips_proto *proto, uint32_t err)
+{
+ if (err & HFI_RHF_ICRCERR)
+ proto->error_stats.num_icrc_err++;
+ if (err & HFI_RHF_ECCERR)
+ proto->error_stats.num_ecc_err++;
+ if (err & HFI_RHF_LENERR)
+ proto->error_stats.num_len_err++;
+ if (err & HFI_RHF_TIDERR)
+ proto->error_stats.num_tid_err++;
+ if (err & HFI_RHF_DCERR)
+ proto->error_stats.num_dc_err++;
+ if (err & HFI_RHF_DCUNCERR)
+ proto->error_stats.num_dcunc_err++;
+ if (err & HFI_RHF_KHDRLENERR)
+ proto->error_stats.num_khdrlen_err++;
+}
+
+#ifdef PSM_DEBUG
+static int _check_headers(struct ips_recvhdrq_event *rcv_ev)
+{
+ struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+ struct ips_proto *proto = rcv_ev->proto;
+ uint32_t *lrh = (uint32_t *) rcv_ev->p_hdr;
+ const uint32_t *rcv_hdr = rcv_ev->rcv_hdr;
+ uint32_t dest_context;
+ const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]);
+ const uint16_t base_dlid =
+ __be16_to_cpu(recvq->proto->epinfo.ep_base_lid);
+
+ /* Check that the receive header queue entry has a sane sequence number */
+ if (_get_rhf_seq(recvq, rcv_hdr) > LAST_RHF_SEQNO) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n",
+ _get_rhf_seq(recvq, rcv_hdr),
+ recvq->state->hdrq_rhf_seq, lrh[0], lrh[1]);
+ return -1;
+ }
+
+ /* Verify that the packet was destined for our context */
+ dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr);
+ if_pf(dest_context != recvq->proto->epinfo.ep_context) {
+
+ struct ips_recvhdrq_state *state = recvq->state;
+
+ /* Packet not targeted at us. Drop packet and continue */
+ ips_proto_dump_err_stats(proto);
+ _dump_invalid_pkt(rcv_ev);
+
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n",
+ dest_context, recvq->proto->epinfo.ep_context,
+ state->hdrq_head);
+
+ return -1;
+ }
+
+ /* Verify that rhf packet length matches the length in LRH */
+ if_pf(_get_rhf_len_in_bytes(recvq, rcv_hdr) !=
+ (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << BYTE2DWORD_SHIFT)) {
+ _HFI_EPDBG
+ ("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n",
+ _get_rhf_len_in_bytes(recvq, rcv_hdr) >> 2,
+ __be16_to_cpu(rcv_ev->p_hdr->lrh[2]));
+
+ ips_proto_dump_err_stats(proto);
+ _dump_invalid_pkt(rcv_ev);
+ return -1;
+ }
+
+ /* Verify that the DLID matches our local LID. */
+ if_pf(!((base_dlid <= pkt_dlid) &&
+ (pkt_dlid <=
+ (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) {
+ _HFI_EPDBG
+ ("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n",
+ rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid);
+ ips_proto_dump_err_stats(proto);
+ _dump_invalid_pkt(rcv_ev);
+ return -1;
+ }
+
+ return 0;
+}
+#endif
+
+static __inline__ int do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev)
+{
+ char *payload = ips_recvhdrq_event_payload(rcv_ev);
+ uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+ ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+ uint32_t *ckptr;
+ uint32_t recv_cksum, cksum, dest_subcontext;
+
+ /* With checksum every packet has a payload */
+ psmi_assert_always(payload);
+
+ ckptr = (uint32_t *) (payload + paylen);
+ recv_cksum = ckptr[0];
+
+ /* Calculate checksum hdr + payload (includes any padding words) */
+ cksum = 0xffffffff;
+ cksum = ips_crc_calculate(HFI_MESSAGE_HDR_SIZE,
+ (uint8_t *) rcv_ev->p_hdr, cksum);
+ if (paylen)
+ cksum = ips_crc_calculate(paylen, (uint8_t *) payload, cksum);
+
+ if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) {
+ struct ips_epstate_entry *epstaddr;
+ uint32_t lcontext;
+ uint32_t hd, tl;
+
+ epstaddr =
+ ips_epstate_lookup(rcv_ev->recvq->epstate,
+ rcv_ev->p_hdr->connidx);
+ epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL;
+
+ lcontext = epstaddr ? rcv_ev->proto->epinfo.ep_context : -1;
+
+ hd = rcv_ev->recvq->context->ctrl->__hfi_rcvhdrhead[0];
+ tl = rcv_ev->recvq->context->ctrl->__hfi_rcvhdrhead[-2];
+
+ dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr);
+
+ _HFI_ERROR
+ ("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%x,%x, rhfseq 0x%x\n",
+ (dest_subcontext !=
+ rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext,
+ epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->pathgrp->
+ pg_base_lid) : -1, cksum,
+ ckptr[0], ckptr[1], _get_proto_hfi_opcode(rcv_ev->p_hdr),
+ rcv_ev->error_flags, hd, tl, rcv_ev->rhf[0],
+ rcv_ev->rhf[1],
+ _get_rhf_seq((struct ips_recvhdrq *)rcv_ev->recvq,
+ rcv_ev->rcv_hdr));
+
+ /* Dump packet */
+ _dump_invalid_pkt(rcv_ev);
+ return 0; /* Packet checksum error */
+ }
+
+ return 1;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+process_pending_acks(struct ips_recvhdrq *recvq))
+{
+ ips_scb_t ctrlscb;
+
+ /* If any pending acks, dispatch them now */
+ while (!SLIST_EMPTY(&recvq->pending_acks)) {
+ struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks);
+
+ SLIST_REMOVE_HEAD(&recvq->pending_acks, next);
+ SLIST_NEXT(flow, next) = NULL;
+
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+
+ if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+ psmi_assert_always((flow->
+ flags & IPS_FLOW_FLAG_PENDING_NAK)
+ == 0);
+
+ flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;
+ ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+ &flow->ipsaddr->
+ ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ } else {
+ psmi_assert_always(flow->
+ flags & IPS_FLOW_FLAG_PENDING_NAK);
+
+ flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;
+ ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+ &flow->ipsaddr->
+ ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+ }
+}
+
+/*
+ * Core receive progress function
+ *
+ * recvhdrq_progress is the core function that services the receive header
+ * queue and optionally, the eager queue. At the lowest level, it identifies
+ * packets marked with errors by the chip and also detects and corrects when
+ * eager overflow conditions occur. At the highest level, it queries the
+ * 'epstate' interface to classify packets from "known" and "unknown"
+ * endpoints. In order to support shared contexts, it can also handle packets
+ * destined for other contexts (or "subcontexts").
+ */
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
+{
+ struct ips_recvhdrq_state *state = recvq->state;
+ const __le32 *rhf;
+ PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto =
+ recvq->proto,
+ .recvq = recvq
+ };
+ struct ips_epstate_entry *epstaddr;
+
+ uint32_t num_hdrq_done = 0;
+ const int num_hdrq_todo = recvq->hdrq.elemcnt;
+ const uint32_t hdrq_elemsz = recvq->hdrq.elemsz;
+ uint32_t dest_subcontext;
+
+ int ret = IPS_RECVHDRQ_CONTINUE;
+ int done = 0;
+ int do_hdr_update = 0;
+
+ /* Chip features */
+ const int has_rtail = recvq->runtime_flags & HFI1_CAP_DMA_RTAIL;
+
+ /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */
+#define next_hdrq_is_ready() \
+ (has_rtail ? \
+ state->hdrq_head != ips_recvq_tail_get(&recvq->hdrq) : \
+ recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, rcv_hdr))
+
+ const uint32_t *rcv_hdr =
+ (const uint32_t *)recvq->hdrq.base_addr + state->hdrq_head;
+ uint32_t tmp_hdrq_head;
+
+ PSM2_LOG_MSG("entering");
+ done = !next_hdrq_is_ready();
+
+ while (!done) {
+
+ rhf = (const __le32 *)rcv_hdr + recvq->hdrq_rhf_off;
+ rcv_ev.error_flags = hfi_hdrget_err_flags(rhf);
+ rcv_ev.ptype = hfi_hdrget_rcv_type(rhf);
+ rcv_ev.rhf = rhf;
+ rcv_ev.rcv_hdr = rcv_hdr;
+ rcv_ev.p_hdr =
+ recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(rcv_hdr, rhf)
+ : _get_proto_hdr(rcv_hdr);
+ rcv_ev.has_cksum =
+ ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+ (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM));
+
+ _HFI_VDBG
+ ("new packet: rcv_hdr %p, rhf_off %d, rhf %p (%x,%x), p_hdr %p\n",
+ rcv_hdr, recvq->hdrq_rhf_off, rhf, rhf[0], rhf[1],
+ rcv_ev.p_hdr);
+
+ /* If the hdrq_head is before cachedlastscan, that means that we have
+ * already prescanned this for BECNs and FECNs, so we should not check
+ * again
+ */
+ if_pt((recvq->proto->flags & IPS_PROTO_FLAG_CCA) &&
+ (state->hdrq_head >= state->hdrq_cachedlastscan)) {
+ /* IBTA CCA handling:
+ * If FECN bit set handle IBTA CCA protocol. For the
+ * flow that suffered congestion we flag it to generate
+ * a control packet with the BECN bit set - This is
+ * currently an unsolicited ACK.
+ *
+ * For all MQ packets the FECN processing/BECN
+ * generation is done in the is_expected_or_nak
+ * function as each eager packet is inspected there.
+ *
+ * For TIDFLOW/Expected data transfers the FECN
+ * bit/BECN generation is done in protoexp_data. Since
+ * header suppression can result in even FECN packets
+ * being suppressed the expected protocol generated
+ * additional BECN packets if a "large" number of
+ * generations are swapped without progress being made
+ * for receive. "Large" is set empirically to 4.
+ *
+ * FECN packets are ignored for all control messages
+ * (except ACKs and NAKs) since they indicate
+ * congestion on the control path which is not rate
+ * controlled. The CCA specification allows FECN on
+ * ACKs to be disregarded as well.
+ */
+ rcv_ev.is_congested =
+ _is_cca_fecn_set(rcv_ev.
+ p_hdr) & IPS_RECV_EVENT_FECN;
+ rcv_ev.is_congested |=
+ (_is_cca_becn_set(rcv_ev.p_hdr) <<
+ (IPS_RECV_EVENT_BECN - 1));
+ } else
+ rcv_ev.is_congested = 0;
+
+#ifdef PSM_DEBUG
+ if_pf(_check_headers(&rcv_ev))
+ goto skip_packet;
+#endif
+ dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr);
+
+ /* If the destination is not our subcontext, process
+ * message as subcontext message (shared contexts) */
+ if (dest_subcontext != recvq->subcontext) {
+ rcv_ev.ipsaddr = NULL;
+
+ ret = recvq->recvq_callbacks.callback_subcontext
+ (&rcv_ev, dest_subcontext);
+ if (ret == IPS_RECVHDRQ_REVISIT)
+ {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK_NO_PROGRESS;
+ }
+
+ goto skip_packet;
+ }
+
+ if_pf(rcv_ev.error_flags) {
+
+ _update_error_stats(recvq->proto, rcv_ev.error_flags);
+
+ recvq->recvq_callbacks.callback_error(&rcv_ev);
+
+ if ((rcv_ev.ptype != RCVHQ_RCV_TYPE_EAGER) ||
+ (!(rcv_ev.error_flags & HFI_RHF_TIDERR)))
+ goto skip_packet;
+
+ /* no pending eager update, header
+ * is not currently under tracing. */
+ if (state->hdr_countdown == 0 &&
+ state->rcv_egr_index_head == NO_EAGER_UPDATE) {
+ uint32_t egr_cnt = recvq->egrq.elemcnt;
+ const uint32_t etail =
+ ips_recvq_tail_get(&recvq->egrq);
+ const uint32_t ehead =
+ ips_recvq_head_get(&recvq->egrq);
+
+ if (ehead == ((etail + 1) % egr_cnt)) {
+ /* eager is full,
+ * trace existing header entries */
+ uint32_t hdr_size =
+ recvq->hdrq_elemlast +
+ hdrq_elemsz;
+ const uint32_t htail =
+ ips_recvq_tail_get
+ (&recvq->hdrq);
+ const uint32_t hhead =
+ state->hdrq_head;
+
+ state->hdr_countdown =
+ (htail > hhead) ?
+ (htail - hhead) :
+ (htail + hdr_size - hhead);
+ }
+ }
+
+ /* Eager packet and tiderr.
+ * Don't consider updating egr head, unless we're in
+ * the congested state. If we're congested, we should
+ * try to keep the eager buffers free. */
+
+ if (!rcv_ev.is_congested)
+ goto skip_packet_no_egr_update;
+ else
+ goto skip_packet;
+ }
+
+ /* If checksum is enabled, verify that it is valid */
+ if_pf(rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev))
+ goto skip_packet;
+
+ _HFI_VDBG("opcode %x, payload %p paylen %d; "
+ "egrhead %lx egrtail %lx; "
+ "useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n",
+ _get_proto_hfi_opcode(rcv_ev.p_hdr),
+ ips_recvhdrq_event_payload(&rcv_ev),
+ ips_recvhdrq_event_paylen(&rcv_ev),
+ ips_recvq_head_get(&recvq->egrq),
+ ips_recvq_tail_get(&recvq->egrq),
+ hfi_hdrget_use_egrbfr(rhf),
+ hfi_hdrget_egrbfr_index(rhf),
+ hfi_hdrget_egrbfr_offset(rhf),
+ state->rcv_egr_index_head);
+
+ /* Classify packet from a known or unknown endpoint */
+ epstaddr = ips_epstate_lookup(recvq->epstate,
+ rcv_ev.p_hdr->connidx);
+ if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) {
+ rcv_ev.ipsaddr = NULL;
+ recvq->recvq_callbacks.
+ callback_packet_unknown(&rcv_ev);
+ } else {
+ rcv_ev.ipsaddr = epstaddr->ipsaddr;
+ ret = ips_proto_process_packet(&rcv_ev);
+ if (ret == IPS_RECVHDRQ_REVISIT)
+ {
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK_NO_PROGRESS;
+ }
+ }
+
+skip_packet:
+ /*
+ * if eager buffer is used, record the index.
+ */
+ if (hfi_hdrget_use_egrbfr(rhf)) {
+ /* set only when a new entry is used */
+ if (hfi_hdrget_egrbfr_offset(rhf) == 0){
+ state->rcv_egr_index_head =
+ hfi_hdrget_egrbfr_index(rhf);
+ state->num_egrq_done++;
+ }
+ /* a header entry is using an eager entry, stop tracing. */
+ state->hdr_countdown = 0;
+ }
+
+skip_packet_no_egr_update:
+ /* Note that state->hdrq_head is sampled speculatively by the code
+ * in ips_ptl_shared_poll() when context sharing, so it is not safe
+ * for this shared variable to temporarily exceed the last element. */
+ tmp_hdrq_head = state->hdrq_head + hdrq_elemsz;
+ _HFI_VDBG
+ ("dma_rtail %d head %d, elemsz %d elemlast %d tmp %d\n",
+ has_rtail, state->hdrq_head, hdrq_elemsz,
+ recvq->hdrq_elemlast, tmp_hdrq_head);
+
+ if_pt(tmp_hdrq_head <= recvq->hdrq_elemlast)
+ state->hdrq_head = tmp_hdrq_head;
+ else
+ state->hdrq_head = 0;
+
+ if_pf(has_rtail == 0
+ && ++recvq->state->hdrq_rhf_seq > LAST_RHF_SEQNO)
+ recvq->state->hdrq_rhf_seq = 1;
+
+ state->num_hdrq_done++;
+ num_hdrq_done++;
+ rcv_hdr =
+ (const uint32_t *)recvq->hdrq.base_addr + state->hdrq_head;
+ done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK)
+ || (num_hdrq_done == num_hdrq_todo));
+
+ do_hdr_update = (state->head_update_interval ?
+ (state->num_hdrq_done ==
+ state->head_update_interval) : done);
+ if (do_hdr_update) {
+ ips_recvq_head_update(&recvq->hdrq, state->hdrq_head);
+ /* Reset header queue entries processed */
+ state->num_hdrq_done = 0;
+ }
+ if (state->num_egrq_done >= state->egrq_update_interval) {
+ /* Lazy update of egrq */
+ if (state->rcv_egr_index_head != NO_EAGER_UPDATE) {
+ ips_recvq_head_update(&recvq->egrq,
+ state->
+ rcv_egr_index_head);
+ state->rcv_egr_index_head = NO_EAGER_UPDATE;
+ state->num_egrq_done = 0;
+ }
+ }
+ if (state->hdr_countdown > 0) {
+ /* a header entry is consumed. */
+ state->hdr_countdown -= hdrq_elemsz;
+ if (state->hdr_countdown == 0) {
+ /* header entry count reaches zero. */
+ const uint32_t tail =
+ ips_recvq_tail_get(&recvq->egrq);
+ const uint32_t head =
+ ips_recvq_head_get(&recvq->egrq);
+ uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+ /* Checks eager-full again. This is a real false-egr-full */
+ if (head == ((tail + 1) % egr_cnt)) {
+ ips_recvq_head_update(&recvq->egrq,
+ tail);
+ _HFI_DBG
+ ("eager array full after overflow, flushing "
+ "(head %llx, tail %llx)\n",
+ (long long)head, (long long)tail);
+ recvq->proto->stats.egr_overflow++;
+ } else
+ _HFI_ERROR
+ ("PSM BUG: EgrOverflow: eager queue is not full\n");
+ }
+ }
+ }
+ /* while (hdrq_entries_to_read) */
+
+ /* Process any pending acks before exiting */
+ process_pending_acks(recvq);
+
+ PSM2_LOG_MSG("leaving");
+ return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS;
+}
+
+/* This function is designed to implement RAPID CCA. It iterates
+ through the recvq, checking each element for set FECN or BECN bits.
+ In the case of finding one, the proper response is executed, and the bits
+ are cleared.
+*/
+psm2_error_t ips_recvhdrq_scan_cca (struct ips_recvhdrq *recvq)
+{
+
+/* Looks at hdr and determines if it is the last item in the queue */
+
+#define is_last_hdr(hdr) \
+ (has_rtail ? \
+ (hdr != ips_recvq_tail_get(&recvq->hdrq)) : \
+ (recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, curr_hdr)))
+
+ struct ips_recvhdrq_state *state = recvq->state;
+ const __le32 *rhf;
+ PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto,
+ .recvq = recvq
+ };
+
+ uint32_t num_hdrq_done = state->hdrq_cachedlastscan / recvq->hdrq.elemsz;
+ const int num_hdrq_todo = recvq->hdrq.elemcnt;
+ const uint32_t hdrq_elemsz = recvq->hdrq.elemsz;
+
+ int done;
+
+ /* Chip features */
+ const int has_rtail = recvq->runtime_flags & HFI1_CAP_DMA_RTAIL;
+
+ uint32_t *rcv_hdr =
+ (uint32_t *)recvq->hdrq.base_addr + state->hdrq_cachedlastscan;
+ uint32_t *curr_hdr = rcv_hdr;
+ uint32_t scan_head = state->hdrq_head + state->hdrq_cachedlastscan;
+
+ /* Skip the first element, since we're going to process it soon anyway */
+ if ( state->hdrq_cachedlastscan == 0 )
+ {
+ curr_hdr = curr_hdr + hdrq_elemsz;
+ scan_head += hdrq_elemsz;
+ num_hdrq_done++;
+ }
+
+ PSM2_LOG_MSG("entering");
+ done = !is_last_hdr(scan_head);
+
+ while (!done) {
+ rhf = (const __le32 *)curr_hdr + recvq->hdrq_rhf_off;
+ rcv_ev.error_flags = hfi_hdrget_err_flags(rhf);
+ rcv_ev.ptype = hfi_hdrget_rcv_type(rhf);
+ rcv_ev.rhf = rhf;
+ rcv_ev.rcv_hdr = curr_hdr;
+ rcv_ev.p_hdr =
+ recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(curr_hdr, rhf)
+ : _get_proto_hdr(curr_hdr);
+ rcv_ev.has_cksum =
+ ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+ (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM));
+
+ _HFI_VDBG
+ ("scanning packet for CCA: curr_hdr %p, rhf_off %d, rhf %p (%x,%x), p_hdr %p\n",
+ curr_hdr, recvq->hdrq_rhf_off, rhf, rhf[0], rhf[1],
+ rcv_ev.p_hdr);
+
+ if_pt ( _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN ) {
+ struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate,
+ rcv_ev.p_hdr->connidx);
+
+ if (epstaddr != NULL && epstaddr->ipsaddr != NULL)
+ {
+ rcv_ev.ipsaddr = epstaddr->ipsaddr;
+
+ /* Send BECN back */
+ ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr;
+ struct ips_message_header *p_hdr = rcv_ev.p_hdr;
+ ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+ struct ips_flow *flow;
+ ips_scb_t ctrlscb;
+
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ ctrlscb.flags = 0;
+ ctrlscb.ips_lrh.data[0].u32w0 =
+ flow->cca_ooo_pkts;
+
+ rcv_ev.proto->epaddr_stats.congestion_pkts++;
+ /* Clear FECN event */
+ rcv_ev.is_congested &= ~IPS_RECV_EVENT_FECN;
+
+ ips_proto_send_ctrl_message(flow,
+ OPCODE_BECN,
+ &flow->ipsaddr->
+ ctrl_msg_queued,
+ &ctrlscb, ctrlscb.cksum, 0);
+ }
+ }
+ else if_pt (0 != (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1))) {
+ struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate,
+ rcv_ev.p_hdr->connidx);
+
+ if (epstaddr != NULL && epstaddr->ipsaddr != NULL)
+ {
+ rcv_ev.ipsaddr = epstaddr->ipsaddr;
+
+ /* Adjust flow */
+ struct ips_proto *proto = rcv_ev.proto;
+ struct ips_message_header *p_hdr = rcv_ev.p_hdr;
+ ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr;
+ struct ips_flow *flow;
+ ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+
+ psmi_assert(flowid < EP_FLOW_LAST);
+ flow = &ipsaddr->flows[flowid];
+ if ((flow->path->pr_ccti +
+ proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) {
+ ips_cca_adjust_rate(flow->path,
+ proto->cace[flow->path->pr_sl].ccti_increase);
+ /* Clear congestion event */
+ rcv_ev.is_congested &= ~IPS_RECV_EVENT_BECN;
+ }
+ }
+ }
+
+ curr_hdr = curr_hdr + hdrq_elemsz;
+
+ num_hdrq_done++;
+ scan_head += hdrq_elemsz;
+ state->hdrq_cachedlastscan += hdrq_elemsz;
+
+ done = (num_hdrq_done == num_hdrq_todo && !is_last_hdr(scan_head) );
+
+ }
+ /* while (hdrq_entries_to_read) */
+
+
+ PSM2_LOG_MSG("leaving");
+ return PSM2_OK;
+}
diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h
new file mode 100644
index 0000000..15761aa
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.h
@@ -0,0 +1,240 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_proto_header.h"
+#include "ips_proto_params.h"
+#include "ips_recvq.h"
+
+#ifndef _IPS_RECVHDRQ_H
+#define _IPS_RECVHDRQ_H
+
+struct ips_recvhdrq;
+struct ips_recvhdrq_state;
+struct ips_epstate;
+
+/* process current packet, continue on next packet */
+#define IPS_RECVHDRQ_CONTINUE 0
+/* process current packet, break and return to caller */
+#define IPS_RECVHDRQ_BREAK 1
+/* keep current packet, revisit the same packet next time */
+#define IPS_RECVHDRQ_REVISIT 2
+
+#define IPS_RECVHDRQ_ELEMSZ_MAX 32 /* 128 bytes */
+#define LAST_RHF_SEQNO 13
+
+/* CCA related receive events */
+#define IPS_RECV_EVENT_FECN 0x1
+#define IPS_RECV_EVENT_BECN 0x2
+
+struct ips_recvhdrq_event {
+ struct ips_proto *proto;
+ const struct ips_recvhdrq *recvq; /* where message received */
+ const uint32_t *rcv_hdr; /* rcv_hdr ptr */
+ const __le32 *rhf; /* receive header flags */
+ struct ips_message_header *p_hdr; /* protocol header in rcv_hdr */
+ struct ips_epaddr *ipsaddr; /* peer ipsaddr, if available */
+ uint32_t error_flags; /* error flags */
+ uint8_t has_cksum; /* payload has cksum */
+ uint8_t is_congested; /* Packet faced congestion */
+ uint16_t ptype; /* packet type */
+};
+
+struct ips_recvhdrq_callbacks {
+ int (*callback_packet_unknown) (const struct ips_recvhdrq_event *);
+ int (*callback_subcontext) (const struct ips_recvhdrq_event *,
+ uint32_t subcontext);
+ int (*callback_error) (struct ips_recvhdrq_event *);
+};
+
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+ const struct ips_epstate *epstate,
+ const struct ips_proto *proto,
+ const struct ips_recvq_params *hdrq_params,
+ const struct ips_recvq_params *egrq_params,
+ const struct ips_recvhdrq_callbacks *callbacks,
+ uint32_t flags,
+ uint32_t subcontext,
+ struct ips_recvhdrq *recvq,
+ struct ips_recvhdrq_state *recvq_state);
+
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq);
+
+psm2_error_t ips_recvhdrq_fini(struct ips_recvhdrq *recvq);
+
+/*
+ * This function is designed to implement RAPID CCA. It iterates
+ * through the recvq, checking each element for set FECN or BECN bits.
+ * In the case of finding one, the proper response is executed, and the bits
+ * are cleared.
+ */
+psm2_error_t ips_recvhdrq_scan_cca(struct ips_recvhdrq *recvq);
+
+/*
+ * Structure containing state for recvhdrq reading. This is logically
+ * part of ips_recvhdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the context. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+#define NO_EAGER_UPDATE ~0U
+struct ips_recvhdrq_state {
+ uint32_t hdrq_head; /* software copy of head */
+ uint32_t rcv_egr_index_head; /* software copy of eager index head */
+ uint32_t hdrq_rhf_seq; /* last seq */
+ uint32_t head_update_interval; /* Header update interval */
+ uint32_t num_hdrq_done; /* Num header queue done */
+ uint32_t egrq_update_interval; /* Eager buffer update interval */
+ uint32_t num_egrq_done; /* num eager buffer done */
+ uint32_t hdr_countdown; /* for false-egr-full tracing */
+ uint32_t hdrq_cachedlastscan; /* last element to be prescanned */
+};
+
+/*
+ * Structure to read from recvhdrq
+ */
+struct ips_recvhdrq {
+ struct ips_proto *proto;
+ const psmi_context_t *context; /* error handling, epid id, etc. */
+ struct ips_recvhdrq_state *state;
+ uint32_t context_flags; /* derived from base_info.spi_runtime_flags */
+ uint32_t subcontext; /* messages that don't match subcontext call
+ * recv_callback_subcontext */
+
+ /* Header queue handling */
+ pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */
+ uint32_t hdrq_rhf_off; /* rhf offset */
+ int hdrq_rhf_notail; /* rhf notail enabled */
+ uint32_t hdrq_elemlast; /* last element precomputed */
+ struct ips_recvq_params hdrq;
+
+ /* Eager queue handling */
+ void **egrq_buftable; /* table of eager idx-to-ptr */
+ struct ips_recvq_params egrq;
+
+ /* Lookup endpoints epid -> ptladdr (rank)) */
+ const struct ips_epstate *epstate;
+
+ /* Callbacks to handle recvq events */
+ struct ips_recvhdrq_callbacks recvq_callbacks;
+
+ /* List of flows with pending acks for receive queue */
+ SLIST_HEAD(pending_flows, ips_flow) pending_acks;
+
+ uint32_t runtime_flags;
+ volatile __u64 *spi_status;
+};
+
+PSMI_INLINE(int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq))
+{
+ if (recvq->hdrq_rhf_notail) /* use rhf-based reads */
+ return recvq->state->hdrq_rhf_seq !=
+ hfi_hdrget_seq(recvq->hdrq.base_addr +
+ recvq->state->hdrq_head +
+ recvq->hdrq_rhf_off);
+ else
+ return ips_recvq_tail_get(&recvq->hdrq) ==
+ recvq->state->hdrq_head;
+}
+
+PSMI_INLINE(
+void *
+ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev))
+{
+ /* XXX return NULL if no eager buffer allocated */
+ if (hfi_hdrget_use_egrbfr(rcv_ev->rhf))
+ return ips_recvq_egr_index_2_ptr(rcv_ev->recvq->egrq_buftable,
+ hfi_hdrget_egrbfr_index
+ (rcv_ev->rhf),
+ hfi_hdrget_egrbfr_offset
+ (rcv_ev->rhf) * 64);
+ else
+ return NULL;
+}
+
+PSMI_INLINE(
+uint32_t
+ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev))
+{
+ uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0;
+
+ return hfi_hdrget_length_in_bytes(rcv_ev->rhf) -
+ (sizeof(struct ips_message_header) +
+ HFI_CRC_SIZE_IN_BYTES + cksum_len);
+ /* PSM does not use bth0].PadCnt, it figures out real datalen other way */
+}
+
+PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq))
+{
+ int ret = pthread_spin_trylock(&recvq->hdrq_lock);
+ return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq))
+{
+ int ret = pthread_spin_lock(&recvq->hdrq_lock);
+ return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq))
+{
+ int ret = pthread_spin_unlock(&recvq->hdrq_lock);
+ return !ret;
+}
+
+#endif /* _IPS_RECVHDRQ_H */
diff --git a/ptl_ips/ips_recvq.c b/ptl_ips/ips_recvq.c
new file mode 100644
index 0000000..55b702c
--- /dev/null
+++ b/ptl_ips/ips_recvq.c
@@ -0,0 +1,91 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_recvq.h"
+
+/* We return a table of pointer indexes.
+ *
+ * From the point of view of the returned pointer, index -1 always points to
+ * the address to call psmi_free on (since we force page-alignment).
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr,
+ uint32_t bufnum, uint32_t bufsize)
+{
+ unsigned i;
+ void *ptr_alloc;
+ uintptr_t *buft;
+ uintptr_t base = (uintptr_t) baseptr;
+
+ ptr_alloc = psmi_malloc(ep, UNDEFINED,
+ PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum +
+ 1));
+ if (ptr_alloc == NULL)
+ return NULL;
+ /* First pointer is to the actual allocated address, so we can free it but
+ * buft[1] is first on the page boundary
+ */
+ buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc + 1, PSMI_PAGESIZE);
+ buft[-1] = (uintptr_t) ptr_alloc;
+ for (i = 0; i < bufnum; i++)
+ buft[i] = (uintptr_t) ((char *)base + i * bufsize);
+ return (void **)buft;
+}
+
+void ips_recvq_egrbuf_table_free(void **buftable)
+{
+ uintptr_t *buft = (uintptr_t *) buftable;
+ void *ptr_alloc = (void *)buft[-1];
+ psmi_free(ptr_alloc);
+}
diff --git a/ptl_ips/ips_recvq.h b/ptl_ips/ips_recvq.h
new file mode 100644
index 0000000..3236da6
--- /dev/null
+++ b/ptl_ips/ips_recvq.h
@@ -0,0 +1,124 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_RECVQ_H
+#define _IPS_RECVQ_H
+
+#include "psm_user.h"
+
+struct ips_recvq_params {
+ volatile __le64 *tail_register; /* location of tail */
+ volatile __le64 *head_register; /* location of head */
+ uint32_t *base_addr; /* base address of q */
+ uint32_t elemsz; /* size of q elements (in words) */
+ uint32_t elemcnt; /* num of q elements (in words) */
+};
+
+/*
+ * Tables to map eager indexes into their buffer addresses
+ *
+ * If function returns NULL, no memory has been allocated and the error handler
+ * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY.
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep,
+ void *base, uint32_t bufnum,
+ uint32_t bufsize);
+void ips_recvq_egrbuf_table_free(void **buftable);
+
+/*
+ * Accessor inlines for reading and writing to hdrq/egrq registers
+ */
+PSMI_ALWAYS_INLINE(
+void *
+ips_recvq_egr_index_2_ptr(void **egrq_buftable, int index, int offset))
+{
+ return (void *)((char *)egrq_buftable[index] + offset);
+}
+
+PSMI_INLINE(
+void
+ips_recvq_head_update(const struct ips_recvq_params *recvq, uint64_t newhead))
+{
+ *recvq->head_register = __cpu_to_le64(newhead);
+ return;
+}
+
+PSMI_INLINE(
+uint64_t
+ips_recvq_head_get(const struct ips_recvq_params *recvq))
+{
+ uint64_t res = __le64_to_cpu(*recvq->head_register);
+ ips_rmb();
+ return res;
+}
+
+PSMI_INLINE(
+void
+ips_recvq_tail_update(const struct ips_recvq_params *recvq, uint64_t newtail))
+{
+ *recvq->tail_register = __cpu_to_le64(newtail);
+ return;
+}
+
+PSMI_INLINE(
+uint64_t
+ips_recvq_tail_get(const struct ips_recvq_params *recvq))
+{
+ uint64_t res = __le64_to_cpu(*recvq->tail_register);
+ ips_rmb();
+ return res;
+}
+
+#endif /* _IPS_RECVQ_H */
diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c
new file mode 100644
index 0000000..0dbae1e
--- /dev/null
+++ b/ptl_ips/ips_scb.c
@@ -0,0 +1,364 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_scb.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t
+ips_scbctrl_init(const psmi_context_t *context,
+ uint32_t numscb, uint32_t numbufs,
+ uint32_t imm_size, uint32_t bufsize,
+ ips_scbctrl_avail_callback_fn_t scb_avail_callback,
+ void *scb_avail_context, struct ips_scbctrl *scbc)
+{
+ int i;
+ struct ips_scb *scb;
+ size_t scb_size;
+ size_t alloc_sz;
+ uintptr_t base, imm_base;
+ psm2_ep_t ep = context->ep;
+ /* scbc->context = context; */
+ psm2_error_t err = PSM2_OK;
+
+ psmi_assert_always(numscb > 0);
+ scbc->sbuf_num = scbc->sbuf_num_cur = numbufs;
+ SLIST_INIT(&scbc->sbuf_free);
+ scbc->sbuf_buf_size = bufsize;
+ scbc->sbuf_buf_base = NULL;
+ scbc->sbuf_buf_alloc = NULL;
+ scbc->sbuf_buf_last = NULL;
+
+ /* send buffers are not mandatory but when allocating them, make sure they
+ * are on a page boundary */
+ if (numbufs > 0) {
+ struct ips_scbbuf *sbuf;
+ int redzone = PSM_VALGRIND_REDZONE_SZ;
+
+ /* If the allocation requested is a page and we have redzones we have
+ * to allocate 2 pages so we end up using a redzone of 2048 bytes.
+ *
+ * if the allocation is not 4096, we relax that requirement and keep
+ * the redzones PSM_VALGRIND_REDZONE_SZ
+ */
+ if (redzone > 0 && bufsize % PSMI_PAGESIZE == 0)
+ redzone = PSMI_PAGESIZE / 2;
+ bufsize += 2 * redzone;
+ bufsize = PSMI_ALIGNUP(bufsize, 64);
+
+ alloc_sz = numbufs * bufsize + redzone + PSMI_PAGESIZE;
+ scbc->sbuf_buf_alloc =
+ psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+ if (scbc->sbuf_buf_alloc == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ base = (uintptr_t) scbc->sbuf_buf_alloc;
+ base = PSMI_ALIGNUP(base + redzone, PSMI_PAGESIZE);
+ scbc->sbuf_buf_base = (void *)base;
+ scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1));
+ _HFI_VDBG
+ ("sendbufs=%d, (redzone=%d|size=%d|redzone=%d),base=[%p..%p)\n",
+ numbufs, redzone, bufsize - 2 * redzone, redzone,
+ (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last);
+
+ for (i = 0; i < numbufs; i++) {
+ sbuf = (struct ips_scbbuf *)(base + bufsize * i);
+ SLIST_NEXT(sbuf, next) = NULL;
+ SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next);
+ }
+
+ VALGRIND_CREATE_MEMPOOL(scbc->sbuf_buf_alloc, 0,
+ /* Should be undefined but we stuff a next
+ * pointer in the buffer */
+ PSM_VALGRIND_MEM_DEFINED);
+ }
+
+ imm_base = 0;
+ scbc->scb_imm_size = imm_size;
+ if (scbc->scb_imm_size) {
+ scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64);
+ alloc_sz = numscb * scbc->scb_imm_size + 64;
+ scbc->scb_imm_buf =
+ psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+ if (scbc->scb_imm_buf == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64);
+ } else
+ scbc->scb_imm_buf = NULL;
+
+ scbc->scb_num = scbc->scb_num_cur = numscb;
+ SLIST_INIT(&scbc->scb_free);
+ scb_size = sizeof(struct ips_scb) + 2 * PSM_VALGRIND_REDZONE_SZ;
+ scb_size = PSMI_ALIGNUP(scb_size, 64);
+ alloc_sz = numscb * scb_size + PSM_VALGRIND_REDZONE_SZ + 64;
+ scbc->scb_base = (void *)
+ psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+ if (scbc->scb_base == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ base = (uintptr_t) scbc->scb_base;
+ base = PSMI_ALIGNUP(base + PSM_VALGRIND_REDZONE_SZ, 64);
+
+ /*
+ * Allocate ack/send timer for each scb object.
+ */
+ scbc->timers = (struct psmi_timer *)
+ psmi_calloc(ep, UNDEFINED, 2*numscb,
+ sizeof(struct psmi_timer));
+ if (scbc->timers == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ for (i = 0; i < numscb; i++) {
+ scb = (struct ips_scb *)(base + i * scb_size);
+ scb->scbc = scbc;
+ if (scbc->scb_imm_buf)
+ scb->imm_payload =
+ (void *)(imm_base + (i * scbc->scb_imm_size));
+ else
+ scb->imm_payload = NULL;
+
+ SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+ /*
+ * Initialize timers.
+ * Associate the timers to each scb, the association is
+ * not fixed because later PSM may exchange the timers
+ * between scb, the reason for exchanging is that the
+ * timer is currently using by flow, but the scb is to
+ * be freed. see ack/nak processing in file ips_prot_recv.c
+ */
+ scb->timer_ack = &scbc->timers[2*i];
+ psmi_timer_entry_init(scb->timer_ack,
+ ips_proto_timer_ack_callback, scb);
+
+ scb->timer_send = &scbc->timers[2*i+1];
+ psmi_timer_entry_init(scb->timer_send,
+ ips_proto_timer_send_callback, scb);
+ }
+ scbc->scb_avail_callback = scb_avail_callback;
+ scbc->scb_avail_context = scb_avail_context;
+
+ /* It would be nice to mark the scb as undefined but we pre-initialize the
+ * "next" pointer and valgrind would see this as a violation.
+ */
+ VALGRIND_CREATE_MEMPOOL(scbc, PSM_VALGRIND_REDZONE_SZ,
+ PSM_VALGRIND_MEM_DEFINED);
+
+fail:
+ return err;
+}
+
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc)
+{
+ if (scbc->scb_base != NULL) {
+ psmi_free(scbc->scb_base);
+ VALGRIND_DESTROY_MEMPOOL(scbc);
+ }
+ if (scbc->sbuf_buf_alloc) {
+ VALGRIND_DESTROY_MEMPOOL(scbc->sbuf_buf_alloc);
+ psmi_free(scbc->sbuf_buf_alloc);
+ }
+ return PSM2_OK;
+}
+
+int ips_scbctrl_bufalloc(ips_scb_t *scb)
+{
+ struct ips_scbctrl *scbc = scb->scbc;
+
+ psmi_assert(scbc->sbuf_num > 0);
+ psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+ (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)));
+ psmi_assert(scb->payload_size <= scbc->sbuf_buf_size);
+
+ if (scb->payload_size <= scbc->scb_imm_size) {
+ /* Attach immediate buffer */
+ ips_scb_buffer(scb) = scb->imm_payload;
+ return 1;
+ }
+
+ if (SLIST_EMPTY(&scbc->sbuf_free))
+ return 0;
+ else {
+ psmi_assert(scbc->sbuf_num_cur);
+ ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free);
+ scbc->sbuf_num_cur--;
+
+ /* If under memory pressure request ACK for packet to reclaim
+ * credits.
+ */
+ if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1))
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+ VALGRIND_MEMPOOL_ALLOC(scbc->sbuf_buf_alloc, ips_scb_buffer(scb),
+ scb->payload_size);
+ SLIST_REMOVE_HEAD(&scbc->sbuf_free, next);
+ return 1;
+ }
+}
+
+int ips_scbctrl_avail(struct ips_scbctrl *scbc)
+{
+ return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0);
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len,
+ uint32_t flags)
+{
+ ips_scb_t *scb, *scb_head = NULL;
+
+ psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1);
+ psmi_assert(scbc->sbuf_buf_size >= len);
+
+ while (scbnum--) {
+ if (SLIST_EMPTY(&scbc->scb_free))
+ break;
+ scb = SLIST_FIRST(&scbc->scb_free);
+ scb->flags = 0; /* Need to set this here as bufalloc may request
+ * an ACK under memory pressure
+ */
+ VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+
+ if (flags & IPS_SCB_FLAG_ADD_BUFFER) {
+ scb->payload_size = len;
+ if (!ips_scbctrl_bufalloc(scb))
+ break;
+ } else {
+ ips_scb_buffer(scb) = NULL;
+ scb->payload_size = 0;
+ }
+
+ scb->tidsendc = NULL;
+ scb->callback = NULL;
+ scb->tidctrl = 0;
+ scb->nfrag = 1;
+ scb->frag_size = 0;
+#ifdef PSM_CUDA
+ scb->mq_req = NULL;
+#endif
+
+ scbc->scb_num_cur--;
+ if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+
+ SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+ SLIST_NEXT(scb, next) = scb_head;
+ scb_head = scb;
+ }
+ return scb_head;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc);
+
+void ips_scbctrl_free(ips_scb_t *scb)
+{
+ struct ips_scbctrl *scbc = scb->scbc;
+ if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+ (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) {
+ scbc->sbuf_num_cur++;
+ SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next);
+ VALGRIND_MEMPOOL_FREE(scbc->sbuf_buf_alloc, ips_scb_buffer(scb));
+ }
+
+ ips_scb_buffer(scb) = NULL;
+ scb->tidsendc = NULL;
+ scb->payload_size = 0;
+ scbc->scb_num_cur++;
+ if (SLIST_EMPTY(&scbc->scb_free)) {
+ SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+ if (scbc->scb_avail_callback != NULL)
+ scbc->scb_avail_callback(scbc, scbc->scb_avail_context);
+ } else
+ SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+ VALGRIND_MEMPOOL_FREE(scbc, scb);
+ return;
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc)
+{
+ ips_scb_t *scb;
+ if (SLIST_EMPTY(&scbc->scb_free))
+ return NULL;
+ scb = SLIST_FIRST(&scbc->scb_free);
+
+ VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+ SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+ SLIST_NEXT(scb, next) = NULL;
+
+ ips_scb_buffer(scb) = NULL;
+ scb->payload_size = 0;
+ scb->flags = 0;
+ scb->tidsendc = NULL;
+ scb->callback = NULL;
+ scb->tidctrl = 0;
+ scb->nfrag = 1;
+ scb->frag_size = 0;
+#ifdef PSM_CUDA
+ scb->mq_req = NULL;
+#endif
+
+ scbc->scb_num_cur--;
+ if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+ scb->flags |= IPS_SEND_FLAG_ACKREQ;
+ return scb;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny);
diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h
new file mode 100644
index 0000000..62a509b
--- /dev/null
+++ b/ptl_ips/ips_scb.h
@@ -0,0 +1,226 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_SCB_H
+#define _IPS_SCB_H
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ips_proto_header.h"
+
+/* ips_alloc_scb flags */
+#define IPS_SCB_FLAG_NONE 0x0
+#define IPS_SCB_FLAG_ADD_BUFFER 0x1
+
+/* macros to update scb */
+#define ips_scb_hdrdata(scb) scb->ips_lrh.hdr_data
+#define ips_scb_uwords(scb) scb->ips_lrh.data
+#define ips_scb_opcode(scb) scb->opcode
+#define ips_scb_buffer(scb) scb->payload
+#define ips_scb_length(scb) scb->payload_size
+#define ips_scb_flags(scb) scb->flags
+#define ips_scb_dma_cntr(scb) scb->dma_cntr
+#define ips_scb_epaddr(scb) scb->epaddr
+#define ips_scb_cb(scb) scb->callback
+#define ips_scb_cb_param(scb) scb->cb_param
+
+#define ips_scb_copy_tag(dst, src) \
+ (dst)[0] = (src)[0]; \
+ (dst)[1] = (src)[1]; \
+ (dst)[2] = (src)[2];
+
+struct ips_scbbuf;
+struct ips_scb;
+struct ips_scbctrl;
+struct ips_tid_send_desc;
+
+typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *,
+ void *context);
+
+STAILQ_HEAD(ips_scb_stailq, ips_scb);
+SLIST_HEAD(ips_scb_slist, ips_scb);
+
+struct ips_scbctrl {
+ /* const psmi_context_t *context; */
+
+ /* Send control blocks for each send */
+ uint32_t scb_num;
+ uint32_t scb_num_cur;
+ SLIST_HEAD(scb_free, ips_scb) scb_free;
+ void *scb_base;
+ ips_scbctrl_avail_callback_fn_t scb_avail_callback;
+ void *scb_avail_context;
+
+ /* Immediate data for send buffers */
+ uint32_t scb_imm_size;
+ void *scb_imm_buf;
+ psmi_timer *timers; /* ack/send timers */
+
+ /*
+ * Send buffers (or bounce buffers) to keep user data if we need to
+ * retransmit.
+ */
+ uint32_t sbuf_num;
+ uint32_t sbuf_num_cur;
+ SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free;
+ void *sbuf_buf_alloc;
+ uint32_t sbuf_buf_size;
+ void *sbuf_buf_base;
+ void *sbuf_buf_last;
+};
+
+struct ips_scbbuf {
+ SLIST_ENTRY(ips_scbbuf) next;
+};
+
+typedef struct ips_scb ips_scb_t;
+
+struct ips_scb {
+ union {
+ SLIST_ENTRY(ips_scb) next;
+ STAILQ_ENTRY(ips_scb) nextq;
+ };
+ union {
+ void *payload;
+ struct ips_scbbuf *sbuf;
+ };
+ uint64_t ack_timeout; /* in cycles */
+ uint64_t abs_timeout; /* in cycles */
+
+ psmi_timer *timer_send; /* for sending packets */
+ psmi_timer *timer_ack; /* for acking packets */
+
+ /* Used when composing packet */
+ psmi_seqnum_t seq_num;
+ uint32_t cksum[2];
+ uint32_t flags;
+ uint32_t payload_size; /* remaining first packet size */
+ uint32_t chunk_size; /* total buffer size if nfrag > 1 */
+ /* initially chunk_size_remaining = chunk_size. */
+ uint32_t chunk_size_remaining; /* buffer size to re-transmit */
+ uint16_t nfrag; /* total packets in sequence */
+ /* initially nfrag_remaining = nfrag */
+ uint16_t nfrag_remaining; /* number packets to re-transmit */
+ uint16_t dma_complete;
+ uint16_t tidctrl;
+ uint16_t frag_size; /* max packet size in sequence */
+ uint16_t opcode;
+
+ struct ips_flow *flow;
+ struct ips_tid_send_desc *tidsendc;
+ uint32_t *tsess;
+ uint16_t tsess_length;
+
+ struct ips_scbctrl *scbc;
+ void *imm_payload;
+
+ union {
+ int (*callback) (void *, uint32_t);
+ psm2_am_completion_fn_t completion_am;
+ };
+ void *cb_param;
+#ifdef PSM_CUDA
+ psm2_mq_req_t mq_req; /* back pointer to original request */
+#endif
+
+ /* sdma header place holder, PSM2 code should access
+ * the sdma_req_info only using the psmi_get_sdma_req_info()
+ * accessor function. */
+ /*
+ * The size of struct sdma_req_info is variable. (10 bytes for
+ * GPU-direct and 8 bytes for non GPU-Direct)
+ * When GPU-Direct feature is used, all 10 bytes of the space is used.
+ * Otherwise, we only use upto 8 bytes. The usage is controlled by
+ * psmi_get_sdma_req_info() in ips_proto.h
+ */
+ union {
+ struct sdma_req_info _DO_NOT_USE_;
+ struct sdma_req_info_v6_3 _PLEASE_DO_NOT_USE_;
+ };
+ struct {
+ struct hfi_pbc pbc;
+ struct ips_message_header ips_lrh;
+ } PSMI_CACHEALIGN;
+};
+
+#ifdef PSM_CUDA
+#define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL)
+/* In case we need to be more precise about scb's locality
+ * we can expand the macro in place, e.g.
+ * #define IS_TRANSFER_BUF_GPU_MEM(scb) (scb->mq_req != NULL && \
+ * scb->mq_req->is_buf_gpu_mem && \
+ * !scb->mq_req->cuda_hostbuf_used)
+ */
+#endif
+
+void ips_scbctrl_free(ips_scb_t *scb);
+int ips_scbctrl_bufalloc(ips_scb_t *scb);
+int ips_scbctrl_avail(struct ips_scbctrl *scbc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc,
+ int scbnum, int len, uint32_t flags);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny);
+
+psm2_error_t ips_scbctrl_init(const psmi_context_t *context,
+ uint32_t numscb, uint32_t numbufs,
+ uint32_t imm_size, uint32_t bufsize,
+ ips_scbctrl_avail_callback_fn_t,
+ void *avail_context, struct ips_scbctrl *);
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *);
+
+psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd);
+
+#endif /* _IPS_SCB_H */
diff --git a/ptl_ips/ips_spio.c b/ptl_ips/ips_spio.c
new file mode 100644
index 0000000..944ebf5
--- /dev/null
+++ b/ptl_ips/ips_spio.c
@@ -0,0 +1,951 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/* included header files */
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sched.h>
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_spio.h"
+#include "ipserror.h" /* ips error codes */
+#include "ips_proto_params.h"
+
+/* Report PIO stalls every 20 seconds at the least */
+#define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9))
+#define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */
+/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */
+#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */
+
+static void spio_report_stall(struct ips_spio *ctrl,
+ uint64_t t_cyc_now, uint64_t send_failures);
+
+static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures);
+
+static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl);
+static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl);
+static psm2_error_t spio_credit_return_update(struct ips_spio *ctrl);
+static psm2_error_t spio_credit_return_update_shared(struct ips_spio *ctrl);
+
+psm2_error_t
+ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
+ struct ips_spio *ctrl)
+{
+ const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ cpuid_t id;
+ int i;
+
+ ctrl->ptl = ptl;
+ ctrl->context = context;
+ /* Copy runtime flags */
+ ctrl->runtime_flags = ptl->runtime_flags;
+ ctrl->unit_id = context->ep->unit_id;
+ ctrl->portnum = context->ep->portnum;
+
+ pthread_spin_init(&ctrl->spio_lock, PTHREAD_PROCESS_PRIVATE);
+ ctrl->spio_credits_addr =
+ (__le64 *) (ptrdiff_t) base_info->sc_credits_addr;
+ ctrl->spio_bufbase_sop =
+ (uint64_t *) (ptrdiff_t) base_info->pio_bufbase_sop;
+ ctrl->spio_bufbase =
+ (uint64_t *) (ptrdiff_t) base_info->pio_bufbase;
+ ctrl->spio_event = (uint64_t *) (ptrdiff_t) base_info->events_bufbase;
+
+ ctrl->spio_consecutive_failures = 0;
+ ctrl->spio_num_stall = 0ULL;
+ ctrl->spio_num_stall_total = 0ULL;
+ ctrl->spio_next_stall_warning = 0ULL;
+ ctrl->spio_last_stall_cyc = 0ULL;
+ ctrl->spio_init_cyc = get_cycles();
+
+ ctrl->spio_total_blocks = ctxt_info->credits;
+ ctrl->spio_block_index = 0;
+
+ ctrl->spio_ctrl = (struct ips_spio_ctrl *)context->spio_ctrl;
+ if (!ctrl->spio_ctrl) {
+ ctrl->spio_ctrl = (volatile struct ips_spio_ctrl *)
+ psmi_calloc(context->ep, UNDEFINED, 1,
+ sizeof(struct ips_spio_ctrl));
+ if (ctrl->spio_ctrl == NULL) {
+ return PSM2_NO_MEMORY;
+ }
+
+ ctrl->spio_reset_hfi = spio_reset_hfi;
+ ctrl->spio_credit_return_update =
+ spio_credit_return_update;
+ } else {
+ ctrl->spio_reset_hfi = spio_reset_hfi_shared;
+ ctrl->spio_credit_return_update =
+ spio_credit_return_update_shared;
+ }
+
+ /*
+ * Only the master process can initialize.
+ */
+ if (ctxt_info->subctxt == 0) {
+ pthread_spin_init(&ctrl->spio_ctrl->spio_ctrl_lock,
+ PTHREAD_PROCESS_SHARED);
+
+ ctrl->spio_ctrl->spio_write_in_progress = 0;
+ ctrl->spio_ctrl->spio_reset_count = 0;
+ ctrl->spio_ctrl->spio_frozen_count = 0;
+
+ ctrl->spio_ctrl->spio_available_blocks =
+ ctrl->spio_total_blocks;
+ ctrl->spio_ctrl->spio_block_index = 0;
+ ctrl->spio_ctrl->spio_fill_counter = 0;
+
+ psmi_assert(SPIO_CREDITS_Counter
+ (ctrl->spio_ctrl->spio_credits.value) == 0);
+ psmi_assert(SPIO_CREDITS_Status
+ (ctrl->spio_ctrl->spio_credits.value) == 0);
+
+ ctrl->spio_ctrl->spio_credits.credit_return =
+ *ctrl->spio_credits_addr;
+ }
+
+ /*
+ * Setup the PIO block copying routine.
+ */
+ ctrl->spio_blockcpy_selected = NULL;
+ ctrl->spio_blockcpy_routines[0] = hfi_pio_blockcpy_64;
+
+ ctrl->spio_blockcpy_routines[1] = NULL;
+#ifdef __SSE2__
+ ctrl->spio_blockcpy_routines[1] = hfi_pio_blockcpy_128;
+#endif
+ ctrl->spio_blockcpy_routines[2] = NULL;
+#ifdef __AVX2__
+ ctrl->spio_blockcpy_routines[2] = hfi_pio_blockcpy_256;
+#endif
+ ctrl->spio_blockcpy_routines[3] = NULL;
+#ifdef __AVX512F__
+ ctrl->spio_blockcpy_routines[3] = hfi_pio_blockcpy_512;
+#endif
+
+ get_cpuid(0x7, 0, &id);
+ if (id.ebx & (1<<AVX512F_BIT)) {
+ /* avx512f supported */
+ for (i = 3; i>= 0; i--) {
+ if (ctrl->spio_blockcpy_routines[i]) {
+ ctrl->spio_blockcpy_selected =
+ ctrl->spio_blockcpy_routines[i];
+ break;
+ }
+ }
+ } else if (id.ebx & (1<<AVX2_BIT)) {
+ /* 32B copying supported */
+ for (i = 2; i >=0; i--) {
+ if (ctrl->spio_blockcpy_routines[i]) {
+ ctrl->spio_blockcpy_selected =
+ ctrl->spio_blockcpy_routines[i];
+ break;
+ }
+ }
+ } else {
+ get_cpuid(0x1, 0, &id);
+ if (id.edx & (1<<SSE2_BIT)) {
+ /* 16B copying supported */
+ for (i = 1; i >=0; i--) {
+ if (ctrl->spio_blockcpy_routines[i]) {
+ ctrl->spio_blockcpy_selected =
+ ctrl->spio_blockcpy_routines[i];
+ break;
+ }
+ }
+ } else {
+ /* use 8B copying */
+ ctrl->spio_blockcpy_selected =
+ ctrl->spio_blockcpy_routines[0];
+ }
+ }
+ psmi_assert(ctrl->spio_blockcpy_selected != NULL);
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED) {
+ PSMI_CUDA_CALL(cudaHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+ 10240 /* Max MTU */, cudaHostAllocPortable);
+ }
+#endif
+
+ _HFI_PRDBG("ips_spio_init() done\n");
+
+ return PSM2_OK;
+}
+
+psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
+{
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED)
+ PSMI_CUDA_CALL(cudaFreeHost, (void *) ctrl->cuda_pio_buffer);
+#endif
+ spio_report_stall(ctrl, get_cycles(), 0ULL);
+ if (!ctrl->context->spio_ctrl)
+ psmi_free((void *)ctrl->spio_ctrl);
+ return PSM2_OK;
+}
+
+static
+void
+spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now,
+ uint64_t send_failures)
+{
+ size_t off = 0;
+ char buf[1024];
+
+ if (ctrl->spio_num_stall == 0)
+ return;
+
+ if (send_failures > 0) {
+ char bufctr[128];
+ uint64_t tx_stat, rx_stat;
+ int ret;
+
+ off = snprintf(buf, sizeof(buf) - 1,
+ "PIO Send context %d with total blocks %d , available blocks %d, "
+ "fill counter %d, free counter %d ",
+ (int)psm2_epid_context(ctrl->context->epid),
+ ctrl->spio_total_blocks,
+ ctrl->spio_ctrl->spio_available_blocks,
+ ctrl->spio_ctrl->spio_fill_counter,
+ SPIO_CREDITS_Counter(ctrl->spio_ctrl->
+ spio_credits.value));
+ buf[off] = '\0';
+
+ /* In case hfifs isn't running */
+ ret = hfi_get_single_portctr(ctrl->unit_id, ctrl->portnum,
+ "TxPkt", &tx_stat);
+ if (ret != -1) {
+ ret = hfi_get_single_portctr(ctrl->unit_id,
+ ctrl->portnum, "RxPkt",
+ &rx_stat);
+ if (ret != -1) {
+ snprintf(bufctr, sizeof(bufctr) - 1,
+ "(TxPktCnt=%llu,RxPktCnt=%llu)",
+ (unsigned long long)tx_stat,
+ (unsigned long long)rx_stat);
+ bufctr[sizeof(bufctr) - 1] = '\0';
+ } else
+ bufctr[0] = '\0';
+ } else
+ bufctr[0] = '\0';
+
+ _HFI_DBG
+ ("PIO Send Stall after at least %.2fM failed send attempts "
+ "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n",
+ send_failures / 1e6,
+ PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+ PSMI_CYCLES_TO_SECSF(t_cyc_now -
+ ctrl->spio_last_stall_cyc),
+ (unsigned long long)ctrl->spio_num_stall,
+ bufctr[0] != '\0' ? bufctr : "", buf);
+ } else {
+ _HFI_DBG
+ ("PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs",
+ (unsigned long long)ctrl->spio_num_stall,
+ PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+ PSMI_CYCLES_TO_SECSF(t_cyc_now -
+ ctrl->spio_last_stall_cyc));
+ }
+
+ return;
+}
+
+static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures)
+{
+ uint64_t t_cyc_now = get_cycles();
+
+ /* We handle the pio-stall every time but only report something every 20
+ * seconds. We print a summary at the end while closing the device */
+ ctrl->spio_num_stall++;
+ ctrl->spio_num_stall_total++;
+
+ if (ctrl->spio_next_stall_warning <= t_cyc_now) {
+ /* If context status is ok (i.e. no cables pulled or anything) */
+ if (psmi_context_check_status(ctrl->context) == PSM2_OK)
+ spio_report_stall(ctrl, t_cyc_now, send_failures);
+ ctrl->spio_next_stall_warning =
+ get_cycles() + SPIO_STALL_WARNING_INTERVAL;
+ }
+
+ /* re-initialize our shadow from the real registers; by this time,
+ * we know the hardware has to have done the update.
+ * Also, kernel check may have changed things.
+ */
+ ctrl->spio_credit_return_update(ctrl);
+
+ ctrl->spio_last_stall_cyc = t_cyc_now;
+
+ return;
+}
+
+/*
+ * A send context halt is detected in several ways:
+ * 1. during pio for normal credit return update;
+ * 2. during events process when no event;
+ * when a hfi is frozen, we recover hfi by calling this routine.
+ */
+static void spio_reset_context(struct ips_spio *ctrl)
+{
+ /* if there are too many reset, teardown process */
+ ctrl->spio_ctrl->spio_reset_count++;
+ if (ctrl->spio_ctrl->spio_reset_count > IPS_CTXT_RESET_MAX)
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Too many send context reset, teardown...\n");
+
+ /*
+ * Because there are many epaddrs and many flows using the
+ * same PIO queue, it is hard to search all the unacked
+ * queue and find the correct retry point. Instead we just
+ * let the upper level flow control to NAK the packets and
+ * do the retry from the right point.
+ */
+
+ /* Call into driver to reset send context, driver will
+ * block this routine until the send context is actually
+ * reset.
+ */
+ ips_wmb();
+ if (hfi_reset_context(ctrl->context->ctrl))
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "Send context reset failed: %d.\n", errno);
+
+ /* Reset spio shared control struct. */
+ ctrl->spio_ctrl->spio_available_blocks =
+ ctrl->spio_total_blocks;
+ ctrl->spio_ctrl->spio_block_index = 0;
+ ctrl->spio_ctrl->spio_fill_counter = 0;
+ /* Get updated credit return again after reset. */
+ ctrl->spio_ctrl->spio_credits.credit_return =
+ *ctrl->spio_credits_addr;
+
+ psmi_assert(SPIO_CREDITS_Counter
+ (ctrl->spio_ctrl->spio_credits.value) == 0);
+ psmi_assert(SPIO_CREDITS_Status
+ (ctrl->spio_ctrl->spio_credits.value) == 0);
+}
+
+/*
+ * hfi frozen is detected when checking events from driver,
+ * psm calls to check events in the main receive loop
+ * when there is no normal traffic.
+ */
+static void spio_reset_hfi_internal(struct ips_spio *ctrl)
+{
+ struct ips_recvhdrq *recvq = &ctrl->ptl->recvq;
+ struct ips_proto *proto = (struct ips_proto *)&ctrl->ptl->proto;
+
+ /* Reset receive queue state, this must be done first
+ * because after send context reset, hardware start to
+ * receive new packets.
+ */
+ recvq->state->hdrq_head = 0;
+ recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE;
+ recvq->state->num_hdrq_done = 0;
+ recvq->state->hdr_countdown = 0;
+ if (!(recvq->runtime_flags & HFI1_CAP_DMA_RTAIL))
+ recvq->state->hdrq_rhf_seq = 1;
+
+ /* Reset send context */
+ spio_reset_context(ctrl);
+
+ /* Reset sdma completion queue, this should be done last
+ * because when send context is reset, driver will complete
+ * all the sdma requests with error code -2. This error
+ * code is ignored by PSM, but other error codes are
+ * caught inside the routine.
+ */
+ while (proto->sdma_done_index != proto->sdma_fill_index)
+ ips_proto_dma_completion_update(proto);
+}
+
+static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl)
+{
+ /* Drain receive header queue before reset hfi, we use
+ * the main progression loop to do this so we return from
+ * here.
+ */
+ if (!ips_recvhdrq_isempty(&ctrl->ptl->recvq))
+ return PSM2_OK_NO_PROGRESS;
+
+ /* do the real reset work:
+ * 1. reset receive header queue;
+ * 2. reset send context;
+ * 3. dain sdma completion queue;
+ */
+ spio_reset_hfi_internal(ctrl);
+
+ return PSM2_OK;
+}
+
+/*
+ * There is a shared count and per process count, all initialized to
+ * zero. If a process' local count is equal to shared count, it is
+ * the first process and does the hfi reset, this process also move
+ * both counts up by one. If a process' local count is not equal to
+ * the shared count, it means other process has done the hfi reset,
+ * it just saves the shared count to local count and return. All the
+ * operation are locked by spio_ctrl_lock.
+ */
+static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl)
+{
+ volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl;
+
+ /* Drain receive header queue before reset hfi, we use
+ * the main progression loop to do this so we return from
+ * here. We don't reset software receive header queue.
+ */
+ if (!ips_recvhdrq_isempty(&ctrl->ptl->recvq))
+ return PSM2_OK_NO_PROGRESS;
+
+ pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+
+ /*
+ * In context sharing mode, if there is a subcontext
+ * process in PIO writing, we need to wait till the PIO
+ * writing is done. So we spin wait here. If other
+ * process comes here and does the hfi reset, it should
+ * be perfectly fine.
+ */
+ while (ctrl->spio_ctrl->spio_write_in_progress) {
+ pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+ usleep(1000);
+ pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+ }
+
+ if (ctrl->spio_frozen_count == ctrl->spio_ctrl->spio_frozen_count) {
+ ctrl->spio_frozen_count++;
+ ctrl->spio_ctrl->spio_frozen_count++;
+
+ spio_reset_hfi_internal(ctrl);
+ } else
+ ctrl->spio_frozen_count = ctrl->spio_ctrl->spio_frozen_count;
+
+ pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+
+ return PSM2_OK;
+}
+
+/*
+ * return value:
+ * PSM2_OK: new credits updated;
+ * PSM2_OK_NO_PROGRESS: no new credits;
+ */
+static psm2_error_t
+spio_credit_return_update(struct ips_spio *ctrl)
+{
+ uint64_t credit_return;
+
+ credit_return = *ctrl->spio_credits_addr;
+ /* Update available blocks based on fill counter and free counter */
+ if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return)
+ return PSM2_OK_NO_PROGRESS;
+
+ ctrl->spio_ctrl->spio_credits.credit_return = credit_return;
+
+ /* If Status is set, then send context is halted */
+ if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) {
+ spio_reset_context(ctrl);
+ } else {
+ /*
+ * OPA1 has 1M PIO buffer, but each context can have max 64K,
+ * which is 1K 64B blocks, so the distance between fill counter
+ * and credit return counter is no more than 1024; Both fill
+ * counter and credit return counter are 11 bits value,
+ * representing range [0, 2047].
+ */
+ psmi_assert((ctrl->spio_ctrl->spio_available_blocks +
+ ((ctrl->spio_ctrl->spio_fill_counter -
+ SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+ value)) & 0x7FF)) <=
+ ctrl->spio_total_blocks);
+ ctrl->spio_ctrl->spio_available_blocks =
+ ctrl->spio_total_blocks -
+ ((ctrl->spio_ctrl->spio_fill_counter -
+ SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+ value)) & 0x7FF);
+
+ /* a successful credit update, clear reset count */
+ ctrl->spio_ctrl->spio_reset_count = 0;
+ }
+
+ return PSM2_OK;
+}
+
+/*
+ * return value:
+ * PSM2_OK: new credits updated;
+ * PSM2_OK_NO_PROGRESS: no new credits;
+ */
+static psm2_error_t
+spio_credit_return_update_shared(struct ips_spio *ctrl)
+{
+ uint64_t credit_return;
+
+ pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock);
+
+ credit_return = *ctrl->spio_credits_addr;
+ /* Update available blocks based on fill counter and free counter */
+ if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) {
+ pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+ return PSM2_OK_NO_PROGRESS;
+ }
+
+ ctrl->spio_ctrl->spio_credits.credit_return = credit_return;
+
+ /* If Status is set, then send context is halted */
+ if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) {
+ /*
+ * In context sharing mode, if there is a subcontext
+ * process in PIO writing, we need to wait till the PIO
+ * writing is done. So we spin wait here. Other processes
+ * won't come here because for them, there is NO new
+ * credit return change (the first 'if' check in this
+ * routine).
+ */
+ while (ctrl->spio_ctrl->spio_write_in_progress) {
+ pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+ usleep(1000);
+ pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock);
+ }
+
+ spio_reset_context(ctrl);
+ } else {
+ /*
+ * OPA1 has 1M PIO buffer, but each context can have max 64K,
+ * which is 1K 64B blocks, so the distance between fill counter
+ * and credit return counter is no more than 1024; Both fill
+ * counter and credit return counter are 11 bits value,
+ * representing range [0, 2047].
+ */
+ psmi_assert((ctrl->spio_ctrl->spio_available_blocks +
+ ((ctrl->spio_ctrl->spio_fill_counter -
+ SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+ value)) & 0x7FF)) <=
+ ctrl->spio_total_blocks);
+ ctrl->spio_ctrl->spio_available_blocks =
+ ctrl->spio_total_blocks -
+ ((ctrl->spio_ctrl->spio_fill_counter -
+ SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits.
+ value)) & 0x7FF);
+
+ /* a successful credit update, clear reset count */
+ ctrl->spio_ctrl->spio_reset_count = 0;
+ }
+
+ pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock);
+
+ return PSM2_OK;
+}
+
+/*
+ * Check and process events
+ * return value:
+ * PSM2_OK: normal events processing;
+ * PSM2_OK_NO_PROGRESS: no event is processed;
+ */
+psm2_error_t
+ips_spio_process_events(const struct ptl *ptl)
+{
+ struct ips_spio *ctrl = ptl->proto.spioc;
+ __u64 event_mask;
+
+ /*
+ * If there is no event, try do credit return update
+ * to catch send context halt.
+ */
+ if_pf(*ctrl->spio_event == 0)
+ return ctrl->spio_credit_return_update(ctrl);
+
+ /*
+ * Process mmu invalidation event, this will invalidate
+ * all caching items removed by mmu notifier.
+ */
+ if ((*ctrl->spio_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+ /*
+ * driver will clear the event bit before return,
+ * PSM does not need to ack the event.
+ */
+ return ips_tidcache_invalidation(&ptl->proto.protoexp->tidc);
+ }
+
+ /* Get event mask for PSM to process */
+ event_mask = (uint64_t) *ctrl->spio_event;
+
+ /* Check if HFI is frozen */
+ if (event_mask & HFI1_EVENT_FROZEN) {
+ /* if no progress, return and retry */
+ if (ctrl->spio_reset_hfi(ctrl) != PSM2_OK)
+ return PSM2_OK_NO_PROGRESS;
+ }
+
+ /* First ack the driver the receipt of the events */
+ _HFI_VDBG("Acking event(s) 0x%" PRIx64 " to qib driver.\n",
+ (uint64_t) event_mask);
+ hfi_event_ack(ctrl->context->ctrl, event_mask);
+
+ if (event_mask & HFI1_EVENT_LINKDOWN) {
+ /* A link down event can clear the LMC and SL2VL
+ * change as those events are implicitly handled
+ * in the link up/down event handler.
+ */
+ event_mask &=
+ ~(HFI1_EVENT_LMC_CHANGE |
+ HFI1_EVENT_SL2VL_CHANGE);
+ ips_ibta_link_updown_event(&ctrl->ptl->proto);
+ _HFI_VDBG("Link down detected.\n");
+ }
+
+ if (event_mask & HFI1_EVENT_LID_CHANGE) {
+ /* Display a warning that LID change has occurred during
+ * the run. This is not supported in the current
+ * implementation and in general is bad for the SM to
+ * re-assign LIDs during a run.
+ */
+ _HFI_INFO
+ ("Warning! LID change detected during run. "
+ "Old LID: %d, New Lid: %d\n",
+ (int)PSMI_EPID_GET_LID(ctrl->context->epid),
+ (int)hfi_get_port_lid(ctrl->unit_id,
+ ctrl->portnum));
+ }
+
+ if (event_mask & HFI1_EVENT_LMC_CHANGE)
+ _HFI_INFO("Fabric LMC changed.\n");
+
+ if (event_mask & HFI1_EVENT_SL2VL_CHANGE) {
+ _HFI_INFO("SL2VL mapping changed for port.\n");
+ ips_ibta_init_sl2sc2vl_table(&ctrl->ptl->proto);
+ }
+
+ return PSM2_OK;
+}
+
+static void
+spio_handle_resync(struct ips_spio *ctrl, uint64_t consecutive_send_failed)
+{
+ /* hfi_force_pio_avail_update(ctrl->context->ctrl); */
+
+ if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1)))
+ spio_handle_stall(ctrl, consecutive_send_failed);
+}
+
+/*
+ * This function attempts to write a packet to a PIO.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Packet triggered through PIO.
+ * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm2_error_t
+ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
+ struct hfi_pbc *pbc, uint32_t *payload,
+ uint32_t length, uint32_t isCtrlMsg,
+ uint32_t cksum_valid, uint32_t cksum
+#ifdef PSM_CUDA
+ , uint32_t is_cuda_payload
+#endif
+ )
+{
+ struct ips_spio *ctrl = proto->spioc;
+ volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl;
+ volatile uint64_t *pioaddr;
+ uint32_t paylen, nblks;
+ psm2_error_t err = PSM2_OK;
+ int do_lock = (ctrl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+
+ if (do_lock)
+ pthread_spin_lock(&ctrl->spio_lock);
+
+ if_pf(PSMI_FAULTINJ_ENABLED()) {
+ PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1,
+ IPS_FAULTINJ_PIOLOST);
+ PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1,
+ IPS_FAULTINJ_PIOBUSY);
+ if (psmi_faultinj_is_fault(fi_lost)) {
+ if (do_lock)
+ pthread_spin_unlock(&ctrl->spio_lock);
+ return PSM2_OK;
+ } else if (psmi_faultinj_is_fault(fi_busy))
+ goto fi_busy;
+ /* else fall through normal processing path, i.e. no faults */
+ }
+
+ psmi_assert((length & 0x3) == 0);
+ paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0);
+ nblks = 1 + ((paylen + 63) >> 6);
+
+ if (spio_ctrl->spio_available_blocks < nblks) {
+ ctrl->spio_credit_return_update(ctrl);
+
+ if_pf(spio_ctrl->spio_available_blocks < nblks) {
+ /* Check unit status */
+fi_busy:
+ if ((err =
+ psmi_context_check_status(ctrl->context)) ==
+ PSM2_OK) {
+ if (0 ==
+ (++ctrl->
+ spio_consecutive_failures &
+ (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL - 1)))
+ spio_handle_resync(ctrl,
+ ctrl->
+ spio_consecutive_failures);
+ err = PSM2_EP_NO_RESOURCES;
+ }
+ /* If cable is pulled, we don't count it as a consecutive failure,
+ * we just make it as though no send pio was available */
+ else if (err == PSM2_OK_NO_PROGRESS)
+ err = PSM2_EP_NO_RESOURCES;
+ /* else something bad happened in check_status */
+ if (do_lock)
+ pthread_spin_unlock(&ctrl->spio_lock);
+ return err;
+ }
+ }
+
+ /*
+ * if context->spio_ctrl is set, it is pointing to shared context ureg
+ * page, and we are using context sharing.
+ */
+ if (ctrl->context->spio_ctrl) {
+ pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+ if (spio_ctrl->spio_available_blocks < nblks) {
+ pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+
+ if (do_lock)
+ pthread_spin_unlock(&ctrl->spio_lock);
+ return PSM2_EP_NO_RESOURCES;
+ }
+ }
+
+ _HFI_VDBG("credits: total %d, avail %d index %d, fill %d "
+ "free %d: %d %d %d %d %d; addr %llx\n",
+ ctrl->spio_total_blocks,
+ spio_ctrl->spio_available_blocks,
+ spio_ctrl->spio_block_index,
+ spio_ctrl->spio_fill_counter,
+ SPIO_CREDITS_Counter(spio_ctrl->spio_credits.value),
+ SPIO_CREDITS_Status(spio_ctrl->spio_credits.value),
+ SPIO_CREDITS_DueToPbc(spio_ctrl->spio_credits.value),
+ SPIO_CREDITS_DueToTheshold(spio_ctrl->spio_credits.value),
+ SPIO_CREDITS_DueToErr(spio_ctrl->spio_credits.value),
+ SPIO_CREDITS_DueToForce(spio_ctrl->spio_credits.value),
+ *ctrl->spio_credits_addr);
+
+ /*
+ * Save the assigned locally, update the shared for other processes.
+ */
+ ctrl->spio_block_index = spio_ctrl->spio_block_index;
+ spio_ctrl->spio_available_blocks -= nblks;
+ /* fill counter should be 11 bits value, same as credit return counter */
+ spio_ctrl->spio_fill_counter =
+ (spio_ctrl->spio_fill_counter + nblks) & 0x7FF;
+ spio_ctrl->spio_block_index += nblks;
+ if (spio_ctrl->spio_block_index >= ctrl->spio_total_blocks)
+ spio_ctrl->spio_block_index -= ctrl->spio_total_blocks;
+
+ /*
+ * Unlock in context sharing mode, but increase refcount to
+ * indicate I am in progress to write to PIO blocks.
+ */
+ if (ctrl->context->spio_ctrl) {
+ spio_ctrl->spio_write_in_progress++;
+ pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+ }
+
+ ctrl->spio_num_stall = 0; /* now able to send, so clear if set */
+ ctrl->spio_consecutive_failures = 0;
+ if (do_lock)
+ pthread_spin_unlock(&ctrl->spio_lock);
+
+ _HFI_VDBG("PIO write: nblks %d length %d, paylen %d\n", nblks, length,
+ paylen);
+
+ /* Setup PBC for this packet */
+ ips_proto_pbc_update(proto, flow, isCtrlMsg,
+ pbc, sizeof(struct ips_message_header), paylen);
+
+ /* Write to PIO: SOP block */
+ pioaddr = ctrl->spio_bufbase_sop + ctrl->spio_block_index * 8;
+ if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+ ctrl->spio_block_index = 0;
+
+ ctrl->spio_blockcpy_selected(pioaddr, (uint64_t *) pbc, 1);
+ _HFI_VDBG("pio qw write sop %p: 8\n", pioaddr);
+
+ /* Write to PIO: other blocks of payload */
+#ifdef PSM_CUDA
+ if (is_cuda_payload) {
+ /* Since the implementation of cudaMemcpy is unknown,
+ and the HFI specifies several conditions for how PIO
+ writes must occur, for safety reasons we should not assume
+ that cudaMemcpy will follow the HFI's requirements.
+ The cudaMemcpy should instead write into a buffer in
+ host memory, and then PSM can copy to the HFI as usual. */
+ PSMI_CUDA_CALL(cudaMemcpy, ctrl->cuda_pio_buffer,
+ payload, paylen, cudaMemcpyDeviceToHost);
+ payload = (uint32_t *) ctrl->cuda_pio_buffer;
+ }
+#endif
+ if (length >= 64) {
+ uint32_t blks2send = length >> 6;
+ uint32_t blks2end =
+ ctrl->spio_total_blocks - ctrl->spio_block_index;
+
+ pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+ if (blks2end >= blks2send) {
+ ctrl->spio_blockcpy_selected(pioaddr,
+ (uint64_t *)payload, blks2send);
+ _HFI_VDBG("pio blk write %p: %d\n",
+ pioaddr, blks2send);
+ ctrl->spio_block_index += blks2send;
+ if (ctrl->spio_block_index == ctrl->spio_total_blocks)
+ ctrl->spio_block_index = 0;
+ payload += blks2send*16;
+ } else {
+ ctrl->spio_blockcpy_selected(pioaddr,
+ (uint64_t *)payload, blks2end);
+ _HFI_VDBG("pio blk write %p: %d\n",
+ pioaddr, blks2end);
+ payload += blks2end*16;
+
+ pioaddr = ctrl->spio_bufbase;
+ ctrl->spio_blockcpy_selected(pioaddr,
+ (uint64_t *)payload, (blks2send-blks2end));
+ _HFI_VDBG("pio blk write %p: %d\n",
+ pioaddr, (blks2send-blks2end));
+ ctrl->spio_block_index = blks2send - blks2end;
+ payload += (blks2send-blks2end)*16;
+ }
+
+ length -= blks2send*64;
+ }
+
+ /*
+ * The following code makes sure to write to pioaddr in
+ * qword granularity, this is required by hardware.
+ */
+ paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0);
+ if (paylen > 0) {
+ uint32_t blkbuf[32];
+ uint32_t qws = length >> 3;
+ uint32_t dws = 0;
+
+ pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+ if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+ ctrl->spio_block_index = 0;
+
+ /* Write the remaining qwords of payload */
+ if (qws) {
+ hfi_qwordcpy_safe(pioaddr, (uint64_t *) payload, qws);
+ _HFI_VDBG("pio qw write %p: %d\n", pioaddr, qws);
+ payload += qws << 1;
+ length -= qws << 3;
+
+ pioaddr += qws;
+ paylen -= qws << 3;
+ }
+
+ /* if we have last one dword payload */
+ if (length > 0) {
+ blkbuf[dws++] = payload[0];
+ }
+ /* if we have checksum to attach */
+ if (paylen > length) {
+ blkbuf[dws++] = cksum;
+ blkbuf[dws++] = cksum;
+ }
+
+ /* Write the rest of qwords of current block */
+ hfi_qwordcpy_safe(pioaddr, (uint64_t *) blkbuf, 8 - qws);
+ _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8 - qws);
+
+ if (paylen > ((8 - qws) << 3)) {
+ /* We need another block */
+ pioaddr =
+ ctrl->spio_bufbase + ctrl->spio_block_index * 8;
+ if (++ctrl->spio_block_index == ctrl->spio_total_blocks)
+ ctrl->spio_block_index = 0;
+
+ /* Write the last block */
+ hfi_qwordcpy_safe(pioaddr,
+ (uint64_t *) &blkbuf[(8 - qws) << 1],
+ 8);
+ _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8);
+ }
+ }
+
+ /*
+ * In context sharing, we need to track who is in progress of
+ * writing to PIO block, this is for halted send context reset.
+ * I am done with PIO blocks writing, decrease the refcount.
+ */
+ if (ctrl->context->spio_ctrl) {
+ pthread_spin_lock(&spio_ctrl->spio_ctrl_lock);
+ spio_ctrl->spio_write_in_progress--;
+ pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock);
+ }
+
+ return PSM2_OK;
+} /* ips_spio_transfer_frame() */
diff --git a/ptl_ips/ips_spio.h b/ptl_ips/ips_spio.h
new file mode 100644
index 0000000..2d61cce
--- /dev/null
+++ b/ptl_ips/ips_spio.h
@@ -0,0 +1,189 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef IPS_SPIO_H
+#define IPS_SPIO_H
+
+#include "psm_user.h"
+
+#define IPS_CTXT_RESET_MAX 1000 /* max send context reset */
+struct ips_spio;
+struct ptl;
+
+/* 64B move instruction support */
+#define AVX512F_BIT 16 /* level 07h, ebx */
+/* 32B move instruction support */
+#define AVX2_BIT 5 /* level 07h, ebx */
+/* 16B move instruction support */
+#define SSE2_BIT 26 /* level 01h, edx */
+
+typedef
+void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest,
+ const uint64_t *src, uint32_t nblock);
+#ifdef __AVX512F__
+void hfi_pio_blockcpy_512(volatile uint64_t *dest,
+ const uint64_t *src, uint32_t nblock);
+#endif
+#ifdef __AVX2__
+void hfi_pio_blockcpy_256(volatile uint64_t *dest,
+ const uint64_t *src, uint32_t nblock);
+#endif
+#ifdef __SSE2__
+void hfi_pio_blockcpy_128(volatile uint64_t *dest,
+ const uint64_t *src, uint32_t nblock);
+#endif
+void hfi_pio_blockcpy_64(volatile uint64_t *dest,
+ const uint64_t *src, uint32_t nblock);
+
+
+psm2_error_t ips_spio_init(const psmi_context_t *context,
+ struct ptl *ptl, struct ips_spio *ctrl);
+psm2_error_t ips_spio_fini(struct ips_spio *ctrl);
+
+psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto,
+ struct ips_flow *flow, struct hfi_pbc *pbc,
+ uint32_t *payload, uint32_t length,
+ uint32_t isCtrlMsg, uint32_t cksum_valid,
+ uint32_t cksum
+#ifdef PSM_CUDA
+ , uint32_t is_cuda_payload
+#endif
+);
+
+psm2_error_t ips_spio_process_events(const struct ptl *ptl);
+
+#define SPIO_CREDITS_Counter(value) (((value) >> 0) & 0x7FF)
+#define SPIO_CREDITS_Status(value) (((value) >> 11) & 0x1)
+#define SPIO_CREDITS_DueToPbc(value) (((value) >> 12) & 0x1)
+#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1)
+#define SPIO_CREDITS_DueToErr(value) (((value) >> 14) & 0x1)
+#define SPIO_CREDITS_DueToForce(value) (((value) >> 15) & 0x1)
+struct ips_spio_credits {
+/* don't use bit operation for performance reason,
+ * using above macro instead.
+ uint16_t Counter:11;
+ uint16_t Status:1;
+ uint16_t CreditReturnDueToPbc:1;
+ uint16_t CreditReturnDueToThreshold:1;
+ uint16_t CreditReturnDueToErr:1;
+ uint16_t CreditReturnDueToForce:1;
+*/
+ union {
+ struct {
+ uint16_t value;
+ uint16_t pad0;
+ uint32_t pad1;
+ };
+ uint64_t credit_return;
+ };
+};
+
+struct ips_spio_ctrl {
+ /* credit return lock for context sharing */
+ pthread_spinlock_t spio_ctrl_lock;
+
+ /* PIO write in progress for context sharing */
+ volatile uint16_t spio_write_in_progress;
+ /* send context reset count */
+ volatile uint16_t spio_reset_count;
+ /* HFI frozen count, shared copy */
+ volatile uint16_t spio_frozen_count;
+
+ volatile uint16_t spio_available_blocks;
+ volatile uint16_t spio_block_index;
+ volatile uint16_t spio_fill_counter;
+ volatile struct ips_spio_credits spio_credits;
+} __attribute__ ((aligned(64)));
+
+struct ips_spio {
+ const psmi_context_t *context;
+ struct ptl *ptl;
+ uint32_t runtime_flags;
+ uint16_t unit_id;
+ uint16_t portnum;
+
+ pthread_spinlock_t spio_lock; /* thread lock */
+ volatile __le64 *spio_credits_addr __attribute__ ((aligned(64)));
+ volatile uint64_t *spio_bufbase_sop;
+ volatile uint64_t *spio_bufbase;
+ volatile uint64_t *spio_event;
+ volatile struct ips_spio_ctrl *spio_ctrl;
+
+ uint16_t spio_frozen_count; /* local copy */
+ uint16_t spio_total_blocks;
+ uint16_t spio_block_index;
+
+ uint32_t spio_consecutive_failures;
+ uint64_t spio_num_stall;
+ uint64_t spio_num_stall_total;
+ uint64_t spio_next_stall_warning;
+ uint64_t spio_last_stall_cyc;
+ uint64_t spio_init_cyc;
+
+ psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl);
+ psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl);
+
+ /* 8B copying, 16B copying, 32B copying, and 64B copying */
+ ips_spio_blockcpy_fn_t spio_blockcpy_routines[4];
+ ips_spio_blockcpy_fn_t spio_blockcpy_selected;
+
+#ifdef PSM_CUDA
+ /* Use an intermediate buffer when writing PIO data from the
+ GPU to ensure that we follow the HFI's write ordering rules. */
+ unsigned char *cuda_pio_buffer;
+#endif
+};
+
+#endif /* IPS_SPIO_H */
diff --git a/ptl_ips/ips_stats.h b/ptl_ips/ips_stats.h
new file mode 100644
index 0000000..046e0c3
--- /dev/null
+++ b/ptl_ips/ips_stats.h
@@ -0,0 +1,83 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_STATS_H
+#define _IPS_STATS_H
+
+struct psm2_epaddr; /* for non-PSM clients */
+
+/* Old stats */
+typedef struct {
+ uint64_t err_chk_send;
+ uint64_t err_chk_recv;
+ uint64_t send_failed;
+ uint64_t recv_dropped;
+ union {
+ uint64_t recv_copied; /* obsolete */
+ uint64_t nak_sent;
+ };
+ uint64_t nak_recv;
+ uint64_t total_send_eager;
+ uint64_t total_send_exp;
+ uint64_t acks_sent;
+ uint64_t retransmits;
+ uint64_t recv_matched;
+ uint64_t recv_unmatched;
+ uint64_t scb_alloc_yields;
+} ips_sess_stat;
+
+int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats);
+
+#endif /* _IPS_STATS_H */
diff --git a/ptl_ips/ips_subcontext.c b/ptl_ips/ips_subcontext.c
new file mode 100644
index 0000000..7e3d04b
--- /dev/null
+++ b/ptl_ips/ips_subcontext.c
@@ -0,0 +1,97 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_subcontext.h"
+#include "ips_spio.h"
+#include "ips_tid.h"
+#include "ips_tidflow.h"
+#include "ptl_ips.h"
+
+psm2_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, uint32_t subcontext_cnt,
+ psmi_context_t *context,
+ struct ips_subcontext_ureg **uregp)
+{
+ const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+ uintptr_t all_subcontext_uregbase =
+ (uintptr_t) base_info->subctxt_uregbase;
+ int i;
+
+ psmi_assert_always(all_subcontext_uregbase != 0);
+ for (i = 0; i < HFI1_MAX_SHARED_CTXTS; i++) {
+ struct ips_subcontext_ureg *subcontext_ureg =
+ (struct ips_subcontext_ureg *)all_subcontext_uregbase;
+ *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL;
+ all_subcontext_uregbase += sizeof(struct ips_subcontext_ureg);
+ }
+
+ ptl->recvshc->hwcontext_ctrl =
+ (struct ips_hwcontext_ctrl *)all_subcontext_uregbase;
+ all_subcontext_uregbase += sizeof(struct ips_hwcontext_ctrl);
+
+ context->spio_ctrl = (void *)all_subcontext_uregbase;
+ all_subcontext_uregbase += sizeof(struct ips_spio_ctrl);
+
+ context->tid_ctrl = (void *)all_subcontext_uregbase;
+ all_subcontext_uregbase += sizeof(struct ips_tid_ctrl);
+
+ context->tf_ctrl = (void *)all_subcontext_uregbase;
+ all_subcontext_uregbase += sizeof(struct ips_tf_ctrl);
+
+ psmi_assert((all_subcontext_uregbase -
+ (uintptr_t) base_info->subctxt_uregbase) <= PSMI_PAGESIZE);
+
+ return PSM2_OK;
+}
diff --git a/ptl_ips/ips_subcontext.h b/ptl_ips/ips_subcontext.h
new file mode 100644
index 0000000..a35e080
--- /dev/null
+++ b/ptl_ips/ips_subcontext.h
@@ -0,0 +1,81 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef __IPS_SUBCONTEXT_H
+#define __IPS_SUBCONTEXT_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+
+/* This data structure is allocated in ureg page of each subcontext process */
+
+struct ips_subcontext_ureg {
+ /* head/eager head/tail register storage, one per cacheline */
+ uint64_t subcontext_uregbase[ur_maxreg * 8];
+ struct ips_writehdrq_state writeq_state; /* used in all ureg pages */
+} __attribute__ ((aligned(64)));
+
+struct ips_hwcontext_ctrl {
+ pthread_spinlock_t context_lock; /* lock shared by all subctxts */
+ struct ips_recvhdrq_state recvq_state; /* state shared by all subctxts */
+} __attribute__ ((aligned(64)));
+
+psm2_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, uint32_t subcontext_cnt,
+ psmi_context_t *context,
+ struct ips_subcontext_ureg **uregp);
+
+#endif
diff --git a/ptl_ips/ips_tid.c b/ptl_ips/ips_tid.c
new file mode 100644
index 0000000..63f213b
--- /dev/null
+++ b/ptl_ips/ips_tid.c
@@ -0,0 +1,278 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_tid.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t
+ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp,
+ ips_tid_avail_cb_fn_t cb, void *cb_context)
+{
+ const struct hfi1_user_info_dep *user_info = &context->user_info;
+ const struct hfi1_base_info *base_info = &context->ctrl->base_info;
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ struct ips_tid *tidc = &protoexp->tidc;
+
+ struct psmi_stats_entry entries[] = {
+ PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL,
+ NULL, &tidc->tid_num_total),
+ };
+
+ tidc->context = context;
+ tidc->protoexp = protoexp;
+ tidc->tid_num_total = 0;
+ tidc->tid_num_inuse = 0;
+ tidc->tid_avail_cb = cb;
+ tidc->tid_avail_context = cb_context;
+ tidc->tid_array = NULL;
+ tidc->invalidation_event = (uint64_t *)
+ (ptrdiff_t) base_info->events_bufbase;
+
+ /*
+ * PSM uses tid registration caching only if driver has enabled it.
+ */
+ if (!(tidc->context->runtime_flags & HFI1_CAP_TID_UNMAP)) {
+ int i;
+ cl_qmap_t *p_map;
+ cl_map_item_t *root,*nil_item;
+
+ tidc->tid_array = (uint32_t *)
+ psmi_calloc(context->ep, UNDEFINED,
+ context->ctrl->__hfi_tidexpcnt,
+ sizeof(uint32_t));
+ if (tidc->tid_array == NULL)
+ return PSM2_NO_MEMORY;
+
+ /*
+ * first is root node, last is terminator node.
+ */
+ p_map = &tidc->tid_cachemap;
+ root = (cl_map_item_t *)
+ psmi_calloc(context->ep, UNDEFINED,
+ context->ctrl->__hfi_tidexpcnt + 2,
+ sizeof(cl_map_item_t));
+
+ if (root == NULL)
+ return PSM2_NO_MEMORY;
+
+ nil_item = &root
+ [context->ctrl->__hfi_tidexpcnt + 1];
+
+ ips_tidcache_map_init(p_map,root,nil_item);
+
+ NTID = 0;
+ NIDLE = 0;
+ IPREV(IHEAD) = INEXT(IHEAD) = IHEAD;
+ for (i = 1; i <= context->ctrl->__hfi_tidexpcnt; i++) {
+ INVALIDATE(i) = 1;
+ }
+
+ /*
+ * if not shared context, all tids are used by the same
+ * process. Otherwise, subcontext process can only cache
+ * its own portion. Driver makes the same tid number
+ * assignment to subcontext processes.
+ */
+ tidc->tid_cachesize = context->ctrl->__hfi_tidexpcnt;
+ if (user_info->subctxt_cnt > 0) {
+ uint16_t remainder = tidc->tid_cachesize %
+ user_info->subctxt_cnt;
+ tidc->tid_cachesize /= user_info->subctxt_cnt;
+ if (ctxt_info->subctxt < remainder)
+ tidc->tid_cachesize++;
+ }
+ }
+
+ /*
+ * Setup shared control structure.
+ */
+ tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl;
+ if (!tidc->tid_ctrl) {
+ tidc->tid_ctrl = (struct ips_tid_ctrl *)
+ psmi_calloc(context->ep, UNDEFINED, 1,
+ sizeof(struct ips_tid_ctrl));
+ if (tidc->tid_ctrl == NULL) {
+ return PSM2_NO_MEMORY;
+ }
+ }
+
+ /*
+ * Only the master process can initialize.
+ */
+ if (ctxt_info->subctxt == 0) {
+ pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock,
+ PTHREAD_PROCESS_SHARED);
+
+ tidc->tid_ctrl->tid_num_max =
+ context->ctrl->__hfi_tidexpcnt;
+ tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max;
+ }
+
+ return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+ PSMI_STATSTYPE_TIDS,
+ entries,
+ PSMI_STATS_HOWMANY(entries), tidc);
+}
+
+psm2_error_t ips_tid_fini(struct ips_tid *tidc)
+{
+ if (tidc->tid_array)
+ ips_tidcache_cleanup(tidc);
+
+ if (!tidc->context->tid_ctrl)
+ psmi_free(tidc->tid_ctrl);
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_tid_acquire(struct ips_tid *tidc,
+ const void *buf, uint32_t *length,
+ uint32_t *tid_array, uint32_t *tidcnt
+#ifdef PSM_CUDA
+ , uint8_t is_cuda_ptr
+#endif
+ )
+{
+ struct ips_tid_ctrl *ctrl = tidc->tid_ctrl;
+ psm2_error_t err = PSM2_OK;
+ uint16_t flags = 0;
+ int rc;
+
+ psmi_assert(((uintptr_t) buf & 0xFFF) == 0);
+ psmi_assert(((*length) & 0xFFF) == 0);
+
+ if (tidc->context->tid_ctrl)
+ pthread_spin_lock(&ctrl->tid_ctrl_lock);
+
+ if (!ctrl->tid_num_avail) {
+ err = PSM2_EP_NO_RESOURCES;
+ goto fail;
+ }
+
+ /* Clip length if it exceeds worst case tid allocation,
+ where each entry in the tid array can accommodate only
+ 1 page. */
+ if (*length > 4096*tidc->tid_ctrl->tid_num_max)
+ {
+ *length = 4096*tidc->tid_ctrl->tid_num_max;
+ }
+
+#ifdef PSM_CUDA
+ if (is_cuda_ptr)
+ flags = HFI1_BUF_GPU_MEM;
+#endif
+
+ rc = hfi_update_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) buf, length,
+ (uint64_t) (uintptr_t) tid_array, tidcnt, flags);
+ if (rc < 0) {
+ /* Unable to pin pages? retry later */
+ err = PSM2_EP_DEVICE_FAILURE;
+ goto fail;
+ }
+
+ psmi_assert_always((*tidcnt) > 0);
+ psmi_assert(ctrl->tid_num_avail >= (*tidcnt));
+ ctrl->tid_num_avail -= (*tidcnt);
+ tidc->tid_num_total += (*tidcnt);
+ tidc->tid_num_inuse += (*tidcnt);
+
+fail:
+ if (tidc->context->tid_ctrl)
+ pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+ return err;
+}
+
+psm2_error_t
+ips_tid_release(struct ips_tid *tidc,
+ uint32_t *tid_array, uint32_t tidcnt)
+{
+ struct ips_tid_ctrl *ctrl = tidc->tid_ctrl;
+ psm2_error_t err = PSM2_OK;
+
+ psmi_assert(tidcnt > 0);
+ if (tidc->context->tid_ctrl)
+ pthread_spin_lock(&ctrl->tid_ctrl_lock);
+
+ if (hfi_free_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) {
+ if (tidc->context->tid_ctrl)
+ pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+ /* If failed to unpin pages, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to tid free %d tids",
+ tidcnt);
+ goto fail;
+ }
+
+ ctrl->tid_num_avail += tidcnt;
+ if (tidc->context->tid_ctrl)
+ pthread_spin_unlock(&ctrl->tid_ctrl_lock);
+
+ tidc->tid_num_inuse -= tidcnt;
+ /* If an available callback is registered invoke it */
+ if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max)
+ && tidc->tid_avail_cb)
+ tidc->tid_avail_cb(tidc, tidc->tid_avail_context);
+
+fail:
+ return err;
+}
diff --git a/ptl_ips/ips_tid.h b/ptl_ips/ips_tid.h
new file mode 100644
index 0000000..4fcdaf1
--- /dev/null
+++ b/ptl_ips/ips_tid.h
@@ -0,0 +1,169 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* included header files */
+
+#ifndef _IPS_TID_H
+#define _IPS_TID_H
+
+#include "psm_user.h"
+#include "ips_tidcache.h"
+
+struct ips_tid;
+
+typedef void (*ips_tid_avail_cb_fn_t) (struct ips_tid *, void *context);
+
+/* Max tids a context can support */
+#define IPS_TID_MAX_TIDS 2048
+/* Max tid-session buffer size */
+#define PSM_TIDLIST_BUFSIZE 4096
+/* Max tid-session window size */
+#define PSM_TID_WINSIZE (4*1024*1024)
+/* Max number of packets for a single TID flow, fitting tid-session window.
+ In PSM2 packet integrity is realized by PSN (Packet Sequence Number),
+ which is kept as 11 bits field (for 9B KDETH),
+ giving max value 2048 (0 - 2047) */
+#define PSM_TID_MAX_PKTS 2048
+/* Total number of combined pages from the Tid-pair to be merged */
+#define PSM_MAX_NUM_PAGES_IN_TIDPAIR 512
+
+struct ips_tid_ctrl {
+ pthread_spinlock_t tid_ctrl_lock;
+ uint32_t tid_num_max;
+ uint32_t tid_num_avail;
+} __attribute__ ((aligned(64)));
+
+struct ips_tid {
+ const psmi_context_t *context;
+ struct ips_protoexp *protoexp;
+
+ void *tid_avail_context;
+ struct ips_tid_ctrl *tid_ctrl;
+ volatile uint64_t *invalidation_event;
+
+ ips_tid_avail_cb_fn_t tid_avail_cb;
+ uint64_t tid_num_total;
+ uint32_t tid_num_inuse;
+ uint32_t tid_cachesize; /* items can be cached */
+ cl_qmap_t tid_cachemap; /* RB tree implementation */
+ /*
+ * tids storage.
+ * This is used in tid registration caching case for
+ * tid invalidation, acquire, replace and release,
+ * entries should be the assigned tid number.
+ */
+ uint32_t *tid_array;
+};
+
+psm2_error_t ips_tid_init(const psmi_context_t *context,
+ struct ips_protoexp *protoexp,
+ ips_tid_avail_cb_fn_t cb, void *cb_context);
+psm2_error_t ips_tid_fini(struct ips_tid *tidc);
+
+/* Acquiring tids.
+ * Buffer base has to be aligned on page boundary
+ * Buffer length has to be multiple pages
+ */
+psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc,
+ const void *buf, /* input buffer, aligned to page boundary */
+ uint32_t *length, /* buffer length, aligned to page size */
+ uint32_t *tid_array, /* output tidarray, */
+ uint32_t *tidcnt, /* output of tid count */
+ uint32_t *pageoff /* output of offset in first tid */
+#ifdef PSM_CUDA
+ , uint8_t is_cuda_ptr
+#endif
+ );
+
+psm2_error_t ips_tidcache_release(struct ips_tid *tidc,
+ uint32_t *tid_array, /* input tidarray, */
+ uint32_t tidcnt); /* input of tid count */
+
+psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc);
+psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc);
+
+psm2_error_t ips_tid_acquire(struct ips_tid *tidc,
+ const void *buf, /* input buffer, aligned to page boundary */
+ uint32_t *length, /* buffer length, aligned to page size */
+ uint32_t *tid_array, /* output tidarray, */
+ uint32_t *tidcnt
+#ifdef PSM_CUDA
+ , uint8_t is_cuda_ptr
+#endif
+ ); /* output of tid count */
+
+psm2_error_t ips_tid_release(struct ips_tid *tidc,
+ uint32_t *tid_array, /* input tidarray, */
+ uint32_t tidcnt); /* input of tid count */
+
+PSMI_INLINE(int ips_tid_num_available(struct ips_tid *tidc))
+{
+ if (tidc->tid_ctrl->tid_num_avail == 0) {
+ if (tidc->tid_ctrl->tid_num_max == tidc->tid_num_inuse)
+ return -1;
+ else
+ return 0;
+ }
+
+ return tidc->tid_ctrl->tid_num_avail;
+}
+
+/* Note that the caller is responsible for making sure that NIDLE is non-zero
+ before calling ips_tidcache_evict. If NIDLE is 0 at the time of call,
+ ips_tidcache_evict is unstable.
+ */
+uint64_t ips_tidcache_evict(struct ips_tid *tidc, uint64_t length);
+
+#endif /* _IPS_TID_H */
diff --git a/ptl_ips/ips_tidcache.c b/ptl_ips/ips_tidcache.c
new file mode 100644
index 0000000..ecc0bba
--- /dev/null
+++ b/ptl_ips/ips_tidcache.c
@@ -0,0 +1,653 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start)
+#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length<<12))
+#define RBTREE_ASSERT psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->ntid)
+
+#include "rbtree.c"
+
+void ips_tidcache_map_init(cl_qmap_t *p_map,
+ cl_map_item_t* const root,
+ cl_map_item_t* const nil_item)
+{
+ ips_cl_qmap_init(p_map,root,nil_item);
+}
+
+/*
+ *
+ * Force to remove a tid, check invalidation event afterwards.
+ */
+static psm2_error_t
+ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt)
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ uint32_t idx;
+ psm2_error_t err;
+
+ /*
+ * call driver to free the tids.
+ */
+ if (hfi_free_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) {
+ /* If failed to unpin pages, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to tid free %d tids", 1);
+ return err;
+ }
+
+ while (tidcnt) {
+ tidcnt--;
+ idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) +
+ IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]);
+
+ /*
+ * sanity check.
+ */
+ psmi_assert(idx != 0);
+ psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+ psmi_assert(INVALIDATE(idx) == 0);
+ psmi_assert(REFCNT(idx) == 0);
+
+ /*
+ * mark the tid invalidated.
+ */
+ INVALIDATE(idx) = 1;
+
+ /*
+ * remove the tid from RB tree.
+ */
+ IDLE_REMOVE(idx);
+ ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+ }
+
+ /*
+ * Because the freed tid is not from invalidation list,
+ * it is possible that kernel just invalidated the tid,
+ * then we need to check and process the invalidation
+ * before we can re-use this tid. The reverse order
+ * will wrongly invalidate this tid again.
+ */
+ if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+ err = ips_tidcache_invalidation(tidc);
+ if (err)
+ return err;
+ }
+
+ return PSM2_OK;
+}
+
+/*
+ * Register a new buffer with driver, and cache the tidinfo.
+ */
+static psm2_error_t
+ips_tidcache_register(struct ips_tid *tidc,
+ unsigned long start, uint32_t length, uint32_t *firstidx
+#ifdef PSM_CUDA
+ , uint8_t is_cuda_ptr
+#endif
+ )
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ uint32_t tidoff, tidlen;
+ uint32_t idx, tidcnt;
+ uint16_t flags = 0;
+ psm2_error_t err;
+
+ /*
+ * make sure we have at least one free tid to
+ * register the new buffer.
+ */
+ if (NTID == tidc->tid_cachesize) {
+ /* all tids are in active use, error? */
+ if (NIDLE == 0)
+ return PSM2_OK_NO_PROGRESS;
+
+ /*
+ * free the first tid in idle queue.
+ */
+ idx = IPREV(IHEAD);
+ tidc->tid_array[0] = p_map->root[idx].payload.tidinfo;
+ err = ips_tidcache_remove(tidc, 1);
+ if (err)
+ return err;
+ }
+ psmi_assert(NTID < tidc->tid_cachesize);
+
+ /* Clip length if it exceeds worst case tid allocation,
+ where each entry in the tid array can accommodate only
+ 1 page. */
+ if (length > 4096*tidc->tid_ctrl->tid_num_max)
+ {
+ length = 4096*tidc->tid_ctrl->tid_num_max;
+ }
+ /*
+ * register the new buffer.
+ */
+
+retry:
+ tidcnt = 0;
+
+#ifdef PSM_CUDA
+ if (is_cuda_ptr)
+ flags = HFI1_BUF_GPU_MEM;
+#endif
+
+ if (hfi_update_tid(tidc->context->ctrl,
+ (uint64_t) start, &length,
+ (uint64_t) tidc->tid_array, &tidcnt, flags) < 0) {
+ /* if driver reaches lockable memory limit */
+ if ((errno == ENOMEM
+#ifdef PSM_CUDA
+ /* This additional check is in place for just the cuda
+ * version. It is a temporary workaround for a known
+ * issue where nvidia driver returns EINVAL instead of
+ * ENOMEM when there is no BAR1 space left to pin pages.
+ * PSM frees tidcache enteries when the driver sends
+ * EINVAL there by unpinning pages and freeing some
+ * BAR1 space.*/
+ || (PSMI_IS_CUDA_ENABLED && errno == EINVAL)
+#endif
+ ) && NIDLE) {
+ uint64_t lengthEvicted = ips_tidcache_evict(tidc,length);
+
+ if (lengthEvicted >= length)
+ goto retry;
+ }
+
+ /* Unable to pin pages? retry later */
+ return PSM2_EP_DEVICE_FAILURE;
+ }
+ psmi_assert_always(tidcnt > 0);
+ psmi_assert((tidcnt+NTID) <= tidc->tid_cachesize);
+
+ /*
+ * backward processing because we want to return
+ * the first RB index in the array.
+ */
+ idx = 0;
+ tidoff = length;
+ while (tidcnt) {
+ /*
+ * Driver only returns tidctrl=1 or tidctrl=2.
+ */
+ tidcnt--;
+ idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) +
+ IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]);
+ tidlen = IPS_TIDINFO_GET_LENGTH(tidc->tid_array[tidcnt]);
+
+ /*
+ * sanity check.
+ */
+ psmi_assert(idx != 0);
+ psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+ psmi_assert(INVALIDATE(idx) != 0);
+ psmi_assert(REFCNT(idx) == 0);
+
+ /*
+ * clear the tid invalidated.
+ */
+ INVALIDATE(idx) = 0;
+
+ /*
+ * put the tid into a RB node.
+ */
+ tidoff -= tidlen << 12;
+ START(idx) = start + tidoff;
+ LENGTH(idx) = tidlen;
+ p_map->root[idx].payload.tidinfo = tidc->tid_array[tidcnt];
+
+ /*
+ * put the node into RB tree and idle queue head.
+ */
+ IDLE_INSERT(idx);
+ ips_cl_qmap_insert_item(p_map, &p_map->root[idx]);
+ }
+ psmi_assert(idx != 0);
+ psmi_assert(tidoff == 0);
+ *firstidx = idx;
+
+ return PSM2_OK;
+}
+
+/*
+ * Get mmu notifier invalidation info and update PSM's caching.
+ */
+psm2_error_t
+ips_tidcache_invalidation(struct ips_tid *tidc)
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ uint32_t i, j, idx, tidcnt;
+ psm2_error_t err;
+
+ /*
+ * get a list of invalidated tids from driver,
+ * driver will clear the event bit before return.
+ */
+ tidcnt = 0;
+ if (hfi_get_invalidation(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tidc->tid_array, &tidcnt) < 0) {
+ /* If failed to get invalidation info, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to get invalidation info");
+ return err;
+ }
+ psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max);
+
+ j = 0;
+ for (i = 0; i < tidcnt; i++) {
+ /*
+ * Driver only returns tidctrl=1 or tidctrl=2.
+ */
+ idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) +
+ IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]);
+ psmi_assert(idx != 0);
+ psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+
+ /*
+ * sanity check.
+ */
+ psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]);
+ psmi_assert(LENGTH(idx) ==
+ IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i]));
+
+ /*
+ * if the tid is already invalidated, ignore it,
+ * but do sanity check.
+ */
+ if (INVALIDATE(idx) != 0) {
+ psmi_assert(REFCNT(idx) == 0);
+ continue;
+ }
+
+ /*
+ * mark the tid invalidated.
+ */
+ INVALIDATE(idx) = 1;
+
+ /*
+ * if the tid is idle, remove the tid from RB tree
+ * and idle queue, put on free list.
+ */
+ if (REFCNT(idx) == 0) {
+ IDLE_REMOVE(idx);
+ ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+
+ if (i != j)
+ tidc->tid_array[j] = tidc->tid_array[i];
+ j++;
+ }
+ }
+
+ if (j > 0) {
+ /*
+ * call driver to free the tids.
+ */
+ if (hfi_free_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+ /* If failed to unpin pages, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to tid free %d tids", j);
+ return err;
+ }
+ }
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_tidcache_acquire(struct ips_tid *tidc,
+ const void *buf, uint32_t *length,
+ uint32_t *tid_array, uint32_t *tidcnt,
+ uint32_t *tidoff
+#ifdef PSM_CUDA
+ , uint8_t is_cuda_ptr
+#endif
+ )
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ cl_map_item_t *p_item;
+ unsigned long start = (unsigned long)buf;
+ unsigned long end = start + (*length);
+ uint32_t idx, nbytes;
+ psm2_error_t err;
+
+ /*
+ * Before every tid caching search, we need to update the
+ * tid caching if there is invalidation event, otherwise,
+ * the cached address may be invalidated and we might have
+ * wrong matching.
+ */
+ if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
+ err = ips_tidcache_invalidation(tidc);
+ if (err)
+ return err;
+ }
+
+ /*
+ * Now we can do matching from the caching, because obsolete
+ * address in caching has been removed or identified.
+ */
+retry:
+ p_item = ips_cl_qmap_search(p_map, start, end);
+ idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) +
+ IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo);
+
+ /*
+ * There is tid matching.
+ */
+ if (idx) {
+ /*
+ * if there is a caching match, but the tid has been
+ * invalidated, we can't match this tid, and we also
+ * can't register this address, we need to wait this
+ * tid to be freed.
+ */
+ if (INVALIDATE(idx) != 0)
+ return PSM2_OK_NO_PROGRESS;
+
+ /*
+ * if the page offset within the tid is not less than
+ * 128K, the address offset within the page is not 64B
+ * multiple, PSM can't handle this tid with any offset
+ * mode. We need to free this tid and re-register with
+ * the asked page address.
+ */
+ if (((start - START(idx)) >= 131072) && ((*tidoff) & 63)) {
+ /*
+ * If the tid is currently used, retry later.
+ */
+ if (REFCNT(idx) != 0)
+ return PSM2_OK_NO_PROGRESS;
+
+ /*
+ * free this tid.
+ */
+ tidc->tid_array[0] = p_map->root[idx].payload.tidinfo;
+ err = ips_tidcache_remove(tidc, 1);
+ if (err)
+ return err;
+
+ /* try to match a node again */
+ goto retry;
+ }
+ }
+
+ /*
+ * If there is no match node, or 'start' falls out of node range,
+ * whole or partial buffer from 'start' is not registered yet.
+ */
+ if (!idx || START(idx) > start) {
+ if (!idx)
+ nbytes = end - start;
+ else
+ nbytes = START(idx) - start;
+
+ /*
+ * Because we don't have any match tid yet, if
+ * there is an error, we return from here, PSM
+ * will try later.
+ */
+ err = ips_tidcache_register(tidc, start, nbytes, &idx
+#ifdef PSM_CUDA
+ , is_cuda_ptr
+#endif
+ );
+ if (err)
+ return err;
+ }
+
+ /*
+ * sanity check.
+ */
+ psmi_assert(START(idx) <= start);
+ psmi_assert(INVALIDATE(idx) == 0);
+
+ *tidoff += start - START(idx);
+ *tidcnt = 1;
+
+ tid_array[0] = p_map->root[idx].payload.tidinfo;
+ REFCNT(idx)++;
+ if (REFCNT(idx) == 1)
+ IDLE_REMOVE(idx);
+ start = END(idx);
+
+ while (start < end) {
+ p_item = ips_cl_qmap_successor(p_map, &p_map->root[idx]);
+ idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) +
+ IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo);
+ if (!idx || START(idx) != start) {
+ if (!idx)
+ nbytes = end - start;
+ else
+ nbytes = (START(idx) > end) ?
+ (end - start) :
+ (START(idx) - start);
+
+ /*
+ * Because we already have at least one match tid,
+ * if it is error to register new pages, we break
+ * here and return the tids we already have.
+ */
+ err = ips_tidcache_register(tidc, start, nbytes, &idx
+#ifdef PSM_CUDA
+ , is_cuda_ptr
+#endif
+ );
+ if (err)
+ break;
+ } else if (INVALIDATE(idx) != 0) {
+ /*
+ * the tid has been invalidated, it is still in
+ * caching because it is still being used, but
+ * any new usage is not allowed, we ignore it and
+ * return the tids we already have.
+ */
+ psmi_assert(REFCNT(idx) != 0);
+ break;
+ }
+
+ /*
+ * sanity check.
+ */
+ psmi_assert(START(idx) == start);
+ psmi_assert(INVALIDATE(idx) == 0);
+
+ tid_array[(*tidcnt)++] = p_map->root[idx].payload.tidinfo;
+ REFCNT(idx)++;
+ if (REFCNT(idx) == 1)
+ IDLE_REMOVE(idx);
+ start = END(idx);
+ }
+
+ if (start < end)
+ *length = start - (unsigned long)buf;
+ /* otherwise, all pages are registered */
+ psmi_assert((*tidcnt) > 0);
+
+ return PSM2_OK;
+}
+
+psm2_error_t
+ips_tidcache_release(struct ips_tid *tidc,
+ uint32_t *tid_array, uint32_t tidcnt)
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ uint32_t i, j, idx;
+ psm2_error_t err;
+
+ psmi_assert(tidcnt > 0);
+
+ j = 0;
+ for (i = 0; i < tidcnt; i++) {
+ /*
+ * Driver only returns tidctrl=1 or tidctrl=2.
+ */
+ idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) +
+ IPS_TIDINFO_GET_TIDCTRL(tid_array[i]);
+ psmi_assert(idx != 0);
+ psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
+ psmi_assert(REFCNT(idx) != 0);
+
+ REFCNT(idx)--;
+ if (REFCNT(idx) == 0) {
+ if (INVALIDATE(idx) != 0) {
+ ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
+
+ tidc->tid_array[j] = tid_array[i];
+ j++;
+ } else {
+ IDLE_INSERT(idx);
+ }
+ }
+ }
+
+ if (j > 0) {
+ /*
+ * call driver to free the tids.
+ */
+ if (hfi_free_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+ /* If failed to unpin pages, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to tid free %d tids", j);
+ return err;
+ }
+ }
+
+ return PSM2_OK;
+}
+
+/*
+ *
+ * Call driver to free all cached tids.
+ */
+psm2_error_t
+ips_tidcache_cleanup(struct ips_tid *tidc)
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ psm2_error_t err;
+ int i, j;
+
+ j = 0;
+ for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) {
+ psmi_assert(REFCNT(i) == 0);
+ if (INVALIDATE(i) == 0) {
+ tidc->tid_array[j++] = p_map->root[i].payload.tidinfo;
+ }
+ }
+
+ if (j > 0) {
+ /*
+ * call driver to free the tids.
+ */
+ if (hfi_free_tid(tidc->context->ctrl,
+ (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
+ /* If failed to unpin pages, it's fatal error */
+ err = psmi_handle_error(tidc->context->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Failed to tid free %d tids", j);
+ return err;
+ }
+ }
+
+ psmi_free(tidc->tid_array);
+ psmi_free(tidc->tid_cachemap.root);
+
+ return PSM2_OK;
+}
+
+
+/* Note that the caller is responsible for making sure that NIDLE is non-zero
+ before calling ips_tidcache_evict. If NIDLE is 0 at the time of call,
+ ips_tidcache_evict is unstable.
+ */
+uint64_t
+ips_tidcache_evict(struct ips_tid *tidc,uint64_t length)
+{
+ cl_qmap_t *p_map = &tidc->tid_cachemap;
+ uint32_t idx = IHEAD, tidcnt = 0, tidlen = 0;
+ /*
+ * try to free the required
+ * pages from idle queue tids
+ */
+
+ do {
+ idx = IPREV(idx);
+ psmi_assert(idx != 0);
+ tidc->tid_array[tidcnt] =
+ p_map->root[idx].payload.tidinfo;
+ tidcnt++;
+
+ tidlen += IPS_TIDINFO_GET_LENGTH
+ (p_map->root[idx].payload.tidinfo)<<12;
+ } while (tidcnt < NIDLE && tidlen < length);
+
+ /*
+ * free the selected tids on successfully finding some:.
+ */
+ if (tidcnt > 0 && ips_tidcache_remove(tidc, tidcnt))
+ return 0;
+
+ return tidlen;
+}
diff --git a/ptl_ips/ips_tidcache.h b/ptl_ips/ips_tidcache.h
new file mode 100644
index 0000000..20d45bf
--- /dev/null
+++ b/ptl_ips/ips_tidcache.h
@@ -0,0 +1,158 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _IPS_TIDCACHE_H
+#define _IPS_TIDCACHE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/*
+ * Design notes.
+ *
+ * PSM needs to call into driver to program receiving buffer pages to
+ * HFI gen1 hardware, each tid can be programmed with physically contiguous
+ * power-of-two pages from 1 pages to 512 pages. This procedure takes
+ * time.
+ *
+ * Lots of applications tend to re-use the same receiving buffer, caching
+ * such programmed tids in user space process will save time and improve
+ * application performance.
+ *
+ * This PSM tid registration caching design requires cooperation between
+ * PSM and driver. Here is what happen between PSM and driver.
+ *
+ * 1. PSM call into driver with a chunk of buffer with virtual address
+ * and length.
+ * 2. driver pins the buffer pages, program hardware with the physical
+ * pages, get a list of tids.
+ * 3. driver caches the tids with the corresponding virtual address in
+ * user space for each tid, and return the list of tids back to PSM.
+ * 4. PSM also caches the list of tids with the corresponding virtual
+ * address for each tid, and use the list of tids for transmission.
+ * 5. when process frees a buffer, kernel VM will catch the event and
+ * calls the callback in driver to notify that the virtual address
+ * range is gone in the process.
+ * 6. driver will search its cache system and find the tids with the
+ * removed virtual address, put these tid in an invalidation queue
+ * and notify PSM the event.
+ * 7. PSM will pick the event and remove the tids from its own cache
+ * as well.
+ * 8. PSM must check such invalidation event every time before searching
+ * its caching system to match tids for a 'new' buffer chunk.
+ * 9, when the caching system is full, and a new buffer chunk is asked
+ * to register, PSM picks a victim to remove.
+ */
+
+typedef struct
+{
+ unsigned long start; /* start virtual address */
+ uint32_t tidinfo; /* tid encoding */
+ uint16_t length; /* length in pages */
+ uint16_t invalidate; /* invalidate flag */
+ uint16_t refcount; /* usage reference count */
+ uint16_t i_prev; /* idle queue previous */
+ uint16_t i_next; /* idle queue next */
+} rbtree_tidcache_mapitem_pl_t;
+
+typedef struct {
+ uint32_t ntid; /* tids are cached */
+ uint32_t nidle; /* tids are idle */
+} rbtree_tidcache_map_pl_t;
+
+#define RBTREE_MI_PL rbtree_tidcache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t
+
+#include "rbtree.h"
+
+/*
+ * Macro definition for easy programming.
+ */
+
+#define NTID p_map->payload.ntid
+#define REFCNT(x) p_map->root[x].payload.refcount
+#define INVALIDATE(x) p_map->root[x].payload.invalidate
+
+#define LENGTH(x) p_map->root[x].payload.length
+#define START(x) p_map->root[x].payload.start
+#define END(x) (START(x) + (LENGTH(x)<<12))
+
+/*
+ * Macro for idle tid queue management.
+ */
+#define NIDLE p_map->payload.nidle
+#define IHEAD 0
+#define INEXT(x) p_map->root[x].payload.i_next
+#define IPREV(x) p_map->root[x].payload.i_prev
+
+#define IDLE_REMOVE(x) do { \
+ INEXT(IPREV(x)) = INEXT(x); \
+ IPREV(INEXT(x)) = IPREV(x); \
+ NIDLE--; \
+ } while (0)
+
+#define IDLE_INSERT(x) do { \
+ INEXT(x) = INEXT(IHEAD); \
+ IPREV(x) = IHEAD; \
+ IPREV(INEXT(IHEAD)) = x; \
+ INEXT(IHEAD) = x; \
+ NIDLE++; \
+ } while (0)
+
+extern void ips_tidcache_map_init(cl_qmap_t *p_map,
+ cl_map_item_t* const root,
+ cl_map_item_t* const nil_item);
+
+#endif
diff --git a/ptl_ips/ips_tidflow.c b/ptl_ips/ips_tidflow.c
new file mode 100644
index 0000000..06b9c58
--- /dev/null
+++ b/ptl_ips/ips_tidflow.c
@@ -0,0 +1,267 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+ const psmi_context_t *context,
+ struct ips_tf *tfc,
+ ips_tf_avail_cb_fn_t cb)
+{
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ int tf_idx;
+
+#if TF_ADD
+ struct psmi_stats_entry entries[] = {
+ PSMI_STATS_DECL("tidflow update count",
+ MPSPAWN_STATS_REDUCTION_ALL,
+ NULL, &tfc->tf_num_total),
+ };
+#endif
+
+ tfc->context = context;
+ tfc->tf_num_total = 0;
+ tfc->tf_num_inuse = 0;
+ tfc->tf_avail_cb = cb;
+ tfc->tf_avail_context = (void *)protoexp;
+ if ((context->runtime_flags & HFI1_CAP_EXTENDED_PSN)) {
+ tfc->tf_gen_mask = 0xFFFFF;
+ } else {
+ tfc->tf_gen_mask = 0x1FFF;
+ }
+
+ /* Allocate and Initialize tidrecvc array. */
+ tfc->tidrecvc = (struct ips_tid_recv_desc *)
+ psmi_calloc(context->ep, UNDEFINED, 1,
+ sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS);
+ if (tfc->tidrecvc == NULL)
+ return PSM2_NO_MEMORY;
+
+ for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+ tfc->tidrecvc[tf_idx].context = context;
+ tfc->tidrecvc[tf_idx].protoexp = protoexp;
+ tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx;
+ tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx;
+ tfc->tidrecvc[tf_idx].tidflow.flowid = EP_FLOW_TIDFLOW;
+ tfc->tidrecvc[tf_idx].tidflow.frag_size = protoexp->proto->epinfo.ep_mtu;
+ }
+
+ /* Shared control structure, it will be in shared memory
+ * for context sharing, otherwise calloc() it */
+ tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl;
+ if (!tfc->tf_ctrl) {
+ tfc->tf_ctrl = (struct ips_tf_ctrl *)
+ psmi_calloc(context->ep, UNDEFINED, 1,
+ sizeof(struct ips_tf_ctrl));
+ if (tfc->tf_ctrl == NULL) {
+ return PSM2_NO_MEMORY;
+ }
+ }
+
+ /*
+ * Only the master process can initialize.
+ */
+ if (ctxt_info->subctxt == 0) {
+ pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock,
+ PTHREAD_PROCESS_SHARED);
+ tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS;
+ tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS;
+
+ for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+ /* Update flow state */
+ tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED;
+ tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx;
+ tfc->tf_ctrl->tf[tf_idx].next_gen = 0;
+ tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
+
+ hfi_tidflow_reset(context->ctrl, tf_idx,
+ tfc->tf_gen_mask, 0x7FF);
+ }
+ tfc->tf_ctrl->tf_head = 0;
+ }
+
+#if TF_ADD
+ /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */
+ return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+ PSMI_STATSTYPE_TIDS,
+ entries,
+ PSMI_STATS_HOWMANY(entries), tidc);
+#else
+ return PSM2_OK;
+#endif
+}
+
+psm2_error_t ips_tf_fini(struct ips_tf *tfc)
+{
+ if (!tfc->context->tf_ctrl)
+ psmi_free(tfc->tf_ctrl);
+ psmi_free(tfc->tidrecvc);
+ return PSM2_OK;
+}
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+ struct ips_tid_recv_desc **tidrecvc)
+{
+ struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+ struct ips_tf_entry *entry;
+
+ if (tfc->context->tf_ctrl)
+ pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+ if (!ctrl->tf_num_avail) {
+ psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS);
+ *tidrecvc = NULL;
+
+ if (tfc->context->tf_ctrl)
+ pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+ return PSM2_EP_NO_RESOURCES;
+ }
+
+ entry = &ctrl->tf[ctrl->tf_head];
+ ctrl->tf_head = entry->next_free;
+ ctrl->tf_num_avail--;
+
+ if (tfc->context->tf_ctrl)
+ pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+ tfc->tf_num_total++;
+ tfc->tf_num_inuse++;
+
+ psmi_assert(entry->state == TF_STATE_DEALLOCATED);
+ entry->state = TF_STATE_ALLOCATED;
+
+ *tidrecvc = &(tfc->tidrecvc[entry->tf_idx]);
+ /* initial tidflow generation */
+ (*tidrecvc)->tidflow_active_gen = entry->next_gen;
+
+ psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx);
+ psmi_assert_always(entry->next_gen < tfc->tf_gen_mask);
+
+ entry->next_gen++;
+ if (entry->next_gen == tfc->tf_gen_mask)
+ entry->next_gen = 0;
+
+ return PSM2_OK;
+}
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx)
+{
+ struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+ struct ips_tf_entry *entry;
+
+ psmi_assert(tf_idx < HFI_TF_NFLOWS);
+ psmi_assert(tf_idx >= 0);
+
+ entry = &ctrl->tf[tf_idx];
+ psmi_assert(entry->state == TF_STATE_ALLOCATED);
+ entry->state = TF_STATE_DEALLOCATED;
+
+ /*
+ * The wire protocol only uses 16bits tidrecvc generation
+ * count in exptid packet, this should be bigger enough,
+ * u16w3 is the lower 16bits of _desc_genc
+ */
+ tfc->tidrecvc[tf_idx].rdescid.u16w3++;
+
+ /* Mark invalid generation for flow (stale packets will be dropped) */
+ hfi_tidflow_reset(tfc->context->ctrl, tf_idx, tfc->tf_gen_mask, 0x7FF);
+
+ if (tfc->context->tf_ctrl)
+ pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+ entry->next_free = ctrl->tf_head;
+ ctrl->tf_head = tf_idx;
+ ctrl->tf_num_avail++;
+
+ if (tfc->context->tf_ctrl)
+ pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+ tfc->tf_num_inuse--;
+ /* If an available callback is registered invoke it */
+ if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb)
+ tfc->tf_avail_cb(tfc, tfc->tf_avail_context);
+
+ return PSM2_OK;
+}
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+ uint32_t tf_idx, uint32_t *tfgen)
+{
+ struct ips_tf_entry *entry;
+ int ret = PSM2_OK;
+
+ psmi_assert(tf_idx < HFI_TF_NFLOWS);
+ psmi_assert(tf_idx >= 0);
+
+ entry = &tfc->tf_ctrl->tf[tf_idx];
+ psmi_assert(entry->state == TF_STATE_ALLOCATED);
+
+ *tfgen = entry->next_gen;
+
+ entry->next_gen++;
+ if (entry->next_gen == tfc->tf_gen_mask)
+ entry->next_gen = 0;
+
+ psmi_assert_always(*tfgen < tfc->tf_gen_mask);
+
+ return ret;
+}
diff --git a/ptl_ips/ips_tidflow.h b/ptl_ips/ips_tidflow.h
new file mode 100644
index 0000000..5578dc5
--- /dev/null
+++ b/ptl_ips/ips_tidflow.h
@@ -0,0 +1,133 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2016 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2016 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_TIDFLOW_H
+#define _IPS_TIDFLOW_H
+
+#include "psm_user.h"
+
+struct ips_tf;
+struct ips_protoexp;
+
+typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context);
+typedef enum {
+ TF_STATE_INVALID = 0,
+ TF_STATE_ALLOCATED = 1,
+ TF_STATE_DEALLOCATED = 2
+} tf_state_t;
+
+struct ips_tf_entry {
+ tf_state_t state;
+ uint32_t tf_idx;
+ uint32_t next_gen;
+ uint32_t next_free;
+};
+
+struct ips_tf_ctrl {
+ pthread_spinlock_t tf_ctrl_lock;
+ uint32_t tf_num_max;
+ uint32_t tf_num_avail;
+ uint32_t tf_head;
+ struct ips_tf_entry tf[HFI_TF_NFLOWS];
+} __attribute__ ((aligned(64)));
+
+struct ips_tf {
+ const psmi_context_t *context;
+ ips_tf_avail_cb_fn_t tf_avail_cb;
+ void *tf_avail_context;
+ struct ips_tf_ctrl *tf_ctrl;
+
+ uint64_t tf_num_total;
+ uint32_t tf_num_inuse;
+ uint32_t tf_gen_mask;
+
+#ifdef PSM_CUDA
+ void *host_to_gpu_bounce_buf_pool;
+#endif
+
+ /* Pointer to array of size HFI_TF_NFLOWS */
+ struct ips_tid_recv_desc *tidrecvc;
+};
+
+PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf))
+{
+ if (tf->tf_ctrl->tf_num_avail == 0) {
+ if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse)
+ return -1;
+ else
+ return 0;
+ }
+
+ return tf->tf_ctrl->tf_num_avail;
+}
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+ const psmi_context_t *context,
+ struct ips_tf *tfc,
+ ips_tf_avail_cb_fn_t cb);
+psm2_error_t ips_tf_fini(struct ips_tf *tfc);
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+ struct ips_tid_recv_desc **tidrecvc);
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx);
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+ uint32_t tf_idx, uint32_t *tfgen);
+
+#endif
diff --git a/ptl_ips/ips_writehdrq.c b/ptl_ips/ips_writehdrq.c
new file mode 100644
index 0000000..1bb8697
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.c
@@ -0,0 +1,110 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "ips_writehdrq.h"
+
+psm2_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+ const struct ips_recvq_params *hdrq_params,
+ const struct ips_recvq_params *egrq_params,
+ struct ips_writehdrq *writeq,
+ struct ips_writehdrq_state *state, uint32_t runtime_flags)
+{
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+
+ memset(writeq, 0, sizeof(*writeq));
+ writeq->context = context;
+ writeq->state = state;
+ writeq->hdrq = *hdrq_params; /* deep copy */
+ writeq->hdrq_elemlast =
+ ((writeq->hdrq.elemcnt - 1) * writeq->hdrq.elemsz);
+ writeq->egrq = *egrq_params; /* deep copy */
+ writeq->egrq_buftable =
+ ips_recvq_egrbuf_table_alloc(context->ep, writeq->egrq.base_addr,
+ writeq->egrq.elemcnt,
+ writeq->egrq.elemsz);
+ writeq->runtime_flags = runtime_flags;
+ writeq->hdrq_rhf_off =
+ (ctxt_info->rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT;
+
+ if (writeq->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+ writeq->hdrq_hdr_copysz =
+ writeq->hdrq.elemsz * sizeof(uint32_t);
+ writeq->state->hdrq_rhf_seq = 0; /* _seq is ignored */
+ } else {
+ writeq->state->hdrq_rhf_seq = 1;
+ /*
+ * We don't allow readers to see the RHF until the writer can
+ * atomically write an updated RHF.
+ */
+ writeq->hdrq_hdr_copysz =
+ (writeq->hdrq.elemsz - 2) * sizeof(uint32_t);
+ /*
+ * Ensure 8-byte alignment of the RHF by looking at RHF of the second
+ * header, which is required for atomic RHF updates.
+ */
+ psmi_assert_always(!((uintptr_t) (writeq->hdrq.base_addr +
+ writeq->hdrq.elemsz +
+ writeq->hdrq_rhf_off) & 0x7));
+ }
+ writeq->state->enabled = 1;
+ return PSM2_OK;
+}
+
+psm2_error_t ips_writehdrq_fini(struct ips_writehdrq *writeq)
+{
+ ips_recvq_egrbuf_table_free(writeq->egrq_buftable);
+ return PSM2_OK;
+}
diff --git a/ptl_ips/ips_writehdrq.h b/ptl_ips/ips_writehdrq.h
new file mode 100644
index 0000000..ff95000
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.h
@@ -0,0 +1,269 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_WRITEHDRQ_H
+#define _IPS_WRITEHDRQ_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_recvq.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Structure containing state for writehdrq writing. This is logically
+ * part of ips_writehdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the port. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+struct ips_writehdrq_state {
+ uint32_t hdrq_rhf_seq; /* last seq */
+ uint32_t egrq_offset; /* in bytes unit, not 64B */
+ uint32_t enabled; /* enables writing */
+};
+
+struct ips_writehdrq {
+ const psmi_context_t *context;
+ struct ips_writehdrq_state *state;
+ struct ips_recvq_params hdrq;
+ uint32_t hdrq_elemlast;
+ uint32_t hdrq_rhf_off; /* rhf offset */
+ uint32_t hdrq_hdr_copysz;
+ struct ips_recvq_params egrq;
+ void **egrq_buftable; /* table of eager idx-to-ptr */
+ uint32_t runtime_flags;
+};
+
+psm2_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+ const struct ips_recvq_params *hdrq_params,
+ const struct ips_recvq_params *egrq_params,
+ struct ips_writehdrq *writeq,
+ struct ips_writehdrq_state *state, uint32_t runtime_flags);
+
+psm2_error_t ips_writehdrq_fini(struct ips_writehdrq *writeq);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_writehdrq_write_rhf_atomic(uint32_t *rhf_dest, uint32_t *rhf_src))
+{
+ /*
+ * In 64-bit mode, we check in init that the rhf will always be 8-byte
+ * aligned
+ */
+ *((uint64_t *) rhf_dest) = *((uint64_t *) rhf_src);
+ return;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+ips_write_eager_packet(struct ips_writehdrq *writeq, uint32_t *write_hdr,
+ uint32_t *write_rhf,
+ const struct ips_recvhdrq_event *rcv_ev))
+{
+ uint32_t write_egr_tail = ips_recvq_tail_get(&writeq->egrq);
+ uint32_t next_write_egr_tail = write_egr_tail;
+ /* checksum is trimmed from paylen, we need to add back */
+ uint32_t rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+ (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+ psmi_assert(rcv_paylen > 0);
+
+ /* Loop as long as the write eager queue is NOT full */
+ while (1) {
+ next_write_egr_tail++;
+ if (next_write_egr_tail >= writeq->egrq.elemcnt)
+ next_write_egr_tail = 0;
+ if (next_write_egr_tail == ips_recvq_head_get(&writeq->egrq)) {
+ break;
+ }
+
+ /* Move to next eager entry if leftover is not enough */
+ if ((writeq->state->egrq_offset + rcv_paylen) >
+ writeq->egrq.elemsz) {
+ writeq->state->egrq_offset = 0;
+ write_egr_tail = next_write_egr_tail;
+
+ /* Update the eager buffer tail pointer */
+ ips_recvq_tail_update(&writeq->egrq, write_egr_tail);
+ } else {
+ /* There is enough space in this entry! */
+ /* Use pre-calculated address from look-up table */
+ char *write_payload =
+ ips_recvq_egr_index_2_ptr(writeq->egrq_buftable,
+ write_egr_tail,
+ writeq->state->
+ egrq_offset);
+ const char *rcv_payload =
+ ips_recvhdrq_event_payload(rcv_ev);
+
+ psmi_assert(write_payload != NULL);
+ psmi_assert(rcv_payload != NULL);
+ psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen);
+
+ /* Copy the header to the subcontext's header queue */
+ psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr,
+ writeq->hdrq_hdr_copysz);
+
+ /* Fix up the header with the subcontext's eager index/offset */
+ hfi_hdrset_egrbfr_index((uint32_t *) write_rhf,
+ write_egr_tail);
+ hfi_hdrset_egrbfr_offset((uint32_t *) write_rhf,
+ (writeq->state->
+ egrq_offset >> 6));
+
+ /* Update offset to next 64B boundary */
+ writeq->state->egrq_offset =
+ (writeq->state->egrq_offset + rcv_paylen +
+ 63) & (~63);
+ return IPS_RECVHDRQ_CONTINUE;
+ }
+ }
+
+ /* At this point, the eager queue is full -- drop the packet. */
+ /* Copy the header to the subcontext's header queue */
+ psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr, writeq->hdrq_hdr_copysz);
+
+ /* Mark header with ETIDERR (eager overflow) */
+ hfi_hdrset_err_flags(write_rhf, HFI_RHF_TIDERR);
+
+ /* Clear UseEgrBfr bit because payload is dropped */
+ hfi_hdrset_use_egrbfr(write_rhf, 0);
+ return IPS_RECVHDRQ_BREAK;
+}
+
+PSMI_INLINE(
+int
+ips_writehdrq_append(struct ips_writehdrq *writeq,
+ const struct ips_recvhdrq_event *rcv_ev))
+{
+ uint32_t write_hdr_head;
+ uint32_t write_hdr_tail;
+ uint32_t *write_hdr;
+ uint32_t *write_rhf;
+ uint32_t next_write_hdr_tail;
+ union {
+ uint32_t u32[2];
+ uint64_t u64;
+ } rhf;
+ int result = IPS_RECVHDRQ_CONTINUE;
+
+ /* Drop packet if write header queue is disabled */
+ if (!writeq->state->enabled) {
+ return IPS_RECVHDRQ_BREAK;
+ }
+
+ write_hdr_head = ips_recvq_head_get(&writeq->hdrq);
+ write_hdr_tail = ips_recvq_tail_get(&writeq->hdrq);
+ write_hdr = writeq->hdrq.base_addr + write_hdr_tail;
+ write_rhf = write_hdr + writeq->hdrq_rhf_off;
+
+ /* Drop packet if write header queue is full */
+ next_write_hdr_tail = write_hdr_tail + writeq->hdrq.elemsz;
+ if (next_write_hdr_tail > writeq->hdrq_elemlast) {
+ next_write_hdr_tail = 0;
+ }
+ if (next_write_hdr_tail == write_hdr_head) {
+ return IPS_RECVHDRQ_BREAK;
+ }
+
+ /*
+ * If not DMA_RTAIL, don't let consumer see RHF until it's ready.
+ * We copy the source rhf and operate on it until we are ready
+ * to atomically update it for the reader.
+ */
+ if (!(writeq->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+ write_rhf = &rhf.u32[0];
+ rhf.u64 = *((uint64_t *) rcv_ev->rhf);
+ }
+
+ if (hfi_hdrget_use_egrbfr(rcv_ev->rhf)) {
+ result = ips_write_eager_packet(writeq,
+ write_hdr, write_rhf, rcv_ev);
+ } else {
+ /* Copy the header to the subcontext's header queue */
+ psmi_mq_mtucpy(write_hdr, rcv_ev->rcv_hdr,
+ writeq->hdrq_hdr_copysz);
+ }
+
+ /* Ensure previous writes are visible before writing rhf seq or tail */
+ ips_wmb();
+
+ if (!(writeq->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
+ /* We accumulated a few changes to the RHF and now want to make it
+ * atomically visible for the reader.
+ */
+ uint32_t rhf_seq = writeq->state->hdrq_rhf_seq;
+ hfi_hdrset_seq((uint32_t *) write_rhf, rhf_seq);
+ if (rhf_seq >= LAST_RHF_SEQNO)
+ writeq->state->hdrq_rhf_seq = 1;
+ else
+ writeq->state->hdrq_rhf_seq = rhf_seq + 1;
+
+ /* Now write the new rhf */
+ ips_writehdrq_write_rhf_atomic(write_hdr + writeq->hdrq_rhf_off,
+ write_rhf);
+ }
+
+ /* The tail must be updated regardless of HFI1_CAP_DMA_RTAIL
+ * since this tail is also used to keep track of where
+ * ips_writehdrq_append will write to next. For subcontexts there is
+ * no separate shadow copy of the tail. */
+ ips_recvq_tail_update(&writeq->hdrq, next_write_hdr_tail);
+
+ return result;
+}
+
+#endif /* _IPS_WRITEHDRQ_H */
diff --git a/ptl_ips/ipserror.c b/ptl_ips/ipserror.c
new file mode 100644
index 0000000..608b73e
--- /dev/null
+++ b/ptl_ips/ipserror.c
@@ -0,0 +1,200 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* IPS - Interconnect Protocol Stack */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include "ipserror.h"
+
+char *ips_err_str(int ips_error)
+{
+ static char err_str[128];
+
+ switch (ips_error) {
+ case IPS_RC_OK:
+ return "OK!";
+
+ case IPS_RC_ERROR:
+ return "general error";
+
+ case IPS_RC_PENDING:
+ return "request pending";
+
+ case IPS_RC_EXIST:
+ return "entry exist";
+
+ case IPS_RC_MAX_ENTRIES_EXCEEDED:
+ return "max entries has been exceeded";
+
+ case IPS_RC_NOT_ENOUGH_BUFFERS:
+ return "not enough buffers to complete request";
+
+ case IPS_RC_NO_FREE_MEM:
+ return "no free memory";
+
+ case IPS_RC_NAME_LOOKUP_FAILED:
+ return "name lookup failed";
+
+ case IPS_RC_PARAM_ERROR:
+ return "invalid parameter";
+
+ case IPS_RC_UNKNOWN_DEVICE:
+ return "unknown device";
+
+ case IPS_RC_DEVICE_INIT_FAILED:
+ return "device init failed";
+
+ case IPS_RC_DATA_TRUNCATED:
+ return "data truncated";
+
+ case IPS_RC_INVALID_RANK:
+ return "invalid rank";
+
+ case IPS_RC_INVALID_OPCODE:
+ return "invalid op code";
+
+ case IPS_RC_PEER_NOT_READY:
+ return "peer is not ready";
+
+ case IPS_RC_PEER_CLOSED:
+ return "peer is closed";
+
+ case IPS_RC_DEST_EQUAL_LOCAL_RANK:
+ return "src and dest rank is equal";
+
+ case IPS_RC_DEVICE_ERROR:
+ return
+ "OPA hardware not found, hardware problem, or disabled";
+
+ case IPS_RC_NETWORK_DOWN:
+ return "The link is down";
+
+ case IPS_RC_NOT_ENOUGH_FREE_TIDS:
+ return "Not enough free TIDS to complete request";
+
+ case IPS_RC_NO_RESOURCE_AVAILABLE:
+ return "Internal resources exhausted";
+
+ case IPS_RC_HW_UPDATE_FAILED:
+ return "Failed TID update for rendevous, allocation problem";
+
+ case IPS_RC_PARTITION_ERROR:
+ return "One or more nodes is on a different partition";
+
+ case IPS_RC_RUN_ERROR:
+ return "One or more nodes is still running the previous job";
+
+ case IPS_RC_ALREADY_OPEN:
+ return "Open/init has already been called";
+
+ case IPS_RC_WAS_CLOSED:
+ return "Close has already been called";
+
+ case IPS_RC_DEST_EQUAL_LOCAL_LID:
+ return "src and dest LID is equal";
+
+ case IPS_RC_BUFFER_ALIGMENT_ERROR:
+ return "Buffer start address is not 32 bit aligned";
+
+ case IPS_RC_LENGTH_ALIGMENT_ERROR:
+ return "Buffer length is not a whole # of 32 bit words";
+
+ case IPS_RC_INVALID_DATA_LENGTH:
+ return "invalid data length";
+
+ case IPS_RC_BUSY:
+ return "Device is busy";
+
+ case IPS_RC_INIT_TIMEOUT_EXPIRED:
+ return "Could not connect to other nodes";
+
+ case IPS_RC_NO_PORTS_AVAILABLE:
+ return "All OPA ports are in use.";
+
+ /* Performance Counters codes */
+ case IPS_RCPERF_INIT_FAILED:
+ return "Initialization of performance counters failed";
+
+ case IPS_RCPERF_EVENT_SETUP_FAILED:
+ return "Setting performance counter events failed";
+
+ case IPS_RCPERF_REG_DEFAULT_SET:
+ return "Default event set for one of the counters";
+
+ case IPS_RCPERF_UNSUPPORTED_CPU:
+ return "This CPU type is not supported";
+
+ case IPS_RCPERF_REG_GET_FAILED:
+ return "Failed to get register value for event";
+
+ case IPS_RCPERF_SET_EVENT_STR_FAILED:
+ return "Failed to find event description";
+
+ case IPS_RCPERF_INVALID_REGISTER:
+ return "Register index out of range of available counters";
+
+ case IPS_RC_SYSERR: /* we hope errno hasn't changed since this was set... */
+ snprintf(err_str, sizeof(err_str), "System error: %s",
+ strerror(errno));
+ return err_str;
+
+ default:
+ snprintf(err_str, sizeof(err_str),
+ "Error code %i: <no interpretation>", ips_error);
+ return err_str;
+ }
+}
diff --git a/ptl_ips/ipserror.h b/ptl_ips/ipserror.h
new file mode 100644
index 0000000..685f617
--- /dev/null
+++ b/ptl_ips/ipserror.h
@@ -0,0 +1,122 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/*
+ * interface to OPA Interconnect Protocol Stack
+ *
+ * This file contains the function prototypes of the interconnect protocol
+ * stack. It should be included in all the clients of the stack, such as MPI.
+ */
+
+#ifndef ipserror_h
+#define ipserror_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return codes */
+#define IPS_RC_OK 0
+#define IPS_RC_ERROR (-1)
+#define IPS_RC_PENDING (-2)
+#define IPS_RC_EXIST (-3)
+#define IPS_RC_MAX_ENTRIES_EXCEEDED (-4)
+#define IPS_RC_NOT_ENOUGH_BUFFERS (-100)
+#define IPS_RC_NO_FREE_MEM (-101)
+#define IPS_RC_NAME_LOOKUP_FAILED (-102)
+#define IPS_RC_PARAM_ERROR (-103)
+#define IPS_RC_UNKNOWN_DEVICE (-104)
+#define IPS_RC_DEVICE_INIT_FAILED (-105)
+#define IPS_RC_DATA_TRUNCATED (-106)
+#define IPS_RC_INVALID_RANK (-107)
+#define IPS_RC_INVALID_OPCODE (-108)
+#define IPS_RC_PEER_NOT_READY (-109)
+#define IPS_RC_PEER_CLOSED (-110)
+#define IPS_RC_DEST_EQUAL_LOCAL_RANK (-111)
+#define IPS_RC_DEVICE_ERROR (-112)
+#define IPS_RC_NETWORK_DOWN (-113)
+#define IPS_RC_NOT_ENOUGH_FREE_TIDS (-114)
+#define IPS_RC_NO_RESOURCE_AVAILABLE (-115)
+#define IPS_RC_HW_UPDATE_FAILED (-116)
+#define IPS_RC_PARTITION_ERROR (-117)
+#define IPS_RC_RUN_ERROR (-118)
+#define IPS_RC_ALREADY_OPEN (-119)
+#define IPS_RC_WAS_CLOSED (-120)
+#define IPS_RC_DEST_EQUAL_LOCAL_LID (-121)
+#define IPS_RC_BUFFER_ALIGMENT_ERROR (-122)
+#define IPS_RC_LENGTH_ALIGMENT_ERROR (-123)
+#define IPS_RC_INVALID_DATA_LENGTH (-124)
+#define IPS_RC_BUSY (-125)
+#define IPS_RC_INIT_TIMEOUT_EXPIRED (-126)
+#define IPS_RC_NO_PORTS_AVAILABLE (-127)
+#define IPS_RC_TRANSFER_INCOMPLETE (-128)
+#define IPS_RC_SYSERR (-129) /* errno has meaning, if no further errors since this error */
+#define IPS_RC_STARTUP_ERR (-130)
+
+/* Performance Counters Error Codes */
+#define IPS_RCPERF_INIT_FAILED (-200)
+#define IPS_RCPERF_EVENT_SETUP_FAILED (-201)
+#define IPS_RCPERF_REG_DEFAULT_SET (-202)
+#define IPS_RCPERF_UNSUPPORTED_CPU (-203)
+#define IPS_RCPERF_REG_GET_FAILED (-204)
+#define IPS_RCPERF_SET_EVENT_STR_FAILED (-205)
+#define IPS_RCPERF_INVALID_REGISTER (-206)
+
+char *ips_err_str(int);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c
new file mode 100644
index 0000000..01a0c3f
--- /dev/null
+++ b/ptl_ips/ptl.c
@@ -0,0 +1,950 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file implements the PSM PTL for ips */
+#include "psm_user.h"
+#include "ptl_ips.h"
+#include "ipserror.h"
+
+int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS 250
+
+#define HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED 6
+#define HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED 2
+
+static
+int
+ips_subcontext_ignore(const struct ips_recvhdrq_event *rcv_ev,
+ uint32_t subcontext)
+{
+ return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int
+ips_subcontext_process(const struct ips_recvhdrq_event *rcv_ev,
+ uint32_t subcontext)
+{
+ struct ptl_shared *recvshc = rcv_ev->proto->ptl->recvshc;
+ if_pt(subcontext != recvshc->subcontext &&
+ subcontext < recvshc->subcontext_cnt) {
+ return ips_writehdrq_append(&recvshc->writeq[subcontext],
+ rcv_ev);
+ }
+ else {
+ _HFI_VDBG
+ ("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n",
+ (int)subcontext, (int)recvshc->subcontext_cnt,
+ (int)recvshc->subcontext, (unsigned)rcv_ev->error_flags);
+ return IPS_RECVHDRQ_BREAK;
+ }
+}
+
+static
+void
+recvhdrq_hw_params(const psmi_context_t *context,
+ struct ips_recvq_params *hdrq,
+ struct ips_recvq_params *egrq,
+ int is_shared_context, int subcontext)
+{
+ const struct hfi1_ctxt_info *cinfo = &context->ctrl->ctxt_info;
+ const struct hfi1_base_info *binfo = &context->ctrl->base_info;
+
+ hdrq->elemcnt = cinfo->rcvhdrq_cnt;
+ /* dwords */
+ hdrq->elemsz = cinfo->rcvhdrq_entsize >> BYTE2DWORD_SHIFT;
+
+ egrq->elemcnt = cinfo->egrtids;
+ /* bytes */
+ egrq->elemsz = cinfo->rcvegr_size;
+
+ if (!is_shared_context) {
+ volatile uint64_t *uregbase = /* HW registers */
+ (volatile uint64_t *)(uintptr_t) binfo->user_regbase;
+
+ hdrq->base_addr =
+ (uint32_t *) (uintptr_t) binfo->rcvhdr_bufbase;
+ hdrq->head_register =
+ (volatile __le64 *)&uregbase[ur_rcvhdrhead];
+ hdrq->tail_register =
+ (volatile __le64 *)(uintptr_t) binfo->rcvhdrtail_base;
+
+ egrq->base_addr = (void *)(uintptr_t) binfo->rcvegr_bufbase;
+ egrq->head_register =
+ (volatile __le64 *)&uregbase[ur_rcvegrindexhead];
+ egrq->tail_register =
+ (volatile __le64 *)&uregbase[ur_rcvegrindextail];
+ } else {
+ /* Subcontexts mimic the HW registers but use different addresses
+ * to avoid cache contention. */
+ volatile uint64_t *subcontext_uregbase;
+ uint32_t *rcv_hdr, *rcv_egr;
+ unsigned hdrsize, egrsize;
+ unsigned pagesize = getpagesize();
+ unsigned i = pagesize - 1;
+
+ hdrsize =
+ (cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize + i) & ~i;
+ egrsize =
+ (cinfo->egrtids * cinfo->rcvegr_size + i) & ~i;
+
+ subcontext_uregbase = (uint64_t *)
+ (((uintptr_t) binfo->subctxt_uregbase) +
+ (sizeof(struct ips_subcontext_ureg) * subcontext));
+ rcv_hdr = (uint32_t *)
+ (((uintptr_t) binfo->subctxt_rcvhdrbuf +
+ (hdrsize * subcontext)));
+ rcv_egr = (uint32_t *)
+ (((uintptr_t) binfo->subctxt_rcvegrbuf +
+ (egrsize * subcontext)));
+
+ hdrq->base_addr = rcv_hdr;
+ hdrq->head_register =
+ (volatile __le64 *)&subcontext_uregbase[ur_rcvhdrhead * 8];
+ hdrq->tail_register =
+ (volatile __le64 *)&subcontext_uregbase[ur_rcvhdrtail * 8];
+
+ egrq->base_addr = rcv_egr;
+ egrq->head_register =
+ (volatile __le64 *)&subcontext_uregbase[ur_rcvegrindexhead *
+ 8];
+ egrq->tail_register =
+ (volatile __le64 *)&subcontext_uregbase[ur_rcvegrindextail *
+ 8];
+ }
+}
+
+static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context);
+static psm2_error_t shrecvq_fini(ptl_t *ptl);
+
+static size_t ips_ptl_sizeof(void)
+{
+ return sizeof(ptl_t);
+}
+
+static
+int ips_ptl_epaddr_stats_num(void)
+{
+ return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+}
+
+static
+int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags)
+{
+ int num_stats =
+ sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+ int i;
+
+ /* All stats are uint64_t */
+ for (i = 0; i < num_stats; i++)
+ flags[i] = MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO;
+
+ desc[0] = "errchecks sent";
+ desc[1] = "errchecks recv";
+ desc[2] = "naks sent";
+ desc[3] = "naks recv";
+ desc[4] = "connect reqs sent";
+ desc[5] = "disconnect reqs sent";
+ desc[6] = "tid grants sent";
+ desc[7] = "tid grants recv";
+ desc[8] = "send rexmit";
+ desc[9] = "congestion packets";
+
+ return num_stats;
+}
+
+int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o)
+{
+ int i, num_stats =
+ sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+ uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats;
+
+ for (i = 0; i < num_stats; i++)
+ stats_o[i] = stats_i[i];
+
+ return num_stats;
+}
+
+static
+psm2_error_t
+psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current)
+{
+ struct ptl *ptl = (struct ptl *)t->context;
+ const uint64_t current_count = get_cycles();
+ psm2_error_t err;
+
+ err = psmi_context_check_status(ptl->context);
+ if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS)
+ err = ips_spio_process_events(ptl);
+
+ psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+ current_count + ptl->status_cyc_timeout);
+
+ return err;
+}
+
+/*
+ * Check if non-double word multiple message size for SDMA is allowed to be
+ * pass to the driver. Starting from 6.2 driver version, PSM is able to pass
+ * to the driver message which size is not a multiple of double word for SDMA.
+ */
+ustatic
+void ips_ptl_non_dw_mul_sdma_init(void)
+{
+ uint16_t major_version = hfi_get_user_major_version();
+ uint16_t minor_version = hfi_get_user_minor_version();
+
+ if ((major_version > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) ||
+ ((major_version == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) &&
+ (minor_version >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED)))
+ {
+ ips_proto_mq_set_non_dw_mul_sdma(IPS_NON_DW_MUL_ALLOWED);
+ }
+ else
+ {
+ ips_proto_mq_set_non_dw_mul_sdma(IPS_NON_DW_MUL_NOT_ALLOWED);
+ }
+}
+
+static
+psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+ psm2_error_t err = PSM2_OK;
+ uint32_t num_of_send_bufs = ep->hfi_num_sendbufs;
+ uint32_t num_of_send_desc = ep->hfi_num_descriptors;
+ uint32_t imm_size = ep->hfi_imm_size;
+ const psmi_context_t *context = &ep->context;
+ const struct hfi1_user_info_dep *user_info = &context->user_info;
+ const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;
+ const int enable_shcontexts = (user_info->subctxt_cnt > 0);
+ const uint64_t current_count = get_cycles();
+
+ /* Preconditions */
+ psmi_assert_always(ep != NULL);
+ psmi_assert_always(ep->epaddr != NULL);
+ psmi_assert_always(ep->epid != 0);
+ psmi_assert_always(ep->hfi_num_sendbufs > 0);
+
+ memset(ptl, 0, sizeof(struct ptl));
+
+ ptl->ep = ep; /* back pointer */
+ ptl->epid = ep->epid; /* cache epid */
+ ptl->epaddr = ep->epaddr; /* cache a copy */
+ ptl->ctl = ctl;
+ ptl->context = context;
+ ptl->runtime_flags = context->runtime_flags;
+
+ memset(ctl, 0, sizeof(*ctl));
+ /* Fill in the control structure */
+ ctl->ep = ep;
+ ctl->ptl = ptl;
+ ctl->ep_poll = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll;
+ ctl->ep_connect = ips_ptl_connect;
+ ctl->ep_disconnect = ips_ptl_disconnect;
+ ctl->mq_send = ips_proto_mq_send;
+ ctl->mq_isend = ips_proto_mq_isend;
+
+ ctl->am_get_parameters = ips_am_get_parameters;
+
+ ctl->am_short_request = ips_am_short_request;
+ ctl->am_short_reply = ips_am_short_reply;
+
+ ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num;
+ ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init;
+ ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get;
+
+ /*
+ * Runtime flags in 'ptl' are different from runtime flags in 'context'.
+ * In 'context', runtime flags reflect what the driver is capable of.
+ * In 'ptl', runtime flags reflect the features we can or want to use in
+ * the driver's supported runtime flags.
+ */
+
+ /*
+ * This timer is to be used to check the context's status at every
+ * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS. This is useful to detect when
+ * the link transitions from the DOWN state to the UP state. We can thus
+ * stop aggregating link failure messages once we detect that the link is
+ * up.
+ */
+ psmi_timer_entry_init(&ptl->status_timer,
+ psmi_context_check_status_callback, ptl);
+
+ /* cache the context's status timeout in cycles */
+ ptl->status_cyc_timeout =
+ ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS);
+
+ /*
+ * Retransmissions and pending operations are kept in a timer structure
+ * (queue). The timerq is shared to various internal IPS interfaces so
+ * that they too may schedule events on the timer queue. The timerq is
+ * drained in the progress function.
+ */
+ if ((err = psmi_timer_init(&ptl->timerq)))
+ goto fail;
+
+ /* start the context's status timer */
+ psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+ current_count + ptl->status_cyc_timeout);
+
+ /*
+ * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings
+ * are added/removed by the connect portion of the ips protocol and lookup
+ * is made by the receive queue processing component.
+ */
+ if ((err = ips_epstate_init(&ptl->epstate, context)))
+ goto fail;
+
+ ips_ptl_non_dw_mul_sdma_init();
+ /*
+ * Context sharing, setup subcontext ureg page.
+ */
+ if (enable_shcontexts) {
+ struct ptl_shared *recvshc;
+
+ recvshc = (struct ptl_shared *)
+ psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ptl_shared));
+ if (recvshc == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+
+ ptl->recvshc = recvshc;
+ recvshc->ptl = ptl;
+
+ /* Initialize recvshc fields */
+ recvshc->context = ctxt_info->ctxt;
+ recvshc->subcontext = ctxt_info->subctxt;
+ recvshc->subcontext_cnt = user_info->subctxt_cnt;
+ psmi_assert_always(recvshc->subcontext_cnt <=
+ HFI1_MAX_SHARED_CTXTS);
+ psmi_assert_always(recvshc->subcontext <
+ recvshc->subcontext_cnt);
+
+ /*
+ * Using ep->context to avoid const modifier since this function
+ * will modify the content in ep->context.
+ */
+ if ((err = ips_subcontext_ureg_get(ptl, recvshc->subcontext_cnt,
+ &ep->context,
+ recvshc->subcontext_ureg)))
+ goto fail;
+
+ memset(recvshc->subcontext_ureg[recvshc->subcontext], 0,
+ sizeof(struct ips_subcontext_ureg));
+ recvshc->context_lock = &recvshc->hwcontext_ctrl->context_lock;
+ if (recvshc->subcontext == 0) {
+ if (pthread_spin_init(recvshc->context_lock,
+ PTHREAD_PROCESS_SHARED) != 0) {
+ err =
+ psmi_handle_error(ptl->ep,
+ PSM2_EP_DEVICE_FAILURE,
+ "Couldn't initialize process-shared spin lock");
+ goto fail;
+ }
+ }
+ }
+
+ /*
+ * Hardware send pio used by eager and control messages.
+ */
+ if ((err = ips_spio_init(context, ptl, &ptl->spioc)))
+ goto fail;
+
+ /*
+ * Actual ips protocol handling.
+ */
+ if ((err =
+ ips_proto_init(context, ptl, num_of_send_bufs, num_of_send_desc,
+ imm_size, &ptl->timerq, &ptl->epstate, &ptl->spioc,
+ &ptl->proto)))
+ goto fail;
+
+ /*
+ * Hardware receive hdr/egr queue, services incoming packets and issues
+ * callbacks for protocol handling in proto_recv. It uses the epstate
+ * interface to determine if a packet is known or unknown.
+ */
+ if (!enable_shcontexts) {
+ struct ips_recvhdrq_callbacks recvq_callbacks;
+ struct ips_recvq_params hdrq, egrq;
+ recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+ recvq_callbacks.callback_packet_unknown =
+ ips_proto_process_unknown;
+ recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+ recvq_callbacks.callback_error = ips_proto_process_packet_error;
+ if ((err =
+ ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+ &hdrq, &egrq, &recvq_callbacks,
+ ptl->runtime_flags, 0, &ptl->recvq,
+ &ptl->recvq_state)))
+ goto fail;
+ }
+
+ /*
+ * Software receive hdr/egr queue, used in shared contexts.
+ */
+ else if ((err = shrecvq_init(ptl, context)))
+ goto fail;
+
+ /*
+ * Receive thread, always initialized but not necessary creates a
+ * pthread.
+ */
+ if ((err = ips_ptl_rcvthread_init(ptl, &ptl->recvq)))
+ goto fail;
+fail:
+ return err;
+}
+
+static psm2_error_t ips_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_in)
+{
+ const struct hfi1_user_info_dep *user_info = &ptl->context->user_info;
+ const int enable_shcontexts = (user_info->subctxt_cnt > 0);
+ psm2_error_t err = PSM2_OK;
+
+ if ((err = ips_proto_fini(&ptl->proto, force, timeout_in)))
+ goto fail;
+
+ /* We have to cancel the thread after terminating the protocol because
+ * connect/disconnect packets use interrupts and the kernel doesn't
+ * like to have no pollers waiting */
+ if ((err = ips_ptl_rcvthread_fini(ptl)))
+ goto fail;
+
+ if ((err = ips_epstate_fini(&ptl->epstate)))
+ goto fail;
+
+ if ((err = ips_spio_fini(&ptl->spioc)))
+ goto fail;
+
+ if ((err = psmi_timer_fini(&ptl->timerq)))
+ goto fail;
+
+ if (!enable_shcontexts && (err = ips_recvhdrq_fini(&ptl->recvq)))
+ goto fail;
+
+ if (enable_shcontexts && (err = shrecvq_fini(ptl)))
+ goto fail;
+
+fail:
+ return err;
+}
+
+static
+psm2_error_t
+ips_ptl_optctl(const void *core_obj, int optname,
+ void *optval, uint64_t *optlen, int get)
+{
+ psm2_error_t err = PSM2_OK;
+
+ switch (optname) {
+ case PSM2_IB_OPT_EP_SL:
+ {
+ /* Core object is psm2_epaddr */
+ psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+ ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+
+ /* If endpoint does not use IB ignore for set, complain for get */
+ if (epaddr->ptlctl->ep_connect != ips_ptl_connect) {
+ if (get)
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Invalid EP transport");
+ goto exit_fn;
+ }
+
+ /* Sanity check option length */
+ if (*optlen < sizeof(uint8_t)) {
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Option value length error");
+ *optlen = sizeof(unsigned);
+ goto exit_fn;
+ }
+
+ if (get) {
+ /* Get returns the SL for the PIO flow */
+ *((uint8_t *) optval) =
+ (uint8_t) ipsaddr->
+ flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl;
+ } else {
+ uint16_t new_sl;
+
+ /* Sanity check if SL is within range */
+ new_sl = (uint16_t) *(uint8_t *) optval;
+ if (new_sl > PSMI_SL_MAX) {
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Invalid SL value %u. %d<= SL <=%d.",
+ new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+ goto exit_fn;
+ }
+
+ /* Set new SL for all flows */
+ ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path->
+ pr_sl = new_sl;
+ ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].path->
+ pr_sl = new_sl;
+ }
+ }
+ break;
+ case PSM2_IB_OPT_DF_SL:
+ {
+ /* Set default SL to be used by an endpoint for all communication */
+ /* Core object is psm2_epaddr */
+ psm2_ep_t ep = (psm2_ep_t) core_obj;
+
+ /* Make sure ep is specified */
+ if (!ep) {
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Invalid PSM Endpoint");
+ goto exit_fn;
+ }
+
+ /* Sanity check option length */
+ if (*optlen < sizeof(uint8_t)) {
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Option value length error");
+ *optlen = sizeof(uint8_t);
+ goto exit_fn;
+ }
+
+ if (get) {
+ *((uint8_t *) optval) =
+ ep->ptl_ips.ptl->proto.epinfo.ep_sl;
+ } else {
+ uint16_t new_sl;
+
+ /* Sanity check if SL is within range */
+ new_sl = (uint16_t) *(uint8_t *) optval;
+ if (new_sl > PSMI_SL_MAX) {
+ err =
+ psmi_handle_error(PSMI_EP_LOGEVENT,
+ PSM2_PARAM_ERR,
+ "Invalid SL value %u. %d<= SL <=%d.",
+ new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+ goto exit_fn;
+ }
+
+ ep->ptl_ips.ptl->proto.epinfo.ep_sl =
+ (uint8_t) new_sl;
+ }
+ }
+ break;
+ default:
+ err =
+ psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown PSM2_IB option %u.", optname);
+ }
+
+exit_fn:
+ return err;
+}
+
+static
+psm2_error_t
+ips_ptl_setopt(const void *component_obj, int optname,
+ const void *optval, uint64_t optlen)
+{
+ return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen,
+ 0);
+}
+
+static
+psm2_error_t
+ips_ptl_getopt(const void *component_obj, int optname,
+ void *optval, uint64_t *optlen)
+{
+ return ips_ptl_optctl(component_obj, optname, optval, optlen, 1);
+}
+
+psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored)
+{
+ const uint64_t current_count = get_cycles();
+ const int do_lock = PSMI_LOCK_DISABLED &&
+ (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+ psm2_error_t err = PSM2_OK_NO_PROGRESS;
+ psm2_error_t err2;
+
+ if (!ips_recvhdrq_isempty(&ptl->recvq)) {
+ if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq))
+ return err;
+ if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+ ips_recvhdrq_scan_cca(&ptl->recvq);
+ }
+
+ err = ips_recvhdrq_progress(&ptl->recvq);
+ if (do_lock)
+ ips_recvhdrq_unlock(&ptl->recvq);
+ if_pf(err > PSM2_OK_NO_PROGRESS)
+ return err;
+ err2 =
+ psmi_timer_process_if_expired(&(ptl->timerq),
+ current_count);
+ if (err2 != PSM2_OK_NO_PROGRESS)
+ return err2;
+ else
+ return err;
+ }
+
+ /*
+ * Process timer expirations after servicing receive queues (some packets
+ * may have been acked, some requests-to-send may have been queued).
+ *
+ * It's safe to look at the timer without holding the lock because it's not
+ * incorrect to be wrong some of the time.
+ */
+ if (psmi_timer_is_expired(&(ptl->timerq), current_count)) {
+ if (do_lock)
+ ips_recvhdrq_lock(&ptl->recvq);
+ err = psmi_timer_process_expired(&(ptl->timerq), current_count);
+ if (do_lock)
+ ips_recvhdrq_unlock(&ptl->recvq);
+ }
+
+ return err;
+}
+
+PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc))
+{
+ return pthread_spin_trylock(recvshc->context_lock);
+}
+
+PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc))
+{
+ pthread_spin_lock(recvshc->context_lock);
+}
+
+PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc))
+{
+ pthread_spin_unlock(recvshc->context_lock);
+}
+
+psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored)
+{
+ const uint64_t current_count = get_cycles();
+ psm2_error_t err = PSM2_OK_NO_PROGRESS;
+ psm2_error_t err2;
+ struct ptl_shared *recvshc = ptl->recvshc;
+ psmi_assert(recvshc != NULL);
+
+ /* The following header queue checks are speculative (but safe)
+ * until this process has acquired the lock. The idea is to
+ * minimize lock contention due to processes spinning on the
+ * shared context. */
+ if (ips_recvhdrq_isempty(&recvshc->recvq)) {
+ if (!ips_recvhdrq_isempty(&ptl->recvq) &&
+ ips_try_lock_shared_context(recvshc) == 0) {
+ /* check that subcontext is empty while under lock to avoid
+ * re-ordering of incoming packets (since packets from
+ * hardware context will be processed immediately). */
+ if_pt(ips_recvhdrq_isempty(&recvshc->recvq)) {
+ if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+ ips_recvhdrq_scan_cca(&ptl->recvq);
+ }
+
+ err = ips_recvhdrq_progress(&ptl->recvq);
+ }
+ ips_unlock_shared_context(recvshc);
+ }
+ }
+
+ if_pf(err > PSM2_OK_NO_PROGRESS)
+ return err;
+
+ if (!ips_recvhdrq_isempty(&recvshc->recvq)) {
+ if (recvshc->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+ ips_recvhdrq_scan_cca(&recvshc->recvq);
+ }
+
+ err2 = ips_recvhdrq_progress(&recvshc->recvq);
+ if (err2 != PSM2_OK_NO_PROGRESS) {
+ err = err2;
+ }
+ }
+
+ if_pf(err > PSM2_OK_NO_PROGRESS)
+ return err;
+
+ /*
+ * Process timer expirations after servicing receive queues (some packets
+ * may have been acked, some requests-to-send may have been queued).
+ */
+ err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count);
+ if (err2 != PSM2_OK_NO_PROGRESS)
+ err = err2;
+
+ return err;
+}
+
+int ips_ptl_recvq_isempty(const ptl_t *ptl)
+{
+ struct ptl_shared *recvshc = ptl->recvshc;
+
+ if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq))
+ return 0;
+ return ips_recvhdrq_isempty(&ptl->recvq);
+}
+
+/*
+ * Legacy ips_get_stat -- do nothing.
+ */
+int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats)
+{
+ memset(stats, 0, sizeof(ips_sess_stat));
+ return 0;
+}
+
+static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context)
+{
+ struct ptl_shared *recvshc = ptl->recvshc;
+ struct ips_recvhdrq_callbacks recvq_callbacks;
+ struct ips_recvq_params hdrq, egrq;
+ psm2_error_t err = PSM2_OK;
+ int i;
+
+ /* Initialize (shared) hardware context recvq (ptl->recvq) */
+ /* NOTE: uses recvq in ptl structure for shared h/w context */
+ recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+ recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown;
+ recvq_callbacks.callback_subcontext = ips_subcontext_process;
+ recvq_callbacks.callback_error = ips_proto_process_packet_error;
+ if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+ &hdrq, &egrq, &recvq_callbacks,
+ ptl->runtime_flags, recvshc->subcontext,
+ &ptl->recvq,
+ &recvshc->hwcontext_ctrl->recvq_state))) {
+ goto fail;
+ }
+
+ /* Initialize software subcontext (recvshc->recvq). Subcontexts do */
+ /* not require the rcvhdr copy feature. */
+ recvhdrq_hw_params(context, &hdrq, &egrq, 1, recvshc->subcontext);
+ recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+ if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+ &hdrq, &egrq, &recvq_callbacks,
+ ptl->runtime_flags, recvshc->subcontext,
+ &recvshc->recvq, &recvshc->recvq_state))) {
+ goto fail;
+ }
+
+ /* Initialize each recvshc->writeq for shared contexts */
+ for (i = 0; i < recvshc->subcontext_cnt; i++) {
+ recvhdrq_hw_params(context, &hdrq, &egrq, 1, i);
+ if ((err = ips_writehdrq_init(context, &hdrq, &egrq,
+ &recvshc->writeq[i],
+ &recvshc->subcontext_ureg[i]->
+ writeq_state,
+ ptl->runtime_flags))) {
+ goto fail;
+ }
+ }
+
+ if (err == PSM2_OK)
+ _HFI_DBG
+ ("Context sharing in use: lid %d, context %d, sub-context %d\n",
+ (int)psm2_epid_nid(ptl->epid), recvshc->context,
+ recvshc->subcontext);
+fail:
+ return err;
+}
+
+static psm2_error_t shrecvq_fini(ptl_t *ptl)
+{
+ psm2_error_t err = PSM2_OK;
+ int i;
+
+ /* disable my write header queue before deallocation */
+ i = ptl->recvshc->subcontext;
+ ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0;
+
+ if ((err = ips_recvhdrq_fini(&ptl->recvq)))
+ goto fail;
+
+ if ((err = ips_recvhdrq_fini(&ptl->recvshc->recvq)))
+ goto fail;
+
+ for (i = 0; i < ptl->recvshc->subcontext_cnt; i++) {
+ if ((err = ips_writehdrq_fini(&ptl->recvshc->writeq[i]))) {
+ goto fail;
+ }
+ }
+
+ psmi_free(ptl->recvshc);
+
+fail:
+ return err;
+}
+
+psm2_error_t
+ips_ptl_connect(ptl_t *ptl, int numep, const psm2_epid_t *array_of_epid,
+ const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+ psm2_error_t err;
+ psm2_ep_t ep;
+ psm2_epid_t *epid_array = NULL;
+ psm2_error_t *error_array = NULL;
+ psm2_epaddr_t *epaddr_array = NULL;
+ ips_epaddr_t *ipsaddr_master, *ipsaddr;
+ int *mask_array = NULL;
+ int i;
+
+ PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+ err = ips_proto_connect(&ptl->proto, numep, array_of_epid,
+ array_of_epid_mask, array_of_errors,
+ array_of_epaddr, timeout_in);
+ if (err)
+ return err;
+
+ psmi_assert_always(ptl->ep->mctxt_master == ptl->ep);
+ if (ptl->ep->mctxt_next == ptl->ep)
+ return err;
+
+ /* make the additional mutil-context connections. */
+ epid_array = (psm2_epid_t *)
+ psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep);
+ mask_array = (int *)
+ psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep);
+ error_array = (psm2_error_t *)
+ psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep);
+ epaddr_array = (psm2_epaddr_t *)
+ psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep);
+ if (!epid_array || !mask_array || !error_array || !epaddr_array) {
+ goto fail;
+ }
+
+ ep = ptl->ep->mctxt_next;
+ while (ep != ep->mctxt_master) {
+
+ /* Setup the mask array and epid array. */
+ for (i = 0; i < numep; i++) {
+ if (array_of_epid_mask[i]
+ && array_of_errors[i] == PSM2_OK) {
+ ipsaddr_master =
+ (ips_epaddr_t *) array_of_epaddr[i];
+ ipsaddr = ipsaddr_master->next;
+ mask_array[i] = 0;
+ while (ipsaddr != ipsaddr_master) {
+ if (((psm2_epaddr_t) ipsaddr)->proto->
+ ep == ep) {
+ mask_array[i] = 1;
+ epid_array[i] =
+ ((psm2_epaddr_t) ipsaddr)->
+ epid;
+ break;
+ }
+ ipsaddr = ipsaddr->next;
+ }
+ } else {
+ mask_array[i] = 0;
+ }
+ }
+
+ /* Make the real protocol connections. */
+ err =
+ ips_proto_connect(&ep->ptl_ips.ptl->proto, numep,
+ epid_array, mask_array, error_array,
+ epaddr_array, timeout_in);
+ if (err)
+ goto fail;
+
+ ep = ep->mctxt_next;
+ }
+
+fail:
+ if (epid_array)
+ psmi_free(epid_array);
+ if (mask_array)
+ psmi_free(mask_array);
+ if (error_array)
+ psmi_free(error_array);
+ if (epaddr_array)
+ psmi_free(epaddr_array);
+
+ return err;
+}
+
+psm2_error_t
+ips_ptl_disconnect(ptl_t *ptl, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+ psm2_error_t err;
+
+ PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+ err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr,
+ array_of_epaddr_mask, array_of_errors,
+ timeout_in);
+ return err;
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_ips = {
+ ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt,
+ ips_ptl_getopt
+};
diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h
new file mode 100644
index 0000000..d2a903a
--- /dev/null
+++ b/ptl_ips/ptl_fwd.h
@@ -0,0 +1,65 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_IPS_H
+#define _PTL_FWD_IPS_H
+#include "ptl.h"
+
+typedef struct ips_epaddr ips_epaddr_t;
+typedef struct ips_msgctl ips_msgctl_t;
+
+/* Symbol in ips ptl */
+struct ptl_ctl_init psmi_ptl_ips;
+#endif /* _PTL_FWD_IPS_H */
diff --git a/ptl_ips/ptl_ips.h b/ptl_ips/ptl_ips.h
new file mode 100644
index 0000000..56adaf7
--- /dev/null
+++ b/ptl_ips/ptl_ips.h
@@ -0,0 +1,194 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PTL_H
+#define _IPS_PTL_H
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+#include "ips_proto_params.h"
+#include "ips_proto.h"
+#include "ips_spio.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+#include "ips_epstate.h"
+#include "ips_stats.h"
+#include "ips_subcontext.h"
+
+struct ptl_shared;
+
+/*
+ * PTL at the ips level (for OPA)
+ *
+ * This PTL structure glues all the ips components together.
+ *
+ * * ips timer, shared by various components, allows each component to
+ * schedule time-based expiration callbacks on the timerq.
+ * * HW receive queue
+ * * send control block to handle eager messages
+ * * instantiation of the ips protocol
+ * * endpoint state, to map endpoint indexes into structures
+ *
+ * Receive-side
+ *
+ * ----[ proto ]
+ * / ^ ^
+ * | | |
+ * | packet packet
+ * | known unknown
+ * add_endpt \ /
+ * | |
+ * `----> [epstate]
+ * ^
+ * |
+ * lookup_endpt
+ * |
+ * [recvq]
+ * |
+ * poll
+ *
+ */
+/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */
+/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */
+struct ptl {
+ psm2_ep_t ep; /* back ptr */
+ psm2_epid_t epid; /* cached from ep */
+ psm2_epaddr_t epaddr; /* cached from ep */
+ ips_epaddr_t *ipsaddr; /* cached from epaddr */
+ ptl_ctl_t *ctl; /* cached from init */
+ const psmi_context_t *context; /* cached from init */
+
+ struct ips_spio spioc; /* PIO send control */
+ struct ips_proto proto; /* protocol instance: timerq, epstate, spio */
+
+ /* Receive header queue and receive queue processing */
+ uint32_t runtime_flags;
+ struct psmi_timer_ctrl timerq;
+ struct ips_epstate epstate; /* map incoming packets */
+ struct ips_recvhdrq_state recvq_state;
+ struct ips_recvhdrq recvq; /* HW recvq: epstate, proto */
+
+ /* timer to check the context's status */
+ struct psmi_timer status_timer;
+
+ /* context's status check timeout in cycles -- cached */
+ uint64_t status_cyc_timeout;
+
+ /* Shared contexts context */
+ struct ptl_shared *recvshc;
+
+ /* Rcv thread context */
+ struct ptl_rcvthread *rcvthread;
+}
+#ifndef PACK_STRUCT_STL
+#define PACK_STRUCT_STL /* nothing */
+#endif
+ __attribute__ ((PACK_STRUCT_STL aligned(16)));
+
+/*
+ * Sample implementation of shared contexts context.
+ *
+ * In shared mode, the hardware queue is serviced by more than one process.
+ * Each process also mirrors the hardware queue in software (represented by an
+ * ips_recvhdrq). For packets we service in the hardware queue that are not
+ * destined for us, we write them in other processes's receive queues
+ * (represented by an ips_writehdrq).
+ *
+ */
+struct ptl_shared {
+ ptl_t *ptl; /* backptr to main ptl */
+ uint32_t context;
+ uint32_t subcontext;
+ uint32_t subcontext_cnt;
+
+ pthread_spinlock_t *context_lock;
+ struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS];
+ struct ips_hwcontext_ctrl *hwcontext_ctrl;
+ struct ips_recvhdrq recvq; /* subcontext receive queue */
+ struct ips_recvhdrq_state recvq_state; /* subcontext receive queue state */
+ struct ips_writehdrq writeq[HFI1_MAX_SHARED_CTXTS]; /* peer subcontexts */
+};
+
+/*
+ * Connect/disconnect are wrappers around psm proto's connect/disconnect,
+ * mostly to abstract away PSM-specific stuff from ips internal structures
+ */
+psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep,
+ const psm2_epid_t *array_of_epid,
+ const int *array_of_epid_mask,
+ psm2_error_t *array_of_errors,
+ psm2_epaddr_t *array_of_epaddr,
+ uint64_t timeout_in);
+
+psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[],
+ uint64_t timeout_in);
+
+/*
+ * Generic Poll function for ips-level ptl
+ */
+psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Support for receive thread
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq);
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl);
+
+#endif /* _IPS_PTL_H */
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
new file mode 100644
index 0000000..527e113
--- /dev/null
+++ b/ptl_ips/ptl_rcvthread.c
@@ -0,0 +1,506 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/poll.h>
+
+#include "ptl_ips.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+#include "psm_mq_internal.h"
+#include "psm_user.h"
+
+/* All in milliseconds */
+#define RCVTHREAD_TO_MIN_FREQ 10 /* min of 10 polls per sec */
+#define RCVTHREAD_TO_MAX_FREQ 100 /* max of 100 polls per sec */
+#define RCVTHREAD_TO_SHIFT 1
+
+struct ptl_rcvthread;
+
+static void *ips_ptl_pollintr(void *recvthreadc);
+static psm2_error_t rcvthread_initstats(ptl_t *ptl);
+static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc);
+
+struct ptl_rcvthread {
+ const psmi_context_t *context;
+ const ptl_t *ptl;
+ struct ips_recvhdrq *recvq;
+
+ pthread_t hdrq_threadid;
+ uint64_t t_start_cyc;
+ int pipefd[2];
+
+ /* stats and some for scheduling */
+ uint64_t pollcnt;
+ uint64_t pollcnt_to;
+ uint64_t pollcyc;
+ uint64_t pollok;
+
+ /* For scheduling interrupt thread */
+ int timeout_period_min;
+ int timeout_period_max;
+ int timeout_shift;
+ uint64_t pollok_last;
+ uint64_t pollcnt_last;
+ uint32_t last_timeout;
+};
+
+#ifdef PSM_CUDA
+ /* This is a global cuda context (extern declaration in psm_user.h)
+ * stored to provide hints during a cuda failure
+ * due to a null cuda context.
+ */
+ CUcontext ctxt;
+#endif
+
+/*
+ * The receive thread knows about the ptl interface, so it can muck with it
+ * directly.
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq)
+{
+ psm2_error_t err = PSM2_OK;
+ struct ptl_rcvthread *rcvc;
+
+ ptl->rcvthread =
+ psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread));
+ if (ptl->rcvthread == NULL) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ rcvc = ptl->rcvthread;
+
+ rcvc->recvq = recvq;
+ rcvc->ptl = ptl;
+ rcvc->context = ptl->context;
+ rcvc->t_start_cyc = get_cycles();
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED)
+ PSMI_CUDA_DRIVER_API_CALL(cuCtxGetCurrent, &ctxt);
+#endif
+
+ if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+
+ if ((err = rcvthread_initsched(rcvc)))
+ goto fail;
+
+ /* Create a pipe so we can synchronously terminate the thread */
+ if (pipe(rcvc->pipefd) != 0) {
+ err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+ "Cannot create a pipe for receive thread: %s\n",
+ strerror(errno));
+ goto fail;
+ }
+
+ if (pthread_create(&rcvc->hdrq_threadid, NULL,
+ ips_ptl_pollintr, ptl->rcvthread)) {
+ close(rcvc->pipefd[0]);
+ close(rcvc->pipefd[1]);
+ err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+ "Cannot start receive thread: %s\n",
+ strerror(errno));
+ goto fail;
+ }
+
+ }
+
+ if ((err = rcvthread_initstats(ptl)))
+ goto fail;
+
+fail:
+ return err;
+}
+
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl)
+{
+ struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+ uint64_t t_now;
+ psm2_error_t err = PSM2_OK;
+
+ PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+ if (ptl->rcvthread == NULL)
+ return err;
+
+ if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+ t_now = get_cycles();
+
+ /* Disable interrupts then kill the receive thread */
+ if (psmi_context_interrupt_isenabled
+ ((psmi_context_t *) ptl->context))
+ if ((err =
+ psmi_context_interrupt_set((psmi_context_t *) ptl->
+ context, 0)))
+ goto fail;
+
+ /* Close the pipe so we can have the thread synchronously exit.
+ On Linux just closing the pipe does not wake up the receive
+ thread.
+ */
+ if (write(rcvc->pipefd[1], (const void *)&t_now,
+ sizeof(uint64_t)) == -1 ||
+ close(rcvc->pipefd[1]) == -1) {
+ _HFI_VDBG
+ ("unable to close pipe to receive thread cleanly\n");
+ }
+ pthread_join(rcvc->hdrq_threadid, NULL);
+
+ if (_HFI_PRDBG_ON) {
+ _HFI_PRDBG_ALWAYS
+ ("rcvthread poll success %lld/%lld times, "
+ "thread cancelled in %.3f us\n",
+ (long long)rcvc->pollok, (long long)rcvc->pollcnt,
+ (double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3);
+ }
+ }
+
+ psmi_free(ptl->rcvthread);
+
+fail:
+ return err;
+}
+
+psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
+{
+ union psmi_envvar_val env_to;
+ char buf[192];
+ char *rcv_freq = buf;
+ int no_timeout = 0;
+ int tvals[3] = { RCVTHREAD_TO_MIN_FREQ,
+ RCVTHREAD_TO_MAX_FREQ,
+ RCVTHREAD_TO_SHIFT
+ };
+ snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ,
+ RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT);
+ buf[sizeof(buf) - 1] = '\0';
+
+ if (!psmi_getenv("PSM2_RCVTHREAD_FREQ",
+ "Thread timeouts (per sec) <min_freq[:max_freq[:shift_freq]]>",
+ PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)rcv_freq, &env_to)) {
+ /* not using default values */
+ int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals);
+ int invalid = 0;
+
+ if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) ||
+ (nparsed > 1 && tvals[1] == 0)) {
+ no_timeout = 1;
+ } else {
+ if (nparsed > 0 && tvals[0] > 1000)
+ invalid = 1;
+ if (nparsed > 1
+ && (tvals[1] > 1000 || tvals[1] < tvals[0]))
+ invalid = 1;
+ if (nparsed > 2 && tvals[2] > 10)
+ invalid = 1;
+ }
+
+ if (invalid) {
+ _HFI_INFO
+ ("Overriding invalid request for RcvThread frequency"
+ " settings of %s to be <%d:%d:%d>\n", env_to.e_str,
+ RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ,
+ RCVTHREAD_TO_SHIFT);
+ tvals[0] = RCVTHREAD_TO_MIN_FREQ;
+ tvals[1] = RCVTHREAD_TO_MAX_FREQ;
+ tvals[2] = RCVTHREAD_TO_SHIFT;
+ }
+ }
+
+ if (no_timeout) {
+ rcvc->last_timeout = -1;
+ _HFI_PRDBG("PSM2_RCVTHREAD_FREQ set to only interrupt "
+ "(no timeouts)\n");
+ } else {
+ /* Convert freq to period in microseconds (for poll()) */
+ rcvc->timeout_period_max = 1000 / tvals[0];
+ rcvc->timeout_period_min = 1000 / tvals[1];
+ rcvc->timeout_shift = tvals[2];
+ /* Start in the middle of min and max */
+ rcvc->last_timeout = (rcvc->timeout_period_min +
+ rcvc->timeout_period_max) / 2;
+ _HFI_PRDBG("PSM2_RCVTHREAD_FREQ converted to period "
+ "min=%dms,max=%dms,shift=%d\n",
+ rcvc->timeout_period_min, rcvc->timeout_period_max,
+ rcvc->timeout_shift);
+ }
+ return PSM2_OK;
+}
+
+static
+int rcvthread_next_timeout(struct ptl_rcvthread *rcvc)
+{
+ uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last;
+
+ if (pollok_diff > 0) {
+ if (rcvc->last_timeout > rcvc->timeout_period_min)
+ /* By default, be less aggressive, but there's a more aggressive
+ * alternative if need be */
+#if 1
+ rcvc->last_timeout >>= rcvc->timeout_shift;
+#else
+ rcvc->last_timeout = rcvc->timeout_period_min;
+#endif
+ } else { /* we had less progress */
+ if (rcvc->last_timeout < rcvc->timeout_period_max)
+ rcvc->last_timeout <<= rcvc->timeout_shift;
+ }
+
+ rcvc->pollok_last = rcvc->pollok;
+ rcvc->pollcnt_last = rcvc->pollcnt;
+ return (int)rcvc->last_timeout;
+}
+
+extern int ips_in_rcvthread;
+
+/*
+ * Receiver thread support.
+ *
+ * By default, polling in the driver asks the chip to generate an interrupt on
+ * every packet. When the driver supports POLLURG we can switch the poll mode
+ * to one that requests interrupts only for packets that contain an urgent bit
+ * (and optionally enable interrupts for hdrq overflow events). When poll
+ * returns an event, we *try* to make progress on the receive queue but simply
+ * go back to sleep if we notice that the main thread is already making
+ * progress.
+ */
+static
+void *ips_ptl_pollintr(void *rcvthreadc)
+{
+ struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc;
+ struct ips_recvhdrq *recvq = rcvc->recvq;
+ psmi_context_t *context = (psmi_context_t *) rcvc->context;
+ int fd_dev = context->fd;
+ int fd_pipe = rcvc->pipefd[0];
+ psm2_ep_t ep;
+ struct pollfd pfd[2];
+ int ret;
+ int next_timeout = rcvc->last_timeout;
+ uint64_t t_cyc;
+ psm2_error_t err;
+
+#ifdef PSM_CUDA
+ if (PSMI_IS_CUDA_ENABLED && ctxt != NULL)
+ PSMI_CUDA_DRIVER_API_CALL(cuCtxSetCurrent, ctxt);
+#endif
+
+ PSM2_LOG_MSG("entering");
+ /* No reason to have many of these, keep this as a backup in case the
+ * recvhdrq init function is misused */
+ psmi_assert_always((recvq->runtime_flags & PSMI_RUNTIME_RCVTHREAD));
+
+ /* Switch driver to a mode where it can interrupt on urgent packets */
+ if (psmi_context_interrupt_set((psmi_context_t *)
+ rcvc->context, 1) == PSM2_EP_NO_RESOURCES) {
+ _HFI_PRDBG
+ ("hfi_poll_type feature not present in driver, turning "
+ "off internal progress thread\n");
+ return NULL;
+ }
+
+ _HFI_PRDBG("Enabled communication thread on URG packets\n");
+
+ while (1) {
+ pfd[0].fd = fd_dev;
+ pfd[0].events = POLLIN;
+ pfd[0].revents = 0;
+ pfd[1].fd = fd_pipe;
+ pfd[1].events = POLLIN;
+ pfd[1].revents = 0;
+
+ ret = poll(pfd, 2, next_timeout);
+ t_cyc = get_cycles();
+ if_pf(ret < 0) {
+ if (errno == EINTR)
+ _HFI_DBG("got signal, keep polling\n");
+ else
+ psmi_handle_error(PSMI_EP_NORETURN,
+ PSM2_INTERNAL_ERR,
+ "Receive thread poll() error: %s",
+ strerror(errno));
+ } else if (pfd[1].revents) {
+ /* Any type of event on this fd means exit, should be POLLHUP */
+ _HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents);
+ close(fd_pipe);
+ break;
+ } else {
+ rcvc->pollcnt++;
+ if (!PSMI_LOCK_TRY(psmi_creation_lock)) {
+
+ if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) {
+ if (PSMI_LOCK_DISABLED) {
+ /* We do this check without acquiring the lock, no sense to
+ * adding the overhead and it doesn't matter if we're
+ * wrong. */
+ if (ips_recvhdrq_isempty(recvq))
+ continue;
+ if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) {
+ ips_recvhdrq_scan_cca(recvq);
+ }
+ if (!ips_recvhdrq_trylock(recvq))
+ continue;
+ err = ips_recvhdrq_progress(recvq);
+ if (err == PSM2_OK)
+ rcvc->pollok++;
+ else
+ rcvc->pollcyc += get_cycles() - t_cyc;
+ ips_recvhdrq_unlock(recvq);
+ } else {
+
+ ep = psmi_opened_endpoint;
+
+ if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) {
+ if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN ) {
+ ips_recvhdrq_scan_cca(recvq);
+ }
+ PSMI_UNLOCK(ep->mq->progress_lock);
+ }
+
+ /* Go through all master endpoints. */
+ do{
+ if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) {
+ /* If we time out, we service shm and hfi. If not, we
+ * assume to have received an hfi interrupt and service
+ * only hfi.
+ */
+ err = psmi_poll_internal(ep,
+ ret ==
+ 0 ? PSMI_TRUE :
+ PSMI_FALSE);
+
+ if (err == PSM2_OK)
+ rcvc->pollok++;
+ else
+ rcvc->pollcyc += get_cycles() - t_cyc;
+ PSMI_UNLOCK(ep->mq->progress_lock);
+ }
+
+ /* get next endpoint from multi endpoint list */
+ ep = ep->user_ep_next;
+ } while(NULL != ep);
+ }
+ }
+
+ PSMI_UNLOCK(psmi_creation_lock);
+ }
+
+ if (ret == 0) { /* change timeout only on timed out poll */
+ rcvc->pollcnt_to++;
+ next_timeout = rcvthread_next_timeout(rcvc);
+ }
+
+ }
+ }
+
+ PSM2_LOG_MSG("leaving");
+ return NULL;
+}
+
+static uint64_t rcvthread_stats_pollok(void *context)
+{
+ struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+ double ratio = 0.0;
+ uint64_t ratio_u;
+ if (rcvc->pollcnt > 0)
+ ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt;
+ memcpy(&ratio_u, &ratio, sizeof(uint64_t));
+ return ratio_u;
+}
+
+static uint64_t rcvthread_stats_pollcyc(void *context)
+{
+ struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+ /* log in milliseconds */
+ return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6);
+}
+
+static psm2_error_t rcvthread_initstats(ptl_t *ptl)
+{
+ struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+ struct psmi_stats_entry entries[] = {
+ PSMI_STATS_DECL("intrthread schedule count",
+ MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO,
+ NULL, &rcvc->pollcnt),
+ PSMI_STATS_DECL("intrthread schedule success (%)",
+ MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_TYPE_DOUBLE,
+ rcvthread_stats_pollok, NULL),
+ PSMI_STATS_DECL("intrthread timeout count",
+ MPSPAWN_STATS_REDUCTION_ALL |
+ MPSPAWN_STATS_SKIP_IF_ZERO,
+ NULL, &rcvc->pollcnt_to),
+ PSMI_STATS_DECL("intrthread wasted time (ms)",
+ MPSPAWN_STATS_REDUCTION_ALL,
+ rcvthread_stats_pollcyc, NULL)
+ };
+
+ /* If we don't want a thread, make sure we still initialize the counters
+ * but set them to NaN instead */
+ if (!(ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD)) {
+ int i;
+ static uint64_t ctr_nan = MPSPAWN_NAN;
+ for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) {
+ entries[i].getfn = NULL;
+ entries[i].u.val = &ctr_nan;
+ }
+ }
+
+ return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+ PSMI_STATSTYPE_RCVTHREAD,
+ entries,
+ PSMI_STATS_HOWMANY(entries), rcvc);
+}
diff --git a/ptl_self/Makefile b/ptl_self/Makefile
new file mode 100644
index 0000000..daeac5b
--- /dev/null
+++ b/ptl_self/Makefile
@@ -0,0 +1,90 @@
+#
+# This file is provided under a dual BSD/GPLv2 license. When using or
+# redistributing this file, you may do so under either license.
+#
+# GPL LICENSE SUMMARY
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Contact Information:
+# Intel Corporation, www.intel.com
+#
+# BSD LICENSE
+#
+# Copyright(c) 2015 Intel Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Copyright (c) 2003-2014 Intel Corporation. All rights reserved.
+#
+
+OUTDIR = .
+
+this_srcdir = $(shell readlink -m .)
+top_srcdir := $(this_srcdir)/..
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := ptl.o
+${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs})
+DEPS := $(${TARGLIB}-objs:.o=.d)
+
+.PHONY: all clean
+IGNORE_DEP_TARGETS = clean
+
+all .DEFAULT: ${${TARGLIB}-objs}
+
+$(OUTDIR)/%.d: $(this_srcdir)/%.c
+ $(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+
+$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
+ $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+ @if [ -d $(OUTDIR) ]; then \
+ cd $(OUTDIR); \
+ rm -f *.o *.d *.gcda *.gcno; \
+ cd -; \
+ fi
+
+#ifeq prevents the deps from being included during clean
+#-include line is required to pull in auto-dependecies during 2nd pass
+ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),)
+-include ${DEPS}
+endif
+
+install:
+ @echo "Nothing to do for install."
diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c
new file mode 100644
index 0000000..da613d9
--- /dev/null
+++ b/ptl_self/ptl.c
@@ -0,0 +1,394 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/*
+ * This file implements the PSM PTL for self (loopback)
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+struct ptl {
+ psm2_ep_t ep;
+ psm2_epid_t epid;
+ psm2_epaddr_t epaddr;
+ ptl_ctl_t *ctl;
+} __attribute__((aligned(16)));
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted)
+{
+ psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr;
+
+ if (recv_req->recv_msglen > 0) {
+ PSM_VALGRIND_DEFINE_MQ_RECV(recv_req->buf, recv_req->buf_len,
+ recv_req->recv_msglen);
+ VALGRIND_MAKE_MEM_DEFINED(send_req->buf, send_req->buf_len);
+ VALGRIND_MAKE_MEM_DEFINED(send_req->buf, recv_req->recv_msglen);
+
+ psmi_mq_mtucpy(recv_req->buf, send_req->buf,
+ recv_req->recv_msglen);
+ }
+
+ psmi_mq_handle_rts_complete(recv_req);
+
+ /* If the send is already marked complete, that's because it was internally
+ * buffered. */
+ if (send_req->state == MQ_STATE_COMPLETE) {
+ psmi_mq_stats_rts_account(send_req);
+ if (send_req->buf != NULL && send_req->send_msglen > 0)
+ psmi_mq_sysbuf_free(send_req->mq, send_req->buf);
+ /* req was left "live" even though the sender was told that the
+ * send was done */
+ psmi_mq_req_free(send_req);
+ } else
+ psmi_mq_handle_rts_complete(send_req);
+
+ _HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n",
+ recv_req->buf, send_req, recv_req);
+ return PSM2_OK;
+}
+
+static
+psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq)
+{
+ uint8_t *ubuf;
+ psm2_mq_req_t req = *ireq;
+
+ PSMI_LOCK_ASSERT(req->mq->progress_lock);
+
+ /* We're waiting on a send request, and the matching receive has not been
+ * posted yet. This is a deadlock condition in MPI but we accommodate it
+ * here in the "self ptl" by using system-allocated memory.
+ */
+ req->testwait_callback = NULL; /* no more calls here */
+
+ ubuf = req->buf;
+ if (ubuf != NULL && req->send_msglen > 0) {
+ req->buf = psmi_mq_sysbuf_alloc(req->mq, req->send_msglen);
+ if (req->buf == NULL)
+ return PSM2_NO_MEMORY;
+ psmi_mq_mtucpy(req->buf, ubuf, req->send_msglen);
+ }
+
+ /* Mark it complete but don't free the req, it's freed when the receiver
+ * does the match */
+ req->state = MQ_STATE_COMPLETE;
+ *ireq = PSM2_MQ_REQINVALID;
+ return PSM2_OK;
+}
+
+/* Self is different. We do everything as rendezvous. */
+static
+psm2_error_t
+self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context,
+ psm2_mq_req_t *req_o)
+{
+ psm2_mq_req_t send_req;
+ psm2_mq_req_t recv_req;
+ int rc;
+
+ send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+ if_pf(send_req == NULL)
+ return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+ /* CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees the all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize
+ */
+ if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
+ int trueflag = 1;
+ PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
+ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+ (CUdeviceptr)ubuf);
+ send_req->is_buf_gpu_mem = 1;
+ } else
+ send_req->is_buf_gpu_mem = 0;
+#endif
+
+ rc = psmi_mq_handle_rts(mq, epaddr, tag,
+ len, NULL, 0, 1,
+ ptl_handle_rtsmatch, &recv_req);
+ send_req->tag = *tag;
+ send_req->buf = (void *)ubuf;
+ send_req->send_msglen = len;
+ send_req->context = context;
+ recv_req->ptl_req_ptr = (void *)send_req;
+ recv_req->rts_sbuf = (uintptr_t) ubuf;
+ recv_req->rts_peer = epaddr;
+ if (rc == MQ_RET_MATCH_OK)
+ ptl_handle_rtsmatch(recv_req, 1);
+ else
+ send_req->testwait_callback = self_mq_send_testwait;
+
+ _HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n",
+ ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+ rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req);
+ *req_o = send_req;
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+ psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+ psm2_error_t err;
+ psm2_mq_req_t req;
+ err = self_mq_isend(mq, epaddr, flags, tag, ubuf, len, NULL, &req);
+ psmi_mq_wait_internal(&req);
+ return err;
+}
+
+/* Fill in AM capabilities parameters */
+static psm2_error_t
+self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+ if (parameters == NULL) {
+ return PSM2_PARAM_ERR;
+ }
+
+ /* Self is just a loop-back and has no restrictions. */
+ parameters->max_handlers = INT_MAX;
+ parameters->max_nargs = INT_MAX;
+ parameters->max_request_short = INT_MAX;
+ parameters->max_reply_short = INT_MAX;
+
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_request(psm2_epaddr_t epaddr,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn,
+ void *completion_ctxt)
+{
+ psm2_am_handler_fn_t hfn;
+ psm2_ep_t ep = epaddr->ptlctl->ptl->ep;
+ struct psmi_am_token tok;
+
+ tok.epaddr_incoming = epaddr;
+
+ hfn = psm_am_get_handler_function(ep, handler);
+ hfn(&tok, args, nargs, src, len);
+
+ if (completion_fn) {
+ completion_fn(completion_ctxt);
+ }
+
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_reply(psm2_am_token_t token,
+ psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+ void *src, size_t len, int flags,
+ psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+ psm2_am_handler_fn_t hfn;
+ struct psmi_am_token *tok = token;
+ psm2_ep_t ep = tok->epaddr_incoming->ptlctl->ptl->ep;
+
+ hfn = psm_am_get_handler_function(ep, handler);
+ hfn(token, args, nargs, src, len);
+
+ if (completion_fn) {
+ completion_fn(completion_ctxt);
+ }
+
+ return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_connect(ptl_t *ptl,
+ int numep,
+ const psm2_epid_t array_of_epid[],
+ const int array_of_epid_mask[],
+ psm2_error_t array_of_errors[],
+ psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns)
+{
+ psmi_assert_always(ptl->epaddr != NULL);
+ psm2_error_t err = PSM2_OK;
+ int i;
+
+ PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+ for (i = 0; i < numep; i++) {
+ if (!array_of_epid_mask[i])
+ continue;
+
+ if (array_of_epid[i] == ptl->epid) {
+ array_of_epaddr[i] = ptl->epaddr;
+ array_of_epaddr[i]->ptlctl = ptl->ctl;
+ array_of_epaddr[i]->epid = ptl->epid;
+ if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid),
+ psmi_gethostname(), 0)) {
+ err = PSM2_NO_MEMORY;
+ goto fail;
+ }
+ psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr);
+ array_of_errors[i] = PSM2_OK;
+ } else {
+ array_of_epaddr[i] = NULL;
+ array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+ }
+ }
+
+fail:
+ return err;
+}
+
+static
+psm2_error_t
+self_disconnect(ptl_t *ptl, int force, int numep,
+ psm2_epaddr_t array_of_epaddr[],
+ const int array_of_epaddr_mask[],
+ psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+ int i;
+ for (i = 0; i < numep; i++) {
+ if (array_of_epaddr_mask[i] == 0)
+ continue;
+
+ if (array_of_epaddr[i] == ptl->epaddr) {
+ psmi_epid_remove(ptl->ep, ptl->epid);
+ array_of_errors[i] = PSM2_OK;
+ }
+ }
+ return PSM2_OK;
+}
+
+static
+size_t self_ptl_sizeof(void)
+{
+ return sizeof(ptl_t);
+}
+
+ustatic
+psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+ psmi_assert_always(ep != NULL);
+ psmi_assert_always(ep->epaddr != NULL);
+ psmi_assert_always(ep->epid != 0);
+
+ ptl->ep = ep;
+ ptl->epid = ep->epid;
+ ptl->epaddr = ep->epaddr;
+ ptl->ctl = ctl;
+
+ memset(ctl, 0, sizeof(*ctl));
+ /* Fill in the control structure */
+ ctl->ptl = ptl;
+ ctl->ep = ep;
+ ctl->ep_poll = NULL;
+ ctl->ep_connect = self_connect;
+ ctl->ep_disconnect = self_disconnect;
+
+ ctl->mq_send = self_mq_send;
+ ctl->mq_isend = self_mq_isend;
+
+ ctl->am_get_parameters = self_am_get_parameters;
+ ctl->am_short_request = self_am_short_request;
+ ctl->am_short_reply = self_am_short_reply;
+
+ /* No stats in self */
+ ctl->epaddr_stats_num = NULL;
+ ctl->epaddr_stats_init = NULL;
+ ctl->epaddr_stats_get = NULL;
+
+ return PSM2_OK;
+}
+
+static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+ return PSM2_OK; /* nothing to do */
+}
+
+static
+psm2_error_t
+self_ptl_setopt(const void *component_obj, int optname,
+ const void *optval, uint64_t optlen)
+{
+ /* No options for SELF PTL at the moment */
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown SELF ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+self_ptl_getopt(const void *component_obj, int optname,
+ void *optval, uint64_t *optlen)
+{
+ /* No options for SELF PTL at the moment */
+ return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+ "Unknown SELF ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_self = {
+ self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt,
+ self_ptl_getopt
+};
diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h
new file mode 100644
index 0000000..77ee7f9
--- /dev/null
+++ b/ptl_self/ptl_fwd.h
@@ -0,0 +1,62 @@
+/*
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2015 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Intel Corporation, www.intel.com
+
+ BSD LICENSE
+
+ Copyright(c) 2015 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_SELF_H
+#define _PTL_FWD_SELF_H
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_self;
+
+#endif
diff --git a/rpm_release_extension b/rpm_release_extension
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++ b/rpm_release_extension
@@ -0,0 +1 @@
+8
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/libpsm2.git
More information about the Pkg-ofed-commits
mailing list