[lua-torch-torch7] 01/07: New upstream version 0~20170511-gae1a805
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Sat May 20 10:07:56 UTC 2017
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.
commit d98615a032d5fbd16e4563e1815d839a7ebb3663
Author: Zhou Mo <cdluminate at gmail.com>
Date: Sat May 20 09:10:04 2017 +0000
New upstream version 0~20170511-gae1a805
---
CMakeLists.txt | 2 +-
CmdLine.lua | 4 +-
FFI.lua => FFInterface.lua | 4 +
README.md | 6 +-
TensorMath.lua | 27 +-
doc/tensor.md | 24 +-
init.lua | 2 +-
lib/TH/CMakeLists.txt | 15 +-
lib/TH/README.md | 6 +-
lib/TH/THGenerateAllTypes.h | 10 +-
lib/TH/THGenerateByteType.h | 24 +
lib/TH/THGenerateCharType.h | 24 +
...GenerateFloatTypes.h => THGenerateDoubleType.h} | 21 +-
...HGenerateFloatTypes.h => THGenerateFloatType.h} | 21 +-
lib/TH/THGenerateFloatTypes.h | 44 +-
lib/TH/THGenerateHalfType.h | 2 +-
lib/TH/THGenerateIntType.h | 24 +
lib/TH/THGenerateIntTypes.h | 97 +-
lib/TH/THGenerateLongType.h | 24 +
lib/TH/THGenerateShortType.h | 24 +
lib/TH/THMath.h | 17 +-
lib/TH/THStorage.c | 37 +
lib/TH/THStorage.h | 1 +
lib/TH/THTensorApply.h | 10 +-
lib/TH/cmake/FindSSE.cmake | 2 +-
lib/TH/generic/THBlas.c | 9 +-
lib/TH/generic/THLapack.c | 16 +
lib/TH/generic/THLapack.h | 1 +
lib/TH/generic/THTensor.c | 23 +-
lib/TH/generic/THTensor.h | 1 +
lib/TH/generic/THTensorLapack.c | 177 ++
lib/TH/generic/THTensorLapack.h | 3 +
lib/TH/generic/THTensorMath.c | 175 +-
lib/TH/generic/THTensorMath.h | 25 +-
lib/TH/generic/simd/simd.h | 12 +-
lib/TH/vector/VSX.c | 2063 +++++++++++++-------
lib/luaT/README.md | 64 +-
test/test.lua | 38 +-
38 files changed, 2033 insertions(+), 1046 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 095a721..f3d1d47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ INCLUDE_DIRECTORIES(BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/lib/luaT")
LINK_DIRECTORIES("${LUA_LIBDIR}")
SET(src DiskFile.c File.c MemoryFile.c PipeFile.c Storage.c Tensor.c Timer.c utils.c init.c TensorOperator.c TensorMath.c random.c Generator.c)
-SET(luasrc init.lua File.lua Tensor.lua CmdLine.lua FFI.lua Tester.lua TestSuite.lua ${CMAKE_CURRENT_BINARY_DIR}/paths.lua test/test.lua)
+SET(luasrc init.lua File.lua Tensor.lua CmdLine.lua FFInterface.lua Tester.lua TestSuite.lua ${CMAKE_CURRENT_BINARY_DIR}/paths.lua test/test.lua)
# Necessary do generate wrapper
ADD_TORCH_WRAP(tensormathwrap TensorMath.lua)
diff --git a/CmdLine.lua b/CmdLine.lua
index 23e9969..6436350 100644
--- a/CmdLine.lua
+++ b/CmdLine.lua
@@ -227,9 +227,9 @@ end
function CmdLine:help(arg)
io.write('Usage: ')
if arg then io.write(arg[0] .. ' ') end
- io.write('[options] ')
+ io.write('[options]')
for i=1,#self.arguments do
- io.write('<' .. strip(self.arguments[i].key) .. '>')
+ io.write(' <' .. strip(self.arguments[i].key) .. '>')
end
io.write('\n')
diff --git a/FFI.lua b/FFInterface.lua
similarity index 97%
rename from FFI.lua
rename to FFInterface.lua
index 333e7b5..cb8bd33 100644
--- a/FFI.lua
+++ b/FFInterface.lua
@@ -1,3 +1,7 @@
+-- if this causes issues, you may need to:
+-- luarocks remove --force ffi
+-- and follow instructions to install
+-- https://github.com/facebook/luaffifb
local ok, ffi = pcall(require, 'ffi')
local function checkArgument(condition, fn, ud, msg, level)
diff --git a/README.md b/README.md
index 96f0dd8..1a51c31 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
* Reporting bugs: [torch7](https://github.com/torch/torch7/issues) [nn](https://github.com/torch/nn/issues) [cutorch](https://github.com/torch/cutorch/issues) [cunn](https://github.com/torch/cutorch/issues) [optim](https://github.com/torch/optim/issues) [threads](https://github.com/torch/threads/issues)
* Hanging out with other developers and users (strictly no install issues, no large blobs of text): [Gitter Chat](https://gitter.im/torch/torch7)
-<a name="torch.reference.dok"/>
+<a name="torch.reference.dok"></a>
# Torch Package Reference Manual #
__Torch__ is the main package in [Torch7](http://torch.ch) where data
@@ -16,7 +16,7 @@ over these are defined. Additionally, it provides many utilities for
accessing files, serializing objects of arbitrary types and other
useful utilities.
-<a name="torch.overview.dok"/>
+<a name="torch.overview.dok"></a>
## Torch Packages ##
* Tensor Library
@@ -36,7 +36,7 @@ useful utilities.
* [Random](doc/random.md) defines a random number generator package with various distributions.
* Finally useful [utility](doc/utility.md) functions are provided for easy handling of torch tensor types and class inheritance.
-<a name="torch.links.dok"/>
+<a name="torch.links.dok"></a>
## Useful Links ##
* [Community packages](https://github.com/torch/torch7/wiki/Cheatsheet)
diff --git a/TensorMath.lua b/TensorMath.lua
index 53838ae..45e07c6 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -616,7 +616,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
cname("sum"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("prod",
cname("prodall"),
@@ -625,7 +626,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
cname("prod"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _,name in ipairs({"min", "max"}) do
wrap(name,
@@ -636,7 +638,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
{{name=Tensor, default=true, returned=true},
{name="IndexTensor", default=true, returned=true, noreadadd=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
end
for _,name in ipairs({"cmin", "cmax"}) do
@@ -719,21 +722,24 @@ wrap("topk",
{name="IndexTensor", default=true, returned=true, noreadadd=true},
{name=Tensor},
{name="long"},
- {name="index", default=lastdim(3)}})
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=true, invisible=true}})
wrap("mode",
cname("mode"),
{{name=Tensor, default=true, returned=true},
{name="IndexTensor", default=true, returned=true, noreadadd=true},
{name=Tensor},
- {name="index", default=lastdim(3)}})
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=true, invisible=true}})
wrap("median",
cname("median"),
{{name=Tensor, default=true, returned=true},
{name="IndexTensor", default=true, returned=true, noreadadd=true},
{name=Tensor},
- {name="index", default=lastdim(3)}})
+ {name="index", default=lastdim(3)},
+ {name="boolean", default=true, invisible=true}})
wrap("tril",
cname("tril"),
@@ -1083,7 +1089,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
cname("mean"),
{{name=Tensor, default=true, returned=true},
{name=Tensor},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
for _,name in ipairs({"var", "std"}) do
wrap(name,
@@ -1094,7 +1101,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name="index"},
- {name="boolean", default=false}})
+ {name="boolean", default=false},
+ {name="boolean", default=true, invisible=true}})
end
wrap("histc",
cname("histc"),
@@ -1121,7 +1129,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
{{name=Tensor, default=true, returned=true},
{name=Tensor},
{name=real},
- {name="index"}})
+ {name="index"},
+ {name="boolean", default=true, invisible=true}})
wrap("renorm",
cname("renorm"),
diff --git a/doc/tensor.md b/doc/tensor.md
index 5809dc1..d18af99 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -4,14 +4,14 @@
The `Tensor` class is probably the most important class in
`Torch`. Almost every package depends on this class. It is *__the__*
class for handling numeric data. As with pretty much anything in
-[Torch7](./../index.md), tensors are
+[Torch7](./index.md), tensors are
[serializable](file.md#torch.File.serialization).
__Multi-dimensional matrix__
-A `Tensor` is a potentially multi-dimensional matrix. The number of
-dimensions is unlimited that can be created using
-[LongStorage](storage.md) with more dimensions.
+A `Tensor` is a multi-dimensional matrix. The number of
+dimensions is unlimited (up to what can be created using
+[LongStorage](storage.md)).
Example:
```lua
@@ -554,7 +554,7 @@ false
<a name="torch.Tensor.double"></a>
<a name="torch.Tensor.float"></a>
-Convenience methods for the [type](#torch.type) method. For e.g.,
+Convenience methods for the [type](#torch.type) method. For example:
```lua
x = torch.Tensor(3):fill(3.14)
> x
@@ -1627,7 +1627,7 @@ y:maskedCopy(mask, x)
[torch.DoubleTensor of dimension 2x4]
```
-Note how the dimensions of the above `x`, `mask` and `y' do not match,
+Note how the dimensions of the above `x`, `mask` and `y` do not match,
but the number of elements do.
<a name="torch.Tensor.maskedFill"></a>
@@ -1663,8 +1663,8 @@ but the number of elements do.
Each of these methods returns a `LongTensor` corresponding to the indices of the
given search operation.
-<a name="torch.Tensor.nonzero"/>
-### [LongTensor] nonzero(tensor)
+<a name="torch.Tensor.nonzero"></a>
+### [LongTensor] nonzero(tensor) ###
Finds and returns a `LongTensor` corresponding to the *subscript* indices of all
non-zero elements in `tensor`.
@@ -1741,7 +1741,7 @@ These methods returns a Tensor which is created by replications of the
original tensor.
<a name="torch.expand"></a>
-#### [result] expand([result,] sizes) ####
+### [result] expand([result,] sizes) ###
`sizes` can either be a `torch.LongStorage` or numbers. Expanding a tensor
does not allocate new memory, but only creates a new view on the existing tensor where
@@ -1835,12 +1835,12 @@ i=0; y:apply(function() i=i+1;return i end)
```
<a name="torch.Tensor.expandAs"></a>
-#### [result] expandAs([result,] tensor) ####
+### [result] expandAs([result,] tensor) ###
This is equivalent to `self:expand(tensor:size())`
<a name="torch.repeatTensor"></a>
-#### [Tensor] repeatTensor([result,] sizes) ####
+### [Tensor] repeatTensor([result,] sizes) ###
`sizes` can either be a `torch.LongStorage` or numbers. Repeating a tensor allocates
new memory, unless `result` is provided, in which case its memory is
@@ -1879,7 +1879,7 @@ x = torch.rand(5)
```
<a name="torch.squeeze"></a>
-#### [Tensor] squeeze([dim]) ####
+### [Tensor] squeeze([dim]) ###
Removes all singleton dimensions of the tensor.
If `dim` is given, squeezes only that particular dimension of the tensor.
diff --git a/init.lua b/init.lua
index 517f4c8..0f3cfbb 100644
--- a/init.lua
+++ b/init.lua
@@ -155,7 +155,7 @@ torch.setdefaulttensortype('torch.DoubleTensor')
require('torch.Tensor')
require('torch.File')
require('torch.CmdLine')
-require('torch.FFI')
+require('torch.FFInterface')
require('torch.Tester')
require('torch.TestSuite')
require('torch.test')
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 8aeb204..c4e6694 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -25,8 +25,8 @@ ENDIF()
######################################################################
IF(MSVC)
- # MSVC now supports C99 since VS2013/VS2015
- SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
+ # MSVC now supports C99 since VS2013/VS2015, however the standard version switch is not provided yet
+ # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
ELSE(MSVC)
# enable gnu99 and not c99 because we use
# gnu extensions like posix_memalign
@@ -212,7 +212,7 @@ ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
IF(C_AVX_FOUND)
IF(MSVC)
SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
- SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX_FLAGS}")
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX ${C_AVX_FLAGS}")
ELSE(MSVC)
SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
@@ -222,7 +222,7 @@ ENDIF(C_AVX_FOUND)
IF(C_AVX2_FOUND)
IF(MSVC)
- SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX2_FLAGS}")
+ SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX2 ${C_AVX2_FLAGS}")
ELSE(MSVC)
SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
ENDIF(MSVC)
@@ -359,7 +359,14 @@ INSTALL(FILES
THFilePrivate.h
${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
THGenerateAllTypes.h
+ THGenerateDoubleType.h
+ THGenerateFloatType.h
THGenerateHalfType.h
+ THGenerateLongType.h
+ THGenerateIntType.h
+ THGenerateShortType.h
+ THGenerateCharType.h
+ THGenerateByteType.h
THGenerateFloatTypes.h
THGenerateIntTypes.h
THLapack.h
diff --git a/lib/TH/README.md b/lib/TH/README.md
index c646ce9..4ac26c1 100644
--- a/lib/TH/README.md
+++ b/lib/TH/README.md
@@ -1,7 +1,11 @@
Environment variables control the disabling of certain explicit SIMD optimizations.
```
+x64 options:
TH_NO_AVX2=1 # disable AVX2 codepaths
TH_NO_AVX=1 # disable AVX codepaths
TH_NO_SSE=1 # disable SSE codepaths
-```
\ No newline at end of file
+
+ppc64le options:
+TH_NO_VSX=1 # disable VSX codepaths
+```
diff --git a/lib/TH/THGenerateAllTypes.h b/lib/TH/THGenerateAllTypes.h
index 778da9d..5b9508d 100644
--- a/lib/TH/THGenerateAllTypes.h
+++ b/lib/TH/THGenerateAllTypes.h
@@ -2,10 +2,16 @@
#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
#endif
-#define THGenerateAllTypes
+#ifndef THGenerateManyTypes
+#define THAllLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
#include "THGenerateFloatTypes.h"
#include "THGenerateIntTypes.h"
-#undef THGenerateAllTypes
+#ifdef THAllLocalGenerateManyTypes
+#undef THAllLocalGenerateManyTypes
+#undef THGenerateManyTypes
#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THGenerateByteType.h b/lib/TH/THGenerateByteType.h
new file mode 100644
index 0000000..71ce7c4
--- /dev/null
+++ b/lib/TH/THGenerateByteType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateByteType.h"
+#endif
+
+#define real unsigned char
+#define accreal long
+#define Real Byte
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define THInf UCHAR_MAX
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_BYTE
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THGenerateCharType.h b/lib/TH/THGenerateCharType.h
new file mode 100644
index 0000000..158dd0e
--- /dev/null
+++ b/lib/TH/THGenerateCharType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateCharType.h"
+#endif
+
+#define real char
+#define accreal long
+#define Real Char
+#define THInf CHAR_MAX
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define TH_REAL_IS_CHAR
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_CHAR
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THGenerateFloatTypes.h b/lib/TH/THGenerateDoubleType.h
similarity index 54%
copy from lib/TH/THGenerateFloatTypes.h
copy to lib/TH/THGenerateDoubleType.h
index 639c1f9..fffee60 100644
--- a/lib/TH/THGenerateFloatTypes.h
+++ b/lib/TH/THGenerateDoubleType.h
@@ -1,24 +1,7 @@
#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#error "You must define TH_GENERIC_FILE before including THGenerateDoubleType.h"
#endif
-#define real float
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Float
-#define THInf FLT_MAX
-#define TH_REAL_IS_FLOAT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_FLOAT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
#define real double
#define accreal double
#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
@@ -36,6 +19,6 @@
#undef TH_CONVERT_REAL_TO_ACCREAL
#undef TH_CONVERT_ACCREAL_TO_REAL
-#ifndef THGenerateAllTypes
+#ifndef THGenerateManyTypes
#undef TH_GENERIC_FILE
#endif
diff --git a/lib/TH/THGenerateFloatTypes.h b/lib/TH/THGenerateFloatType.h
similarity index 54%
copy from lib/TH/THGenerateFloatTypes.h
copy to lib/TH/THGenerateFloatType.h
index 639c1f9..a31b50c 100644
--- a/lib/TH/THGenerateFloatTypes.h
+++ b/lib/TH/THGenerateFloatType.h
@@ -1,5 +1,5 @@
#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h"
#endif
#define real float
@@ -19,23 +19,6 @@
#undef TH_CONVERT_REAL_TO_ACCREAL
#undef TH_CONVERT_ACCREAL_TO_REAL
-#define real double
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Double
-#define THInf DBL_MAX
-#define TH_REAL_IS_DOUBLE
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_DOUBLE
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateAllTypes
+#ifndef THGenerateManyTypes
#undef TH_GENERIC_FILE
#endif
diff --git a/lib/TH/THGenerateFloatTypes.h b/lib/TH/THGenerateFloatTypes.h
index 639c1f9..be5ea84 100644
--- a/lib/TH/THGenerateFloatTypes.h
+++ b/lib/TH/THGenerateFloatTypes.h
@@ -1,41 +1,17 @@
#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
#endif
-#define real float
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Float
-#define THInf FLT_MAX
-#define TH_REAL_IS_FLOAT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_FLOAT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
+#ifndef THGenerateManyTypes
+#define THFloatLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
-#define real double
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Double
-#define THInf DBL_MAX
-#define TH_REAL_IS_DOUBLE
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_DOUBLE
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
+#include "THGenerateFloatType.h"
+#include "THGenerateDoubleType.h"
-#ifndef THGenerateAllTypes
+#ifdef THFloatLocalGenerateManyTypes
+#undef THFloatLocalGenerateManyTypes
+#undef THGenerateManyTypes
#undef TH_GENERIC_FILE
#endif
diff --git a/lib/TH/THGenerateHalfType.h b/lib/TH/THGenerateHalfType.h
index c42cb3f..47ff1e8 100644
--- a/lib/TH/THGenerateHalfType.h
+++ b/lib/TH/THGenerateHalfType.h
@@ -20,6 +20,6 @@
#undef TH_CONVERT_REAL_TO_ACCREAL
#undef TH_CONVERT_ACCREAL_TO_REAL
-#ifndef THGenerateAllTypes
+#ifndef THGenerateManyTypes
#undef TH_GENERIC_FILE
#endif
diff --git a/lib/TH/THGenerateIntType.h b/lib/TH/THGenerateIntType.h
new file mode 100644
index 0000000..1562b9e
--- /dev/null
+++ b/lib/TH/THGenerateIntType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateIntType.h"
+#endif
+
+#define real int
+#define accreal long
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Int
+#define THInf INT_MAX
+#define TH_REAL_IS_INT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_INT
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THGenerateIntTypes.h b/lib/TH/THGenerateIntTypes.h
index 1d991ca..9931fb1 100644
--- a/lib/TH/THGenerateIntTypes.h
+++ b/lib/TH/THGenerateIntTypes.h
@@ -2,92 +2,19 @@
#error "You must define TH_GENERIC_FILE before including THGenerateIntTypes.h"
#endif
-#define real unsigned char
-#define accreal long
-#define Real Byte
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define THInf UCHAR_MAX
-#define TH_REAL_IS_BYTE
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_BYTE
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-
-#define real char
-#define accreal long
-#define Real Char
-#define THInf CHAR_MAX
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define TH_REAL_IS_CHAR
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_CHAR
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#define real short
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Short
-#define THInf SHRT_MAX
-#define TH_REAL_IS_SHORT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_SHORT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#define real int
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Int
-#define THInf INT_MAX
-#define TH_REAL_IS_INT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_INT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
+#ifndef THGenerateManyTypes
+#define THIntLocalGenerateManyTypes
+#define THGenerateManyTypes
+#endif
-#define real long
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Long
-#define THInf LONG_MAX
-#define TH_REAL_IS_LONG
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_LONG
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
+#include "THGenerateByteType.h"
+#include "THGenerateCharType.h"
+#include "THGenerateShortType.h"
+#include "THGenerateIntType.h"
+#include "THGenerateLongType.h"
-#ifndef THGenerateAllTypes
+#ifdef THIntLocalGenerateManyTypes
+#undef THIntLocalGenerateManyTypes
+#undef THGenerateManyTypes
#undef TH_GENERIC_FILE
#endif
diff --git a/lib/TH/THGenerateLongType.h b/lib/TH/THGenerateLongType.h
new file mode 100644
index 0000000..75f90e1
--- /dev/null
+++ b/lib/TH/THGenerateLongType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateLongType.h"
+#endif
+
+#define real long
+#define accreal long
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Long
+#define THInf LONG_MAX
+#define TH_REAL_IS_LONG
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_LONG
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THGenerateShortType.h b/lib/TH/THGenerateShortType.h
new file mode 100644
index 0000000..047e51a
--- /dev/null
+++ b/lib/TH/THGenerateShortType.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateShortType.h"
+#endif
+
+#define real short
+#define accreal long
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
+#define Real Short
+#define THInf SHRT_MAX
+#define TH_REAL_IS_SHORT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_SHORT
+#undef TH_CONVERT_REAL_TO_ACCREAL
+#undef TH_CONVERT_ACCREAL_TO_REAL
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/lib/TH/THMath.h b/lib/TH/THMath.h
index b96083f..004e4fe 100644
--- a/lib/TH/THMath.h
+++ b/lib/TH/THMath.h
@@ -17,5 +17,20 @@ static inline double TH_lerp(double a, double b, double weight) {
return a + weight * (b-a);
}
-#endif // _THMATH_H
+static inline float TH_sigmoidf(float value) {
+ return 1.0f / (1.0f + expf(-value));
+}
+
+static inline float TH_fracf(float x) {
+ return x - truncf(x);
+}
+
+static inline float TH_rsqrtf(float x) {
+ return 1.0f / sqrtf(x);
+}
+static inline float TH_lerpf(float a, float b, float weight) {
+ return a + weight * (b-a);
+}
+
+#endif // _THMATH_H
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index 9c48e77..cea0f95 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -65,3 +65,40 @@ TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t
}
return copy;
}
+
+TH_API void THLongStorage_calculateExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim, THLongStorage *sizes, long **esz, long **est) {
+ ptrdiff_t ndim = THLongStorage_size(sizes);
+ long numUnsqueezed = ndim - tensorDim;
+
+ long *expandedSizes = THAlloc(sizeof(long)*ndim);
+ long *expandedStrides = THAlloc(sizeof(long)*ndim);
+
+ for (long i = numUnsqueezed; i < ndim; ++i) {
+ expandedSizes[i] = tensorSizes[i - numUnsqueezed];
+ expandedStrides[i] = tensorStrides[i - numUnsqueezed];
+ }
+
+ for (long i = numUnsqueezed - 1; i > -1; --i) {
+ expandedSizes[i] = 1;
+ expandedStrides[i] = expandedSizes[i+1] * expandedStrides[i+1];
+ }
+
+ // create a new geometry for the tensor
+ for (long i = 0; i < ndim; ++i) {
+ long size = expandedSizes[i];
+ long targetSize = THLongStorage_data(sizes)[i];
+ if (size == 1) {
+ if (targetSize != 1) {
+ expandedSizes[i] = targetSize;
+ expandedStrides[i] = 0;
+ }
+ } else if (size != targetSize) {
+ THFree(expandedSizes);
+ THFree(expandedStrides);
+ THError("The expanded size of the tensor (%d) must match the existing size (%d) at \
+ non-singleton dimension %ld.", targetSize, size, i);
+ }
+ }
+ *esz = expandedSizes;
+ *est = expandedStrides;
+}
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index df80229..f81926c 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -30,5 +30,6 @@ typedef struct {
TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
+TH_API void THLongStorage_calculateExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim, THLongStorage *sizes, long **esz, long **est);
#endif
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 3e6ed6e..86672b9 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -34,7 +34,7 @@
TYPE *TENSOR##_data = NULL; \
long *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
- int TENSOR##_contiguous = ALLOW_CONTIGUOUS; \
+ int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
TENSOR##_n = (TENSOR->nDimension ? 1 : 0); \
for(TENSOR##_i = 0; TENSOR##_i < TENSOR->nDimension; TENSOR##_i++) \
TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
@@ -61,7 +61,7 @@
TENSOR##_dim = 1; \
for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; TENSOR##_i--) \
{ \
- if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM) \
+ if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
TENSOR##_dim++; \
} \
/* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
@@ -69,7 +69,7 @@
TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
TH_TENSOR_dim_index = TENSOR##_dim-1; \
- TENSOR##_dimOffset = &TENSOR##_counter[DIM]; \
+ TENSOR##_dimOffset = (DIM == TENSOR->nDimension-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->nDimension-1]; \
TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \
/* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
@@ -79,9 +79,9 @@
TENSOR##_counter[TENSOR##_i] = 0; \
} \
for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; --TENSOR##_i) { \
- if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM) { \
+ if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
- if (TENSOR##_i < DIM) \
+ if (DIM != TENSOR->nDimension-1 && TENSOR##_i < DIM) \
TENSOR##_dimOffset--; \
} else { \
--TH_TENSOR_dim_index; \
diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
index f84ce89..a14abe8 100644
--- a/lib/TH/cmake/FindSSE.cmake
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -73,7 +73,7 @@ SET(AVX2_CODE "
int main()
{
- __m256i a;
+ __m256i a = {0};
a = _mm256_abs_epi16(a);
return 0;
}
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 371df4d..b04931f 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -83,8 +83,13 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
#endif
{
long i;
- for(i = 0; i < n; i++)
- x[i*incx] *= a;
+ for(i = 0; i < n; i++) {
+ if (a == 0) {
+ x[i*incx] = 0;
+ } else {
+ x[i*incx] *= a;
+ }
+ }
}
}
diff --git a/lib/TH/generic/THLapack.c b/lib/TH/generic/THLapack.c
index 7a9321b..148ae26 100644
--- a/lib/TH/generic/THLapack.c
+++ b/lib/TH/generic/THLapack.c
@@ -17,6 +17,8 @@ TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int
TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
+TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
@@ -138,6 +140,20 @@ void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
THError("getrf : Lapack library not found in compile time\n");
#endif
}
+
+void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+ dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#else
+ sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#endif
+#else
+ THError("getrs : Lapack library not found in compile time\n");
+#endif
+}
+
/* Matrix Inverse */
void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info)
{
diff --git a/lib/TH/generic/THLapack.h b/lib/TH/generic/THLapack.h
index da9df91..b464dd2 100644
--- a/lib/TH/generic/THLapack.h
+++ b/lib/TH/generic/THLapack.h
@@ -16,6 +16,7 @@ TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, rea
TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info);
/* LU decomposition */
TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info);
+TH_API void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info);
/* Matrix Inverse */
TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info);
diff --git a/lib/TH/generic/THTensor.c b/lib/TH/generic/THTensor.c
index 7d88bef..eab7231 100644
--- a/lib/TH/generic/THTensor.c
+++ b/lib/TH/generic/THTensor.c
@@ -21,14 +21,14 @@ int THTensor_(nDimension)(const THTensor *self)
long THTensor_(size)(const THTensor *self, int dim)
{
THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
- dim+1, THTensor_(nDimension)(self));
+ dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
return self->size[dim];
}
long THTensor_(stride)(const THTensor *self, int dim)
{
- THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor", dim+1,
- THTensor_(nDimension)(self));
+ THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
+ dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
return self->stride[dim];
}
@@ -285,6 +285,23 @@ void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, lon
THTensor_(resizeNd)(self, 5, size, NULL);
}
+THTensor* THTensor_(newExpand)(THTensor *tensor, THLongStorage *sizes) {
+ THArgCheck(THLongStorage_size(sizes) >= THTensor_(nDimension)(tensor), 1, "the number of sizes provided \
+ must be greater or equal to the number of dimensions in the tensor");
+ THArgCheck(THTensor_(nDimension)(tensor) > 0, 0, "can't expand an empty tensor");
+
+ long *expandedSizes;
+ long *expandedStrides;
+ THLongStorage_calculateExpandGeometry(tensor->size, tensor->stride, THTensor_(nDimension)(tensor), sizes, &expandedSizes, &expandedStrides);
+
+ THTensor *result = THTensor_(new)();
+ THTensor_(setStorageNd)(result, THTensor_(storage)(tensor), THTensor_(storageOffset)(tensor), THLongStorage_size(sizes), expandedSizes, expandedStrides);
+ THFree(expandedSizes);
+ THFree(expandedStrides);
+
+ return result;
+}
+
void THTensor_(set)(THTensor *self, THTensor *src)
{
if(self != src)
diff --git a/lib/TH/generic/THTensor.h b/lib/TH/generic/THTensor.h
index 2fac0a8..8754796 100644
--- a/lib/TH/generic/THTensor.h
+++ b/lib/TH/generic/THTensor.h
@@ -69,6 +69,7 @@ TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long fir
TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
+TH_API THTensor *THTensor_(newExpand)(THTensor *tensor, THLongStorage *size);
TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index fe82cc0..d0196c9 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -938,4 +938,181 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
THTensor_(free)(work);
}
+void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, THTensor *a)
+{
+ THArgCheck(THTensor_(nDimension)(a) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(a));
+
+ if (ra_ != a) {
+ THTensor_(resizeAs)(ra_, a);
+ THTensor_(copy)(ra_, a);
+ }
+
+ int m = a->size[1];
+ int n = a->size[2];
+ if (m != n) {
+ THError("btrifact is only implemented for square matrices");
+ }
+ long num_batches = THTensor_(size)(a, 0);
+ THTensor *ra__;
+ int lda;
+
+ if (ra_->stride[1] == 1) {
+ // column ordered, what BLAS wants
+ lda = ra_->stride[2];
+ ra__ = ra_;
+ } else {
+ // not column ordered, need to make it such (requires copy)
+ THTensor *transp_r_ = THTensor_(newTranspose)(ra_, 1, 2);
+ ra__ = THTensor_(newClone)(transp_r_);
+ THTensor_(free)(transp_r_);
+ THTensor_(transpose)(ra__, NULL, 1, 2);
+ lda = ra__->stride[2];
+ }
+
+ THTensor *ai = THTensor_(new)();
+ THTensor *rai = THTensor_(new)();
+ THIntTensor *rpivoti = THIntTensor_new();
+
+ int info = 0;
+ int *info_ptr = &info;
+ if (rinfo_) {
+ THIntTensor_resize1d(rinfo_, num_batches);
+ info_ptr = THIntTensor_data(rinfo_);
+ }
+
+ THIntTensor_resize2d(rpivots_, num_batches, n);
+
+ long batch = 0;
+ for (; batch < num_batches; ++batch) {
+ THTensor_(select)(ai, a, 0, batch);
+ THTensor_(select)(rai, ra__, 0, batch);
+ THIntTensor_select(rpivoti, rpivots_, 0, batch);
+
+ THLapack_(getrf)(n, n, THTensor_(data)(rai), lda,
+ THIntTensor_data(rpivoti), info_ptr);
+ if (rinfo_) {
+ info_ptr++;
+ } else if (info != 0) {
+ break;
+ }
+ }
+
+ THTensor_(free)(ai);
+ THTensor_(free)(rai);
+ THIntTensor_free(rpivoti);
+
+ if (ra__ != ra_) {
+ THTensor_(freeCopyTo)(ra__, ra_);
+ }
+
+ if (!rinfo_ && info != 0) {
+ THError("failed to factorize batch element %ld (info == %d)", batch, info);
+ }
+}
+
+void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
+{
+ THArgCheck(THTensor_(nDimension)(atf) == 3, 1, "expected 3D tensor, got %dD",
+ THTensor_(nDimension)(atf));
+ THArgCheck(THTensor_(nDimension)(b) == 3 ||
+ THTensor_(nDimension)(b) == 2, 4, "expected 2D or 3D tensor");
+ THArgCheck(THTensor_(size)(atf, 0) ==
+ THTensor_(size)(b, 0), 3, "number of batches must be equal");
+ THArgCheck(THTensor_(size)(atf, 1) ==
+ THTensor_(size)(atf, 2), 3, "A matrices must be square");
+ THArgCheck(THTensor_(size)(atf, 1) ==
+ THTensor_(size)(b, 1), 3, "dimensions of A and b must be equal");
+
+ if (rb_ != b) {
+ THTensor_(resizeAs)(rb_, b);
+ THTensor_(copy)(rb_, b);
+ }
+
+ long num_batches = atf->size[0];
+ long n = atf->size[1];
+ int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
+
+ int lda, ldb;
+ THTensor *atf_;
+ THTensor *rb__;
+
+ // correct ordering of A
+ if (atf->stride[1] == 1) {
+ // column ordered, what BLAS wants
+ lda = atf->stride[2];
+ atf_ = atf;
+ } else {
+ // not column ordered, need to make it such (requires copy)
+ // it would be nice if we could use the op(A) flags to automatically
+ // transpose A if needed, but this leads to unpredictable behavior if the
+ // user clones A_tf later with a different ordering
+ THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2);
+ atf_ = THTensor_(newClone)(transp_r_);
+ THTensor_(free)(transp_r_);
+ THTensor_(transpose)(atf_, NULL, 1, 2);
+ lda = atf_->stride[2];
+ }
+
+ // correct ordering of B
+ if (rb_->stride[1] == 1) {
+ // column ordered
+ if (rb_->nDimension == 2 || rb_->size[2] == 1) {
+ ldb = n;
+ } else {
+ ldb = rb_->stride[2];
+ }
+ rb__ = rb_;
+ } else {
+ // make column ordered
+ if (rb_->nDimension > 2) {
+ THTensor *transp_r_ = THTensor_(newTranspose)(rb_, 1, 2);
+ rb__ = THTensor_(newClone)(transp_r_);
+ THTensor_(free)(transp_r_);
+ THTensor_(transpose)(rb__, NULL, 1, 2);
+ ldb = rb__->stride[2];
+ } else {
+ rb__ = THTensor_(newClone)(rb_);
+ ldb = n;
+ }
+ }
+
+ THTensor *ai = THTensor_(new)();
+ THTensor *rbi = THTensor_(new)();
+ THIntTensor *pivoti = THIntTensor_new();
+
+ if (!THIntTensor_isContiguous(pivots)) {
+ THError("Error: rpivots_ is not contiguous.");
+ }
+
+ for (long batch = 0; batch < num_batches; ++batch) {
+ THTensor_(select)(ai, atf_, 0, batch);
+ THTensor_(select)(rbi, rb__, 0, batch);
+ THIntTensor_select(pivoti, pivots, 0, batch);
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+ int info;
+ THLapack_(getrs)('N', n, nrhs, THTensor_(data)(ai), lda,
+ THIntTensor_data(pivoti), THTensor_(data)(rbi),
+ ldb, &info);
+ if (info != 0) {
+ THError("Error: Nonzero info.");
+ }
+#else
+ THError("Unimplemented");
+#endif
+ }
+
+ THTensor_(free)(ai);
+ THTensor_(free)(rbi);
+ THIntTensor_free(pivoti);
+
+ if (atf_ != atf) {
+ THTensor_(free)(atf_);
+ }
+
+ if (rb__ != rb_) {
+ THTensor_(freeCopyTo)(rb__, rb_);
+ }
+}
+
#endif
diff --git a/lib/TH/generic/THTensorLapack.h b/lib/TH/generic/THTensorLapack.h
index 1a19977..29db83b 100644
--- a/lib/TH/generic/THTensorLapack.h
+++ b/lib/TH/generic/THTensorLapack.h
@@ -19,4 +19,7 @@ TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau);
TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans);
TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol);
+TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, THTensor *a);
+TH_API void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots);
+
#endif
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index b95d81f..1dc1bc7 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -715,7 +715,10 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
rp[i] = (value == 0)? NAN : tp[i] - value * floor(tp[i] / value);
#else
- rp[i] = tp[i] - value * (tp[i] / value); // There is no NAN for integers
+ // There is no NAN for integers
+ rp[i] = tp[i] % value;
+ if (rp[i] * value < 0)
+ rp[i] += value;
#endif
}
} else {
@@ -723,7 +726,8 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
#else
// There is no NAN for integers
- TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data - value * (*t_data / value););
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data % value;
+ if (*r__data * value < 0) *r__data += value;);
#endif
}
}
@@ -991,7 +995,10 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
rp[i] = (sp[i] == 0)? NAN : tp[i] - sp[i] * floor(tp[i] / sp[i]);
#else
- rp[i] = tp[i] - sp[i] * (tp[i] / sp[i]); // There is no NAN for integers
+ // There is no NAN for integers
+ rp[i] = tp[i] % sp[i];
+ if (rp[i] * sp[i] < 0)
+ rp[i] += sp[i];
#endif
}
} else {
@@ -999,7 +1006,8 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data););
#else
// There is no NAN for integers
- TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data - *src_data * (*t_data / *src_data););
+ TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data % *src_data;
+ if (*r__data * *src_data < 0) *r__data += *src_data;);
#endif
}
@@ -1357,7 +1365,10 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
THTensor_(copy)(r_, t);
}
- if(beta != 1)
+ if(beta == 0) {
+ THTensor_(zero)(r_);
+ }
+ else if(beta != 1)
THTensor_(mul)(r_, r_, beta);
if(r_->stride[0] == 1)
@@ -1474,7 +1485,7 @@ ptrdiff_t THTensor_(numel)(THTensor *t)
return THTensor_(nElement)(t);
}
-void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
{
THLongStorage *dim;
@@ -1543,9 +1554,14 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
THTensor_(free)(tempValues_);
THLongTensor_free(tempIndices_);
}
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(values_, values_, dimension);
+ THLongTensor_squeeze1d(indices_, indices_, dimension);
+ }
}
-void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
{
THLongStorage *dim;
@@ -1611,10 +1627,15 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
*tempIndices__data = *tempIndices__dimOffset;
});
}
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(values_, values_, dimension);
+ THLongTensor_squeeze1d(indices_, indices_, dimension);
+ }
}
-void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
{
THLongStorage *dim;
@@ -1644,9 +1665,13 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
THTensor_(free)(temp_);
}
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(r_, r_, dimension);
+ }
}
-void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
{
THLongStorage *dim;
@@ -1676,6 +1701,10 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
THTensor_(free)(temp_);
}
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(r_, r_, dimension);
+ }
}
void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
@@ -2244,7 +2273,7 @@ static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements,
#undef REAL_SWAP
#undef BOTH_SWAP
-void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
{
THLongStorage *dim;
THTensor *temp_;
@@ -2302,9 +2331,13 @@ void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
THTensor_(free)(temp_);
THLongTensor_free(tempi_);
+ if (!keepdim) {
+ THTensor_(squeeze1d)(values_, values_, dimension);
+ THLongTensor_squeeze1d(indices_, indices_, dimension);
+ }
}
-void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension)
+void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim)
{
THLongStorage *dim;
THTensor *temp_;
@@ -2344,9 +2377,13 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t,
THTensor_(free)(temp_);
THLongTensor_free(tempi_);
+ if (!keepdim) {
+ THTensor_(squeeze1d)(values_, values_, dimension);
+ THLongTensor_squeeze1d(indices_, indices_, dimension);
+ }
}
-void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
{
long t_size_dim, k;
@@ -2355,7 +2392,7 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
t_size_dim = THTensor_(size)(t, dimension);
k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
- THTensor_(kthvalue)(values_, indices_, t, k+1, dimension);
+ THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
}
void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted)
@@ -2686,7 +2723,7 @@ TENSOR_IMPLEMENT_LOGICAL(ne,!=)
LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
#endif /* long only part */
-#if defined(TH_REAL_IS_INT)
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT)
LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
#endif /* int only part */
@@ -2709,54 +2746,62 @@ TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
/* floating point only now */
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-LAB_IMPLEMENT_BASIC_FUNCTION(log,log)
-LAB_IMPLEMENT_BASIC_FUNCTION(log1p,log1p)
-LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_sigmoid)
-LAB_IMPLEMENT_BASIC_FUNCTION(exp,exp)
-LAB_IMPLEMENT_BASIC_FUNCTION(cos,cos)
-LAB_IMPLEMENT_BASIC_FUNCTION(acos,acos)
-LAB_IMPLEMENT_BASIC_FUNCTION(cosh,cosh)
-LAB_IMPLEMENT_BASIC_FUNCTION(sin,sin)
-LAB_IMPLEMENT_BASIC_FUNCTION(asin,asin)
-LAB_IMPLEMENT_BASIC_FUNCTION(sinh,sinh)
-LAB_IMPLEMENT_BASIC_FUNCTION(tan,tan)
-LAB_IMPLEMENT_BASIC_FUNCTION(atan,atan)
-LAB_IMPLEMENT_BASIC_FUNCTION(tanh,tanh)
-LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,pow)
-LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,sqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_rsqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(ceil,ceil)
-LAB_IMPLEMENT_BASIC_FUNCTION(floor,floor)
-LAB_IMPLEMENT_BASIC_FUNCTION(round,round)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,fabs)
-LAB_IMPLEMENT_BASIC_FUNCTION(trunc,trunc)
-LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_frac)
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+
+LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log))
+LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p))
+LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid))
+LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp))
+LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos))
+LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos))
+LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh))
+LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin))
+LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin))
+LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh))
+LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan))
+LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan))
+LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh))
+LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,TH_MATH_NAME(pow))
+LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil))
+LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor))
+LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round))
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
+LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc))
+LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-LAB_IMPLEMENT_BASIC_FUNCTION(cinv, 1.0 / )
+LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
+
void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
{
THTensor_(resizeAs)(r_, tx);
- TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = atan2(*tx_data,*ty_data););
+ TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
}
void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
{
THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
THTensor_(resizeAs)(r_, a);
- TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_lerp(*a_data, *b_data, weight););
+ TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight););
}
-void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
{
THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
dimension + TH_INDEX_BASE);
- THTensor_(sum)(r_, t, dimension);
+ THTensor_(sum)(r_, t, dimension, keepdim);
THTensor_(div)(r_, r_, t->size[dimension]);
}
-void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
+void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
{
THLongStorage *dim;
@@ -2785,7 +2830,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
sum2 /= t_size;
sum2 -= sum*sum;
sum2 = (sum2 < 0 ? 0 : sum2);
- *r__data = (real)sqrt(sum2);
+ *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
}
else
{
@@ -2793,11 +2838,15 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
sum2 /= t_size-1;
sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
sum2 = (sum2 < 0 ? 0 : sum2);
- *r__data = (real)sqrt(sum2);
+ *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
});
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(r_, r_, dimension);
+ }
}
-void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag)
+void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
{
THLongStorage *dim;
@@ -2836,9 +2885,13 @@ void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag)
sum2 = (sum2 < 0 ? 0 : sum2);
*r__data = (real)sum2;
});
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(r_, r_, dimension);
+ }
}
-void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension)
+void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim)
{
THLongStorage *dim;
@@ -2861,9 +2914,15 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension)
TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
accreal sum = 0;
long i;
- for(i = 0; i < t_size; i++)
- sum += pow(fabs(t_data[i*t_stride]), value);
- *r__data = pow(sum, 1.0/value);)
+ for(i = 0; i < t_size; i++) {
+ sum += TH_MATH_NAME(pow)(
+ TH_MATH_NAME(fabs)(t_data[i*t_stride]), value);
+ }
+ *r__data = TH_MATH_NAME(pow)(sum, 1.0/value);)
+ }
+
+ if (!keepdim) {
+ THTensor_(squeeze1d)(r_, r_, dimension);
}
}
@@ -2874,14 +2933,14 @@ accreal THTensor_(normall)(THTensor *tensor, real value)
TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
return sum;
} else if(value == 1) {
- TH_TENSOR_APPLY(real, tensor, sum += fabs(*tensor_data););
+ TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
return sum;
} else if(value == 2) {
TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
return sqrt(sum);
} else {
- TH_TENSOR_APPLY(real, tensor, sum += pow(fabs(*tensor_data), value););
- return pow(sum, 1.0/value);
+ TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
+ return TH_MATH_NAME(pow)(sum, 1.0/value);
}
}
@@ -2913,7 +2972,7 @@ void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension,
} else if (value == 2) {
TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
} else {
- TH_TENSOR_APPLY(real, rowS, norm += pow(fabs(*rowS_data), value););
+ TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
}
norm = pow(norm, 1/value);
@@ -2939,8 +2998,9 @@ accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
{
real sum = 0;
TH_TENSOR_APPLY2(real, tensor, real, src,
- sum += pow(fabs(*tensor_data - *src_data), value);)
- return pow(sum, 1.0/value);
+ sum += TH_MATH_NAME(pow)(
+ TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
+ return TH_MATH_NAME(pow)(sum, 1.0/value);
}
accreal THTensor_(meanall)(THTensor *tensor)
@@ -2998,12 +3058,12 @@ void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
if(n == 1) {
TH_TENSOR_APPLY(real, r_,
- *r__data = pow(10.0, a);
+ *r__data = TH_MATH_NAME(pow)(10.0, a);
i++;
);
} else {
TH_TENSOR_APPLY(real, r_,
- *r__data = pow(10.0, a + i*(b-a)/((real)(n-1)));
+ *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1)));
i++;
);
}
@@ -3091,6 +3151,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minval
);
}
+#undef TH_MATH_NAME
#endif /* floating point only part */
#undef IS_NONZERO
#endif
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index ff994ed..a3cf410 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -69,13 +69,13 @@ TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha,
TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain);
TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
-TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension);
-TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim);
+TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim);
TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
@@ -132,7 +132,7 @@ TH_API void THTensor_(geTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
TH_API void THTensor_(neTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-#if defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
#endif
@@ -140,6 +140,7 @@ TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
TH_API void THTensor_(log)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(lgamma)(THTensor *r_, THTensor *t);
TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
TH_API void THTensor_(cos)(THTensor *r_, THTensor *t);
@@ -164,10 +165,10 @@ TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t);
TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
-TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag);
-TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag);
-TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension);
+TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
+TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
+TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
index 19d41b1..83c4c56 100644
--- a/lib/TH/generic/simd/simd.h
+++ b/lib/TH/generic/simd/simd.h
@@ -78,7 +78,13 @@ static inline uint32_t detectHostSIMDExtensions()
static inline uint32_t detectHostSIMDExtensions()
{
- return SIMDExtension_VSX;
+ uint32_t hostSimdExts = SIMDExtension_DEFAULT;
+ char *evar;
+
+ evar = getenv("TH_NO_VSX");
+ if (evar == NULL || strncmp(evar, "1", 2) != 0)
+ hostSimdExts = SIMDExtension_VSX;
+ return hostSimdExts;
}
#else //PPC64 without VSX
@@ -89,7 +95,7 @@ static inline uint32_t detectHostSIMDExtensions()
}
#endif
-
+
#else // x86
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
@@ -146,7 +152,7 @@ static inline uint32_t detectHostSIMDExtensions()
evar = getenv("TH_NO_SSE");
if (evar == NULL || strncmp(evar, "1", 2) != 0)
- TH_NO_SSE = 0;
+ TH_NO_SSE = 0;
if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) {
hostSimdExts |= SIMDExtension_SSE;
}
diff --git a/lib/TH/vector/VSX.c b/lib/TH/vector/VSX.c
index 14f14a7..ce5bb38 100644
--- a/lib/TH/vector/VSX.c
+++ b/lib/TH/vector/VSX.c
@@ -1,14 +1,10 @@
#ifdef __PPC64__
-
#include <altivec.h>
#include <stddef.h>
//--------------------------------------------------------------------------------------------------
-// THDoubleVector_fill_VSX was tested on Power8:
-//
-// Unrolling 128 elements is 20% faster than unrolling 64 elements.
-// Unrolling 64 elements is faster than unrolling any lesser number of elements.
+// THDoubleVector_fill_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
{
@@ -103,28 +99,36 @@ static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n
//--------------------------------------------------------------------------------------------------
-// THDoubleVector_add_VSX was tested on Power8:
-//
-// Max speedup achieved when unrolling 24 elements.
-// When unrolling 32 elements, the performance was the same as for 24.
-// When unrolling 16 elements, performance was not as good as for 24.
-// Unrolling 24 elements was 43% faster than unrolling 4 elements (2.8 sec vs 4.0 sec).
-// Unrolling 24 elements was about 8% faster than unrolling 16 elements (2.8 sec vs 3.0 sec).
+// THDoubleVector_cadds_VSX:
//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_add_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+static void THDoubleVector_cadd_VSX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
- vector double c_fp64vec2;
+
+ double val[2] = {c, c};
+ vector double c_fp64vec2 = vec_xl(0, val);
+
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
- double val[2] = {c, c};
- c_fp64vec2 = vec_xl(0, val);
for (i = 0; i <= n-24; i += 24)
{
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
@@ -138,31 +142,110 @@ static void THDoubleVector_add_VSX(double *y, const double *x, const double c, c
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
+ y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
+ y4_fp64vec2 = vec_madd(y4_fp64vec2, c_fp64vec2, x4_fp64vec2);
+ y5_fp64vec2 = vec_madd(y5_fp64vec2, c_fp64vec2, x5_fp64vec2);
+ y6_fp64vec2 = vec_madd(y6_fp64vec2, c_fp64vec2, x6_fp64vec2);
+ y7_fp64vec2 = vec_madd(y7_fp64vec2, c_fp64vec2, x7_fp64vec2);
+ y8_fp64vec2 = vec_madd(y8_fp64vec2, c_fp64vec2, x8_fp64vec2);
+ y9_fp64vec2 = vec_madd(y9_fp64vec2, c_fp64vec2, x9_fp64vec2);
+ y10_fp64vec2 = vec_madd(y10_fp64vec2, c_fp64vec2,x10_fp64vec2);
+ y11_fp64vec2 = vec_madd(y11_fp64vec2, c_fp64vec2,x11_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, z+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, z+(i+10));
+ vec_xst(y6_fp64vec2, 0, z+(i+12));
+ vec_xst(y7_fp64vec2, 0, z+(i+14));
+ vec_xst(y8_fp64vec2, 0, z+(i+16));
+ vec_xst(y9_fp64vec2, 0, z+(i+18));
+ vec_xst(y10_fp64vec2, 0, z+(i+20));
+ vec_xst(y11_fp64vec2, 0, z+(i+22));
+ }
+ for (; i <= n-8; i += 8)
+ {
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
- y4_fp64vec2 = vec_xl(0, y+(i+8 ));
- y5_fp64vec2 = vec_xl(0, y+(i+10));
- y6_fp64vec2 = vec_xl(0, y+(i+12));
- y7_fp64vec2 = vec_xl(0, y+(i+14));
- y8_fp64vec2 = vec_xl(0, y+(i+16));
- y9_fp64vec2 = vec_xl(0, y+(i+18));
- y10_fp64vec2 = vec_xl(0, y+(i+20));
- y11_fp64vec2 = vec_xl(0, y+(i+22));
- y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
- y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 );
- y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 );
- y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 );
- y4_fp64vec2 = vec_madd(c_fp64vec2, x4_fp64vec2, y4_fp64vec2 );
- y5_fp64vec2 = vec_madd(c_fp64vec2, x5_fp64vec2, y5_fp64vec2 );
- y6_fp64vec2 = vec_madd(c_fp64vec2, x6_fp64vec2, y6_fp64vec2 );
- y7_fp64vec2 = vec_madd(c_fp64vec2, x7_fp64vec2, y7_fp64vec2 );
- y8_fp64vec2 = vec_madd(c_fp64vec2, x8_fp64vec2, y8_fp64vec2 );
- y9_fp64vec2 = vec_madd(c_fp64vec2, x9_fp64vec2, y9_fp64vec2 );
- y10_fp64vec2 = vec_madd(c_fp64vec2, x10_fp64vec2, y10_fp64vec2);
- y11_fp64vec2 = vec_madd(c_fp64vec2, x11_fp64vec2, y11_fp64vec2);
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ }
+ for (; i < n; i++)
+ z[i] = x[i] + c* y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_adds_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_adds_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ double val[2] = {c, c};
+ vector double c_fp64vec2 = vec_xl(0, val);
+
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+ for (i = 0; i <= n-24; i += 24)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2);
+ y4_fp64vec2 = vec_add(x4_fp64vec2, c_fp64vec2);
+ y5_fp64vec2 = vec_add(x5_fp64vec2, c_fp64vec2);
+ y6_fp64vec2 = vec_add(x6_fp64vec2, c_fp64vec2);
+ y7_fp64vec2 = vec_add(x7_fp64vec2, c_fp64vec2);
+ y8_fp64vec2 = vec_add(x8_fp64vec2, c_fp64vec2);
+ y9_fp64vec2 = vec_add(x9_fp64vec2, c_fp64vec2);
+ y10_fp64vec2 = vec_add(x10_fp64vec2, c_fp64vec2);
+ y11_fp64vec2 = vec_add(x11_fp64vec2, c_fp64vec2);
+
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -184,15 +267,10 @@ static void THDoubleVector_add_VSX(double *y, const double *x, const double c, c
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
-
- y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
- y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 );
- y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 );
- y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 );
+ y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -202,157 +280,159 @@ static void THDoubleVector_add_VSX(double *y, const double *x, const double c, c
for (; i <= n-2; i += 2)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
+ y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = (c * x[i]) + y[i];
+ y[i] = x[i] +c;
}
-static void THDoubleVector_diff_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_cmul_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_cmul_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
+{
ptrdiff_t i;
- vector double xz0_fp64vec2, xz1_fp64vec2, xz2_fp64vec2, xz3_fp64vec2, xz4_fp64vec2, xz5_fp64vec2, xz6_fp64vec2, xz7_fp64vec2;
- vector double xz8_fp64vec2, xz9_fp64vec2, xz10_fp64vec2, xz11_fp64vec2;
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
for (i = 0; i <= n-24; i += 24)
{
- xz0_fp64vec2 = vec_xl(0, x+(i ));
- xz1_fp64vec2 = vec_xl(0, x+(i+2 ));
- xz2_fp64vec2 = vec_xl(0, x+(i+4 ));
- xz3_fp64vec2 = vec_xl(0, x+(i+6 ));
- xz4_fp64vec2 = vec_xl(0, x+(i+8 ));
- xz5_fp64vec2 = vec_xl(0, x+(i+10));
- xz6_fp64vec2 = vec_xl(0, x+(i+12));
- xz7_fp64vec2 = vec_xl(0, x+(i+14));
- xz8_fp64vec2 = vec_xl(0, x+(i+16));
- xz9_fp64vec2 = vec_xl(0, x+(i+18));
- xz10_fp64vec2 = vec_xl(0, x+(i+20));
- xz11_fp64vec2 = vec_xl(0, x+(i+22));
-
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
- y4_fp64vec2 = vec_xl(0, y+(i+8 ));
- y5_fp64vec2 = vec_xl(0, y+(i+10));
- y6_fp64vec2 = vec_xl(0, y+(i+12));
- y7_fp64vec2 = vec_xl(0, y+(i+14));
- y8_fp64vec2 = vec_xl(0, y+(i+16));
- y9_fp64vec2 = vec_xl(0, y+(i+18));
- y10_fp64vec2 = vec_xl(0, y+(i+20));
- y11_fp64vec2 = vec_xl(0, y+(i+22));
-
- xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
- xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 );
- xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 );
- xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 );
- xz4_fp64vec2 = vec_sub(xz4_fp64vec2, y4_fp64vec2 );
- xz5_fp64vec2 = vec_sub(xz5_fp64vec2, y5_fp64vec2 );
- xz6_fp64vec2 = vec_sub(xz6_fp64vec2, y6_fp64vec2 );
- xz7_fp64vec2 = vec_sub(xz7_fp64vec2, y7_fp64vec2 );
- xz8_fp64vec2 = vec_sub(xz8_fp64vec2, y8_fp64vec2 );
- xz9_fp64vec2 = vec_sub(xz9_fp64vec2, y9_fp64vec2 );
- xz10_fp64vec2 = vec_sub(xz10_fp64vec2, y10_fp64vec2);
- xz11_fp64vec2 = vec_sub(xz11_fp64vec2, y11_fp64vec2);
-
- vec_xst(xz0_fp64vec2, 0, z+(i ));
- vec_xst(xz1_fp64vec2, 0, z+(i+2 ));
- vec_xst(xz2_fp64vec2, 0, z+(i+4 ));
- vec_xst(xz3_fp64vec2, 0, z+(i+6 ));
- vec_xst(xz4_fp64vec2, 0, z+(i+8 ));
- vec_xst(xz5_fp64vec2, 0, z+(i+10));
- vec_xst(xz6_fp64vec2, 0, z+(i+12));
- vec_xst(xz7_fp64vec2, 0, z+(i+14));
- vec_xst(xz8_fp64vec2, 0, z+(i+16));
- vec_xst(xz9_fp64vec2, 0, z+(i+18));
- vec_xst(xz10_fp64vec2, 0, z+(i+20));
- vec_xst(xz11_fp64vec2, 0, z+(i+22));
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+ y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2);
+ y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2);
+ y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2);
+ y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2);
+ y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2);
+ y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2);
+ y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
+ y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, z+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, z+(i+10));
+ vec_xst(y6_fp64vec2, 0, z+(i+12));
+ vec_xst(y7_fp64vec2, 0, z+(i+14));
+ vec_xst(y8_fp64vec2, 0, z+(i+16));
+ vec_xst(y9_fp64vec2, 0, z+(i+18));
+ vec_xst(y10_fp64vec2, 0, z+(i+20));
+ vec_xst(y11_fp64vec2, 0, z+(i+22));
}
for (; i <= n-8; i += 8)
{
- xz0_fp64vec2 = vec_xl(0, x+(i ));
- xz1_fp64vec2 = vec_xl(0, x+(i+2 ));
- xz2_fp64vec2 = vec_xl(0, x+(i+4 ));
- xz3_fp64vec2 = vec_xl(0, x+(i+6 ));
-
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
-
- xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
- xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 );
- xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 );
- xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 );
-
- vec_xst(xz0_fp64vec2, 0, z+(i ));
- vec_xst(xz1_fp64vec2, 0, z+(i+2 ));
- vec_xst(xz2_fp64vec2, 0, z+(i+4 ));
- vec_xst(xz3_fp64vec2, 0, z+(i+6 ));
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
}
for (; i <= n-2; i += 2)
{
- xz0_fp64vec2 = vec_xl(0, x+(i ));
- y0_fp64vec2 = vec_xl(0, y+(i ));
- xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
- vec_xst(xz0_fp64vec2, 0, z+(i ));
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, z+(i ));
}
for (; i < n; i++)
- z[i] = x[i] - y[i];
+ z[i] = x[i] * y[i];
}
-static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t n)
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_muls_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
- vector double c_fp64vec2;
double val[2] = {c, c};
- c_fp64vec2 = vec_xl(0, val);
+ vector double c_fp64vec2 = vec_xl(0, val);
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
- vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2, y12_fp64vec2, y13_fp64vec2, y14_fp64vec2, y15_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
- for (i = 0; i <= n-32; i += 32)
+ for (i = 0; i <= n-24; i += 24)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
- y4_fp64vec2 = vec_xl(0, y+(i+8 ));
- y5_fp64vec2 = vec_xl(0, y+(i+10));
- y6_fp64vec2 = vec_xl(0, y+(i+12));
- y7_fp64vec2 = vec_xl(0, y+(i+14));
- y8_fp64vec2 = vec_xl(0, y+(i+16));
- y9_fp64vec2 = vec_xl(0, y+(i+18));
- y10_fp64vec2 = vec_xl(0, y+(i+20));
- y11_fp64vec2 = vec_xl(0, y+(i+22));
- y12_fp64vec2 = vec_xl(0, y+(i+24));
- y13_fp64vec2 = vec_xl(0, y+(i+26));
- y14_fp64vec2 = vec_xl(0, y+(i+28));
- y15_fp64vec2 = vec_xl(0, y+(i+30));
-
- y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2);
- y4_fp64vec2 = vec_mul(y4_fp64vec2, c_fp64vec2);
- y5_fp64vec2 = vec_mul(y5_fp64vec2, c_fp64vec2);
- y6_fp64vec2 = vec_mul(y6_fp64vec2, c_fp64vec2);
- y7_fp64vec2 = vec_mul(y7_fp64vec2, c_fp64vec2);
- y8_fp64vec2 = vec_mul(y8_fp64vec2, c_fp64vec2);
- y9_fp64vec2 = vec_mul(y9_fp64vec2, c_fp64vec2);
- y10_fp64vec2 = vec_mul(y10_fp64vec2, c_fp64vec2);
- y11_fp64vec2 = vec_mul(y11_fp64vec2, c_fp64vec2);
- y12_fp64vec2 = vec_mul(y12_fp64vec2, c_fp64vec2);
- y13_fp64vec2 = vec_mul(y13_fp64vec2, c_fp64vec2);
- y14_fp64vec2 = vec_mul(y14_fp64vec2, c_fp64vec2);
- y15_fp64vec2 = vec_mul(y15_fp64vec2, c_fp64vec2);
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2);
+ y4_fp64vec2 = vec_mul(x4_fp64vec2, c_fp64vec2);
+ y5_fp64vec2 = vec_mul(x5_fp64vec2, c_fp64vec2);
+ y6_fp64vec2 = vec_mul(x6_fp64vec2, c_fp64vec2);
+ y7_fp64vec2 = vec_mul(x7_fp64vec2, c_fp64vec2);
+ y8_fp64vec2 = vec_mul(x8_fp64vec2, c_fp64vec2);
+ y9_fp64vec2 = vec_mul(x9_fp64vec2, c_fp64vec2);
+ y10_fp64vec2 = vec_mul(x10_fp64vec2, c_fp64vec2);
+ y11_fp64vec2 = vec_mul(x11_fp64vec2, c_fp64vec2);
+
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -366,22 +446,18 @@ static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t
vec_xst(y9_fp64vec2, 0, y+(i+18));
vec_xst(y10_fp64vec2, 0, y+(i+20));
vec_xst(y11_fp64vec2, 0, y+(i+22));
- vec_xst(y12_fp64vec2, 0, y+(i+24));
- vec_xst(y13_fp64vec2, 0, y+(i+26));
- vec_xst(y14_fp64vec2, 0, y+(i+28));
- vec_xst(y15_fp64vec2, 0, y+(i+30));
}
for (; i <= n-8; i += 8)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2);
+ y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -390,16 +466,19 @@ static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t
}
for (; i <= n-2; i += 2)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
- vec_xst(y0_fp64vec2, 0, y+(i ));
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * c;
+ y[i] = c * x[i];
}
-static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n)
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_cdiv_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_cdiv_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
{
ptrdiff_t i;
@@ -437,31 +516,31 @@ static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
- y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2);
- y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2);
- y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2);
- y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2);
- y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2);
- y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2);
- y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
- y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
-
- vec_xst(y0_fp64vec2, 0, y+(i ));
- vec_xst(y1_fp64vec2, 0, y+(i+2 ));
- vec_xst(y2_fp64vec2, 0, y+(i+4 ));
- vec_xst(y3_fp64vec2, 0, y+(i+6 ));
- vec_xst(y4_fp64vec2, 0, y+(i+8 ));
- vec_xst(y5_fp64vec2, 0, y+(i+10));
- vec_xst(y6_fp64vec2, 0, y+(i+12));
- vec_xst(y7_fp64vec2, 0, y+(i+14));
- vec_xst(y8_fp64vec2, 0, y+(i+16));
- vec_xst(y9_fp64vec2, 0, y+(i+18));
- vec_xst(y10_fp64vec2, 0, y+(i+20));
- vec_xst(y11_fp64vec2, 0, y+(i+22));
+ y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2);
+ y4_fp64vec2 = vec_div(x4_fp64vec2, y4_fp64vec2);
+ y5_fp64vec2 = vec_div(x5_fp64vec2, y5_fp64vec2);
+ y6_fp64vec2 = vec_div(x6_fp64vec2, y6_fp64vec2);
+ y7_fp64vec2 = vec_div(x7_fp64vec2, y7_fp64vec2);
+ y8_fp64vec2 = vec_div(x8_fp64vec2, y8_fp64vec2);
+ y9_fp64vec2 = vec_div(x9_fp64vec2, y9_fp64vec2);
+ y10_fp64vec2 = vec_div(x10_fp64vec2, y10_fp64vec2);
+ y11_fp64vec2 = vec_div(x11_fp64vec2, y11_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, z+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, z+(i+10));
+ vec_xst(y6_fp64vec2, 0, z+(i+12));
+ vec_xst(y7_fp64vec2, 0, z+(i+14));
+ vec_xst(y8_fp64vec2, 0, z+(i+16));
+ vec_xst(y9_fp64vec2, 0, z+(i+18));
+ vec_xst(y10_fp64vec2, 0, z+(i+20));
+ vec_xst(y11_fp64vec2, 0, z+(i+22));
}
for (; i <= n-8; i += 8)
{
@@ -475,44 +554,133 @@ static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+ y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2);
- vec_xst(y0_fp64vec2, 0, y+(i ));
- vec_xst(y1_fp64vec2, 0, y+(i+2 ));
- vec_xst(y2_fp64vec2, 0, y+(i+4 ));
- vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ vec_xst(y0_fp64vec2, 0, z+(i ));
+ vec_xst(y1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, z+(i+6 ));
}
for (; i <= n-2; i += 2)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
x0_fp64vec2 = vec_xl(0, x+(i ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
- vec_xst(y0_fp64vec2, 0, y+(i ));
+ y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, z+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * x[i];
+ z[i] = x[i] / y[i];
}
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_divs_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_divs_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+ double val[2] = {c, c};
+ vector double c_fp64vec2 = vec_xl(0, val);
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+ for (i = 0; i <= n-24; i += 24)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
-static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
-{
- ptrdiff_t i;
-
- float val[4] = {c, c, c, c};
- vector float fp32vec4 = vec_xl(0, val);
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
+ y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2);
+ y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2);
+ y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2);
+ y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2);
+ y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2);
+ y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2);
+ y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
+ y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
+
- for (i = 0; i <= n-256; i += 256)
- {
- vec_xst(fp32vec4, 0, x+(i ));
- vec_xst(fp32vec4, 0, x+(i+4 ));
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, y+(i+10));
+ vec_xst(y6_fp64vec2, 0, y+(i+12));
+ vec_xst(y7_fp64vec2, 0, y+(i+14));
+ vec_xst(y8_fp64vec2, 0, y+(i+16));
+ vec_xst(y9_fp64vec2, 0, y+(i+18));
+ vec_xst(y10_fp64vec2, 0, y+(i+20));
+ vec_xst(y11_fp64vec2, 0, y+(i+22));
+ }
+ for (; i <= n-8; i += 8)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = x[i] / c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_fill_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ float val[4] = {c, c, c, c};
+ vector float fp32vec4 = vec_xl(0, val);
+
+ for (i = 0; i <= n-256; i += 256)
+ {
+ vec_xst(fp32vec4, 0, x+(i ));
+ vec_xst(fp32vec4, 0, x+(i+4 ));
vec_xst(fp32vec4, 0, x+(i+8 ));
vec_xst(fp32vec4, 0, x+(i+12 ));
vec_xst(fp32vec4, 0, x+(i+16 ));
@@ -594,25 +762,42 @@ static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
}
-static void THFloatVector_add_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cadd_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cadd_VSX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
- vector float c_fp32vec4;
+
+ float val[4] = {c, c, c, c};
+ vector float c_fp32vec4 = vec_xl(0, val);
+
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
- float val[4] = {c, c, c, c};
- c_fp32vec4 = vec_xl(0, val);
for (i = 0; i <= n-48; i += 48)
{
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y4_fp32vec4 = vec_xl(0, y+(i+16 ));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
- x3_fp32vec4 = vec_xl(0, x+(i+12));
- x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
+ x4_fp32vec4 = vec_xl(0, x+(i+16 ));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
@@ -621,31 +806,108 @@ static void THFloatVector_add_VSX(float *y, const float *x, const float c, const
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
+ y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
+ y4_fp32vec4 = vec_madd(y4_fp32vec4, c_fp32vec4, x4_fp32vec4);
+ y5_fp32vec4 = vec_madd(y5_fp32vec4, c_fp32vec4, x5_fp32vec4);
+ y6_fp32vec4 = vec_madd(y6_fp32vec4, c_fp32vec4, x6_fp32vec4);
+ y7_fp32vec4 = vec_madd(y7_fp32vec4, c_fp32vec4, x7_fp32vec4);
+ y8_fp32vec4 = vec_madd(y8_fp32vec4, c_fp32vec4, x8_fp32vec4);
+ y9_fp32vec4 = vec_madd(y9_fp32vec4, c_fp32vec4, x9_fp32vec4);
+ y10_fp32vec4 = vec_madd(y10_fp32vec4, c_fp32vec4, x10_fp32vec4);
+ y11_fp32vec4 = vec_madd(y11_fp32vec4, c_fp32vec4, x11_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
+ vec_xst(y4_fp32vec4, 0, z+(i+16 ));
+ vec_xst(y5_fp32vec4, 0, z+(i+20));
+ vec_xst(y6_fp32vec4, 0, z+(i+24));
+ vec_xst(y7_fp32vec4, 0, z+(i+28));
+ vec_xst(y8_fp32vec4, 0, z+(i+32));
+ vec_xst(y9_fp32vec4, 0, z+(i+36));
+ vec_xst(y10_fp32vec4, 0, z+(i+40));
+ vec_xst(y11_fp32vec4, 0, z+(i+44));
+ }
+ for (; i <= n-16; i += 16)
+ {
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
- y4_fp32vec4 = vec_xl(0, y+(i+16));
- y5_fp32vec4 = vec_xl(0, y+(i+20));
- y6_fp32vec4 = vec_xl(0, y+(i+24));
- y7_fp32vec4 = vec_xl(0, y+(i+28));
- y8_fp32vec4 = vec_xl(0, y+(i+32));
- y9_fp32vec4 = vec_xl(0, y+(i+36));
- y10_fp32vec4 = vec_xl(0, y+(i+40));
- y11_fp32vec4 = vec_xl(0, y+(i+44));
+ y3_fp32vec4 = vec_xl(0, y+(i+12 ));
- y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
- y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 );
- y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 );
- y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 );
- y4_fp32vec4 = vec_madd(c_fp32vec4, x4_fp32vec4, y4_fp32vec4 );
- y5_fp32vec4 = vec_madd(c_fp32vec4, x5_fp32vec4, y5_fp32vec4 );
- y6_fp32vec4 = vec_madd(c_fp32vec4, x6_fp32vec4, y6_fp32vec4 );
- y7_fp32vec4 = vec_madd(c_fp32vec4, x7_fp32vec4, y7_fp32vec4 );
- y8_fp32vec4 = vec_madd(c_fp32vec4, x8_fp32vec4, y8_fp32vec4 );
- y9_fp32vec4 = vec_madd(c_fp32vec4, x9_fp32vec4, y9_fp32vec4 );
- y10_fp32vec4 = vec_madd(c_fp32vec4, x10_fp32vec4, y10_fp32vec4);
- y11_fp32vec4 = vec_madd(c_fp32vec4, x11_fp32vec4, y11_fp32vec4);
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
+
+ y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ }
+ for (; i < n; i++)
+ z[i] = x[i] + c* y[i];
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_adds_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_adds_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+ float val[4] = {c, c, c, c};
+ vector float c_fp32vec4 = vec_xl(0, val);
+
+ vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+ vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+ vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+ for (i = 0; i <= n-48; i += 48)
+ {
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+ x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x5_fp32vec4 = vec_xl(0, x+(i+20));
+ x6_fp32vec4 = vec_xl(0, x+(i+24));
+ x7_fp32vec4 = vec_xl(0, x+(i+28));
+ x8_fp32vec4 = vec_xl(0, x+(i+32));
+ x9_fp32vec4 = vec_xl(0, x+(i+36));
+ x10_fp32vec4 = vec_xl(0, x+(i+40));
+ x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4);
+ y4_fp32vec4 = vec_add(x4_fp32vec4, c_fp32vec4);
+ y5_fp32vec4 = vec_add(x5_fp32vec4, c_fp32vec4);
+ y6_fp32vec4 = vec_add(x6_fp32vec4, c_fp32vec4);
+ y7_fp32vec4 = vec_add(x7_fp32vec4, c_fp32vec4);
+ y8_fp32vec4 = vec_add(x8_fp32vec4, c_fp32vec4);
+ y9_fp32vec4 = vec_add(x9_fp32vec4, c_fp32vec4);
+ y10_fp32vec4 = vec_add(x10_fp32vec4, c_fp32vec4);
+ y11_fp32vec4 = vec_add(x11_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -667,15 +929,10 @@ static void THFloatVector_add_VSX(float *y, const float *x, const float c, const
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
-
- y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
- y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 );
- y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 );
- y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 );
+ y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -685,159 +942,157 @@ static void THFloatVector_add_VSX(float *y, const float *x, const float c, const
for (; i <= n-4; i += 4)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
+ y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = (c * x[i]) + y[i];
+ y[i] = c + x[i];
}
-
-
-static void THFloatVector_diff_VSX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cmul_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cmul_VSX(float *z, const float *y, const float *x, const ptrdiff_t n)
+{
ptrdiff_t i;
- vector float xz0_fp32vec4, xz1_fp32vec4, xz2_fp32vec4, xz3_fp32vec4, xz4_fp32vec4, xz5_fp32vec4, xz6_fp32vec4, xz7_fp32vec4;
- vector float xz8_fp32vec4, xz9_fp32vec4, xz10_fp32vec4, xz11_fp32vec4;
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+ vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+ vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
for (i = 0; i <= n-48; i += 48)
{
- xz0_fp32vec4 = vec_xl(0, x+(i ));
- xz1_fp32vec4 = vec_xl(0, x+(i+4 ));
- xz2_fp32vec4 = vec_xl(0, x+(i+8 ));
- xz3_fp32vec4 = vec_xl(0, x+(i+12));
- xz4_fp32vec4 = vec_xl(0, x+(i+16));
- xz5_fp32vec4 = vec_xl(0, x+(i+20));
- xz6_fp32vec4 = vec_xl(0, x+(i+24));
- xz7_fp32vec4 = vec_xl(0, x+(i+28));
- xz8_fp32vec4 = vec_xl(0, x+(i+32));
- xz9_fp32vec4 = vec_xl(0, x+(i+36));
- xz10_fp32vec4 = vec_xl(0, x+(i+40));
- xz11_fp32vec4 = vec_xl(0, x+(i+44));
-
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
- y4_fp32vec4 = vec_xl(0, y+(i+16));
- y5_fp32vec4 = vec_xl(0, y+(i+20));
- y6_fp32vec4 = vec_xl(0, y+(i+24));
- y7_fp32vec4 = vec_xl(0, y+(i+28));
- y8_fp32vec4 = vec_xl(0, y+(i+32));
- y9_fp32vec4 = vec_xl(0, y+(i+36));
- y10_fp32vec4 = vec_xl(0, y+(i+40));
- y11_fp32vec4 = vec_xl(0, y+(i+44));
-
- xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
- xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 );
- xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 );
- xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 );
- xz4_fp32vec4 = vec_sub(xz4_fp32vec4, y4_fp32vec4 );
- xz5_fp32vec4 = vec_sub(xz5_fp32vec4, y5_fp32vec4 );
- xz6_fp32vec4 = vec_sub(xz6_fp32vec4, y6_fp32vec4 );
- xz7_fp32vec4 = vec_sub(xz7_fp32vec4, y7_fp32vec4 );
- xz8_fp32vec4 = vec_sub(xz8_fp32vec4, y8_fp32vec4 );
- xz9_fp32vec4 = vec_sub(xz9_fp32vec4, y9_fp32vec4 );
- xz10_fp32vec4 = vec_sub(xz10_fp32vec4, y10_fp32vec4);
- xz11_fp32vec4 = vec_sub(xz11_fp32vec4, y11_fp32vec4);
-
- vec_xst(xz0_fp32vec4, 0, z+(i ));
- vec_xst(xz1_fp32vec4, 0, z+(i+4 ));
- vec_xst(xz2_fp32vec4, 0, z+(i+8 ));
- vec_xst(xz3_fp32vec4, 0, z+(i+12));
- vec_xst(xz4_fp32vec4, 0, z+(i+16));
- vec_xst(xz5_fp32vec4, 0, z+(i+20));
- vec_xst(xz6_fp32vec4, 0, z+(i+24));
- vec_xst(xz7_fp32vec4, 0, z+(i+28));
- vec_xst(xz8_fp32vec4, 0, z+(i+32));
- vec_xst(xz9_fp32vec4, 0, z+(i+36));
- vec_xst(xz10_fp32vec4, 0, z+(i+40));
- vec_xst(xz11_fp32vec4, 0, z+(i+44));
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12 ));
+ y4_fp32vec4 = vec_xl(0, y+(i+16 ));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
+ x4_fp32vec4 = vec_xl(0, x+(i+16 ));
+ x5_fp32vec4 = vec_xl(0, x+(i+20));
+ x6_fp32vec4 = vec_xl(0, x+(i+24));
+ x7_fp32vec4 = vec_xl(0, x+(i+28));
+ x8_fp32vec4 = vec_xl(0, x+(i+32));
+ x9_fp32vec4 = vec_xl(0, x+(i+36));
+ x10_fp32vec4 = vec_xl(0, x+(i+40));
+ x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+ y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4);
+ y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4);
+ y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4);
+ y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4);
+ y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4);
+ y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4);
+ y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
+ y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
+ vec_xst(y4_fp32vec4, 0, z+(i+16 ));
+ vec_xst(y5_fp32vec4, 0, z+(i+20));
+ vec_xst(y6_fp32vec4, 0, z+(i+24));
+ vec_xst(y7_fp32vec4, 0, z+(i+28));
+ vec_xst(y8_fp32vec4, 0, z+(i+32));
+ vec_xst(y9_fp32vec4, 0, z+(i+36));
+ vec_xst(y10_fp32vec4, 0, z+(i+40));
+ vec_xst(y11_fp32vec4, 0, z+(i+44));
}
for (; i <= n-16; i += 16)
{
- xz0_fp32vec4 = vec_xl(0, x+(i ));
- xz1_fp32vec4 = vec_xl(0, x+(i+4 ));
- xz2_fp32vec4 = vec_xl(0, x+(i+8 ));
- xz3_fp32vec4 = vec_xl(0, x+(i+12));
-
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
-
- xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
- xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 );
- xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 );
- xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 );
-
- vec_xst(xz0_fp32vec4, 0, z+(i ));
- vec_xst(xz1_fp32vec4, 0, z+(i+4 ));
- vec_xst(xz2_fp32vec4, 0, z+(i+8 ));
- vec_xst(xz3_fp32vec4, 0, z+(i+12));
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12 ));
+
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
}
for (; i <= n-4; i += 4)
{
- xz0_fp32vec4 = vec_xl(0, x+(i ));
- y0_fp32vec4 = vec_xl(0, y+(i ));
- xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
- vec_xst(xz0_fp32vec4, 0, z+(i ));
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ vec_xst(y0_fp32vec4, 0, z+(i ));
}
for (; i < n; i++)
- z[i] = x[i] - y[i];
+ z[i] = y[i] * x[i];
}
-static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_muls_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
-
- vector float c_fp32vec4;
float val[4] = {c, c, c, c};
- c_fp32vec4 = vec_xl(0, val);
+ vector float c_fp32vec4 = vec_xl(0, val);
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
- vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4, y12_fp32vec4, y13_fp32vec4, y14_fp32vec4, y15_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+ vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+ vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
- for (i = 0; i <= n-64; i += 64)
+
+ for (i = 0; i <= n-48; i += 48)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
- y4_fp32vec4 = vec_xl(0, y+(i+16));
- y5_fp32vec4 = vec_xl(0, y+(i+20));
- y6_fp32vec4 = vec_xl(0, y+(i+24));
- y7_fp32vec4 = vec_xl(0, y+(i+28));
- y8_fp32vec4 = vec_xl(0, y+(i+32));
- y9_fp32vec4 = vec_xl(0, y+(i+36));
- y10_fp32vec4 = vec_xl(0, y+(i+40));
- y11_fp32vec4 = vec_xl(0, y+(i+44));
- y12_fp32vec4 = vec_xl(0, y+(i+48));
- y13_fp32vec4 = vec_xl(0, y+(i+52));
- y14_fp32vec4 = vec_xl(0, y+(i+56));
- y15_fp32vec4 = vec_xl(0, y+(i+60));
-
- y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4);
- y4_fp32vec4 = vec_mul(y4_fp32vec4, c_fp32vec4);
- y5_fp32vec4 = vec_mul(y5_fp32vec4, c_fp32vec4);
- y6_fp32vec4 = vec_mul(y6_fp32vec4, c_fp32vec4);
- y7_fp32vec4 = vec_mul(y7_fp32vec4, c_fp32vec4);
- y8_fp32vec4 = vec_mul(y8_fp32vec4, c_fp32vec4);
- y9_fp32vec4 = vec_mul(y9_fp32vec4, c_fp32vec4);
- y10_fp32vec4 = vec_mul(y10_fp32vec4, c_fp32vec4);
- y11_fp32vec4 = vec_mul(y11_fp32vec4, c_fp32vec4);
- y12_fp32vec4 = vec_mul(y12_fp32vec4, c_fp32vec4);
- y13_fp32vec4 = vec_mul(y13_fp32vec4, c_fp32vec4);
- y14_fp32vec4 = vec_mul(y14_fp32vec4, c_fp32vec4);
- y15_fp32vec4 = vec_mul(y15_fp32vec4, c_fp32vec4);
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+ x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x5_fp32vec4 = vec_xl(0, x+(i+20));
+ x6_fp32vec4 = vec_xl(0, x+(i+24));
+ x7_fp32vec4 = vec_xl(0, x+(i+28));
+ x8_fp32vec4 = vec_xl(0, x+(i+32));
+ x9_fp32vec4 = vec_xl(0, x+(i+36));
+ x10_fp32vec4 = vec_xl(0, x+(i+40));
+ x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4);
+ y4_fp32vec4 = vec_mul(x4_fp32vec4, c_fp32vec4);
+ y5_fp32vec4 = vec_mul(x5_fp32vec4, c_fp32vec4);
+ y6_fp32vec4 = vec_mul(x6_fp32vec4, c_fp32vec4);
+ y7_fp32vec4 = vec_mul(x7_fp32vec4, c_fp32vec4);
+ y8_fp32vec4 = vec_mul(x8_fp32vec4, c_fp32vec4);
+ y9_fp32vec4 = vec_mul(x9_fp32vec4, c_fp32vec4);
+ y10_fp32vec4 = vec_mul(x10_fp32vec4, c_fp32vec4);
+ y11_fp32vec4 = vec_mul(x11_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -851,22 +1106,18 @@ static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
vec_xst(y9_fp32vec4, 0, y+(i+36));
vec_xst(y10_fp32vec4, 0, y+(i+40));
vec_xst(y11_fp32vec4, 0, y+(i+44));
- vec_xst(y12_fp32vec4, 0, y+(i+48));
- vec_xst(y13_fp32vec4, 0, y+(i+52));
- vec_xst(y14_fp32vec4, 0, y+(i+56));
- vec_xst(y15_fp32vec4, 0, y+(i+60));
}
for (; i <= n-16; i += 16)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4);
+ y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -875,17 +1126,19 @@ static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
}
for (; i <= n-4; i += 4)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * c;
+ y[i] = c * x[i];
}
-
-static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_cdiv_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_cdiv_VSX(float *z, const float *x, const float *y, const ptrdiff_t n)
{
ptrdiff_t i;
@@ -898,8 +1151,8 @@ static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
for (i = 0; i <= n-48; i += 48)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4));
+ y2_fp32vec4 = vec_xl(0, y+(i+8));
y3_fp32vec4 = vec_xl(0, y+(i+12));
y4_fp32vec4 = vec_xl(0, y+(i+16));
y5_fp32vec4 = vec_xl(0, y+(i+20));
@@ -913,8 +1166,8 @@ static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
- x3_fp32vec4 = vec_xl(0, x+(i+12));
- x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
+ x4_fp32vec4 = vec_xl(0, x+(i+16 ));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
@@ -923,123 +1176,174 @@ static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
- y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4);
- y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4);
- y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4);
- y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4);
- y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4);
- y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4);
- y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
- y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
-
- vec_xst(y0_fp32vec4, 0, y+(i ));
- vec_xst(y1_fp32vec4, 0, y+(i+4 ));
- vec_xst(y2_fp32vec4, 0, y+(i+8 ));
- vec_xst(y3_fp32vec4, 0, y+(i+12));
- vec_xst(y4_fp32vec4, 0, y+(i+16));
- vec_xst(y5_fp32vec4, 0, y+(i+20));
- vec_xst(y6_fp32vec4, 0, y+(i+24));
- vec_xst(y7_fp32vec4, 0, y+(i+28));
- vec_xst(y8_fp32vec4, 0, y+(i+32));
- vec_xst(y9_fp32vec4, 0, y+(i+36));
- vec_xst(y10_fp32vec4, 0, y+(i+40));
- vec_xst(y11_fp32vec4, 0, y+(i+44));
+ y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
+ y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4);
+ y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4);
+ y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4);
+ y4_fp32vec4 = vec_div(x4_fp32vec4, y4_fp32vec4);
+ y5_fp32vec4 = vec_div(x5_fp32vec4, y5_fp32vec4);
+ y6_fp32vec4 = vec_div(x6_fp32vec4, y6_fp32vec4);
+ y7_fp32vec4 = vec_div(x7_fp32vec4, y7_fp32vec4);
+ y8_fp32vec4 = vec_div(x8_fp32vec4, y8_fp32vec4);
+ y9_fp32vec4 = vec_div(x9_fp32vec4, y9_fp32vec4);
+ y10_fp32vec4 = vec_div(x10_fp32vec4, y10_fp32vec4);
+ y11_fp32vec4 = vec_div(x11_fp32vec4, y11_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
+ vec_xst(y4_fp32vec4, 0, z+(i+16 ));
+ vec_xst(y5_fp32vec4, 0, z+(i+20));
+ vec_xst(y6_fp32vec4, 0, z+(i+24));
+ vec_xst(y7_fp32vec4, 0, z+(i+28));
+ vec_xst(y8_fp32vec4, 0, z+(i+32));
+ vec_xst(y9_fp32vec4, 0, z+(i+36));
+ vec_xst(y10_fp32vec4, 0, z+(i+40));
+ vec_xst(y11_fp32vec4, 0, z+(i+44));
}
for (; i <= n-16; i += 16)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y3_fp32vec4 = vec_xl(0, y+(i+12 ));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
- x3_fp32vec4 = vec_xl(0, x+(i+12));
+ x3_fp32vec4 = vec_xl(0, x+(i+12 ));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+ y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
+ y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4);
+ y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4);
+ y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4);
- vec_xst(y0_fp32vec4, 0, y+(i ));
- vec_xst(y1_fp32vec4, 0, y+(i+4 ));
- vec_xst(y2_fp32vec4, 0, y+(i+8 ));
- vec_xst(y3_fp32vec4, 0, y+(i+12));
+ vec_xst(y0_fp32vec4, 0, z+(i ));
+ vec_xst(y1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, z+(i+12 ));
}
for (; i <= n-4; i += 4)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
x0_fp32vec4 = vec_xl(0, x+(i ));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
- vec_xst(y0_fp32vec4, 0, y+(i ));
+ y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
+ vec_xst(y0_fp32vec4, 0, z+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * x[i];
+ z[i] = x[i] / y[i];
}
+//--------------------------------------------------------------------------------------------------
+// THFloatVector_divs_VSX:
+//--------------------------------------------------------------------------------------------------
+static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+ float val[4] = {c, c, c, c};
+ vector float c_fp64vec2 = vec_xl(0, val);
+
+ vector float y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector float y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector float x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector float x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+ for (i = 0; i <= n-48; i += 48)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+12 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+16 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+20));
+ x6_fp64vec2 = vec_xl(0, x+(i+24));
+ x7_fp64vec2 = vec_xl(0, x+(i+28));
+ x8_fp64vec2 = vec_xl(0, x+(i+32));
+ x9_fp64vec2 = vec_xl(0, x+(i+36));
+ x10_fp64vec2 = vec_xl(0, x+(i+40));
+ x11_fp64vec2 = vec_xl(0, x+(i+44));
+
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
+ y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2);
+ y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2);
+ y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2);
+ y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2);
+ y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2);
+ y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2);
+ y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
+ y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
+
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+12 ));
+ vec_xst(y4_fp64vec2, 0, y+(i+16 ));
+ vec_xst(y5_fp64vec2, 0, y+(i+20));
+ vec_xst(y6_fp64vec2, 0, y+(i+24));
+ vec_xst(y7_fp64vec2, 0, y+(i+28));
+ vec_xst(y8_fp64vec2, 0, y+(i+32));
+ vec_xst(y9_fp64vec2, 0, y+(i+36));
+ vec_xst(y10_fp64vec2, 0, y+(i+40));
+ vec_xst(y11_fp64vec2, 0, y+(i+44));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+12 ));
+
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+12 ));
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+16 ));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = x[i] / c;
+}
//------------------------------------------------
//
// Testing for correctness and performance
//
-// If you want to run these tests, compile this
-// file with -DRUN_VSX_TESTS on a Power machine,
-// and then run the executable that is generated.
-//
-//------------------------------------------------
-//
-// Example passing run (from a Power8 machine):
-//
-// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
-// $ ./vsxtest
-//
-// standardDouble_fill() test took 0.34604 seconds
-// THDoubleVector_fill_VSX() test took 0.15663 seconds
-// All assertions PASSED for THDoubleVector_fill_VSX() test.
-//
-// standardFloat_fill() test took 0.32901 seconds
-// THFloatVector_fill_VSX() test took 0.07830 seconds
-// All assertions PASSED for THFloatVector_fill_VSX() test.
-//
-// standardDouble_add() test took 0.51602 seconds
-// THDoubleVector_add_VSX() test took 0.31384 seconds
-// All assertions PASSED for THDoubleVector_add_VSX() test.
-//
-// standardFloat_add() test took 0.39845 seconds
-// THFloatVector_add_VSX() test took 0.14544 seconds
-// All assertions PASSED for THFloatVector_add_VSX() test.
-//
-// standardDouble_diff() test took 0.48219 seconds
-// THDoubleVector_diff_VSX() test took 0.31708 seconds
-// All assertions PASSED for THDoubleVector_diff_VSX() test.
+// If you want to run these tests, compile this
+// file with -DRUN_VSX_TESTS on a Power machine,
+// and then run the executable that is generated.
//
-// standardFloat_diff() test took 0.60340 seconds
-// THFloatVector_diff_VSX() test took 0.17083 seconds
-// All assertions PASSED for THFloatVector_diff_VSX() test.
+//------------------------------------------------
//
-// standardDouble_scale() test took 0.33157 seconds
-// THDoubleVector_scale_VSX() test took 0.19075 seconds
-// All assertions PASSED for THDoubleVector_scale_VSX() test.
+// Example passing run (from a Power8 machine):
//
-// standardFloat_scale() test took 0.33008 seconds
-// THFloatVector_scale_VSX() test took 0.09741 seconds
-// All assertions PASSED for THFloatVector_scale_VSX() test.
+// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
+// $ ./vsxtest
//
-// standardDouble_mul() test took 0.50986 seconds
-// THDoubleVector_mul_VSX() test took 0.30939 seconds
-// All assertions PASSED for THDoubleVector_mul_VSX() test.
+// TODO
//
-// standardFloat_mul() test took 0.40241 seconds
-// THFloatVector_mul_VSX() test took 0.14346 seconds
-// All assertions PASSED for THFloatVector_mul_VSX() test.
//
// Finished runnning all tests. All tests PASSED.
//
@@ -1055,8 +1359,10 @@ static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
-void test_THDoubleVector_fill_VSX();
+//--------------------------------------------------------------------------------------------------
+// Standard implementations:
+//--------------------------------------------------------------------------------------------------
static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
@@ -1069,52 +1375,76 @@ static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
x[i] = c;
}
-static void standardDouble_add(double *y, const double *x, const double c, const ptrdiff_t n)
+static void standardDouble_cadd(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ z[i] = x[i] + c * y[i];
+}
+
+static void standardFloat_cadd(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ z[i] = x[i] + c * y[i];
+}
+
+static void standardDouble_adds(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] = c + x[i];
+}
+
+static void standardFloat_adds(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] = c + x[i];
+}
+
+static void standardDouble_cmul(double *z, const double *x, const double *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] += c * x[i];
+ z[i] = x[i] * y[i];
}
-static void standardFloat_add(float *y, const float *x, const float c, const ptrdiff_t n)
+static void standardFloat_cmul(float *z, const float *x, const float *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] += c * x[i];
+ z[i] = x[i] * y[i];
}
-static void standardDouble_diff(double *z, const double *x, const double *y, const ptrdiff_t n)
+static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- z[i] = x[i] - y[i];
+ y[i] = c * x[i];
}
-static void standardFloat_diff(float *z, const float *x, const float *y, const ptrdiff_t n)
+static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- z[i] = x[i] - y[i];
+ y[i] = c * x[i];
}
-static void standardDouble_scale(double *y, const double c, const ptrdiff_t n)
+static void standardDouble_cdiv(double *z, const double *x, const double *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= c;
+ z[i] = x[i] / y[i];
}
-static void standardFloat_scale(float *y, const float c, const ptrdiff_t n)
+static void standardFloat_cdiv(float *z, const float *x, const float *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= c;
+ z[i] = x[i] / y[i];
}
-static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n)
+static void standardDouble_divs(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= x[i];
+ y[i] = x[i] / c;
}
-static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n)
+static void standardFloat_divs(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= x[i];
+ y[i] = x[i] / c;
}
double randDouble()
@@ -1138,6 +1468,10 @@ int near(double a, double b)
return 1;
}
+
+//--------------------------------------------------------------------------------------------------
+// Standard tests:
+//--------------------------------------------------------------------------------------------------
void test_THDoubleVector_fill_VSX()
{
clock_t start, end;
@@ -1279,71 +1613,208 @@ void test_THFloatVector_fill_VSX()
free(x_optimized);
}
-void test_THDoubleVector_add_VSX()
+
+void test_THDoubleVector_cadd_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
- double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
- double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
- double c = (double)randDouble();
+ double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
- double yVal = randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
+ y[i] = randDouble();
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_cadd_VSX() test.\n\n");
+
+
+ free(z_standard);
+ free(z_optimized);
+ free(x);
+}
+
+void test_THFloatVector_cadd_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = (float)randDouble();
+ y[i] = (float)randDouble();
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
}
+ printf("All assertions PASSED for THFloatVector_cadd_VSX() test.\n\n");
+
+ free(z_standard);
+ free(z_optimized);
+ free(x);
+}
+
+void test_THDoubleVector_adds_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ x[i] = randDouble();
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
- standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
- standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardDouble_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardDouble_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
- THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THDoubleVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THDoubleVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardDouble_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THDoubleVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardDouble_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THDoubleVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardDouble_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THDoubleVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardDouble_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THDoubleVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardDouble_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardDouble_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THDoubleVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardDouble_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THDoubleVector_add_VSX() test.\n\n");
+ printf("All assertions PASSED for THDoubleVector_adds_VSX() test.\n\n");
free(y_standard);
@@ -1352,7 +1823,7 @@ void test_THDoubleVector_add_VSX()
}
-void test_THFloatVector_add_VSX()
+void test_THFloatVector_adds_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
@@ -1364,59 +1835,54 @@ void test_THFloatVector_add_VSX()
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
- {
x[i] = (float)randDouble();
- float yVal = (float)randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
- }
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
- standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
- standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardFloat_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardFloat_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
- THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THFloatVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THFloatVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardFloat_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THFloatVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardFloat_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THFloatVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardFloat_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THFloatVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardFloat_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THFloatVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardFloat_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardFloat_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THFloatVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardFloat_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THFloatVector_add_VSX() test.\n\n");
+ printf("All assertions PASSED for THFloatVector_adds_VSX() test.\n\n");
free(y_standard);
@@ -1424,24 +1890,22 @@ void test_THFloatVector_add_VSX()
free(x);
}
-void test_THDoubleVector_diff_VSX()
+
+void test_THDoubleVector_cmul_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
- double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
y[i] = randDouble();
- double zVal = randDouble();
- z_standard[i] = zVal;
- z_optimized[i] = zVal;
}
@@ -1449,74 +1913,69 @@ void test_THDoubleVector_diff_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardDouble_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardDouble_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THDoubleVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THDoubleVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardDouble_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THDoubleVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardDouble_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THDoubleVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardDouble_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THDoubleVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardDouble_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THDoubleVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardDouble_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardDouble_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THDoubleVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardDouble_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
- printf("All assertions PASSED for THDoubleVector_diff_VSX() test.\n\n");
+ printf("All assertions PASSED for THDoubleVector_cmul_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
- free(y);
free(x);
}
-
-void test_THFloatVector_diff_VSX()
+void test_THFloatVector_cmul_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
- float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
y[i] = (float)randDouble();
- float zVal = (float)randDouble();
- z_standard[i] = zVal;
- z_optimized[i] = zVal;
}
@@ -1524,71 +1983,68 @@ void test_THFloatVector_diff_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardFloat_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardFloat_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THFloatVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THFloatVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardFloat_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THFloatVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardFloat_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THFloatVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardFloat_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THFloatVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardFloat_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THFloatVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardFloat_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardFloat_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THFloatVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardFloat_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
- printf("All assertions PASSED for THFloatVector_diff_VSX() test.\n\n");
+ printf("All assertions PASSED for THFloatVector_cmul_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
- free(y);
free(x);
}
-
-void test_THDoubleVector_scale_VSX()
+void test_THDoubleVector_muls_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
- double yVal = randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
+ x[i] = randDouble();
}
@@ -1596,69 +2052,69 @@ void test_THDoubleVector_scale_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS );
- standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardDouble_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS );
- THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THDoubleVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THDoubleVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardDouble_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THDoubleVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardDouble_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THDoubleVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardDouble_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THDoubleVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardDouble_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THDoubleVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardDouble_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardDouble_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THDoubleVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardDouble_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THDoubleVector_scale_VSX() test.\n\n");
+ printf("All assertions PASSED for THDoubleVector_muls_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
+ free(x);
}
-
-void test_THFloatVector_scale_VSX()
+void test_THFloatVector_muls_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
- float c = (float)randDouble();
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
- float yVal = (float)randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
+ x[i] = (float)randDouble();
}
@@ -1666,54 +2122,197 @@ void test_THFloatVector_scale_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS );
- standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardFloat_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS );
- THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THFloatVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THFloatVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardFloat_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THFloatVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardFloat_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THFloatVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardFloat_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THFloatVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardFloat_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THFloatVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardFloat_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardFloat_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THFloatVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardFloat_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THFloatVector_scale_VSX() test.\n\n");
+ printf("All assertions PASSED for THFloatVector_muls_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
+ free(x);
+}
+
+
+
+void test_THDoubleVector_cdiv_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = randDouble();
+ y[i] = randDouble();
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_cdiv_VSX() test.\n\n");
+
+
+ free(z_standard);
+ free(z_optimized);
+ free(x);
+}
+
+void test_THFloatVector_cdiv_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = (float)randDouble();
+ y[i] = (float)randDouble();
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
+ }
+ printf("All assertions PASSED for THFloatVector_cdiv_VSX() test.\n\n");
+
+
+ free(z_standard);
+ free(z_optimized);
+ free(x);
}
-void test_THDoubleVector_mul_VSX()
+void test_THDoubleVector_divs_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
@@ -1721,14 +2320,12 @@ void test_THDoubleVector_mul_VSX()
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
- double yVal = randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
}
@@ -1736,47 +2333,48 @@ void test_THDoubleVector_mul_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardDouble_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardDouble_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THDoubleVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THDoubleVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardDouble_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THDoubleVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardDouble_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THDoubleVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardDouble_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THDoubleVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardDouble_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THDoubleVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardDouble_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardDouble_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THDoubleVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardDouble_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THDoubleVector_mul_VSX() test.\n\n");
+ printf("All assertions PASSED for THDoubleVector_divs_VSX() test.\n\n");
free(y_standard);
@@ -1784,8 +2382,7 @@ void test_THDoubleVector_mul_VSX()
free(x);
}
-
-void test_THFloatVector_mul_VSX()
+void test_THFloatVector_divs_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
@@ -1793,14 +2390,12 @@ void test_THFloatVector_mul_VSX()
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
- float yVal = (float)randDouble();
- y_standard[i] = yVal;
- y_optimized[i] = yVal;
}
@@ -1808,47 +2403,48 @@ void test_THFloatVector_mul_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
- printf("standardFloat_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+ printf("standardFloat_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
- printf("THFloatVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+ printf("THFloatVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardFloat_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THFloatVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardFloat_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THFloatVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardFloat_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THFloatVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardFloat_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THFloatVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardFloat_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardFloat_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THFloatVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardFloat_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
- printf("All assertions PASSED for THFloatVector_mul_VSX() test.\n\n");
+ printf("All assertions PASSED for THFloatVector_divs_VSX() test.\n\n");
free(y_standard);
@@ -1857,7 +2453,9 @@ void test_THFloatVector_mul_VSX()
}
-
+//--------------------------------------------------------------------------------------------------
+// Run tests:
+//--------------------------------------------------------------------------------------------------
int main()
{
printf("\n");
@@ -1891,17 +2489,24 @@ int main()
test_THDoubleVector_fill_VSX();
test_THFloatVector_fill_VSX();
- test_THDoubleVector_add_VSX();
- test_THFloatVector_add_VSX();
+ test_THDoubleVector_cadd_VSX();
+ test_THFloatVector_cadd_VSX();
+
+ test_THDoubleVector_adds_VSX();
+ test_THFloatVector_adds_VSX();
+
+ test_THDoubleVector_cmul_VSX();
+ test_THFloatVector_cmul_VSX();
+
+ test_THDoubleVector_muls_VSX();
+ test_THFloatVector_muls_VSX();
- test_THDoubleVector_diff_VSX();
- test_THFloatVector_diff_VSX();
+ test_THDoubleVector_cdiv_VSX();
+ test_THFloatVector_cdiv_VSX();
- test_THDoubleVector_scale_VSX();
- test_THFloatVector_scale_VSX();
+ test_THDoubleVector_divs_VSX();
+ test_THFloatVector_divs_VSX();
- test_THDoubleVector_mul_VSX();
- test_THFloatVector_mul_VSX();
printf("Finished runnning all tests. All tests PASSED.\n");
diff --git a/lib/luaT/README.md b/lib/luaT/README.md
index f28d143..3550e4e 100644
--- a/lib/luaT/README.md
+++ b/lib/luaT/README.md
@@ -1,4 +1,4 @@
-<a name="luat.dok"/>
+<a name="luat.dok"></a>
# Lua Torch C API #
luaT provides an API to interface Lua and C in Torch packages. It defines a
@@ -9,19 +9,19 @@ It additionally provides few functions that `luaL` should have defined, and
defines several functions similar to `luaL` ones for better type error printing when using
`luaT` classes.
-<a name="luat.memory.dok"/>
+<a name="luat.memory.dok"></a>
## Memory functions ##
Classical memory allocation functions which generate a Lua error in case of
problem.
-<a name="luaT_alloc"/>
+<a name="luaT_alloc"></a>
### void* luaT_alloc(lua_State *L, long size) ###
Allocates `size` bytes, and return a pointer on the allocated
memory. A Lua error will be generated if running out of memory.
-<a name="luaT_realloc"/>
+<a name="luaT_realloc"></a>
### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
Realloc `ptr` to `size` bytes. `ptr` must have been previously
@@ -29,7 +29,7 @@ allocated with [luaT_alloc](#luaT_alloc) or
[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
functions. A Lua error will be generated if running out of memory.
-<a name="luaT_free"/>
+<a name="luaT_free"></a>
### void luaT_free(lua_State *L, void *ptr) ###
Free memory allocated at address `ptr`. The memory must have been
@@ -37,7 +37,7 @@ previously allocated with [luaT_alloc](#luaT_alloc) or
[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
functions.
-<a name="luat.classcreate"/>
+<a name="luat.classcreate"></a>
## Class creation and basic handling ##
A `luaT` class is basically either a Lua _table_ or _userdata_ with
@@ -48,7 +48,7 @@ another class, then the metatable will itself have a metatable
corresponding to the _parent metatable_: the metatables are cascaded
according to the class inheritance. Multiple inheritance is not supported.
-<a name="luat.operatoroverloading"/>
+<a name="luat.operatoroverloading"></a>
### Operator overloading ###
The metatable of a `luaT` object contains `Lua` operators like
@@ -67,7 +67,7 @@ metaclass, these operators must follow a particular scheme:
Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
-<a name="luat_newlocalmetatable"/>
+<a name="luat_newlocalmetatable"></a>
### const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx) ###
This function creates a new metatable, which is the Lua way to define a new
@@ -100,13 +100,13 @@ methods in Lua.
The return value is the value returned by [luaT_typenameid](#luat_typenameid).
-<a name="luat_newmetatable"/>
+<a name="luat_newmetatable"></a>
### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
Same as [luaT_newlocalmetatable](#luat_newmetatable), but where the
constructor table is assigned in the global namespace (`moduleidx = 0`).
-<a name="luat_pushmetatable"/>
+<a name="luat_pushmetatable"></a>
### int luaT_pushmetatable(lua_State *L, const name *tname) ###
Push the metatable with type name `tname` on the stack, if `tname` is a
@@ -115,7 +115,7 @@ valid Torch class name (previously registered with luaT_newmetatable).
On success, returns 1. If `tname` is invalid, nothing is pushed and it
returns 0.
-<a name="luat_typenameid"/>
+<a name="luat_typenameid"></a>
### const char* luaT_typenameid(lua_State *L, const char *tname) ###
If `tname` is a valid Torch class name, then returns a unique string (the
@@ -125,40 +125,40 @@ running. The returned string shall not be freed.
If `tname` is an invalid class name, returns NULL.
-<a name="luat_typename"/>
+<a name="luat_typename"></a>
### const char* luaT_typename(lua_State *L, int ud) ###
Returns the typename of the object at index `ud` on the stack. If it is
not a valid Torch object, returns NULL.
-<a name="luat_pushudata"/>
+<a name="luat_pushudata"></a>
### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
Given a C structure `udata`, push a userdata object on the stack with
metatable corresponding to `tname`. Obviously, `tname` must be a valid
Torch name registered with [luaT_newmetatable](#luat_newmetatable).
-<a name="luat_toudata"/>
+<a name="luat_toudata"></a>
### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
Returns a pointer to the original C structure previously pushed on the
stack with [luaT_pushudata](#luat_pushudata), if the object at index
`ud` is a valid Torch class name. Returns NULL otherwise.
-<a name="luat_isudata"/>
+<a name="luat_isudata"></a>
### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
Returns 0 otherwise.
-<a name="luat_getfield"/>
+<a name="luat_getfield"></a>
### Checking fields of a table ###
This functions check that the table at the given index `ud` on the Lua
stack has a field named `field`, and that it is of the specified type.
These function raises a Lua error on failure.
-<a name="luat_getfieldcheckudata"/>
+<a name="luat_getfieldcheckudata"></a>
## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
Checks that the field named `field` of the table at index `ud` is a
@@ -166,99 +166,99 @@ Torch class name `tname`. Returns the pointer of the C structure
previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
success. The function raises a Lua error on failure.
-<a name="luat_getfieldchecklightudata"/>
+<a name="luat_getfieldchecklightudata"></a>
## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
Checks that the field named `field` of the table at index `ud` is a
lightuserdata. Returns the lightuserdata pointer on success. The function
raises a Lua error on failure.
-<a name="luat_getfieldcheckint"/>
+<a name="luat_getfieldcheckint"></a>
## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
Checks that the field named `field` of the table at index `ud` is an
int. Returns the int value pointer on success. The function raises a Lua
error on failure.
-<a name="luat_getfieldcheckstring"/>
+<a name="luat_getfieldcheckstring"></a>
## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
Checks that the field named `field` of the table at index `ud` is a
string. Returns a pointer to the string on success. The function raises a
Lua error on failure.
-<a name="luat_getfieldcheckboolean"/>
+<a name="luat_getfieldcheckboolean"></a>
## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
Checks that the field named `field` of the table at index `ud` is a
boolean. On success, returns 1 if the boolean is `true`, 0 if it is
`false`. The function raises a Lua error on failure.
-<a name="luat_getfieldchecktable"/>
+<a name="luat_getfieldchecktable"></a>
## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
Checks that the field named `field` of the table at index `ud` is a
table. On success, push the table on the stack. The function raises a Lua
error on failure.
-<a name="luat_typerror"/>
+<a name="luat_typerror"></a>
### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
Raises a `luaL_argerror` (and returns its value), claiming that the
object at index `ud` on the stack is not of type `tname`. Note that
this function does not check the type, it only raises an error.
-<a name="luat_checkboolean"/>
+<a name="luat_checkboolean"></a>
### int luaT_checkboolean(lua_State *L, int ud) ###
Checks that the value at index `ud` is a boolean. On success, returns 1
if the boolean is `true`, 0 if it is `false`. The function raises a Lua
error on failure.
-<a name="luat_optboolean"/>
+<a name="luat_optboolean"></a>
### int luaT_optboolean(lua_State *L, int ud, int def) ###
Checks that the value at index `ud` is a boolean. On success, returns 1
if the boolean is `true`, 0 if it is `false`. If there is no value at
index `ud`, returns `def`. In any other cases, raises an error.
-<a name="luat_registeratname"/>
+<a name="luat_registeratname"></a>
### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
This function assume a table is on the stack. It creates a table field
`name` in the table (if this field does not exist yet), and fill up
`methods` in this table field.
-<a name="luat_classrootname"/>
+<a name="luat_classrootname"></a>
### const char *luaT_classrootname(const char *tname) ###
Assuming `tname` is of the form `A.b.c`, returns 'c'. The returned value
shall not be freed. It is a pointer inside `tname` string.
-<a name="luat_classmodulename"/>
+<a name="luat_classmodulename"></a>
### int luaT_classmodulename(const char *tname, char *parent_name) ###
Alias to `luaT_fullparentname ` for ensuring backwards compatibility;
use of `luaT_fullparentname` is preferred.
-<a name="luat_fullparentname"/>
+<a name="luat_fullparentname"></a>
### int luaT_fullparentname(const char *tname, char *parent_name) ###
Returns a 0-1 valued integer indicating whether `tname` has a parent module.
Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A.b`.
-<a name="luat_classmodulename"/>
+<a name="luat_classmodulename"></a>
### int luaT_outerparentname(const char *tname, char *parent_name) ###
Returns a 0-1 valued integer indicating whether `tname` has a parent module.
Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A`.
-<a name="luat_classmodulename"/>
+<a name="luat_classmodulename"></a>
### int luaT_innerparentname(const char *tname, char *parent_name) ###
Returns a 0-1 valued integer indicating whether `tname` has a parent module.
Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `b`.
-<a name="luat_stackdump"/>
+<a name="luat_stackdump"></a>
### void luaT_stackdump(lua_State *L) ###
This function print outs the state of the Lua stack. It is useful for debug
diff --git a/test/test.lua b/test/test.lua
index 6221854..2abf016 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -416,6 +416,19 @@ function torchtest.max() -- torch.max([resval, resind,] x [,dim])
local res1val = torch.max(m1)
mytester:assert(res1val ~= res1val, 'error in torch.max - NaNs')
end
+
+ -- dim == nDim -1
+ local a = torch.Tensor({{1,2},{3,4}}):select(2, 1)
+ local aval, aind = torch.max(a, 1)
+ mytester:assert(aval[1] == 3)
+ mytester:assert(aind[1] == 2)
+
+ local b = torch.Tensor({{{1,2},{3,4}},{{5,6},{7,8}}}):select(3, 1)
+ local bval, bind = torch.max(b, 2)
+ mytester:assert(bval[1][1] == 3)
+ mytester:assert(bind[1][1] == 2)
+ mytester:assert(bval[2][1] == 7)
+ mytester:assert(bind[2][1] == 2)
end
function torchtest.min() -- torch.min([resval, resind,] x [,dim])
@@ -806,14 +819,17 @@ function torchtest.div()
for k,t in ipairs(types) do
- local m1 = torch.randn(10,10):type(t)
+ local m1 = torch.Tensor(10,10):uniform(0,10):type(t)
local res1 = m1:clone()
res1[{ {},3 }]:div(2)
local res2 = m1:clone()
for i = 1,m1:size(1) do
- res2[{ i,3 }] = res2[{ i,3 }] / 2
+ local ok = pcall(function() res2[{ i,3 }] = res2[{ i,3 }] / 2 end)
+ if not ok then
+ res2[{ i,3 }] = torch.floor(res2[{ i,3 }] / 2)
+ end
end
local err = (res1-res2):double():abs():max()
@@ -1273,8 +1289,8 @@ function torchtest.cdiv()
-- [res] torch.cdiv([res,] tensor1, tensor2)
-- contiguous
- local m1 = torch.randn(10, 10, 10):type(t)
- local m2 = torch.randn(10, 10 * 10):type(t)
+ local m1 = torch.Tensor(10, 10, 10):uniform(0,10):type(t)
+ local m2 = torch.Tensor(10, 10 * 10):uniform(0,10):type(t)
m2[m2:eq(0)] = 2
local sm1 = m1[{4, {}, {}}]
local sm2 = m2[{4, {}}]
@@ -1283,7 +1299,10 @@ function torchtest.cdiv()
for i = 1,sm1:size(1) do
for j = 1, sm1:size(2) do
local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] / sm2[idx1d]
+ local ok = pcall(function() res2[i][j] = sm1[i][j] / sm2[idx1d] end)
+ if not ok then
+ res2[i][j] = torch.floor(sm1[i][j] / sm2[idx1d])
+ end
end
end
local err = res1:clone():zero()
@@ -1305,8 +1324,8 @@ function torchtest.cdiv()
mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous' .. ' ' .. t)
-- non-contiguous
- local m1 = torch.randn(10, 10, 10):type(t)
- local m2 = torch.randn(10 * 10, 10 * 10):type(t)
+ local m1 = torch.Tensor(10, 10, 10):uniform(0,10):type(t)
+ local m2 = torch.Tensor(10 * 10, 10 * 10):uniform(0,10):type(t)
m2[m2:eq(0)] = 2
local sm1 = m1[{{}, 4, {}}]
local sm2 = m2[{{}, 4}]
@@ -1315,7 +1334,10 @@ function torchtest.cdiv()
for i = 1,sm1:size(1) do
for j = 1, sm1:size(2) do
local idx1d = (((i-1)*sm1:size(1)))+j
- res2[i][j] = sm1[i][j] / sm2[idx1d]
+ local ok = pcall(function() res2[i][j] = sm1[i][j] / sm2[idx1d] end)
+ if not ok then
+ res2[i][j] = torch.floor(sm1[i][j] / sm2[idx1d])
+ end
end
end
local err = res1:clone():zero()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git
More information about the debian-science-commits
mailing list