add Eigen-3.4

2026-06-08 08:13:55 +00:00 · 2021-11-11 21:06:39 -08:00
parent a84071ee67
commit e3b98615f1
42 changed files with 2060 additions and 1434 deletions
--- a/examples/ThirdPartyLibs/Eigen/Core
+++ b/examples/ThirdPartyLibs/Eigen/Core
@@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2007-2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_H
+#define EIGEN_CORE_H
+
+// first thing Eigen does: stop the compiler from reporting useless warnings.
+#include "src/Core/util/DisableStupidWarnings.h"
+
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the compiler/OS/arch detections and define most defaults.
+#include "src/Core/util/Macros.h"
+
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
+
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
+#endif
+
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
+// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
+// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5)
+  #pragma GCC optimize ("-fno-ipa-cp-clone")
+#endif
+
+// Prevent ICC from specializing std::complex operators that silently fail
+// on device. This allows us to use our own device-compatible specializations
+// instead.
+#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
+    && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
+#endif
+#include <complex>
+
+// this include file manages BLAS and MKL related macros
+// and inclusion of their respective header files
+#include "src/Core/util/MKL_support.h"
+
+
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
+#endif
+
+#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
+  #define EIGEN_HAS_GPU_BF16
+#endif
+
+#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
+  #define EIGEN_HAS_OPENMP
+#endif
+
+#ifdef EIGEN_HAS_OPENMP
+#include <omp.h>
+#endif
+
+// MSVC for windows mobile does not have the errno.h file
+#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
+#define EIGEN_HAS_ERRNO
+#endif
+
+#ifdef EIGEN_HAS_ERRNO
+#include <cerrno>
+#endif
+#include <cstddef>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <functional>
+#include <sstream>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
+#include <cstring>
+#include <string>
+#include <limits>
+#include <climits> // for CHAR_BIT
+// for min/max:
+#include <algorithm>
+
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
+// for std::is_nothrow_move_assignable
+#ifdef EIGEN_INCLUDE_TYPE_TRAITS
+#include <type_traits>
+#endif
+
+// for outputting debug info
+#ifdef EIGEN_DEBUG_ASSIGN
+#include <iostream>
+#endif
+
+// required for __cpuid, needs to be included after cmath
+// also required for _BitScanReverse on Windows on ARM
+#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
+  #include <intrin.h>
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+  #undef min
+  #undef max
+  #undef isnan
+  #undef isinf
+  #undef isfinite
+  #include <CL/sycl.hpp>
+  #include <map>
+  #include <memory>
+  #include <utility>
+  #include <thread>
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
+  #endif
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
+  #endif
+#endif
+
+
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
+#endif
+
+namespace Eigen {
+
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
+// ensure QNX/QCC support
+using std::size_t;
+// gcc 4.6.0 wants std:: for ptrdiff_t
+using std::ptrdiff_t;
+
+}
+
+/** \defgroup Core_Module Core module
+  * This is the main module of Eigen providing dense matrix and vector support
+  * (both fixed and dynamic size) with all the features corresponding to a BLAS library
+  * and much more...
+  *
+  * \code
+  * #include <Eigen/Core>
+  * \endcode
+  */
+
+#include "src/Core/util/Constants.h"
+#include "src/Core/util/Meta.h"
+#include "src/Core/util/ForwardDeclarations.h"
+#include "src/Core/util/StaticAssert.h"
+#include "src/Core/util/XprHelper.h"
+#include "src/Core/util/Memory.h"
+#include "src/Core/util/IntegralConstant.h"
+#include "src/Core/util/SymbolicIndex.h"
+
+#include "src/Core/NumTraits.h"
+#include "src/Core/MathFunctions.h"
+#include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/BFloat16.h"
+#include "src/Core/arch/Default/TypeCasting.h"
+#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
+
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/TypeCasting.h"
+  #include "src/Core/arch/AVX512/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX512/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+  // Use AVX for floats and doubles, SSE for integers
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_SSE
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/SSE/Complex.h"
+#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+  #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/MathFunctions.h"
+  #include "src/Core/arch/AltiVec/Complex.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
+  #include "src/Core/arch/NEON/MathFunctions.h"
+  #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+  #include "src/Core/arch/SVE/PacketMath.h"
+  #include "src/Core/arch/SVE/TypeCasting.h"
+  #include "src/Core/arch/SVE/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_ZVECTOR
+  #include "src/Core/arch/ZVector/PacketMath.h"
+  #include "src/Core/arch/ZVector/MathFunctions.h"
+  #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
+  #include "src/Core/arch/GPU/TypeCasting.h"
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
+  #include "src/Core/arch/SYCL/InteropHeaders.h"
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+  #include "src/Core/arch/SYCL/PacketMath.h"
+  #include "src/Core/arch/SYCL/MathFunctions.h"
+  #include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
+#endif
+
+#include "src/Core/arch/Default/Settings.h"
+// This file provides generic implementations valid for scalar as well
+#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
+
+#include "src/Core/functors/TernaryFunctors.h"
+#include "src/Core/functors/BinaryFunctors.h"
+#include "src/Core/functors/UnaryFunctors.h"
+#include "src/Core/functors/NullaryFunctors.h"
+#include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"
+
+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#ifdef EIGEN_CUDACC
+#include "src/Core/arch/CUDA/Complex.h"
+#endif
+
+#include "src/Core/util/IndexedViewHelper.h"
+#include "src/Core/util/ReshapedHelper.h"
+#include "src/Core/ArithmeticSequence.h"
+#ifndef EIGEN_NO_IO
+  #include "src/Core/IO.h"
+#endif
+#include "src/Core/DenseCoeffsBase.h"
+#include "src/Core/DenseBase.h"
+#include "src/Core/MatrixBase.h"
+#include "src/Core/EigenBase.h"
+
+#include "src/Core/Product.h"
+#include "src/Core/CoreEvaluators.h"
+#include "src/Core/AssignEvaluator.h"
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
+                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
+  #include "src/Core/Assign.h"
+#endif
+
+#include "src/Core/ArrayBase.h"
+#include "src/Core/util/BlasUtil.h"
+#include "src/Core/DenseStorage.h"
+#include "src/Core/NestByValue.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+
+#include "src/Core/ReturnByValue.h"
+#include "src/Core/NoAlias.h"
+#include "src/Core/PlainObjectBase.h"
+#include "src/Core/Matrix.h"
+#include "src/Core/Array.h"
+#include "src/Core/CwiseTernaryOp.h"
+#include "src/Core/CwiseBinaryOp.h"
+#include "src/Core/CwiseUnaryOp.h"
+#include "src/Core/CwiseNullaryOp.h"
+#include "src/Core/CwiseUnaryView.h"
+#include "src/Core/SelfCwiseBinaryOp.h"
+#include "src/Core/Dot.h"
+#include "src/Core/StableNorm.h"
+#include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
+#include "src/Core/Map.h"
+#include "src/Core/Ref.h"
+#include "src/Core/Block.h"
+#include "src/Core/VectorBlock.h"
+#include "src/Core/IndexedView.h"
+#include "src/Core/Reshaped.h"
+#include "src/Core/Transpose.h"
+#include "src/Core/DiagonalMatrix.h"
+#include "src/Core/Diagonal.h"
+#include "src/Core/DiagonalProduct.h"
+#include "src/Core/Redux.h"
+#include "src/Core/Visitor.h"
+#include "src/Core/Fuzzy.h"
+#include "src/Core/Swap.h"
+#include "src/Core/CommaInitializer.h"
+#include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
+#include "src/Core/TriangularMatrix.h"
+#include "src/Core/SelfAdjointView.h"
+#include "src/Core/products/GeneralBlockPanelKernel.h"
+#include "src/Core/products/Parallelizer.h"
+#include "src/Core/ProductEvaluators.h"
+#include "src/Core/products/GeneralMatrixVector.h"
+#include "src/Core/products/GeneralMatrixMatrix.h"
+#include "src/Core/SolveTriangular.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular.h"
+#include "src/Core/products/SelfadjointMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixMatrix.h"
+#include "src/Core/products/SelfadjointProduct.h"
+#include "src/Core/products/SelfadjointRank2Update.h"
+#include "src/Core/products/TriangularMatrixVector.h"
+#include "src/Core/products/TriangularMatrixMatrix.h"
+#include "src/Core/products/TriangularSolverMatrix.h"
+#include "src/Core/products/TriangularSolverVector.h"
+#include "src/Core/BandMatrix.h"
+#include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"
+
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+  #include "src/Core/arch/AltiVec/MatrixProduct.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+#endif
+
+#include "src/Core/BooleanRedux.h"
+#include "src/Core/Select.h"
+#include "src/Core/VectorwiseOp.h"
+#include "src/Core/PartialReduxEvaluator.h"
+#include "src/Core/Random.h"
+#include "src/Core/Replicate.h"
+#include "src/Core/Reverse.h"
+#include "src/Core/ArrayWrapper.h"
+#include "src/Core/StlIterators.h"
+
+#ifdef EIGEN_USE_BLAS
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
+#endif // EIGEN_USE_BLAS
+
+#ifdef EIGEN_USE_MKL_VML
+#include "src/Core/Assign_MKL.h"
+#endif
+
+#include "src/Core/GlobalFunctions.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CORE_H
--- a/examples/ThirdPartyLibs/Eigen/Dense
+++ b/examples/ThirdPartyLibs/Eigen/Dense
@@ -0,0 +1,7 @@
+#include "Core"
+#include "LU"
+#include "Cholesky"
+#include "QR"
+#include "SVD"
+#include "Geometry"
+#include "Eigenvalues"
--- a/examples/ThirdPartyLibs/Eigen/Geometry
+++ b/examples/ThirdPartyLibs/Eigen/Geometry
@@ -0,0 +1,59 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_MODULE_H
+#define EIGEN_GEOMETRY_MODULE_H
+
+#include "Core"
+
+#include "SVD"
+#include "LU"
+#include <limits>
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Geometry_Module Geometry module
+  *
+  * This module provides support for:
+  *  - fixed-size homogeneous transformations
+  *  - translation, scaling, 2D and 3D rotations
+  *  - \link Quaternion quaternions \endlink
+  *  - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3)
+  *  - orthognal vector generation (\ref MatrixBase::unitOrthogonal)
+  *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
+  *  - \link AlignedBox axis aligned bounding boxes \endlink
+  *  - \link umeyama least-square transformation fitting \endlink
+  *
+  * \code
+  * #include <Eigen/Geometry>
+  * \endcode
+  */
+
+#include "src/Geometry/OrthoMethods.h"
+#include "src/Geometry/EulerAngles.h"
+
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"
+
+// Use the SSE optimized version whenever possible.
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_GEOMETRY_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/Householder
+++ b/examples/ThirdPartyLibs/Eigen/Householder
@@ -0,0 +1,29 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HOUSEHOLDER_MODULE_H
+#define EIGEN_HOUSEHOLDER_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Householder_Module Householder module
+  * This module provides Householder transformations.
+  *
+  * \code
+  * #include <Eigen/Householder>
+  * \endcode
+  */
+
+#include "src/Householder/Householder.h"
+#include "src/Householder/HouseholderSequence.h"
+#include "src/Householder/BlockHouseholder.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_HOUSEHOLDER_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/IterativeLinearSolvers
+++ b/examples/ThirdPartyLibs/Eigen/IterativeLinearSolvers
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+#define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+
+#include "SparseCore"
+#include "OrderingMethods"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup IterativeLinearSolvers_Module IterativeLinearSolvers module
+  *
+  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
+  * Those solvers are accessible via the following classes:
+  *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
+  *  - BiCGSTAB for general square matrices.
+  *
+  * These iterative solvers are associated with some preconditioners:
+  *  - IdentityPreconditioner - not really useful
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
+  *
+  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
+  *
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
+  */
+
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
+#include "src/IterativeLinearSolvers/IterativeSolverBase.h"
+#include "src/IterativeLinearSolvers/BasicPreconditioners.h"
+#include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
+#include "src/IterativeLinearSolvers/BiCGSTAB.h"
+#include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/Jacobi
+++ b/examples/ThirdPartyLibs/Eigen/Jacobi
@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_JACOBI_MODULE_H
+#define EIGEN_JACOBI_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup Jacobi_Module Jacobi module
+  * This module provides Jacobi and Givens rotations.
+  *
+  * \code
+  * #include <Eigen/Jacobi>
+  * \endcode
+  *
+  * In addition to listed classes, it defines the two following MatrixBase methods to apply a Jacobi or Givens rotation:
+  *  - MatrixBase::applyOnTheLeft()
+  *  - MatrixBase::applyOnTheRight().
+  */
+
+#include "src/Jacobi/Jacobi.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_JACOBI_MODULE_H
+
--- a/examples/ThirdPartyLibs/Eigen/LU
+++ b/examples/ThirdPartyLibs/Eigen/LU
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LU_MODULE_H
+#define EIGEN_LU_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup LU_Module LU module
+  * This module includes %LU decomposition and related notions such as matrix inversion and determinant.
+  * This module defines the following MatrixBase methods:
+  *  - MatrixBase::inverse()
+  *  - MatrixBase::determinant()
+  *
+  * \code
+  * #include <Eigen/LU>
+  * \endcode
+  */
+
+#include "src/misc/Kernel.h"
+#include "src/misc/Image.h"
+#include "src/LU/FullPivLU.h"
+#include "src/LU/PartialPivLU.h"
+#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/LU/PartialPivLU_LAPACKE.h"
+#endif
+#include "src/LU/Determinant.h"
+#include "src/LU/InverseImpl.h"
+
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
+  #include "src/LU/arch/InverseSize4.h"
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_LU_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/OrderingMethods
+++ b/examples/ThirdPartyLibs/Eigen/OrderingMethods
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ORDERINGMETHODS_MODULE_H
+#define EIGEN_ORDERINGMETHODS_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** 
+  * \defgroup OrderingMethods_Module OrderingMethods module
+  *
+  * This module is currently for internal use only
+  * 
+  * It defines various built-in and external ordering methods for sparse matrices. 
+  * They are typically used to reduce the number of elements during 
+  * the sparse matrix decomposition (LLT, LU, QR).
+  * Precisely, in a preprocessing step, a permutation matrix P is computed using 
+  * those ordering methods and applied to the columns of the matrix. 
+  * Using for instance the sparse Cholesky decomposition, it is expected that 
+  * the nonzeros elements in LLT(A*P) will be much smaller than that in LLT(A).
+  * 
+  * 
+  * Usage : 
+  * \code
+  * #include <Eigen/OrderingMethods>
+  * \endcode
+  * 
+  * A simple usage is as a template parameter in the sparse decomposition classes : 
+  * 
+  * \code 
+  * SparseLU<MatrixType, COLAMDOrdering<int> > solver;
+  * \endcode 
+  * 
+  * \code 
+  * SparseQR<MatrixType, COLAMDOrdering<int> > solver;
+  * \endcode
+  * 
+  * It is possible as well to call directly a particular ordering method for your own purpose, 
+  * \code 
+  * AMDOrdering<int> ordering;
+  * PermutationMatrix<Dynamic, Dynamic, int> perm;
+  * SparseMatrix<double> A; 
+  * //Fill the matrix ...
+  * 
+  * ordering(A, perm); // Call AMD
+  * \endcode
+  * 
+  * \note Some of these methods (like AMD or METIS), need the sparsity pattern 
+  * of the input matrix to be symmetric. When the matrix is structurally unsymmetric, 
+  * Eigen computes internally the pattern of \f$A^T*A\f$ before calling the method.
+  * If your matrix is already symmetric (at leat in structure), you can avoid that
+  * by calling the method with a SelfAdjointView type.
+  * 
+  * \code
+  *  // Call the ordering on the pattern of the lower triangular matrix A
+  * ordering(A.selfadjointView<Lower>(), perm);
+  * \endcode
+  */
+
+#include "src/OrderingMethods/Amd.h"
+#include "src/OrderingMethods/Ordering.h"
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ORDERINGMETHODS_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/PaStiXSupport
+++ b/examples/ThirdPartyLibs/Eigen/PaStiXSupport
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PASTIXSUPPORT_MODULE_H
+#define EIGEN_PASTIXSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <pastix_nompi.h>
+#include <pastix.h>
+}
+
+#ifdef complex
+#undef complex
+#endif
+
+/** \ingroup Support_modules
+  * \defgroup PaStiXSupport_Module PaStiXSupport module
+  * 
+  * This module provides an interface to the <a href="http://pastix.gforge.inria.fr/">PaSTiX</a> library.
+  * PaSTiX is a general \b supernodal, \b parallel and \b opensource sparse solver.
+  * It provides the two following main factorization classes:
+  * - class PastixLLT : a supernodal, parallel LLt Cholesky factorization.
+  * - class PastixLDLT: a supernodal, parallel LDLt Cholesky factorization.
+  * - class PastixLU : a supernodal, parallel LU factorization (optimized for a symmetric pattern).
+  * 
+  * \code
+  * #include <Eigen/PaStiXSupport>
+  * \endcode
+  *
+  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
+  * The dependencies depend on how PaSTiX has been compiled.
+  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
+  *
+  */
+
+#include "src/PaStiXSupport/PaStiXSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_PASTIXSUPPORT_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/PardisoSupport
+++ b/examples/ThirdPartyLibs/Eigen/PardisoSupport
@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARDISOSUPPORT_MODULE_H
+#define EIGEN_PARDISOSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include <mkl_pardiso.h>
+
+/** \ingroup Support_modules
+  * \defgroup PardisoSupport_Module PardisoSupport module
+  *
+  * This module brings support for the Intel(R) MKL PARDISO direct sparse solvers.
+  *
+  * \code
+  * #include <Eigen/PardisoSupport>
+  * \endcode
+  *
+  * In order to use this module, the MKL headers must be accessible from the include paths, and your binary must be linked to the MKL library and its dependencies.
+  * See this \ref TopicUsingIntelMKL "page" for more information on MKL-Eigen integration.
+  * 
+  */
+
+#include "src/PardisoSupport/PardisoSupport.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_PARDISOSUPPORT_MODULE_H
--- a/examples/ThirdPartyLibs/Eigen/src/Core/Assign_MKL.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/Assign_MKL.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/Block.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/Block.h
@@ -260,19 +260,19 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }

    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const
    {
      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
    }

    template<int LoadMode>
-    inline PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const
    {
      return m_xpr.template packet<Unaligned>
              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -280,7 +280,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }

    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
--- a/examples/ThirdPartyLibs/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/PartialReduxEvaluator.h
@@ -54,12 +54,17 @@ struct packetwise_redux_traits
 /* Value to be returned when size==0 , by default let's return 0 */
 template<typename PacketType,typename Func>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+PacketType packetwise_redux_empty_value(const Func& ) {
+  const typename unpacket_traits<PacketType>::type zero(0);
+  return pset1<PacketType>(zero);
+}

 /* For products the default is 1 */
 template<typename PacketType,typename Scalar>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) {
+  return pset1<PacketType>(Scalar(1));
+}

 /* Perform the actual reduction */
 template<typename Func, typename Evaluator,
--- a/examples/ThirdPartyLibs/Eigen/src/Core/Stride.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/Stride.h
@@ -38,10 +38,14 @@ namespace Eigen {
  * \include Map_general_stride.cpp
  * Output: \verbinclude Map_general_stride.out
  *
-  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
  * not allowed).
  *
+  * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
+  * the inner stride is the pointer increment between two consecutive elements,
+  * regardless of storage layout.
+  *
  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
  */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AVX/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AVX/Complex.h
@@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl

 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }

 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AVX512/Complex.h
@@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<fl

 template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
 {
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
 }

 template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -127,20 +127,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }

-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
  Packet4f res0, res1;
 #ifdef __VSX__
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
 #ifdef _BIG_ENDIAN
  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #else
  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #endif
 #else
-  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
-  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
 #endif
  return Packet2cf(res0);
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -9,22 +9,8 @@ namespace Eigen {

 namespace internal {

-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
-EIGEN_STRONG_INLINE void gemm_extra_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index row,
-  Index col,
-  Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlpha);
-
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_extra_row(
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
@@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
  const Packet& pAlpha,
  const Packet& pMask);

-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_col(
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_extra_cols(
  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
-  Index& row,
-  Index rows,
+  Index strideB,
+  Index offsetB,
  Index col,
-  Index remaining_cols,
-  const Packet& pAlpha);
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask);

 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);

 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index col,
-  Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag);
-
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
@@ -91,123 +64,88 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row(
  const Packet& pMask);

 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
-  Index& row,
-  Index rows,
+  Index offsetB,
  Index col,
-  Index remaining_cols,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
  const Packet& pAlphaReal,
-  const Packet& pAlphaImag);
+  const Packet& pAlphaImag,
+  const Packet& pMask);

 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);

-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index col);

-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
-
-template<typename Packet>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);

 template<typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);

-const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
-                                                     16, 17, 18, 19,
-                                                      4,  5,  6,  7,
-                                                     20, 21, 22, 23};
-
-const static Packet16uc p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
-                                                      24, 25, 26, 27,
-                                                      12, 13, 14, 15,
-                                                      28, 29, 30, 31};
-//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
-const static Packet16uc p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
-                                                     16, 17, 18, 19, 20, 21, 22, 23};
-
-//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
-const static Packet16uc p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
-                                                      24, 25, 26, 27, 28, 29, 30, 31};
-
-
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc>
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
+  acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
+  }

-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
+  acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
+  }
 }

-template<typename Packet, typename Packetc>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+  bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);

  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
-  acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
-  acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
-  acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+  if (N > 1) {
+    acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+  }

-  acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
-  acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
-  acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
-  acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
-}
-
-template<typename Packet, typename Packetc>
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
-}
-
-template<typename Packet, typename Packetc>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
-{
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
-
-  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
-
-  acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
-}
-
-template<>
-EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
-}
-
-template<>
-EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
+  acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
+  if (N > 1) {
+    acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
+  }
 }

 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H

-#pragma GCC target("cpu=power10")
+#pragma GCC target("cpu=power10,htm")

 #ifdef __has_builtin
 #if !__has_builtin(__builtin_vsx_assemble_pair)
@@ -30,37 +30,37 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
 }

 template<typename DataMapper, typename Index, typename Packet, const Index accCols>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
 {
  PacketBlock<Packet, 4> result;
  __builtin_mma_disassemble_acc(&result.packet, acc);

  PacketBlock<Packet, 4> tRes;
-  bload<DataMapper, Packet, Index, accCols, 0, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);

-  bscale<Packet>(tRes, result, alpha);
+  bscale<Packet, 4>(tRes, result, alpha);

-  data.template storePacketBlock<Packet, 4>(i, j, tRes);
+  data.template storePacketBlock<Packet, 4>(i, 0, tRes);
 }

-template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC, int N>
-EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
 {
  PacketBlock<Packet, 4> resultReal, resultImag;
  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);

  PacketBlock<Packetc, 8> tRes;
-  bload<DataMapper, Packetc, Index, accColsC, N, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);

  PacketBlock<Packet,4> taccReal, taccImag;
  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);

  PacketBlock<Packetc, 4> acc1, acc2;
-  bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+  bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);

-  data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
-  data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
+  data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
+  data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
 }

 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
@@ -125,7 +125,7 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
 {
-  rhsV = ploadRhs<Scalar, Packet>((const Scalar*)(rhs));
+  rhsV = ploadRhs<Scalar, Packet>(rhs);
 } 

 template<>
@@ -184,12 +184,11 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
  }

 #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
-  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \
-  MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9);
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);

 #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0; \
@@ -222,7 +221,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)

 #define MICRO_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
-    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
  }
@@ -238,21 +237,19 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)

 #define MICRO_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
  }

 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)

 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
-  Index offsetA,
  Index& row,
-  Index col,
  const Packet& pAlpha)
 {
  const Scalar* rhs_ptr = rhs_base;
@@ -278,11 +275,84 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
  row += unroll_factor*accCols;
 }

+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_MMA_UNROLL 7
+  while(row + MAX_MMA_UNROLL*accCols <= rows) {
+    gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_MMA_UNROLL > 7
+    case 7:
+      gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 6
+    case 6:
+      gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 5
+    case 5:
+      gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 4
+    case 4:
+      gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 3
+    case 3:
+      gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 2
+    case 2:
+      gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+#if MAX_MMA_UNROLL > 1
+    case 1:
+      gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_MMA_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
 template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;

      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
@@ -293,79 +363,10 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
-        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
-
-        Index row = 0;
-#define MAX_MMA_UNROLL 7
-        while(row + MAX_MMA_UNROLL*accCols <= rows) {
-          gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-        }
-        switch( (rows-row)/accCols ) {
-#if MAX_MMA_UNROLL > 7
-          case 7:
-            gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 6
-          case 6:
-            gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 5
-          case 5:
-            gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 4
-          case 4:
-            gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 3
-          case 3:
-            gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 2
-          case 2:
-            gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_MMA_UNROLL > 1
-          case 1:
-            gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-          default:
-            break;
-        }
-#undef MAX_MMA_UNROLL
-
-        if(remaining_rows > 0)
-        {
-          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
-        }
+        gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
      }

-      if(remaining_cols > 0)
-      {
-        const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
-        const Scalar* lhs_base = blockA;
-
-        for(; col < cols; col++)
-        {
-          Index row = 0;
-
-          gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
-
-          if (remaining_rows > 0)
-          {
-            gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
-          }
-          rhs_base++;
-        }
-      }
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
 }

 #define accColsC (accCols / 2)
@@ -373,21 +374,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define advanceCols ((RhsIsReal) ? 1 : 2)

 // PEEL_COMPLEX_MMA loop factor.
-#define PEEL_COMPLEX_MMA 7
+#define PEEL_COMPLEX_MMA 3

 #define MICRO_COMPLEX_MMA_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4)
+  func(0) func(1) func(2) func(3)

 #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
  if (unroll_factor > iter) { \
    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
-    lhs_ptr_real##iter += accCols; \
    if(!LhsIsReal) { \
-      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
-      lhs_ptr_imag##iter += accCols; \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
    } else { \
      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
    } \
+    lhs_ptr_real##iter += accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
@@ -400,8 +400,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,

 #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
  if (PEEL_COMPLEX_MMA > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
    if(!RhsIsReal) { \
      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
@@ -409,20 +409,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
    } \
    MICRO_COMPLEX_MMA_UNROLL(func2); \
-    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
  } else { \
    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
  }

 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
-  type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9);
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);

 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0, rhsVi0; \
@@ -459,15 +456,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,

 #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
-    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
-    if(!LhsIsReal) { \
-      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
-    } \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
  }

 #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
@@ -475,45 +466,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
  if (unroll_factor > iter) { \
    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
-    if(!LhsIsReal) { \
-      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
-    } \
  }

 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)

 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
-    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC, 0>(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
  }

 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)

 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
-  Index offsetA,
  Index strideB,
  Index& row,
-  Index col,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag)
 {
  const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
+  const Scalar* rhs_ptr_imag = NULL;
+  const Index imag_delta = accCols*strideA;
  if(!RhsIsReal) {
    rhs_ptr_imag = rhs_base + accRows*strideB;
  } else {
    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
  }
-  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
-  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
-  const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
-  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4;
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;

  MICRO_COMPLEX_MMA_SRC_PTR
  MICRO_COMPLEX_MMA_DST_PTR
@@ -537,11 +523,70 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
  row += unroll_factor*accCols;
 }

+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_MMA_UNROLL 4
+  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_MMA_UNROLL > 4
+    case 4:
+      gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 3
+    case 3:
+      gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+    case 2:
+      gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+    case 1:
+      gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_MMA_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;

      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
@@ -556,64 +601,10 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
-        Index row = 0;
-
-#define MAX_COMPLEX_MMA_UNROLL 4
-        while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
-          gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-        }
-        switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_MMA_UNROLL > 4
-          case 4:
-            gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 3
-          case 3:
-            gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 2
-          case 2:
-            gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 1
-          case 1:
-            gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-          default:
-            break;
-        }
-#undef MAX_COMPLEX_MMA_UNROLL
-
-        if(remaining_rows > 0)
-        {
-          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-        }
+        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
      }

-      if(remaining_cols > 0)
-      {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
-        const Scalar* lhs_base = blockA;
-
-        for(; col < cols; col++)
-        {
-          Index row = 0;
-
-          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
-
-          if (remaining_rows > 0)
-          {
-            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
-          }
-          rhs_base++;
-        }
-      }
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 }

 #undef accColsC
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/AltiVec/PacketMath.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/CUDA/Complex.h
@@ -11,13 +11,24 @@
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H

-// clang-format off
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, GCC and older versions of clang do
 // not treat them as device functions and thus Eigen functors making use of
 // these operators fail to compile. Here, we manually specialize these
 // operators and functors for complex types when building for CUDA to enable
 // their use on-device.
+//
+// NOTES:
+//  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
+//    since they are already specialized in the standard. Using them will result
+//    in silent kernel failures.
+//  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
+//    to duplicate definition errors, since these are already specialized in
+//    Visual Studio's <complex> header (contrary to the standard).  This is
+//    preferable to removing such definitions, which will lead to silent kernel
+//    failures.
+//  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
+//    to the first inclusion of <complex>.

 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
    
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/Default/BFloat16.h
@@ -251,12 +251,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const
    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
    return output;
  }
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  output.value = p[0];
-#else
-  output.value = p[1];
-#endif
+  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
  return output;
 }

@@ -462,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
-    float result = 0;
-    unsigned short* q = reinterpret_cast<unsigned short*>(&result);
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = h.value;
-#else
-    q[1] = h.value;
-#endif
-    return result;
+    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
 }
 // --- standard functions ---

--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -493,9 +493,10 @@ ptranspose(PacketBlock<double2,2>& kernel) {

 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)

-// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
-// its corresponding packet_traits<Eigen::half> must be visible on host.
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)

 typedef ulonglong2 Packet4h2;
 template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
@@ -526,42 +527,9 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
  };
 };

-namespace {
-// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __halves2half2(a, b);
-#else
-  // Round-about way since __halves2half2 is a __device__ function.
-  return __floats2half2_rn(__half2float(a), __half2float(b));
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __low2half(a);
-#else
-  return __float2half(__low2float(a));
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __high2half(a);
-#else
-  return __float2half(__high2float(a));
-#endif
-}
-} // namespace
-
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __half2half2(from);
-#else
-  const float f = __half2float(from);
-  return __floats2half2_rn(f, f);
-#endif
 }

 template <>
@@ -576,8 +544,6 @@ pset1<Packet4h2>(const Eigen::half& from) {
  return r;
 }

-// We now need this visible on both host and device.
-// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 namespace {

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -585,11 +551,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
-  return combine_half(from[0], from[1]);
+  return __halves2half2(from[0], from[1]);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
-  return combine_half(from[0], from[0]);
+  return __halves2half2(from[0], from[0]);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
@@ -599,8 +565,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
                                                   const half2& from) {
-  to[0] = get_half2_low(from);
-  to[1] = get_half2_high(from);
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
 }


@@ -610,7 +576,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
  // Input is guaranteed to be properly aligned.
  return __ldg(reinterpret_cast<const half2*>(from));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }

@@ -619,31 +585,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
 #if defined(EIGEN_GPU_HAS_LDG)
  return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
                                                    Index stride) {
-  return combine_half(from[0*stride], from[1*stride]);
+  return __halves2half2(from[0*stride], from[1*stride]);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
    Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = get_half2_low(from);
-  to[stride*1] = get_half2_high(from);
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
-  return get_half2_low(a);
+  return __low2half(a);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
@@ -658,12 +624,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = get_half2_low(kernel.packet[0]);
-  __half a2 = get_half2_high(kernel.packet[0]);
-  __half b1 = get_half2_low(kernel.packet[1]);
-  __half b2 = get_half2_high(kernel.packet[1]);
-  kernel.packet[0] = combine_half(a1, b1);
-  kernel.packet[1] = combine_half(a2, b2);
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
@@ -671,88 +637,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
  float f = __half2float(a) + 1.0f;
-  return combine_half(a, __float2half(f));
+  return __halves2half2(a, __float2half(f));
 #endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
                                                    const half2& a,
                                                    const half2& b) {
-  half mask_low = get_half2_low(mask);
-  half mask_high = get_half2_high(mask);
-  half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
-  half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
-  return combine_half(result_low, result_high);
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
                                                    const half2& b) {
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
                                                    const half2& b) {
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
                                                 const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
                                                const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
                                                 const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
                                                    const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
@@ -851,9 +817,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
@@ -862,9 +828,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
@@ -885,7 +851,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
 }

@@ -897,7 +863,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
 }

@@ -1068,10 +1034,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
  Packet4h2 r;
  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
-  p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
-  p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
-  p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
  return r;
 }

@@ -1152,12 +1118,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose_half(half2& f0, half2& f1) {
-  __half a1 = get_half2_low(f0);
-  __half a2 = get_half2_high(f0);
-  __half b1 = get_half2_low(f1);
-  __half b2 = get_half2_high(f1);
-  f0 = combine_half(a1, b1);
-  f1 = combine_half(a2, b2);
+  __half a1 = __low2half(f0);
+  __half a2 = __high2half(f0);
+  __half b1 = __low2half(f1);
+  __half b2 = __high2half(f1);
+  f0 = __halves2half2(a1, b1);
+  f1 = __halves2half2(a2, b2);
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
@@ -1254,10 +1220,10 @@ plset<Packet4h2>(const Eigen::half& a) {
  float f = __half2float(a);
  Packet4h2 r;
  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(a, __float2half(f + 1.0f));
-  p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
  return r;
 #endif
 }
@@ -1477,9 +1443,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
    const Packet4h2& a) {
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_max(a_alias[0]),
+  half2 m0 = __halves2half2(predux_max(a_alias[0]),
                            predux_max(a_alias[1]));
-  half2 m1 = combine_half(predux_max(a_alias[2]),
+  half2 m1 = __halves2half2(predux_max(a_alias[2]),
                            predux_max(a_alias[3]));
  __half first  = predux_max(m0);
  __half second = predux_max(m1);
@@ -1496,9 +1462,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
    const Packet4h2& a) {
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_min(a_alias[0]),
+  half2 m0 = __halves2half2(predux_min(a_alias[0]),
                            predux_min(a_alias[1]));
-  half2 m1 = combine_half(predux_min(a_alias[2]),
+  half2 m1 = __halves2half2(predux_min(a_alias[2]),
                            predux_min(a_alias[3]));
  __half first  = predux_min(m0);
  __half second = predux_min(m1);
@@ -1652,9 +1618,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }

 template<>
@@ -1664,14 +1630,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }

-// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-
-#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)

 #undef EIGEN_GPU_HAS_LDG
 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -15,8 +15,7 @@ namespace Eigen {
 namespace internal {

 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))

 template <>
 struct type_casting_traits<Eigen::half, float> {
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -155,7 +155,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif EIGEN_ARCH_ARM32
+#elif EIGEN_ARCH_ARM
  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
  // by default no explicit prefetching
@@ -3918,8 +3918,6 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {

 template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }

-#endif // EIGEN_ARCH_ARM64
-
 // Do we have an fp16 types and supporting Neon intrinsics?
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 typedef float16x4_t Packet4hf;
@@ -4580,6 +4578,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>&
 }
 #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC

+#endif // EIGEN_ARCH_ARM64
+
 } // end namespace internal

 } // end namespace Eigen
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/SSE/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/SSE/Complex.h
@@ -106,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  Packet2cf res;
-#ifdef EIGEN_VECTORIZE_SSE3
-  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
-#else
-  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
-  res.v = _mm_movelh_ps(res.v, res.v);
-#endif
-  return res;
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet2cf(_mm_set_ps(im, re, im, re));
 }

 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/SSE/PacketMath.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/ZVector/Complex.h
@@ -91,8 +91,18 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float>  type;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+};

 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
@@ -150,7 +160,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
  pstore<std::complex<double> >(&res, a);

  return res;
@@ -195,7 +205,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
  pstore<std::complex<float> >(res, a);

  return res[0];
@@ -225,14 +235,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo

 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
--- a/examples/ThirdPartyLibs/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -91,8 +91,8 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);

 static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
-                              numext::bit_cast<double>0x8000000000000000ull) };
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
+                              numext::bit_cast<double>(0x8000000000000000ull) };

 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
@@ -358,7 +358,7 @@ pbroadcast4<Packet2d>(const double *a,

 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
@@ -368,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f

 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
 return pload<Packet2d>(af);
@@ -376,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou

 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
  pstore<int>((int *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
@@ -386,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const

 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  pstore<double>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -460,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int    x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
@@ -639,7 +639,7 @@ pbroadcast4<Packet4f>(const float *a,

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
@@ -649,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa

 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
  pstore<float>((float *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
@@ -785,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
  return p;
 }

-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -943,7 +943,7 @@ pbroadcast4<Packet4f>(const float *a,

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  af[2] = from[2*stride];
@@ -953,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa

 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
  pstore<float>((float*)af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -978,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { r
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/BlasUtil.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/BlasUtil.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/ConfigureVectorization.h
@@ -438,11 +438,11 @@
  #include <arm_fp16.h>
 #endif

-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
+#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
  // We can use the optimized fp16 to float and float to fp16 conversion routines
  #define EIGEN_HAS_FP16_C

-  #if defined(EIGEN_COMP_CLANG)
+  #if EIGEN_COMP_CLANG
    // Workaround for clang: The FP16C intrinsics for clang are included by
    // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -4,6 +4,7 @@
 #ifdef _MSC_VER
  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
  // 4101 - unreferenced local variable
+  // 4127 - conditional expression is constant
  // 4181 - qualifier applied to reference type ignored
  // 4211 - nonstandard extension used : redefined extern to static
  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
@@ -19,7 +20,7 @@
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)

 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -93,6 +94,13 @@
  #pragma diag_suppress 2735
  #pragma diag_suppress 2737
  #pragma diag_suppress 2739
+  #pragma diag_suppress 2976
+  #pragma diag_suppress 2979
+  // Disable the "// __device__ annotation is ignored on a function(...) that is
+  //              explicitly defaulted on its first declaration" message.
+  // The __device__ annotation seems to actually be needed in some cases,
+  // otherwise resulting in kernel runtime errors.
+  #pragma diag_suppress 2977
 #endif

 #else
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/IntegralConstant.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/IntegralConstant.h
@@ -138,7 +138,7 @@ template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
  static const int value = N;
 };

-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
  static const int value = N;
 };
@@ -154,7 +154,7 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
 };

 template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif

@@ -166,7 +166,7 @@ template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct clea
 // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
 template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };

-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 // In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
 template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
 #endif
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/MKL_support.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/MKL_support.h
--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/Macros.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/Macros.h
@@ -1131,7 +1131,16 @@ namespace Eigen {
      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
    #elif EIGEN_ARCH_ARM_OR_ARM64
      // General, NEON.
-      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+      // Clang doesn't like "r",
+      //    error: non-trivial scalar-to-vector conversion, possible invalid
+      //           constraint for vector type
+      // GCC < 5 doesn't like "g",
+      //    error: 'asm' operand requires impossible reload
+      #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0)
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,w" (X));
+      #else
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+      #endif
    #elif EIGEN_ARCH_i386_OR_x86_64
      // General, SSE.
      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
@@ -1216,7 +1225,7 @@ namespace Eigen {
 * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
 */
 #if EIGEN_HAS_CXX11
-#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default;
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
 #else
 #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
 #endif
@@ -1241,12 +1250,12 @@ namespace Eigen {
 */
 #if EIGEN_HAS_CXX11
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() = default; \
-    ~Derived() = default;
+    EIGEN_DEVICE_FUNC Derived() = default; \
+    EIGEN_DEVICE_FUNC ~Derived() = default;
 #else
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() {}; \
-    /* ~Derived() {}; */
+    EIGEN_DEVICE_FUNC Derived() {}; \
+    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
 #endif


--- a/examples/ThirdPartyLibs/Eigen/src/Core/util/Meta.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Core/util/Meta.h
--- a/examples/ThirdPartyLibs/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/examples/ThirdPartyLibs/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -440,9 +440,8 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal
 template<typename MatrixType, int Size, bool IsComplex>
 struct tridiagonalization_inplace_selector
 {
-  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
  typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  template<typename DiagonalType, typename SubDiagonalType>
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
  static EIGEN_DEVICE_FUNC
      void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ)
  {
--- a/examples/ThirdPartyLibs/Eigen/src/SVD/BDCSVD.h
+++ b/examples/ThirdPartyLibs/Eigen/src/SVD/BDCSVD.h
@@ -27,6 +27,10 @@
 #define eigen_internal_assert(X) assert(X);
 #endif

+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+#include <iostream>
+#endif
+
 namespace Eigen {

 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -172,7 +176,7 @@ public:

  void setSwitchSize(int s) 
  {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3.");
    m_algoswap = s;
  }
 
@@ -404,7 +408,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
 // lastCol + 1 - firstCol is the size of the submatrix.
 //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
+//@param firstColW : Same as firstRowW with the column.
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
@@ -899,7 +903,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
      eigen_internal_assert(fLeft<Literal(0));

-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif

@@ -1242,8 +1246,8 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
  {
    // Check for total deflation
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length-1).array().abs()<considerZero).all();
    
    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
    // First, compute the respective permutation.
--- a/examples/ThirdPartyLibs/Eigen/src/misc/lapacke.h
+++ b/examples/ThirdPartyLibs/Eigen/src/misc/lapacke.h
--- a/examples/ThirdPartyLibs/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/examples/ThirdPartyLibs/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -30,15 +30,40 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
  *
  * \sa max()
  */
-EIGEN_MAKE_CWISE_BINARY_OP(min,min)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const OtherDerived &other) const
+{
+  return (min<PropagateFast>)(other);
+}

 /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
  *
  * \sa max()
  */
+template <int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 min
 #else
@@ -46,7 +71,20 @@ min
 #endif
 (const Scalar &other) const
 {
-  return (min)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (min<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const Scalar &other) const
+{
+  return (min<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
 }

 /** \returns an expression of the coefficient-wise max of \c *this and \a other
@@ -56,14 +94,39 @@ min
  *
  * \sa min()
  */
-EIGEN_MAKE_CWISE_BINARY_OP(max,max)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const OtherDerived &other) const
+{
+  return (max<PropagateFast>)(other);
+}

 /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
  *
  * \sa min()
  */
+template <int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived,
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived,
                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 max
@@ -72,7 +135,20 @@ max
 #endif
 (const Scalar &other) const
 {
-  return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (max<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const Scalar &other) const
+{
+  return (max<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
 }

 /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
--- a/examples/ThirdPartyLibs/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/examples/ThirdPartyLibs/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -72,23 +72,39 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
  *
  * \sa class CwiseBinaryOp, max()
  */
-template<typename OtherDerived>
+template<int NaNPropagation, typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return cwiseMin<PropagateFast>(other);
 }

 /** \returns an expression of the coefficient-wise min of *this and scalar \a other
  *
  * \sa class CwiseBinaryOp, min()
  */
+template<int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
-  return cwiseMin(Derived::Constant(rows(), cols(), other));
+  return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
+cwiseMin(const Scalar &other) const
+{
+  return cwiseMin<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }

 /** \returns an expression of the coefficient-wise max of *this and \a other
@@ -98,23 +114,39 @@ cwiseMin(const Scalar &other) const
  *
  * \sa class CwiseBinaryOp, min()
  */
-template<typename OtherDerived>
+template<int NaNPropagation, typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return cwiseMax<PropagateFast>(other);
 }

 /** \returns an expression of the coefficient-wise max of *this and scalar \a other
  *
  * \sa class CwiseBinaryOp, min()
  */
+template<int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
-  return cwiseMax(Derived::Constant(rows(), cols(), other));
+  return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
+cwiseMax(const Scalar &other) const
+{
+  return cwiseMax<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }